Hi all
I started looking at modules in the flan t5 model that i have downloaded …
I am trying to understand how multihead attention is implemented there …
I can see only self attention layers … there
can anyone please explain this
model.encoder.block
T5Stack(
(embed_tokens): Embedding(32128, 512)
(block): ModuleList(
(0): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
(relative_attention_bias): Embedding(32, 6)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(1-7): 7 x T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(final_layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)