The architecture:
CaptioningTransformer(
(visual_projection): Linear(in_features=512, out_features=256, bias=True)
(embedding): Embedding(1004, 256, padding_idx=0)
(positional_encoding): PositionalEncoding(
(dropout): Dropout(p=0.1, inplace=False)
)
(transformer): TransformerDecoder(
(layers): ModuleList(
(0-1): 2 x TransformerDecoderLayer(
(self_attn): MultiHeadAttention(
(key): Linear(in_features=256, out_features=256, bias=True)
(query): Linear(in_features=256, out_features=256, bias=True)
(value): Linear(in_features=256, out_features=256, bias=True)
(proj): Linear(in_features=256, out_features=256, bias=True)
(attn_drop): Dropout(p=0.1, inplace=False)
)
(multihead_attn): MultiHeadAttention(
(key): Linear(in_features=256, out_features=256, bias=True)
(query): Linear(in_features=256, out_features=256, bias=True)
(value): Linear(in_features=256, out_features=256, bias=True)
(proj): Linear(in_features=256, out_features=256, bias=True)
(attn_drop): Dropout(p=0.1, inplace=False)
)
(linear1): Linear(in_features=256, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear2): Linear(in_features=2048, out_features=256, bias=True)
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
)
)
(output): Linear(in_features=256, out_features=1004, bias=True)
)
No comments:
Post a Comment