I’m trying to run the VisualBERT for MCQ as given in the Huggingface model_docs page. I generated the visual embeddings with resnet. But the model forward pass producing runtimeError. Please help.
import torch
from torch import nn
from torchvision import models, transforms
from PIL import Image as img
# Define pre-trained ResNet model and freeze convolutional layers
resnet_model = models.resnet18(pretrained=True)
for param in resnet_model.parameters():
param.requires_grad = False
# Define transformation for image pre-processing
transform = transforms.Compose([
transforms.Resize((224,224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Function to generate embedding for a single image
def get_visual_embeddings(image):
# Preprocess image and forward through ResNet
image = transform(image)
image = image.unsqueeze(0) # Add batch dimension
with torch.no_grad():
embedding = resnet_model(image)[0] # Extract features from output
return embedding
# Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
from transformers import AutoTokenizer, VisualBertForMultipleChoice
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr")
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
visual_embeds = get_visual_embeddings(image)
# (batch_size, num_choices, visual_seq_length, visual_embedding_dim)
visual_embeds = visual_embeds.expand(1, 2, *visual_embeds.shape)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors="pt", padding=True)
# batch size is 1
inputs_dict = {k: v.unsqueeze(0) for k, v in encoding.items()}
inputs_dict.update(
{
"visual_embeds": visual_embeds,
"visual_attention_mask": visual_attention_mask,
"visual_token_type_ids": visual_token_type_ids,
"labels": labels,
}
)
outputs = model(**inputs_dict)
But when I’m doing the forward pass it producing an error “RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 2 but got size 1 for tensor number 1 in the list.” in the torch.cat function. Where it is going wrong I couldn’t understand. Please help.
Here is the whole stack trace
RuntimeError Traceback (most recent call last)
Cell In[58], line 1
----> 1 outputs = model(**inputs_dict)
File ~/miniconda3/envs/blip_vqa_base_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File ~/miniconda3/envs/blip_vqa_base_env/lib/python3.8/site-packages/transformers/models/visual_bert/modeling_visual_bert.py:1131, in VisualBertForMultipleChoice.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, visual_embeds, visual_attention_mask, visual_token_type_ids, image_text_alignment, output_attentions, output_hidden_states, return_dict, labels)
1120 visual_attention_mask = (
1121 visual_attention_mask.view(-1, visual_attention_mask.size(-1))
1122 if visual_attention_mask is not None
1123 else None
1124 )
1125 visual_token_type_ids = (
1126 visual_token_type_ids.view(-1, visual_token_type_ids.size(-1))
1127 if visual_token_type_ids is not None
1128 else None
1129 )
-> 1131 outputs = self.visual_bert(
1132 input_ids,
1133 attention_mask=attention_mask,
1134 token_type_ids=token_type_ids,
1135 position_ids=position_ids,
1136 head_mask=head_mask,
1137 inputs_embeds=inputs_embeds,
1138 visual_embeds=visual_embeds,
1139 visual_attention_mask=visual_attention_mask,
1140 visual_token_type_ids=visual_token_type_ids,
1141 image_text_alignment=image_text_alignment,
1142 output_attentions=output_attentions,
1143 output_hidden_states=output_hidden_states,
1144 return_dict=return_dict,
1145 )
1147 _, pooled_output = outputs[0], outputs[1]
1149 pooled_output = self.dropout(pooled_output)
File ~/miniconda3/envs/blip_vqa_base_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File ~/miniconda3/envs/blip_vqa_base_env/lib/python3.8/site-packages/transformers/models/visual_bert/modeling_visual_bert.py:796, in VisualBertModel.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, visual_embeds, visual_attention_mask, visual_token_type_ids, image_text_alignment, output_attentions, output_hidden_states, return_dict)
793 # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
794 # ourselves in which case we just need to make it broadcastable to all heads.
795 if visual_embeds is not None:
--> 796 combined_attention_mask = torch.cat((attention_mask, visual_attention_mask), dim=-1)
797 extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
798 combined_attention_mask, (batch_size, input_shape + visual_input_shape)
799 )
801 else:
RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 2 but got size 1 for tensor number 1 in the list.
Please let me know where Im doing it wrong.