Vilt Model causing RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu) in the new update.
I have placed the model already on the GPU then running the below code
My same old code is running fine on other envs, but in my current env I newly installed the huggingface transformers library since then I’m facing a lot of issues in the same codes.
Please help, it would mean a lot to me. Please!
Here is the finetuning code:
optimizer = torch.optim.AdamW(model.parameters(), lr = 5e-5)
torch.set_grad_enabled(True) # Context-manager
model.train()
epochList, accList = [],[]
for epoch in tqdm(range(20)):
print(f"Epoch: {epoch}")
for idx, batch in enumerate(train_dataloader):
batch = {k:v.to(device) for k,v in batch.items()}
optimizer.zero_grad()
outputs = model(**batch)
loss = outputs.loss
print(idx,"-> Loss:", loss.item())
loss.backward()
optimizer.step()
if (idx != 0 ) and (idx % 200 == 0):
model.eval()
acc_score_test = calculateAccuracyTest()
acc_score_val = calculateAccuracyVal()
print(f'\nValidation Accuracy: {acc_score_val}, Test Accuracy: {acc_score_test} \n')
epochList.append((epoch*tot_number_of_steps)+idx)
accList.append((acc_score_test,acc_score_val))
model.train()
Stack Trace is
0%
0/20 [00:00<?, ?it/s]
Epoch: 0
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[145], line 18
14 batch = {k:v.to(device) for k,v in batch.items()}
16 optimizer.zero_grad()
---> 18 outputs = model(**batch)
19 loss = outputs.loss
21 print(idx,"-> Loss:", loss.item())
File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/transformers/models/vilt/modeling_vilt.py:1127, in ViltForQuestionAnswering.forward(self, input_ids, attention_mask, token_type_ids, pixel_values, pixel_mask, head_mask, inputs_embeds, image_embeds, labels, output_attentions, output_hidden_states, return_dict)
1093 r"""
1094 labels (`torch.FloatTensor` of shape `(batch_size, num_labels)`, *optional*):
1095 Labels for computing the visual question answering loss. This tensor must be either a one-hot encoding of
(...)
1123 Predicted answer: 2
1124 ```"""
1125 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1127 outputs = self.vilt(
1128 input_ids,
1129 attention_mask=attention_mask,
1130 token_type_ids=token_type_ids,
1131 pixel_values=pixel_values,
1132 pixel_mask=pixel_mask,
1133 head_mask=head_mask,
1134 inputs_embeds=inputs_embeds,
1135 image_embeds=image_embeds,
1136 output_attentions=output_attentions,
1137 output_hidden_states=output_hidden_states,
1138 return_dict=return_dict,
1139 )
1141 pooler_output = outputs.pooler_output if return_dict else outputs[1]
1143 logits = self.classifier(pooler_output)
File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/transformers/models/vilt/modeling_vilt.py:829, in ViltModel.forward(self, input_ids, attention_mask, token_type_ids, pixel_values, pixel_mask, head_mask, inputs_embeds, image_embeds, image_token_type_idx, output_attentions, output_hidden_states, return_dict)
822 # Prepare head mask if needed
823 # 1.0 in head_mask indicate we keep the head
824 # attention_probs has shape bsz x n_heads x N x N
825 # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
826 # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
827 head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
--> 829 embedding_output, attention_mask = self.embeddings(
830 input_ids,
831 attention_mask,
832 token_type_ids,
833 pixel_values,
834 pixel_mask,
835 inputs_embeds,
836 image_embeds,
837 image_token_type_idx=image_token_type_idx,
838 )
840 # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
841 # ourselves in which case we just need to make it broadcastable to all heads.
842 extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/transformers/models/vilt/modeling_vilt.py:219, in ViltEmbeddings.forward(self, input_ids, attention_mask, token_type_ids, pixel_values, pixel_mask, inputs_embeds, image_embeds, image_token_type_idx)
217 # PART 2: patch embeddings (with interpolated position encodings)
218 if image_embeds is None:
--> 219 image_embeds, image_masks, patch_index = self.visual_embed(
220 pixel_values, pixel_mask, max_image_length=self.config.max_image_length
221 )
222 else:
223 image_masks = pixel_mask.flatten(1)
File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/transformers/models/vilt/modeling_vilt.py:186, in ViltEmbeddings.visual_embed(self, pixel_values, pixel_mask, max_image_length)
184 x = x[select[:, 0], select[:, 1]].view(batch_size, -1, num_channels)
185 x_mask = x_mask[select[:, 0], select[:, 1]].view(batch_size, -1)
--> 186 patch_index = patch_index[select[:, 0], select[:, 1]].view(batch_size, -1, 2)
187 pos_embed = pos_embed[select[:, 0], select[:, 1]].view(batch_size, -1, num_channels)
189 cls_tokens = self.cls_token.expand(batch_size, -1, -1)
RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)