Vilt RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu) in the new update

Vilt Model causing RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu) in the new update.

I have placed the model already on the GPU then running the below code

My same old code is running fine on other envs, but in my current env I newly installed the huggingface transformers library since then I’m facing a lot of issues in the same codes.

Please help, it would mean a lot to me. Please!

Here is the finetuning code:

optimizer = torch.optim.AdamW(model.parameters(), lr = 5e-5)
torch.set_grad_enabled(True)  # Context-manager 
model.train()

epochList, accList = [],[]


for epoch in tqdm(range(20)):  

    print(f"Epoch: {epoch}")

    for idx, batch in enumerate(train_dataloader):

        batch = {k:v.to(device) for k,v in batch.items()}

        optimizer.zero_grad()
        
        outputs = model(**batch)
        loss = outputs.loss

        print(idx,"-> Loss:", loss.item())
        
        loss.backward()
        optimizer.step()

        
        if (idx != 0 ) and (idx % 200 == 0):
            
            model.eval()
            
            acc_score_test = calculateAccuracyTest()
            acc_score_val = calculateAccuracyVal()
            
            print(f'\nValidation Accuracy: {acc_score_val}, Test Accuracy: {acc_score_test} \n')
                    
            epochList.append((epoch*tot_number_of_steps)+idx)
            accList.append((acc_score_test,acc_score_val))

            model.train()

Stack Trace is

 0%
0/20 [00:00<?, ?it/s]

Epoch: 0

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[145], line 18
     14 batch = {k:v.to(device) for k,v in batch.items()}
     16 optimizer.zero_grad()
---> 18 outputs = model(**batch)
     19 loss = outputs.loss
     21 print(idx,"-> Loss:", loss.item())

File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
   1516     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517 else:
-> 1518     return self._call_impl(*args, **kwargs)

File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
   1522 # If we don't have any hooks, we want to skip the rest of the logic in
   1523 # this function, and just call forward.
   1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1525         or _global_backward_pre_hooks or _global_backward_hooks
   1526         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527     return forward_call(*args, **kwargs)
   1529 try:
   1530     result = None

File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/transformers/models/vilt/modeling_vilt.py:1127, in ViltForQuestionAnswering.forward(self, input_ids, attention_mask, token_type_ids, pixel_values, pixel_mask, head_mask, inputs_embeds, image_embeds, labels, output_attentions, output_hidden_states, return_dict)
   1093 r"""
   1094 labels (`torch.FloatTensor` of shape `(batch_size, num_labels)`, *optional*):
   1095     Labels for computing the visual question answering loss. This tensor must be either a one-hot encoding of
   (...)
   1123 Predicted answer: 2
   1124 ```"""
   1125 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1127 outputs = self.vilt(
   1128     input_ids,
   1129     attention_mask=attention_mask,
   1130     token_type_ids=token_type_ids,
   1131     pixel_values=pixel_values,
   1132     pixel_mask=pixel_mask,
   1133     head_mask=head_mask,
   1134     inputs_embeds=inputs_embeds,
   1135     image_embeds=image_embeds,
   1136     output_attentions=output_attentions,
   1137     output_hidden_states=output_hidden_states,
   1138     return_dict=return_dict,
   1139 )
   1141 pooler_output = outputs.pooler_output if return_dict else outputs[1]
   1143 logits = self.classifier(pooler_output)

File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
   1516     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517 else:
-> 1518     return self._call_impl(*args, **kwargs)

File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
   1522 # If we don't have any hooks, we want to skip the rest of the logic in
   1523 # this function, and just call forward.
   1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1525         or _global_backward_pre_hooks or _global_backward_hooks
   1526         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527     return forward_call(*args, **kwargs)
   1529 try:
   1530     result = None

File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/transformers/models/vilt/modeling_vilt.py:829, in ViltModel.forward(self, input_ids, attention_mask, token_type_ids, pixel_values, pixel_mask, head_mask, inputs_embeds, image_embeds, image_token_type_idx, output_attentions, output_hidden_states, return_dict)
    822 # Prepare head mask if needed
    823 # 1.0 in head_mask indicate we keep the head
    824 # attention_probs has shape bsz x n_heads x N x N
    825 # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
    826 # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
    827 head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
--> 829 embedding_output, attention_mask = self.embeddings(
    830     input_ids,
    831     attention_mask,
    832     token_type_ids,
    833     pixel_values,
    834     pixel_mask,
    835     inputs_embeds,
    836     image_embeds,
    837     image_token_type_idx=image_token_type_idx,
    838 )
    840 # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
    841 # ourselves in which case we just need to make it broadcastable to all heads.
    842 extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)

File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
   1516     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517 else:
-> 1518     return self._call_impl(*args, **kwargs)

File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
   1522 # If we don't have any hooks, we want to skip the rest of the logic in
   1523 # this function, and just call forward.
   1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1525         or _global_backward_pre_hooks or _global_backward_hooks
   1526         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527     return forward_call(*args, **kwargs)
   1529 try:
   1530     result = None

File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/transformers/models/vilt/modeling_vilt.py:219, in ViltEmbeddings.forward(self, input_ids, attention_mask, token_type_ids, pixel_values, pixel_mask, inputs_embeds, image_embeds, image_token_type_idx)
    217 # PART 2: patch embeddings (with interpolated position encodings)
    218 if image_embeds is None:
--> 219     image_embeds, image_masks, patch_index = self.visual_embed(
    220         pixel_values, pixel_mask, max_image_length=self.config.max_image_length
    221     )
    222 else:
    223     image_masks = pixel_mask.flatten(1)

File ~/miniconda3/envs/yolo/lib/python3.11/site-packages/transformers/models/vilt/modeling_vilt.py:186, in ViltEmbeddings.visual_embed(self, pixel_values, pixel_mask, max_image_length)
    184 x = x[select[:, 0], select[:, 1]].view(batch_size, -1, num_channels)
    185 x_mask = x_mask[select[:, 0], select[:, 1]].view(batch_size, -1)
--> 186 patch_index = patch_index[select[:, 0], select[:, 1]].view(batch_size, -1, 2)
    187 pos_embed = pos_embed[select[:, 0], select[:, 1]].view(batch_size, -1, num_channels)
    189 cls_tokens = self.cls_token.expand(batch_size, -1, -1)

RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)