[LayoutLMv3] index out of range in self inside outputs = model(**encoding)

Thanks @nielsr for the implementation!

I’m trying to fine-tune the model based on my own dataset following this: Google Colab

The training works fine but when inferencing, this cell doesn’t work:

example = dataset["test"][0]
print(example.keys())

image = example["image"]
words = example["tokens"]
boxes = example["bboxes"]
word_labels = example["ner_tags"]

encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
for k,v in encoding.items():
  print(k,v.shape)

with torch.no_grad():
  outputs = model(**encoding)

and the error shown is this:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
\LayoutLMv3.ipynb Cell 42 in <cell line: 1>()
      1 with torch.no_grad():
----> 2   outputs = model(**encoding)

File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File \.venv\lib\site-packages\transformers\models\layoutlmv3\modeling_layoutlmv3.py:1025, in LayoutLMv3ForTokenClassification.forward(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, pixel_values)
    995 r"""
    996 labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    997     Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
   (...)
   1021 >>> logits = outputs.logits
   1022 ```"""
   1023 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1025 outputs = self.layoutlmv3(
   1026     input_ids,
   1027     bbox=bbox,
   1028     attention_mask=attention_mask,
   1029     token_type_ids=token_type_ids,
   1030     position_ids=position_ids,
   1031     head_mask=head_mask,
   1032     inputs_embeds=inputs_embeds,
   1033     output_attentions=output_attentions,
   1034     output_hidden_states=output_hidden_states,
   1035     return_dict=return_dict,
   1036     pixel_values=pixel_values,
   1037 )
   1038 if input_ids is not None:
   1039     input_shape = input_ids.size()

File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File \.venv\lib\site-packages\transformers\models\layoutlmv3\modeling_layoutlmv3.py:833, in LayoutLMv3Model.forward(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, pixel_values, output_attentions, output_hidden_states, return_dict)
    830     if bbox is None:
    831         bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device)
--> 833     embedding_output = self.embeddings(
    834         input_ids=input_ids,
    835         bbox=bbox,
    836         position_ids=position_ids,
    837         token_type_ids=token_type_ids,
    838         inputs_embeds=inputs_embeds,
    839     )
    841 final_bbox = final_position_ids = None
    842 patch_height = patch_width = None

File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File \.venv\lib\site-packages\transformers\models\layoutlmv3\modeling_layoutlmv3.py:261, in LayoutLMv3TextEmbeddings.forward(self, input_ids, bbox, token_type_ids, position_ids, inputs_embeds)
    258 token_type_embeddings = self.token_type_embeddings(token_type_ids)
    260 embeddings = inputs_embeds + token_type_embeddings
--> 261 position_embeddings = self.position_embeddings(position_ids)
    262 embeddings += position_embeddings
    264 spatial_position_embeddings = self.calculate_spatial_position_embeddings(bbox)

File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File\.venv\lib\site-packages\torch\nn\modules\sparse.py:158, in Embedding.forward(self, input)
    157 def forward(self, input: Tensor) -> Tensor:
--> 158     return F.embedding(
    159         input, self.weight, self.padding_idx, self.max_norm,
    160         self.norm_type, self.scale_grad_by_freq, self.sparse)

File \.venv\lib\site-packages\torch\nn\functional.py:2199, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2193     # Note [embedding_renorm set_grad_enabled]
   2194     # XXX: equivalent to
   2195     # with torch.no_grad():
   2196     #   torch.embedding_renorm_
   2197     # remove once script supports set_grad_enabled
   2198     _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2199 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

IndexError: index out of range in self

NOTE: I built my own dataset generator.
2nd NOTE: The number of labels in each image can reach 200-400 tokens.

I hope there is a way to solve this.
Thank you in advance!

@Fully
Were u able to solve this issue can you share your solution for the above error?