[LayoutLMv3] index out of range in self inside outputs = model(**encoding)

Thanks @nielsr for the implementation!

I’m trying to fine-tune the model based on my own dataset following this: Google Colab

The training works fine but when inferencing, this cell doesn’t work:

example = dataset["test"][0]
print(example.keys())

image = example["image"]
words = example["tokens"]
boxes = example["bboxes"]
word_labels = example["ner_tags"]

encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
for k,v in encoding.items():
  print(k,v.shape)

with torch.no_grad():
  outputs = model(**encoding)

and the error shown is this:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
\LayoutLMv3.ipynb Cell 42 in <cell line: 1>()
      1 with torch.no_grad():
----> 2   outputs = model(**encoding)

File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File \.venv\lib\site-packages\transformers\models\layoutlmv3\modeling_layoutlmv3.py:1025, in LayoutLMv3ForTokenClassification.forward(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, pixel_values)
    995 r"""
    996 labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    997     Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
   (...)
   1021 >>> logits = outputs.logits
   1022 ```"""
   1023 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1025 outputs = self.layoutlmv3(
   1026     input_ids,
   1027     bbox=bbox,
   1028     attention_mask=attention_mask,
   1029     token_type_ids=token_type_ids,
   1030     position_ids=position_ids,
   1031     head_mask=head_mask,
   1032     inputs_embeds=inputs_embeds,
   1033     output_attentions=output_attentions,
   1034     output_hidden_states=output_hidden_states,
   1035     return_dict=return_dict,
   1036     pixel_values=pixel_values,
   1037 )
   1038 if input_ids is not None:
   1039     input_shape = input_ids.size()

File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File \.venv\lib\site-packages\transformers\models\layoutlmv3\modeling_layoutlmv3.py:833, in LayoutLMv3Model.forward(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, pixel_values, output_attentions, output_hidden_states, return_dict)
    830     if bbox is None:
    831         bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device)
--> 833     embedding_output = self.embeddings(
    834         input_ids=input_ids,
    835         bbox=bbox,
    836         position_ids=position_ids,
    837         token_type_ids=token_type_ids,
    838         inputs_embeds=inputs_embeds,
    839     )
    841 final_bbox = final_position_ids = None
    842 patch_height = patch_width = None

File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File \.venv\lib\site-packages\transformers\models\layoutlmv3\modeling_layoutlmv3.py:261, in LayoutLMv3TextEmbeddings.forward(self, input_ids, bbox, token_type_ids, position_ids, inputs_embeds)
    258 token_type_embeddings = self.token_type_embeddings(token_type_ids)
    260 embeddings = inputs_embeds + token_type_embeddings
--> 261 position_embeddings = self.position_embeddings(position_ids)
    262 embeddings += position_embeddings
    264 spatial_position_embeddings = self.calculate_spatial_position_embeddings(bbox)

File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File\.venv\lib\site-packages\torch\nn\modules\sparse.py:158, in Embedding.forward(self, input)
    157 def forward(self, input: Tensor) -> Tensor:
--> 158     return F.embedding(
    159         input, self.weight, self.padding_idx, self.max_norm,
    160         self.norm_type, self.scale_grad_by_freq, self.sparse)

File \.venv\lib\site-packages\torch\nn\functional.py:2199, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2193     # Note [embedding_renorm set_grad_enabled]
   2194     # XXX: equivalent to
   2195     # with torch.no_grad():
   2196     #   torch.embedding_renorm_
   2197     # remove once script supports set_grad_enabled
   2198     _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2199 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

IndexError: index out of range in self

NOTE: I built my own dataset generator.
2nd NOTE: The number of labels in each image can reach 200-400 tokens.

I hope there is a way to solve this.
Thank you in advance!

@Fully
Were u able to solve this issue can you share your solution for the above error?

Hi @Fully, The embedding layer in model is not accepting the input ids in your data sample. This generally happens when the length of data sample is more than 512. one has to set the truncate parameter to True. So that the length never more than 512.

Hi @purnasai, I’m trying the test model “hf-tiny-model-private/tiny-random-LayoutLMv3ForQuestionAnswering” and facing the same error.

my proesser is
encoding = processor(image, question, words, boxes=boxes, max_length=512, padding="max_length", truncation=True, return_tensors="pt")

already try the truncation=True but i get the error.
my output of words and boxes are as like down. the parameters work fine on model of “rubentito/layoutlmv3-base-mpdocvqa”.

Could you help me the about this error?
Thanks
Lenght of Words: 190
Lenght of Boxes: 190
boxes: [[653, 24, 731, 49], [667, 43, 885, 75], [646, 93, 923, 111], [399, 121, 594, 142], [200, 184, 337, 199], [443, 184, 511, 196], [200, 201, 271, 213], [443, 202, 483, 212], [200, 218, 271, 230], [443, 219, 535, 231], [200, 235, 295, 247], [443, 235, 504, 247], [200, 252, 358, 268], [443, 252, 519, 264], [202, 271, 295, 282], [443, 269, 612, 285], [110, 346, 181, 355], [233, 345, 299, 357], [354, 345, 420, 357], [467, 345, 526, 355], [571, 345, 629, 355], [718, 345, 756, 355], [855, 346, 891, 355], [253, 357, 281, 366], [375, 357, 404, 366], [477, 357, 516, 368], [581, 357, 620, 368], [725, 357, 749, 369], [862, 357, 886, 369], [232, 369, 303, 378], [351, 367, 422, 381], [460, 369, 533, 378], [563, 367, 635, 381], [722, 371, 748, 378], [844, 369, 902, 378], [474, 381, 541, 390], [119, 392, 141, 402], [175, 393, 196, 402], [241, 392, 262, 402], [306, 393, 326, 402], [361, 392, 384, 402], [417, 393, 438, 402], [476, 392, 516, 402], [579, 392, 620, 402], [655, 393, 668, 401], [729, 392, 768, 402], [874, 392, 900, 402], [121, 404, 144, 413], [175, 405, 196, 413], [242, 404, 264, 413], [306, 405, 326, 414], [363, 404, 386, 413], [417, 405, 438, 413], [477, 404, 516, 414], [579, 404, 620, 414], [729, 404, 767, 413], [874, 404, 900, 413], [122, 416, 139, 426], [175, 417, 196, 425], [244, 416, 259, 425], [306, 417, 326, 425], [364, 416, 381, 425], [417, 416, 438, 425], [483, 416, 508, 425], [587, 416, 613, 426], [654, 417, 669, 424], [735, 416, 760, 426], [829, 416, 845, 425], [871, 416, 904, 426], [123, 428, 139, 437], [175, 428, 196, 437], [245, 428, 261, 437], [306, 428, 326, 437], [363, 428, 383, 437], [417, 428, 438, 437], [483, 428, 508, 437], [586, 428, 613, 438], [654, 428, 668, 436], [736, 428, 760, 437], [829, 428, 845, 436], [872, 428, 903, 437], [127, 441, 133, 448], [175, 440, 196, 448], [247, 441, 255, 448], [307, 440, 327, 449], [369, 441, 377, 448], [417, 440, 438, 448], [476, 439, 516, 449], [581, 439, 620, 449], [729, 439, 768, 449], [874, 439, 899, 449], [128, 452, 136, 460], [175, 452, 196, 460], [250, 452, 257, 460], [307, 452, 326, 461], [366, 451, 379, 460], [417, 452, 438, 460], [476, 451, 516, 461], [579, 451, 620, 461], [654, 452, 671, 460], [730, 451, 768, 461], [874, 451, 900, 461], [176, 463, 195, 475], [308, 463, 327, 475], [344, 463, 400, 473], [418, 464, 437, 473], [467, 463, 525, 473], [572, 463, 627, 473], [652, 463, 675, 472], [720, 463, 775, 473], [825, 463, 849, 472], [859, 463, 915, 473], [177, 475, 193, 484], [308, 475, 325, 484], [344, 474, 400, 485], [418, 475, 437, 484], [466, 474, 525, 485], [572, 474, 627, 485], [652, 475, 675, 483], [720, 474, 775, 485], [824, 475, 849, 483], [859, 474, 915, 485], [462, 485, 552, 498], [122, 498, 137, 508], [243, 498, 259, 508], [364, 498, 381, 508], [478, 498, 516, 508], [587, 498, 613, 508], [656, 500, 668, 506], [729, 498, 768, 508], [875, 498, 902, 508], [123, 509, 140, 520], [245, 509, 261, 520], [366, 510, 383, 520], [477, 510, 515, 520], [588, 510, 612, 520], [655, 511, 672, 519], [729, 510, 767, 520], [875, 510, 900, 520], [120, 521, 140, 532], [241, 522, 260, 531], [362, 521, 383, 532], [477, 521, 516, 532], [587, 521, 613, 532], [729, 521, 768, 532], [882, 522, 899, 531], [121, 534, 141, 543], [239, 534, 264, 544], [361, 534, 384, 544], [476, 534, 515, 543], [588, 534, 612, 543], [729, 534, 767, 543], [875, 534, 899, 543], [119, 546, 141, 555], [175, 546, 195, 555], [239, 546, 263, 555], [361, 546, 384, 555], [476, 545, 516, 555], [588, 546, 612, 555], [727, 545, 770, 556], [872, 545, 906, 556], [120, 557, 144, 566], [175, 558, 195, 566], [242, 557, 264, 566], [306, 557, 327, 566], [362, 557, 386, 566], [417, 558, 438, 566], [476, 557, 515, 566], [587, 557, 612, 566], [729, 557, 767, 566], [871, 557, 904, 567], [88, 640, 600, 658], [89, 658, 715, 672], [719, 660, 778, 670], [90, 675, 494, 692], [89, 691, 801, 709], [89, 711, 753, 726], [750, 709, 840, 727], [90, 728, 165, 739], [89, 744, 458, 760], [89, 762, 235, 776], [237, 766, 245, 772], [246, 762, 350, 776], [356, 762, 580, 777], [598, 763, 638, 774], [91, 809, 170, 823], [200, 807, 276, 820], [331, 825, 441, 840], [305, 545, 328, 555], [418, 546, 439, 555]]

For some reason, the coordinates are in a different order than LayoutLMv3 expects. Adding this line fixed the issue for me:

def reorder_bbox_cols(bbox):
    return torch.index_select(bbox, 2, torch.LongTensor([2,3,0,1]))
encoding['bbox'] = reorder_bbox_cols(encoding['bbox'])