Thanks @nielsr for the implementation!
I’m trying to fine-tune the model based on my own dataset following this: Google Colab
The training works fine but when inferencing, this cell doesn’t work:
example = dataset["test"][0]
print(example.keys())
image = example["image"]
words = example["tokens"]
boxes = example["bboxes"]
word_labels = example["ner_tags"]
encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
for k,v in encoding.items():
print(k,v.shape)
with torch.no_grad():
outputs = model(**encoding)
and the error shown is this:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
\LayoutLMv3.ipynb Cell 42 in <cell line: 1>()
1 with torch.no_grad():
----> 2 outputs = model(**encoding)
File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File \.venv\lib\site-packages\transformers\models\layoutlmv3\modeling_layoutlmv3.py:1025, in LayoutLMv3ForTokenClassification.forward(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, pixel_values)
995 r"""
996 labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
997 Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
(...)
1021 >>> logits = outputs.logits
1022 ```"""
1023 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1025 outputs = self.layoutlmv3(
1026 input_ids,
1027 bbox=bbox,
1028 attention_mask=attention_mask,
1029 token_type_ids=token_type_ids,
1030 position_ids=position_ids,
1031 head_mask=head_mask,
1032 inputs_embeds=inputs_embeds,
1033 output_attentions=output_attentions,
1034 output_hidden_states=output_hidden_states,
1035 return_dict=return_dict,
1036 pixel_values=pixel_values,
1037 )
1038 if input_ids is not None:
1039 input_shape = input_ids.size()
File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File \.venv\lib\site-packages\transformers\models\layoutlmv3\modeling_layoutlmv3.py:833, in LayoutLMv3Model.forward(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, pixel_values, output_attentions, output_hidden_states, return_dict)
830 if bbox is None:
831 bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device)
--> 833 embedding_output = self.embeddings(
834 input_ids=input_ids,
835 bbox=bbox,
836 position_ids=position_ids,
837 token_type_ids=token_type_ids,
838 inputs_embeds=inputs_embeds,
839 )
841 final_bbox = final_position_ids = None
842 patch_height = patch_width = None
File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File \.venv\lib\site-packages\transformers\models\layoutlmv3\modeling_layoutlmv3.py:261, in LayoutLMv3TextEmbeddings.forward(self, input_ids, bbox, token_type_ids, position_ids, inputs_embeds)
258 token_type_embeddings = self.token_type_embeddings(token_type_ids)
260 embeddings = inputs_embeds + token_type_embeddings
--> 261 position_embeddings = self.position_embeddings(position_ids)
262 embeddings += position_embeddings
264 spatial_position_embeddings = self.calculate_spatial_position_embeddings(bbox)
File \.venv\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File\.venv\lib\site-packages\torch\nn\modules\sparse.py:158, in Embedding.forward(self, input)
157 def forward(self, input: Tensor) -> Tensor:
--> 158 return F.embedding(
159 input, self.weight, self.padding_idx, self.max_norm,
160 self.norm_type, self.scale_grad_by_freq, self.sparse)
File \.venv\lib\site-packages\torch\nn\functional.py:2199, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2193 # Note [embedding_renorm set_grad_enabled]
2194 # XXX: equivalent to
2195 # with torch.no_grad():
2196 # torch.embedding_renorm_
2197 # remove once script supports set_grad_enabled
2198 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2199 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self
NOTE: I built my own dataset generator.
2nd NOTE: The number of labels in each image can reach 200-400 tokens.
I hope there is a way to solve this.
Thank you in advance!