Hi, I am finetuning Whisper and run into a trainer issue and donât know what to do:
RuntimeError: The size of tensor a (462) must match the size of tensor b (448) at non-singleton dimension 1
The trace goes:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
File <timed exec>:28
File ~/.local/lib/python3.9/site-packages/transformers/trainer.py:1515, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1510 self.model_wrapped = self.model
1512 inner_training_loop = find_executable_batch_size(
1513 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1514 )
-> 1515 return inner_training_loop(
1516 args=args,
1517 resume_from_checkpoint=resume_from_checkpoint,
1518 trial=trial,
1519 ignore_keys_for_eval=ignore_keys_for_eval,
1520 )
File ~/.local/lib/python3.9/site-packages/transformers/trainer.py:1763, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1761 tr_loss_step = self.training_step(model, inputs)
1762 else:
-> 1763 tr_loss_step = self.training_step(model, inputs)
1765 if (
1766 args.logging_nan_inf_filter
1767 and not is_torch_tpu_available()
1768 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1769 ):
1770 # if loss is nan or inf simply add the average of previous logged losses
1771 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File ~/.local/lib/python3.9/site-packages/transformers/trainer.py:2522, in Trainer.training_step(self, model, inputs)
2519 return loss_mb.reduce_mean().detach().to(self.args.device)
2521 with self.compute_loss_context_manager():
-> 2522 loss = self.compute_loss(model, inputs)
2524 if self.args.n_gpu > 1:
2525 loss = loss.mean() # mean() to average on multi-gpu parallel training
File ~/.local/lib/python3.9/site-packages/transformers/trainer.py:2554, in Trainer.compute_loss(self, model, inputs, return_outputs)
2552 else:
2553 labels = None
-> 2554 outputs = model(**inputs)
2555 # Save past state if it exists
2556 # TODO: this needs to be fixed and made cleaner later.
2557 if self.args.past_index >= 0:
File ~/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1102, in Module._call_impl(self, *input, **kwargs)
1098 # If we don't have any hooks, we want to skip the rest of the logic in
1099 # this function, and just call forward.
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
File ~/.local/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py:1192, in WhisperForConditionalGeneration.forward(self, input_features, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1187 if decoder_input_ids is None and decoder_inputs_embeds is None:
1188 decoder_input_ids = shift_tokens_right(
1189 labels, self.config.pad_token_id, self.config.decoder_start_token_id
1190 )
-> 1192 outputs = self.model(
1193 input_features,
1194 decoder_input_ids=decoder_input_ids,
1195 encoder_outputs=encoder_outputs,
1196 decoder_attention_mask=decoder_attention_mask,
1197 head_mask=head_mask,
1198 decoder_head_mask=decoder_head_mask,
1199 cross_attn_head_mask=cross_attn_head_mask,
1200 past_key_values=past_key_values,
1201 decoder_inputs_embeds=decoder_inputs_embeds,
1202 use_cache=use_cache,
1203 output_attentions=output_attentions,
1204 output_hidden_states=output_hidden_states,
1205 return_dict=return_dict,
1206 )
1207 lm_logits = self.proj_out(outputs[0])
1209 loss = None
File ~/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1102, in Module._call_impl(self, *input, **kwargs)
1098 # If we don't have any hooks, we want to skip the rest of the logic in
1099 # this function, and just call forward.
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
File ~/.local/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py:1061, in WhisperModel.forward(self, input_features, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, decoder_inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
1054 encoder_outputs = BaseModelOutput(
1055 last_hidden_state=encoder_outputs[0],
1056 hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
1057 attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
1058 )
1060 # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-> 1061 decoder_outputs = self.decoder(
1062 input_ids=decoder_input_ids,
1063 attention_mask=decoder_attention_mask,
1064 encoder_hidden_states=encoder_outputs[0],
1065 head_mask=decoder_head_mask,
1066 cross_attn_head_mask=cross_attn_head_mask,
1067 past_key_values=past_key_values,
1068 inputs_embeds=decoder_inputs_embeds,
1069 use_cache=use_cache,
1070 output_attentions=output_attentions,
1071 output_hidden_states=output_hidden_states,
1072 return_dict=return_dict,
1073 )
1075 if not return_dict:
1076 return decoder_outputs + encoder_outputs
File ~/.local/lib/python3.9/site-packages/torch/nn/modules/module.py:1102, in Module._call_impl(self, *input, **kwargs)
1098 # If we don't have any hooks, we want to skip the rest of the logic in
1099 # this function, and just call forward.
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
File ~/.local/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py:868, in WhisperDecoder.forward(self, input_ids, attention_mask, encoder_hidden_states, head_mask, cross_attn_head_mask, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
865 # embed positions
866 positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
--> 868 hidden_states = inputs_embeds + positions
869 hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
871 # decoder layers
RuntimeError: The size of tensor a (462) must match the size of tensor b (448) at non-singleton dimension 1
And I found it strange because I am training on the Fleurs training set with different languages, this only happens to the Telugu
dataset, and only happens right at the 24-th step like below: