The size of tensor error while fine tuning whisper

I am using my custom dataset . Now , while running [trainer.train()] , i am getting error as [
RuntimeError: The size of tensor a (517) must match the size of tensor b (448) at non-singleton dimension 1]

Please note I am using whisper medium model as seen below :

from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
output_dir=“./whisper-medium-hi”, # change to a repo name of your choice
per_device_train_batch_size=2,
gradient_accumulation_steps=16, # increase by 2x for every 2x decrease in batch size
learning_rate=1e-5,
warmup_steps=500,
max_steps=4000,
gradient_checkpointing=True,
fp16=True,
evaluation_strategy=“steps”,
per_device_eval_batch_size=8,
predict_with_generate=True,
generation_max_length=225,
save_steps=1000,
eval_steps=1000,
logging_steps=25,
report_to=[“tensorboard”],
load_best_model_at_end=True,
metric_for_best_model=“wer”,
greater_is_better=False,
push_to_hub=True,
)

Below is the error stacktrace :

use_cache = True is incompatible with gradient checkpointing. Setting use_cache = False

[1001/4000 1:31:27 < 4:34:34, 0.18 it/s, Epoch 1.76/8]
Step Training Loss Validation Loss
[151/228 11:43 < 06:01, 0.21 it/s]

RuntimeError Traceback (most recent call last)
in <cell line: 1>()
----> 1 trainer.train()

15 frames
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1578 # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
1579 hf_hub_utils.disable_progress_bars()
→ 1580 return inner_training_loop(
1581 args=args,
1582 resume_from_checkpoint=resume_from_checkpoint,

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1980 self.control = self.callback_handler.on_step_end(args, self.state, self.control)
1981
→ 1982 self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
1983 else:
1984 self.control = self.callback_handler.on_substep_end(args, self.state, self.control)

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval)
2324 metrics.update(dataset_metrics)
2325 else:
→ 2326 metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
2327 self._report_to_hp_search(trial, self.state.global_step, metrics)
2328

/usr/local/lib/python3.10/dist-packages/transformers/trainer_seq2seq.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix, **gen_kwargs)
163 self._gen_kwargs = gen_kwargs
164
→ 165 return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
166
167 def predict(

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
3062
3063 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
→ 3064 output = eval_loop(
3065 eval_dataloader,
3066 description=“Evaluation”,

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
3251
3252 # Prediction step
→ 3253 loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
3254 main_input_name = getattr(self.model, “main_input_name”, “input_ids”)
3255 inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None

/usr/local/lib/python3.10/dist-packages/transformers/trainer_seq2seq.py in prediction_step(self, model, inputs, prediction_loss_only, ignore_keys, **gen_kwargs)
310 if has_labels:
311 with self.compute_loss_context_manager():
→ 312 outputs = model(**inputs)
313 if self.label_smoother is not None:
314 loss = self.label_smoother(outputs, inputs[“labels”]).mean().detach()

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = ,

/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in forward(*args, **kwargs)
634
635 def forward(*args, **kwargs):
→ 636 return model_forward(*args, **kwargs)
637
638 # To act like a decorator so that it can be popped when doing extract_model_from_parallel

/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in call(self, *args, **kwargs)
622
623 def call(self, *args, **kwargs):
→ 624 return convert_to_fp32(self.model_forward(*args, **kwargs))
625
626 def getstate(self):

/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_autocast(*args, **kwargs)
12 def decorate_autocast(*args, **kwargs):
13 with autocast_instance:
—> 14 return func(*args, **kwargs)
15 decorate_autocast.__script_unsupported = ‘@autocast() decorator is not supported in script mode’ # type: ignore[attr-defined]
16 return decorate_autocast

/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py in forward(self, input_features, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1484 )
1485
→ 1486 outputs = self.model(
1487 input_features,
1488 attention_mask=attention_mask,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = ,

/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py in forward(self, input_features, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, decoder_inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
1360
1361 # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
→ 1362 decoder_outputs = self.decoder(
1363 input_ids=decoder_input_ids,
1364 attention_mask=decoder_attention_mask,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = ,

/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py in forward(self, input_ids, attention_mask, encoder_hidden_states, head_mask, cross_attn_head_mask, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
1124 positions = self.embed_positions(inputs_embeds, past_key_values_length=past_key_values_length)
1125
→ 1126 hidden_states = inputs_embeds + positions
1127 hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
1128

RuntimeError: The size of tensor a (517) must match the size of tensor b (448) at non-singleton dimension 1

did you solve it?