Hi! I ran into this bug when running Seq2SeqTrainer and donât know how to tackle this. Can someone help me look into it a bit? Thank you so much!
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
output_dir="/home/sivan/whisper_base_fl_ch",
per_device_train_batch_size=128,
gradient_accumulation_steps=8,
learning_rate=1e-5,
warmup_steps=500,
max_steps=4000,
gradient_checkpointing=True,
fp16=True,
group_by_length=True,
evaluation_strategy="steps",
per_device_eval_batch_size=8,
predict_with_generate=True,
generation_max_length=225,
save_steps=1000,
eval_steps=1000,
logging_steps=25,
report_to=["tensorboard"],
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
push_to_hub=False,
disable_tqdm=True,
)
#%%
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
args=training_args,
model=model,
train_dataset=fleurs_ch["train"],
eval_dataset=fleurs_ch["test"],
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=processor.feature_extractor,
)
#%%
trainer.train()
And the error output goes:
***** Running training *****
Num examples = 3246
Num Epochs = 1334
Instantaneous batch size per device = 128
Total train batch size (w. parallel, distributed & accumulation) = 1024
Gradient Accumulation steps = 8
Total optimization steps = 4000
Number of trainable parameters = 72593920
TypeError Traceback (most recent call last)
Cell In [49], line 1
----> 1 trainer.train()
File ~/.local/lib/python3.9/site-packages/transformers/trainer.py:1515, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1510 self.model_wrapped = self.model
1512 inner_training_loop = find_executable_batch_size(
1513 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1514 )
-> 1515 return inner_training_loop(
1516 args=args,
1517 resume_from_checkpoint=resume_from_checkpoint,
1518 trial=trial,
1519 ignore_keys_for_eval=ignore_keys_for_eval,
1520 )
File ~/.local/lib/python3.9/site-packages/transformers/trainer.py:1763, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1761 tr_loss_step = self.training_step(model, inputs)
1762 else:
-> 1763 tr_loss_step = self.training_step(model, inputs)
1765 if (
1766 args.logging_nan_inf_filter
1767 and not is_torch_tpu_available()
1768 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1769 ):
1770 # if loss is nan or inf simply add the average of previous logged losses
1771 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File ~/.local/lib/python3.9/site-packages/transformers/trainer.py:2521, in Trainer.training_step(self, model, inputs)
2518 loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
2519 return loss_mb.reduce_mean().detach().to(self.args.device)
-> 2521 with self.compute_loss_context_manager():
2522 loss = self.compute_loss(model, inputs)
2524 if self.args.n_gpu > 1:
File ~/.local/lib/python3.9/site-packages/transformers/utils/generic.py:333, in ContextManagers.__enter__(self)
331 def __enter__(self):
332 for context_manager in self.context_managers:
--> 333 self.stack.enter_context(context_manager)
File /opt/conda/envs/pytorch_env/lib/python3.9/contextlib.py:448, in _BaseExitStack.enter_context(self, cm)
446 _cm_type = type(cm)
447 _exit = _cm_type.__exit__
--> 448 result = _cm_type.__enter__(cm)
449 self._push_cm_exit(cm, _exit)
450 return result
File ~/.local/lib/python3.9/site-packages/torch/autocast_mode.py:177, in autocast.__enter__(self)
175 torch.set_autocast_enabled(self._enabled)
176 torch.autocast_increment_nesting()
--> 177 torch.set_autocast_cache_enabled(self._cache_enabled)
TypeError: enabled must be a bool (got NoneType)