I’m trying to fine-tune an AraBERT2GPT2 model using the EncoderDecoderModel class on a relatively small dataset. I train only for 1 epoch and evaluate after it. However, I get a CUDA out of memory error a few samples into the evaluation even after reducing the evaluation batch size to 1. What may be the cause of this and how can I solve it?
RuntimeError Traceback (most recent call last)
<ipython-input-4-581ee471b86c> in <module>()
18 trainer = Seq2SeqTrainer(model=model, args=training_args, compute_metrics=compute_metrics, train_dataset = train_dataset['train'], eval_dataset=train_dataset['test'])
19
---> 20 trainer.train()
21 torch.cuda.empty_cache()
9 frames
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1319 resume_from_checkpoint=resume_from_checkpoint,
1320 trial=trial,
-> 1321 ignore_keys_for_eval=ignore_keys_for_eval,
1322 )
1323
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1642
1643 self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
-> 1644 self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
1645
1646 if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval)
1794 metrics = None
1795 if self.control.should_evaluate:
-> 1796 metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
1797 self._report_to_hp_search(trial, epoch, metrics)
1798
/usr/local/lib/python3.7/dist-packages/transformers/trainer_seq2seq.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix, max_length, num_beams)
68 self._max_length = max_length if max_length is not None else self.args.generation_max_length
69 self._num_beams = num_beams if num_beams is not None else self.args.generation_num_beams
---> 70 return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
71
72 def predict(
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
2463 prediction_loss_only=True if self.compute_metrics is None else None,
2464 ignore_keys=ignore_keys,
-> 2465 metric_key_prefix=metric_key_prefix,
2466 )
2467
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
2660 if self.preprocess_logits_for_metrics is not None:
2661 logits = self.preprocess_logits_for_metrics(logits, labels)
-> 2662 preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
2663 self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
2664
/usr/local/lib/python3.7/dist-packages/transformers/trainer_pt_utils.py in nested_concat(tensors, new_tensors, padding_index)
112 ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
113 if isinstance(tensors, (list, tuple)):
--> 114 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
115 elif isinstance(tensors, torch.Tensor):
116 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
/usr/local/lib/python3.7/dist-packages/transformers/trainer_pt_utils.py in <genexpr>(.0)
112 ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
113 if isinstance(tensors, (list, tuple)):
--> 114 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
115 elif isinstance(tensors, torch.Tensor):
116 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
/usr/local/lib/python3.7/dist-packages/transformers/trainer_pt_utils.py in nested_concat(tensors, new_tensors, padding_index)
114 return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
115 elif isinstance(tensors, torch.Tensor):
--> 116 return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
117 elif isinstance(tensors, np.ndarray):
118 return numpy_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
/usr/local/lib/python3.7/dist-packages/transformers/trainer_pt_utils.py in torch_pad_and_concatenate(tensor1, tensor2, padding_index)
73
74 if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
---> 75 return torch.cat((tensor1, tensor2), dim=0)
76
77 # Let's figure out the new shape
RuntimeError: CUDA out of memory. Tried to allocate 86.00 MiB (GPU 0; 15.90 GiB total capacity; 14.10 GiB already allocated; 79.75 MiB free; 14.78 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF