OutOfMemoryError: CUDA out of memory

Hi :slight_smile:
I am writing to seek assistance regarding an error I encountered while running a custum BERT-based classifier.

I am currently working with a relatively small dataset, and I have set the batch sizes as follows:

  • train_batch_size: 4
  • val_batch_size: 4

The validation set consists of fewer than 200 examples, while the training set has approximately 600 examples.

I would greatly appreciate your help in resolving this issue.

Here is the code snippet that Iā€™m working with:

class BERTClassifier(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, self.num_labels)
        self.post_init()

    def forward(self, input_ids, attention_mask=None,  
                token_type_ids=None, labels=None,
                features=None, output_attentions=False):
        outputs = self.bert(
          input_ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids,
          output_attentions=True
        )
        if features is not None:
          dropout_output = torch.cat((dropout_output, features), dim=1)
        logits = self.linear(dropout_output)
        loss = None
        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

model = BERTClassifier.from_pretrained('bert-base-uncased')
model.to(device)

I encountered the following error message:

ā•­ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ Traceback (most recent call last) ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā•®
ā”‚ /tmp/ipykernel_29/4032920361.py:1 in <module>                                                    ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ [Errno 2] No such file or directory: '/tmp/ipykernel_29/4032920361.py'                           ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1645 in train                    ā”‚
ā”‚                                                                                                  ā”‚
ā”‚   1642 ā”‚   ā”‚   inner_training_loop = find_executable_batch_size(                                 ā”‚
ā”‚   1643 ā”‚   ā”‚   ā”‚   self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size  ā”‚
ā”‚   1644 ā”‚   ā”‚   )                                                                                 ā”‚
ā”‚ ā± 1645 ā”‚   ā”‚   return inner_training_loop(                                                       ā”‚
ā”‚   1646 ā”‚   ā”‚   ā”‚   args=args,                                                                    ā”‚
ā”‚   1647 ā”‚   ā”‚   ā”‚   resume_from_checkpoint=resume_from_checkpoint,                                ā”‚
ā”‚   1648 ā”‚   ā”‚   ā”‚   trial=trial,                                                                  ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2035 in _inner_training_loop     ā”‚
ā”‚                                                                                                  ā”‚
ā”‚   2032 ā”‚   ā”‚   ā”‚   ā”‚   self.control.should_training_stop = True                                  ā”‚
ā”‚   2033 ā”‚   ā”‚   ā”‚                                                                                 ā”‚
ā”‚   2034 ā”‚   ā”‚   ā”‚   self.control = self.callback_handler.on_epoch_end(args, self.state, self.con  ā”‚
ā”‚ ā± 2035 ā”‚   ā”‚   ā”‚   self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_  ā”‚
ā”‚   2036 ā”‚   ā”‚   ā”‚                                                                                 ā”‚
ā”‚   2037 ā”‚   ā”‚   ā”‚   if DebugOption.TPU_METRICS_DEBUG in self.args.debug:                          ā”‚
ā”‚   2038 ā”‚   ā”‚   ā”‚   ā”‚   if is_torch_tpu_available():                                              ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2321 in _maybe_log_save_evaluate ā”‚
ā”‚                                                                                                  ā”‚
ā”‚   2318 ā”‚   ā”‚   ā”‚   ā”‚   ā”‚   )                                                                     ā”‚
ā”‚   2319 ā”‚   ā”‚   ā”‚   ā”‚   ā”‚   metrics.update(dataset_metrics)                                       ā”‚
ā”‚   2320 ā”‚   ā”‚   ā”‚   else:                                                                         ā”‚
ā”‚ ā± 2321 ā”‚   ā”‚   ā”‚   ā”‚   metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)                 ā”‚
ā”‚   2322 ā”‚   ā”‚   ā”‚   self._report_to_hp_search(trial, self.state.global_step, metrics)             ā”‚
ā”‚   2323 ā”‚   ā”‚   ā”‚                                                                                 ā”‚
ā”‚   2324 ā”‚   ā”‚   ā”‚   # Run delayed LR scheduler now that metrics are populated                     ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3053 in evaluate                 ā”‚
ā”‚                                                                                                  ā”‚
ā”‚   3050 ā”‚   ā”‚   start_time = time.time()                                                          ā”‚
ā”‚   3051 ā”‚   ā”‚                                                                                     ā”‚
ā”‚   3052 ā”‚   ā”‚   eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else se  ā”‚
ā”‚ ā± 3053 ā”‚   ā”‚   output = eval_loop(                                                               ā”‚
ā”‚   3054 ā”‚   ā”‚   ā”‚   eval_dataloader,                                                              ā”‚
ā”‚   3055 ā”‚   ā”‚   ā”‚   description="Evaluation",                                                     ā”‚
ā”‚   3056 ā”‚   ā”‚   ā”‚   # No point gathering the predictions if there are no metrics, otherwise we d  ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3270 in evaluation_loop          ā”‚
ā”‚                                                                                                  ā”‚
ā”‚   3267 ā”‚   ā”‚   ā”‚   ā”‚   if self.preprocess_logits_for_metrics is not None:                        ā”‚
ā”‚   3268 ā”‚   ā”‚   ā”‚   ā”‚   ā”‚   logits = self.preprocess_logits_for_metrics(logits, labels)           ā”‚
ā”‚   3269 ā”‚   ā”‚   ā”‚   ā”‚   logits = self._nested_gather(logits)                                      ā”‚
ā”‚ ā± 3270 ā”‚   ā”‚   ā”‚   ā”‚   preds_host = logits if preds_host is None else nested_concat(preds_host,  ā”‚
ā”‚   3271 ā”‚   ā”‚   ā”‚   if labels is not None:                                                        ā”‚
ā”‚   3272 ā”‚   ā”‚   ā”‚   ā”‚   labels = self._nested_gather(labels)                                      ā”‚
ā”‚   3273 ā”‚   ā”‚   ā”‚   ā”‚   labels_host = labels if labels_host is None else nested_concat(labels_ho  ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in nested_concat    ā”‚
ā”‚                                                                                                  ā”‚
ā”‚    111 ā”‚   ā”‚   new_tensors                                                                       ā”‚
ā”‚    112 ā”‚   ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens  ā”‚
ā”‚    113 ā”‚   if isinstance(tensors, (list, tuple)):                                                ā”‚
ā”‚ ā±  114 ā”‚   ā”‚   return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i  ā”‚
ā”‚    115 ā”‚   elif isinstance(tensors, torch.Tensor):                                               ā”‚
ā”‚    116 ā”‚   ā”‚   return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind  ā”‚
ā”‚    117 ā”‚   elif isinstance(tensors, Mapping):                                                    ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in <genexpr>        ā”‚
ā”‚                                                                                                  ā”‚
ā”‚    111 ā”‚   ā”‚   new_tensors                                                                       ā”‚
ā”‚    112 ā”‚   ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens  ā”‚
ā”‚    113 ā”‚   if isinstance(tensors, (list, tuple)):                                                ā”‚
ā”‚ ā±  114 ā”‚   ā”‚   return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i  ā”‚
ā”‚    115 ā”‚   elif isinstance(tensors, torch.Tensor):                                               ā”‚
ā”‚    116 ā”‚   ā”‚   return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind  ā”‚
ā”‚    117 ā”‚   elif isinstance(tensors, Mapping):                                                    ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in nested_concat    ā”‚
ā”‚                                                                                                  ā”‚
ā”‚    111 ā”‚   ā”‚   new_tensors                                                                       ā”‚
ā”‚    112 ā”‚   ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens  ā”‚
ā”‚    113 ā”‚   if isinstance(tensors, (list, tuple)):                                                ā”‚
ā”‚ ā±  114 ā”‚   ā”‚   return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i  ā”‚
ā”‚    115 ā”‚   elif isinstance(tensors, torch.Tensor):                                               ā”‚
ā”‚    116 ā”‚   ā”‚   return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind  ā”‚
ā”‚    117 ā”‚   elif isinstance(tensors, Mapping):                                                    ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in <genexpr>        ā”‚
ā”‚                                                                                                  ā”‚
ā”‚    111 ā”‚   ā”‚   new_tensors                                                                       ā”‚
ā”‚    112 ā”‚   ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens  ā”‚
ā”‚    113 ā”‚   if isinstance(tensors, (list, tuple)):                                                ā”‚
ā”‚ ā±  114 ā”‚   ā”‚   return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i  ā”‚
ā”‚    115 ā”‚   elif isinstance(tensors, torch.Tensor):                                               ā”‚
ā”‚    116 ā”‚   ā”‚   return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind  ā”‚
ā”‚    117 ā”‚   elif isinstance(tensors, Mapping):                                                    ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:116 in nested_concat    ā”‚
ā”‚                                                                                                  ā”‚
ā”‚    113 ā”‚   if isinstance(tensors, (list, tuple)):                                                ā”‚
ā”‚    114 ā”‚   ā”‚   return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i  ā”‚
ā”‚    115 ā”‚   elif isinstance(tensors, torch.Tensor):                                               ā”‚
ā”‚ ā±  116 ā”‚   ā”‚   return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind  ā”‚
ā”‚    117 ā”‚   elif isinstance(tensors, Mapping):                                                    ā”‚
ā”‚    118 ā”‚   ā”‚   return type(tensors)(                                                             ā”‚
ā”‚    119 ā”‚   ā”‚   ā”‚   {k: nested_concat(t, new_tensors[k], padding_index=padding_index) for k, t i  ā”‚
ā”‚                                                                                                  ā”‚
ā”‚ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:75 in                   ā”‚
ā”‚ torch_pad_and_concatenate                                                                        ā”‚
ā”‚                                                                                                  ā”‚
ā”‚     72 ā”‚   tensor2 = atleast_1d(tensor2)                                                         ā”‚
ā”‚     73 ā”‚                                                                                         ā”‚
ā”‚     74 ā”‚   if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:                   ā”‚
ā”‚ ā±   75 ā”‚   ā”‚   return torch.cat((tensor1, tensor2), dim=0)                                       ā”‚
ā”‚     76 ā”‚                                                                                         ā”‚
ā”‚     77 ā”‚   # Let's figure out the new shape                                                      ā”‚
ā”‚     78 ā”‚   new_shape = (tensor1.shape[0] + tensor2.shape[0], max(tensor1.shape[1], tensor2.shap  ā”‚
ā•°ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā•Æ
OutOfMemoryError: CUDA out of memory. Tried to allocate 288.00 MiB (GPU 0; 15.90 GiB total capacity; 14.14 GiB 
already allocated; 201.75 MiB free; 14.82 GiB reserved in total by PyTorch) If reserved memory is >> allocated 
memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and 
PYTORCH_CUDA_ALLOC_CONF

Thank you