OutOfMemoryError: CUDA out of memory

Hi :slight_smile:
I am writing to seek assistance regarding an error I encountered while running a custum BERT-based classifier.

I am currently working with a relatively small dataset, and I have set the batch sizes as follows:

  • train_batch_size: 4
  • val_batch_size: 4

The validation set consists of fewer than 200 examples, while the training set has approximately 600 examples.

I would greatly appreciate your help in resolving this issue.

Here is the code snippet that I’m working with:

class BERTClassifier(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, self.num_labels)
        self.post_init()

    def forward(self, input_ids, attention_mask=None,  
                token_type_ids=None, labels=None,
                features=None, output_attentions=False):
        outputs = self.bert(
          input_ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids,
          output_attentions=True
        )
        if features is not None:
          dropout_output = torch.cat((dropout_output, features), dim=1)
        logits = self.linear(dropout_output)
        loss = None
        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

model = BERTClassifier.from_pretrained('bert-base-uncased')
model.to(device)

I encountered the following error message:

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /tmp/ipykernel_29/4032920361.py:1 in <module>                                                    │
│                                                                                                  │
│ [Errno 2] No such file or directory: '/tmp/ipykernel_29/4032920361.py'                           │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1645 in train                    │
│                                                                                                  │
│   1642 │   │   inner_training_loop = find_executable_batch_size(                                 │
│   1643 │   │   │   self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size  │
│   1644 │   │   )                                                                                 │
│ ā± 1645 │   │   return inner_training_loop(                                                       │
│   1646 │   │   │   args=args,                                                                    │
│   1647 │   │   │   resume_from_checkpoint=resume_from_checkpoint,                                │
│   1648 │   │   │   trial=trial,                                                                  │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2035 in _inner_training_loop     │
│                                                                                                  │
│   2032 │   │   │   │   self.control.should_training_stop = True                                  │
│   2033 │   │   │                                                                                 │
│   2034 │   │   │   self.control = self.callback_handler.on_epoch_end(args, self.state, self.con  │
│ ā± 2035 │   │   │   self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_  │
│   2036 │   │   │                                                                                 │
│   2037 │   │   │   if DebugOption.TPU_METRICS_DEBUG in self.args.debug:                          │
│   2038 │   │   │   │   if is_torch_tpu_available():                                              │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2321 in _maybe_log_save_evaluate │
│                                                                                                  │
│   2318 │   │   │   │   │   )                                                                     │
│   2319 │   │   │   │   │   metrics.update(dataset_metrics)                                       │
│   2320 │   │   │   else:                                                                         │
│ ā± 2321 │   │   │   │   metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)                 │
│   2322 │   │   │   self._report_to_hp_search(trial, self.state.global_step, metrics)             │
│   2323 │   │   │                                                                                 │
│   2324 │   │   │   # Run delayed LR scheduler now that metrics are populated                     │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3053 in evaluate                 │
│                                                                                                  │
│   3050 │   │   start_time = time.time()                                                          │
│   3051 │   │                                                                                     │
│   3052 │   │   eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else se  │
│ ā± 3053 │   │   output = eval_loop(                                                               │
│   3054 │   │   │   eval_dataloader,                                                              │
│   3055 │   │   │   description="Evaluation",                                                     │
│   3056 │   │   │   # No point gathering the predictions if there are no metrics, otherwise we d  │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3270 in evaluation_loop          │
│                                                                                                  │
│   3267 │   │   │   │   if self.preprocess_logits_for_metrics is not None:                        │
│   3268 │   │   │   │   │   logits = self.preprocess_logits_for_metrics(logits, labels)           │
│   3269 │   │   │   │   logits = self._nested_gather(logits)                                      │
│ ā± 3270 │   │   │   │   preds_host = logits if preds_host is None else nested_concat(preds_host,  │
│   3271 │   │   │   if labels is not None:                                                        │
│   3272 │   │   │   │   labels = self._nested_gather(labels)                                      │
│   3273 │   │   │   │   labels_host = labels if labels_host is None else nested_concat(labels_ho  │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in nested_concat    │
│                                                                                                  │
│    111 │   │   new_tensors                                                                       │
│    112 │   ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens  │
│    113 │   if isinstance(tensors, (list, tuple)):                                                │
│ ā±  114 │   │   return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i  │
│    115 │   elif isinstance(tensors, torch.Tensor):                                               │
│    116 │   │   return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind  │
│    117 │   elif isinstance(tensors, Mapping):                                                    │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in <genexpr>        │
│                                                                                                  │
│    111 │   │   new_tensors                                                                       │
│    112 │   ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens  │
│    113 │   if isinstance(tensors, (list, tuple)):                                                │
│ ā±  114 │   │   return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i  │
│    115 │   elif isinstance(tensors, torch.Tensor):                                               │
│    116 │   │   return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind  │
│    117 │   elif isinstance(tensors, Mapping):                                                    │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in nested_concat    │
│                                                                                                  │
│    111 │   │   new_tensors                                                                       │
│    112 │   ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens  │
│    113 │   if isinstance(tensors, (list, tuple)):                                                │
│ ā±  114 │   │   return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i  │
│    115 │   elif isinstance(tensors, torch.Tensor):                                               │
│    116 │   │   return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind  │
│    117 │   elif isinstance(tensors, Mapping):                                                    │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in <genexpr>        │
│                                                                                                  │
│    111 │   │   new_tensors                                                                       │
│    112 │   ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens  │
│    113 │   if isinstance(tensors, (list, tuple)):                                                │
│ ā±  114 │   │   return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i  │
│    115 │   elif isinstance(tensors, torch.Tensor):                                               │
│    116 │   │   return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind  │
│    117 │   elif isinstance(tensors, Mapping):                                                    │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:116 in nested_concat    │
│                                                                                                  │
│    113 │   if isinstance(tensors, (list, tuple)):                                                │
│    114 │   │   return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i  │
│    115 │   elif isinstance(tensors, torch.Tensor):                                               │
│ ā±  116 │   │   return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind  │
│    117 │   elif isinstance(tensors, Mapping):                                                    │
│    118 │   │   return type(tensors)(                                                             │
│    119 │   │   │   {k: nested_concat(t, new_tensors[k], padding_index=padding_index) for k, t i  │
│                                                                                                  │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:75 in                   │
│ torch_pad_and_concatenate                                                                        │
│                                                                                                  │
│     72 │   tensor2 = atleast_1d(tensor2)                                                         │
│     73 │                                                                                         │
│     74 │   if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:                   │
│ ā±   75 │   │   return torch.cat((tensor1, tensor2), dim=0)                                       │
│     76 │                                                                                         │
│     77 │   # Let's figure out the new shape                                                      │
│     78 │   new_shape = (tensor1.shape[0] + tensor2.shape[0], max(tensor1.shape[1], tensor2.shap  │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
OutOfMemoryError: CUDA out of memory. Tried to allocate 288.00 MiB (GPU 0; 15.90 GiB total capacity; 14.14 GiB 
already allocated; 201.75 MiB free; 14.82 GiB reserved in total by PyTorch) If reserved memory is >> allocated 
memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and 
PYTORCH_CUDA_ALLOC_CONF

Thank you