Hi
I am writing to seek assistance regarding an error I encountered while running a custum BERT-based classifier.
I am currently working with a relatively small dataset, and I have set the batch sizes as follows:
train_batch_size
: 4val_batch_size
: 4
The validation set consists of fewer than 200 examples, while the training set has approximately 600 examples.
I would greatly appreciate your help in resolving this issue.
Here is the code snippet that Iām working with:
class BERTClassifier(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.bert = BertModel(config)
self.dropout = torch.nn.Dropout(0.3)
self.linear = torch.nn.Linear(768, self.num_labels)
self.post_init()
def forward(self, input_ids, attention_mask=None,
token_type_ids=None, labels=None,
features=None, output_attentions=False):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
output_attentions=True
)
if features is not None:
dropout_output = torch.cat((dropout_output, features), dim=1)
logits = self.linear(dropout_output)
loss = None
if labels is not None:
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)
model = BERTClassifier.from_pretrained('bert-base-uncased')
model.to(device)
I encountered the following error message:
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā Traceback (most recent call last) āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā®
ā /tmp/ipykernel_29/4032920361.py:1 in <module> ā
ā ā
ā [Errno 2] No such file or directory: '/tmp/ipykernel_29/4032920361.py' ā
ā ā
ā /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1645 in train ā
ā ā
ā 1642 ā ā inner_training_loop = find_executable_batch_size( ā
ā 1643 ā ā ā self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size ā
ā 1644 ā ā ) ā
ā ā± 1645 ā ā return inner_training_loop( ā
ā 1646 ā ā ā args=args, ā
ā 1647 ā ā ā resume_from_checkpoint=resume_from_checkpoint, ā
ā 1648 ā ā ā trial=trial, ā
ā ā
ā /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2035 in _inner_training_loop ā
ā ā
ā 2032 ā ā ā ā self.control.should_training_stop = True ā
ā 2033 ā ā ā ā
ā 2034 ā ā ā self.control = self.callback_handler.on_epoch_end(args, self.state, self.con ā
ā ā± 2035 ā ā ā self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_ ā
ā 2036 ā ā ā ā
ā 2037 ā ā ā if DebugOption.TPU_METRICS_DEBUG in self.args.debug: ā
ā 2038 ā ā ā ā if is_torch_tpu_available(): ā
ā ā
ā /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2321 in _maybe_log_save_evaluate ā
ā ā
ā 2318 ā ā ā ā ā ) ā
ā 2319 ā ā ā ā ā metrics.update(dataset_metrics) ā
ā 2320 ā ā ā else: ā
ā ā± 2321 ā ā ā ā metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) ā
ā 2322 ā ā ā self._report_to_hp_search(trial, self.state.global_step, metrics) ā
ā 2323 ā ā ā ā
ā 2324 ā ā ā # Run delayed LR scheduler now that metrics are populated ā
ā ā
ā /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3053 in evaluate ā
ā ā
ā 3050 ā ā start_time = time.time() ā
ā 3051 ā ā ā
ā 3052 ā ā eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else se ā
ā ā± 3053 ā ā output = eval_loop( ā
ā 3054 ā ā ā eval_dataloader, ā
ā 3055 ā ā ā description="Evaluation", ā
ā 3056 ā ā ā # No point gathering the predictions if there are no metrics, otherwise we d ā
ā ā
ā /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3270 in evaluation_loop ā
ā ā
ā 3267 ā ā ā ā if self.preprocess_logits_for_metrics is not None: ā
ā 3268 ā ā ā ā ā logits = self.preprocess_logits_for_metrics(logits, labels) ā
ā 3269 ā ā ā ā logits = self._nested_gather(logits) ā
ā ā± 3270 ā ā ā ā preds_host = logits if preds_host is None else nested_concat(preds_host, ā
ā 3271 ā ā ā if labels is not None: ā
ā 3272 ā ā ā ā labels = self._nested_gather(labels) ā
ā 3273 ā ā ā ā labels_host = labels if labels_host is None else nested_concat(labels_ho ā
ā ā
ā /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in nested_concat ā
ā ā
ā 111 ā ā new_tensors ā
ā 112 ā ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens ā
ā 113 ā if isinstance(tensors, (list, tuple)): ā
ā ā± 114 ā ā return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i ā
ā 115 ā elif isinstance(tensors, torch.Tensor): ā
ā 116 ā ā return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind ā
ā 117 ā elif isinstance(tensors, Mapping): ā
ā ā
ā /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in <genexpr> ā
ā ā
ā 111 ā ā new_tensors ā
ā 112 ā ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens ā
ā 113 ā if isinstance(tensors, (list, tuple)): ā
ā ā± 114 ā ā return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i ā
ā 115 ā elif isinstance(tensors, torch.Tensor): ā
ā 116 ā ā return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind ā
ā 117 ā elif isinstance(tensors, Mapping): ā
ā ā
ā /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in nested_concat ā
ā ā
ā 111 ā ā new_tensors ā
ā 112 ā ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens ā
ā 113 ā if isinstance(tensors, (list, tuple)): ā
ā ā± 114 ā ā return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i ā
ā 115 ā elif isinstance(tensors, torch.Tensor): ā
ā 116 ā ā return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind ā
ā 117 ā elif isinstance(tensors, Mapping): ā
ā ā
ā /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:114 in <genexpr> ā
ā ā
ā 111 ā ā new_tensors ā
ā 112 ā ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tens ā
ā 113 ā if isinstance(tensors, (list, tuple)): ā
ā ā± 114 ā ā return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i ā
ā 115 ā elif isinstance(tensors, torch.Tensor): ā
ā 116 ā ā return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind ā
ā 117 ā elif isinstance(tensors, Mapping): ā
ā ā
ā /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:116 in nested_concat ā
ā ā
ā 113 ā if isinstance(tensors, (list, tuple)): ā
ā 114 ā ā return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n i ā
ā 115 ā elif isinstance(tensors, torch.Tensor): ā
ā ā± 116 ā ā return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_ind ā
ā 117 ā elif isinstance(tensors, Mapping): ā
ā 118 ā ā return type(tensors)( ā
ā 119 ā ā ā {k: nested_concat(t, new_tensors[k], padding_index=padding_index) for k, t i ā
ā ā
ā /opt/conda/lib/python3.10/site-packages/transformers/trainer_pt_utils.py:75 in ā
ā torch_pad_and_concatenate ā
ā ā
ā 72 ā tensor2 = atleast_1d(tensor2) ā
ā 73 ā ā
ā 74 ā if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]: ā
ā ā± 75 ā ā return torch.cat((tensor1, tensor2), dim=0) ā
ā 76 ā ā
ā 77 ā # Let's figure out the new shape ā
ā 78 ā new_shape = (tensor1.shape[0] + tensor2.shape[0], max(tensor1.shape[1], tensor2.shap ā
ā°āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāÆ
OutOfMemoryError: CUDA out of memory. Tried to allocate 288.00 MiB (GPU 0; 15.90 GiB total capacity; 14.14 GiB
already allocated; 201.75 MiB free; 14.82 GiB reserved in total by PyTorch) If reserved memory is >> allocated
memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and
PYTORCH_CUDA_ALLOC_CONF
Thank you