Hello everyone!
I thought I’d post this here first, as I am not sure if it is a bug or if I am doing something wrong.
I’m using the huggingface library to train an XLM-R token classifier. I originally wrote the training routine myself, which worked quite well, but I wanted to switch to the trainer for more advanced features like early stopping and easier setting of training arguments.
To prototype my code, I usually run it on a free google colab account. While the training process works, I’ve had the code crash several times, because the disk space of the Compute Environment runs out. This is NOT my google drive space, but a separate disk of around 60GB space. I have observed, that during training the used space keeps on growing, but I have no idea where or what exactly is writing data. Once the disk is full, this results in the code crashing:
The following are my training parameters/callbacks defined:
## Define Callbacks
class PrinterCallback(TrainerCallback):
def on_train_begin(self, args, state, control, **kwargs):
print('\033[1m'+ '=' * 25 + " Model Training " + '=' * 25 + '\033[0m')
def on_epoch_begin(self, args, state, control, **kwargs):
print('\n'+ '\033[1m'+ '=' * 25 +' Epoch {:} / {:} '.format(int(trainer.state.epoch) + 1, int(trainer.state.num_train_epochs)) + '=' * 25)
## Training parameters
# training arguments
training_args = TrainingArguments(
output_dir='./checkpoints', # output directory
num_train_epochs=5, # total # of training epochs
per_device_train_batch_size=32, # batch size per device during training
per_device_eval_batch_size=32, # batch size for evaluation
warmup_steps=0, # number of warmup steps for learning rate scheduler
weight_decay=0, # strength of weight decay
learning_rate=2e-5, #2e-5
logging_dir='./logs', # directory for storing logs
evaluation_strategy= "epoch", #"steps", "epoch", or "no"
#eval_steps=100,
save_total_limit=1,
load_best_model_at_end=False, #loads the model with the best evaluation score
metric_for_best_model="weightedF1",
greater_is_better=True
)
## Start training
# initialize huggingface trainer
trainer = Trainer(
model=xlmr_model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=xlmr_tokenizer,
compute_metrics=validate,
callbacks=[PrinterCallback]
)
trainer.train()
Any idea what is going wrong here?
Edit: Here is the Error as text from another run; apparently Torch is continuously writing something to disk, but why and what is it?
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/torch/serialization.py in save(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization)
371 with _open_zipfile_writer(opened_file) as opened_zipfile:
--> 372 _save(obj, opened_zipfile, pickle_module, pickle_protocol)
373 return
6 frames
/usr/local/lib/python3.7/dist-packages/torch/serialization.py in _save(obj, zip_file, pickle_module, pickle_protocol)
490 num_bytes = storage.size() * storage.element_size()
--> 491 zip_file.write_record(name, storage.data_ptr(), num_bytes)
492
OSError: [Errno 28] No space left on device
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-36-3435b262f1ae> in <module>()
----> 1 trainer.train()
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
1170 self.control = self.callback_handler.on_step_end(self.args, self.state, self.control)
1171
-> 1172 self._maybe_log_save_evaluate(tr_loss, model, trial, epoch)
1173
1174 if self.control.should_epoch_stop or self.control.should_training_stop:
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch)
1267
1268 if self.control.should_save:
-> 1269 self._save_checkpoint(model, trial, metrics=metrics)
1270 self.control = self.callback_handler.on_save(self.args, self.state, self.control)
1271
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in _save_checkpoint(self, model, trial, metrics)
1317 elif self.is_world_process_zero() and not self.deepspeed:
1318 # deepspeed.save_checkpoint above saves model/optim/sched
-> 1319 torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
1320 with warnings.catch_warnings(record=True) as caught_warnings:
1321 torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
/usr/local/lib/python3.7/dist-packages/torch/serialization.py in save(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization)
371 with _open_zipfile_writer(opened_file) as opened_zipfile:
372 _save(obj, opened_zipfile, pickle_module, pickle_protocol)
--> 373 return
374 _legacy_save(obj, opened_file, pickle_module, pickle_protocol)
375
/usr/local/lib/python3.7/dist-packages/torch/serialization.py in __exit__(self, *args)
257
258 def __exit__(self, *args) -> None:
--> 259 self.file_like.write_end_of_file()
260 self.buffer.flush()
261
RuntimeError: [enforce fail at inline_container.cc:274] . unexpected pos 2212230208 vs 2212230096```