Hi,
I am finetuning a BARTForConditionalGeneration model. I am using Trainer from the library to train so I do not use anything fancy. I have 2 gpus I can even fit batch size 8 or 16 during training but after first epoch, I always receive Cuda Out of memory error. I find it strange because my evaluation batch size is 1.
Below is my code, which is very short actually.
import torch
import argparse
import os
import sys
import numpy as np
import torch.nn.functional as F
sys.path.append('..')
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments, BartForConditionalGeneration, FSMTForConditionalGeneration
from data_reader import GetDataAsPython
from sklearn.model_selection import train_test_split
from prepare_data import create_data, create_dataset, get_test_results, extract_warning_types
from transformers import T5Tokenizer, BartTokenizer, FSMTTokenizer
from datetime import datetime
parser = argparse.ArgumentParser()
parser.add_argument('-e', '--epochs', type=int, default=100)
parser.add_argument('-bs', '--batch-size', type=int, default=1)
parser.add_argument('-lr', '--learning-rate', type=float, default=1e-4)
parser.add_argument('-gcv', '--gradient-clip-val', type=float, default=0.0)
parser.add_argument('-wd', '--weight-decay', type=float, default=0.01)
parser.add_argument('-mn', '--model-name', type=str, choices=['t5-small', 't5-base', 't5-large', 'bart-base'], required=True)
args = parser.parse_args()
data = GetDataAsPython('../data_large2.json')
data_eslint = GetDataAsPython('../data_eslint.json')
data += data_eslint
all_warning_types = extract_warning_types(data)
all_warning_types = ['generator-star-spacing', 'no-array-constructor', 'no-extra-bind', 'no-debugger', 'no-extra-boolean-cast', 'no-extra-semi', 'no-useless-escape']
model_name = args.model_name
if 't5' in model_name:
tokenizer = T5Tokenizer.from_pretrained(model_name)
elif 'bart' in model_name:
tokenizer = BartTokenizer.from_pretrained('facebook/' + model_name)
else:
raise "Unrecognized model"
tokenizer.add_tokens(['{', '}'])
now = datetime.now()
dt_string = now.strftime("%d-%m-%Y_%H-%M-%S")
model_directory = 't5global' + '_' + dt_string
model_directory = model_name + '_global_' + dt_string
os.system('mkdir ' + model_directory)
with open(model_directory + '/commandline_args.txt', 'w') as f:
f.write('\n'.join(sys.argv[1:]))
tokenizer.save_pretrained(model_directory)
train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels = create_data(data, all_warning_types, include_warning=True, model_name=model_name)
train_dataset = create_dataset(train_inputs, train_labels, tokenizer, pad_truncate=True)
val_dataset = create_dataset(val_inputs, val_labels, tokenizer, pad_truncate=True)
test_dataset = create_dataset(test_inputs, test_labels, tokenizer, pad_truncate=True)
training_args = TrainingArguments(
output_dir=model_directory,
num_train_epochs=args.epochs,
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=1,
warmup_steps=500,
weight_decay=args.weight_decay,
logging_dir=model_directory,
logging_steps=100,
do_eval=True,
evaluation_strategy='epoch',
learning_rate=args.learning_rate,
load_best_model_at_end=True,
metric_for_best_model='eval_loss',
greater_is_better=False,
)
if 't5' in model_name:
model = T5ForConditionalGeneration.from_pretrained(model_name, return_dict=False)
elif 'bart' in model_name:
model = BartForConditionalGeneration.from_pretrained('facebook/' + model_name)
model.resize_token_embeddings(len(tokenizer))
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
optimizers=[torch.optim.Adam(params=model.parameters(), lr=args.learning_rate), None],
tokenizer=tokenizer,
)
trainer.train()
trainer.save_model()
output = get_test_results(model, tokenizer, test_inputs, test_labels, False)
print(output)
output_file = open(model_name + 'allrules_results.txt', 'w+')
output_file.write(output)
and here is the stack trace
{'loss': 7.759439697265625, 'learning_rate': 2e-05, 'epoch': 0.28328611898017}
{‘loss’: 1.2010345458984375, ‘learning_rate’: 4e-05, ‘epoch’: 0.56657223796034}
{‘loss’: 0.3362786865234375, ‘learning_rate’: 6e-05, ‘epoch’: 0.8498583569405099}
3%|█████▎ | 353/10590 [02:50<1:22:07, 2.08it/sTraceback (most recent call last):██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 40/41 [00:09<00:00, 3.72it/s]
File “transformers_global.py”, line 87, in
trainer.train()
File “/home/berkay/model/lib/python3.8/site-packages/transformers/trainer.py”, line 792, in train
self._maybe_log_save_evalute(tr_loss, model, trial, epoch)
File “/home/berkay/model/lib/python3.8/site-packages/transformers/trainer.py”, line 843, in _maybe_log_save_evalute
metrics = self.evaluate()
File “/home/berkay/model/lib/python3.8/site-packages/transformers/trainer.py”, line 1251, in evaluate
output = self.prediction_loop(eval_dataloader, description=“Evaluation”)
File “/home/berkay/model/lib/python3.8/site-packages/transformers/trainer.py”, line 1353, in prediction_loop
preds_host = logits if preds_host is None else nested_concat(preds_host, logits, dim=0)
File “/home/berkay/model/lib/python3.8/site-packages/transformers/trainer_pt_utils.py”, line 47, in nested_concat
return type(tensors)(nested_concat(t, n, dim) for t, n in zip(tensors, new_tensors))
File “/home/berkay/model/lib/python3.8/site-packages/transformers/trainer_pt_utils.py”, line 47, in
return type(tensors)(nested_concat(t, n, dim) for t, n in zip(tensors, new_tensors))
File “/home/berkay/model/lib/python3.8/site-packages/transformers/trainer_pt_utils.py”, line 49, in nested_concat
return torch.cat((tensors, new_tensors), dim=dim)
RuntimeError: CUDA out of memory. Tried to allocate 2.63 GiB (GPU 0; 10.76 GiB total capacity; 4.74 GiB already allocated; 2.53 GiB free; 7.27 GiB reserved in total by PyTorch)
3%|█████▎