Cuda out of memory during evaluation but training is fine

Hi,

I am finetuning a BARTForConditionalGeneration model. I am using Trainer from the library to train so I do not use anything fancy. I have 2 gpus I can even fit batch size 8 or 16 during training but after first epoch, I always receive Cuda Out of memory error. I find it strange because my evaluation batch size is 1.

Below is my code, which is very short actually.

import torch
import argparse
import os
import sys
import numpy as np
import torch.nn.functional as F
sys.path.append('..')
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments, BartForConditionalGeneration, FSMTForConditionalGeneration
from data_reader import GetDataAsPython
from sklearn.model_selection import train_test_split
from prepare_data import create_data, create_dataset, get_test_results, extract_warning_types
from transformers import T5Tokenizer, BartTokenizer, FSMTTokenizer
from datetime import datetime

parser = argparse.ArgumentParser()
parser.add_argument('-e', '--epochs', type=int, default=100)
parser.add_argument('-bs', '--batch-size', type=int, default=1)
parser.add_argument('-lr', '--learning-rate', type=float, default=1e-4)
parser.add_argument('-gcv', '--gradient-clip-val', type=float, default=0.0)
parser.add_argument('-wd', '--weight-decay', type=float, default=0.01)
parser.add_argument('-mn', '--model-name', type=str, choices=['t5-small', 't5-base', 't5-large', 'bart-base'], required=True)
args = parser.parse_args()

data = GetDataAsPython('../data_large2.json')
data_eslint = GetDataAsPython('../data_eslint.json')
data += data_eslint

all_warning_types = extract_warning_types(data)
all_warning_types = ['generator-star-spacing', 'no-array-constructor', 'no-extra-bind', 'no-debugger', 'no-extra-boolean-cast', 'no-extra-semi', 'no-useless-escape']

model_name = args.model_name
if 't5' in model_name:
    tokenizer = T5Tokenizer.from_pretrained(model_name)
elif 'bart' in model_name:
    tokenizer = BartTokenizer.from_pretrained('facebook/' + model_name)
else:
    raise "Unrecognized model"
tokenizer.add_tokens(['{', '}'])


now = datetime.now()
dt_string = now.strftime("%d-%m-%Y_%H-%M-%S")
model_directory = 't5global' + '_' + dt_string
model_directory = model_name + '_global_' + dt_string
os.system('mkdir ' + model_directory)
with open(model_directory + '/commandline_args.txt', 'w') as f:
    f.write('\n'.join(sys.argv[1:]))
tokenizer.save_pretrained(model_directory)

train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels = create_data(data, all_warning_types, include_warning=True, model_name=model_name)
train_dataset = create_dataset(train_inputs, train_labels, tokenizer, pad_truncate=True)
val_dataset = create_dataset(val_inputs, val_labels, tokenizer, pad_truncate=True)
test_dataset = create_dataset(test_inputs, test_labels, tokenizer, pad_truncate=True)

training_args = TrainingArguments(
    output_dir=model_directory,          
    num_train_epochs=args.epochs,              
    per_device_train_batch_size=args.batch_size,  
    per_device_eval_batch_size=1,   
    warmup_steps=500,                   
    weight_decay=args.weight_decay,               
    logging_dir=model_directory,
    logging_steps=100,
    do_eval=True,
    evaluation_strategy='epoch',
    learning_rate=args.learning_rate,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
)

if 't5' in model_name:
    model = T5ForConditionalGeneration.from_pretrained(model_name, return_dict=False)
elif 'bart' in model_name:
    model = BartForConditionalGeneration.from_pretrained('facebook/' + model_name)
model.resize_token_embeddings(len(tokenizer))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=[torch.optim.Adam(params=model.parameters(), lr=args.learning_rate), None],       
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model()
output = get_test_results(model, tokenizer, test_inputs, test_labels, False)
print(output)

output_file = open(model_name + 'allrules_results.txt', 'w+')
output_file.write(output)

and here is the stack trace

{'loss': 7.759439697265625, 'learning_rate': 2e-05, 'epoch': 0.28328611898017}                                                                                                                              

{β€˜loss’: 1.2010345458984375, β€˜learning_rate’: 4e-05, β€˜epoch’: 0.56657223796034}
{β€˜loss’: 0.3362786865234375, β€˜learning_rate’: 6e-05, β€˜epoch’: 0.8498583569405099}
3%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 353/10590 [02:50<1:22:07, 2.08it/sTraceback (most recent call last):β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 40/41 [00:09<00:00, 3.72it/s]
File β€œtransformers_global.py”, line 87, in
trainer.train()
File β€œ/home/berkay/model/lib/python3.8/site-packages/transformers/trainer.py”, line 792, in train
self._maybe_log_save_evalute(tr_loss, model, trial, epoch)
File β€œ/home/berkay/model/lib/python3.8/site-packages/transformers/trainer.py”, line 843, in _maybe_log_save_evalute
metrics = self.evaluate()
File β€œ/home/berkay/model/lib/python3.8/site-packages/transformers/trainer.py”, line 1251, in evaluate
output = self.prediction_loop(eval_dataloader, description=β€œEvaluation”)
File β€œ/home/berkay/model/lib/python3.8/site-packages/transformers/trainer.py”, line 1353, in prediction_loop
preds_host = logits if preds_host is None else nested_concat(preds_host, logits, dim=0)
File β€œ/home/berkay/model/lib/python3.8/site-packages/transformers/trainer_pt_utils.py”, line 47, in nested_concat
return type(tensors)(nested_concat(t, n, dim) for t, n in zip(tensors, new_tensors))
File β€œ/home/berkay/model/lib/python3.8/site-packages/transformers/trainer_pt_utils.py”, line 47, in
return type(tensors)(nested_concat(t, n, dim) for t, n in zip(tensors, new_tensors))
File β€œ/home/berkay/model/lib/python3.8/site-packages/transformers/trainer_pt_utils.py”, line 49, in nested_concat
return torch.cat((tensors, new_tensors), dim=dim)
RuntimeError: CUDA out of memory. Tried to allocate 2.63 GiB (GPU 0; 10.76 GiB total capacity; 4.74 GiB already allocated; 2.53 GiB free; 7.27 GiB reserved in total by PyTorch)
3%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž

To avoid that, you need to add eval_accumulation_steps in your TrainingArguments. By default the Trainer accumulated all predictions on the host before sending them to the CPU (because it’s faster) but if you run OOM, fix that argument to a small value (for instance 20 or 10) to trigger the copy more frequently and free host memory.

3 Likes

Hi,

I’m running into the same memory issues, although I’m using the finetune.py script from the seq2seq directory as therefore use a PyTorch Lightning trainer instead. As I am a bit unfamiliar with Lightning, is it still possible to achieve the same result with that script, or will I need to switch to the built in trainer from finetune_trainer.py instead?

I have no idea how to do this using PyTorch Lightning.

To be able to use eval_accumulation_steps argument you should use finetune_trainer.py, currently this is not possible with PL.

Thanks, switched over to finetune_trainer.pyand it works great now.

I facing the same issue in version 4.7.0 Using eval_accumulation_steps = 2 eventually ends up in RAM overflow and killing the process (vocabulary size is about 40K, sequence length 512, 15000 samples is about 3e11 float logits).

As a workaround I’ve added logits = [l.argmax(-1) for l in logits] immediately after prediction_step in evaluation_loop function in trainer.py . That helps for evaluation, not sure, though, that I’ve not broken something else.