I’m trying to fine-tune the Microsoft/BioGPT model, which has approx. 350 million parameters, on an AWS EC2 instance (ml.g4dn.xlarge) with a 16 GB T4 GPU.
While the training part is going absolutely fine. When the time comes for evaluation after each epoch, it runs out of GPU memory. I tried shifting evaluation to CPU but then it exhausts the RAM. If I omit evaluation completely, there are no problems. But I want to keep evaluation because I check for overfitting and need early stopping.
I was thinking of using an additional instance and shifting eval to the 2nd GPU but the problem is that, during eval, the memory keeps filling up so after some steps the 2nd GPU would fill up as well. I’m not sure how to rectify this and needed some inputs. Any insights or advice would be greatly appreciated. Thanks.
Below, I share my Custom Trainer and Training Arguments for reference:
# My Training arguments
train_args = TrainingArguments(
learning_rate=1e-5,
lr_scheduler_type="inverse_sqrt",
per_device_train_batch_size=2,
per_device_eval_batch_size=1,
warmup_steps=34, # Computed based on train set, batch_size, epochs and gradient accumulation.
num_train_epochs=3,
weight_decay=0.01,
evaluation_strategy="steps", # For testing puposes
eval_steps=5,
save_strategy="steps",
save_steps=5,
save_total_limit=1,
logging_steps=10,
logging_dir=args.logging_dir,
load_best_model_at_end=True,
metric_for_best_model="roc_auc",
greater_is_better=True,
gradient_accumulation_steps=32,
fp16=True
)
# My custom trainer
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
# Transfer dataset to same device as the model
inputs_on_device = {k: torch.stack(tuple(sample.to(model.device) for sample in v)).to(model.device) for k, v in inputs.items()}
labels = inputs_on_device.get("labels")
# forward pass
outputs = model(**inputs_on_device)
logits = outputs.logits
# compute custom loss
loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([1.0, 1.0]).to(model.device))
loss = loss_fct(logits.view(-1, 2), labels.view(-1))
# Tried to clear memory(didn't work)
del inputs
del inputs_on_device
torch.cuda.empty_cache()
return (loss, outputs) if return_outputs else loss
def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix: str = "eval"):
# Tried to clear memory(didn't work)
torch.cuda.empty_cache()
# Get CPU device
eval_device = torch.device("cpu")
print(f"Moving model to evaluation device: {eval_device}")
self.model.to(eval_device)
self.eval_device_override = eval_device
# Run default evaluation logic
metrics = super().evaluate(eval_dataset=eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
print(f"Evaluation done. Metrics: {metrics}")
# Restore model to GPU
self.model.to("cuda")
# print("Moved model back to original device: cuda")
return metrics
def move_to_device(self, dataset, device_name):
# Move dataset to device_name
new_dataset = dict()
for k, v in dataset.items():
if isinstance(v, torch.Tensor):
for sample in v:
sample.to(device_name)
new_dataset[k] = v.to(device_name)
else:
new_dataset[k] = v
return new_dataset
def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
# Force move inputs to the eval device
eval_device = getattr(self, "eval_device_override", model.device)
new_inputs = self.move_to_device(inputs, eval_device)
return super().prediction_step(model, new_inputs, prediction_loss_only, ignore_keys)