Hey all,
I am using a local HPC to try and train LLMs, all as a test. I have been able to train GPT2 and smaller LLMs no problem. But now I am trying to train EleutherAI/gpt-neo-2.7B and I seem to need a bit more VRAM.
Well okay, I will use a system with multiple GPUs! I have limited access to a system with a few NVIDIA A100-SXM4-40GB. So I made the following python script:
import subprocess
from transformers import AutoTokenizer, GPTNeoForCausalLM, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TextDataset
from accelerate import Accelerator
import torch
import os
# Set CUDA_LAUNCH_BLOCKING=1 to get more detailed error messages
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
def print_gpu_memory():
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
print(result.stdout.decode('utf-8'))
# Initialize the accelerator
accelerator = Accelerator()
# Print initial GPU memory usage
print("Initial GPU memory usage:")
print_gpu_memory()
# Load the model and tokenizer
model_name = "EleutherAI/gpt-neo-2.7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name, gradient_checkpointing=True)
# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
model.resize_token_embeddings(len(tokenizer))
# Print GPU memory usage after loading model and tokenizer
print("After loading model and tokenizer:")
print_gpu_memory()
# Load the text from the file
file_path = "cleaned.txt"
# Load dataset
print("Just before loading dataset")
print_gpu_memory()
train_dataset = TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=512)
print(f"Number of samples in the dataset: {len(train_dataset)}")
# Print GPU memory usage after loading dataset
print("After loading dataset:")
print_gpu_memory()
# Define data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
# Define training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=2,
per_device_train_batch_size=1,
gradient_accumulation_steps=8, # Accumulate gradients to simulate a larger batch size
save_steps=10_000,
save_total_limit=2,
learning_rate=1e-4,
dataloader_num_workers=4,
fp16=True
)
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=data_collator,
)
# Prepare everything with accelerator
model, train_dataset, training_args = accelerator.prepare(
model, train_dataset, training_args
)
# Print GPU memory usage after preparing with accelerator
print("After preparing with accelerator:")
print_gpu_memory()
torch.cuda.empty_cache()
# Train the model
print("Just before training")
print_gpu_memory()
training_successful = False
try:
# sync GPUs
accelerator.wait_for_everyone()
trainer.train()
training_successful = True
except RuntimeError as e:
training_successful = False
print(f"Training failed with error: {e}")
# Print GPU memory usage after training
print("After training:")
print_gpu_memory()
# Save the fine-tuned model and tokenizer
if training_successful:
model.save_pretrained('./trained_models/neo_2_epochs')
# Print GPU memory usage after saving the model
print("After saving the model:")
print_gpu_memory()
But what I see is that everything is loaded on just one GPU:
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA A100-SXM4-40GB On | 00000000:31:00.0 Off | Off |
| N/A 32C P0 70W / 400W | 40445MiB / 40960MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA A100-SXM4-40GB On | 00000000:32:00.0 Off | Off |
| N/A 29C P0 65W / 400W | 5MiB / 40960MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
And it dies with a
Training failed with error: CUDA out of memory. Tried to allocate 100.00 MiB. GPU
Can anyone please tell me what I am doing wrong?