Am I doing multiple GPU right?

Hey all,

I am using a local HPC to try and train LLMs, all as a test. I have been able to train GPT2 and smaller LLMs no problem. But now I am trying to train EleutherAI/gpt-neo-2.7B and I seem to need a bit more VRAM.
Well okay, I will use a system with multiple GPUs! I have limited access to a system with a few NVIDIA A100-SXM4-40GB. So I made the following python script:

import subprocess
from transformers import AutoTokenizer, GPTNeoForCausalLM, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TextDataset
from accelerate import Accelerator
import torch
import os

# Set CUDA_LAUNCH_BLOCKING=1 to get more detailed error messages
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

def print_gpu_memory():
    result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
    print(result.stdout.decode('utf-8'))

# Initialize the accelerator
accelerator = Accelerator()

# Print initial GPU memory usage
print("Initial GPU memory usage:")
print_gpu_memory()

# Load the model and tokenizer
model_name = "EleutherAI/gpt-neo-2.7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name, gradient_checkpointing=True)

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))


# Print GPU memory usage after loading model and tokenizer
print("After loading model and tokenizer:")
print_gpu_memory()

# Load the text from the file
file_path = "cleaned.txt"

# Load dataset
print("Just before loading dataset")
print_gpu_memory()

train_dataset = TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=512)
print(f"Number of samples in the dataset: {len(train_dataset)}")

# Print GPU memory usage after loading dataset
print("After loading dataset:")
print_gpu_memory()

# Define data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,  # Accumulate gradients to simulate a larger batch size
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=1e-4,
    dataloader_num_workers=4,
    fp16=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# Prepare everything with accelerator
model, train_dataset, training_args = accelerator.prepare(
    model, train_dataset, training_args
)

# Print GPU memory usage after preparing with accelerator
print("After preparing with accelerator:")
print_gpu_memory()

torch.cuda.empty_cache()

# Train the model
print("Just before training")
print_gpu_memory()

training_successful = False

try:
    # sync GPUs
    accelerator.wait_for_everyone()
    trainer.train()
    training_successful = True
except RuntimeError as e:
    training_successful = False
    print(f"Training failed with error: {e}")

# Print GPU memory usage after training
print("After training:")
print_gpu_memory()

# Save the fine-tuned model and tokenizer
if training_successful:
    model.save_pretrained('./trained_models/neo_2_epochs')

# Print GPU memory usage after saving the model
print("After saving the model:")
print_gpu_memory()

But what I see is that everything is loaded on just one GPU:

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA A100-SXM4-40GB          On  |   00000000:31:00.0 Off |                  Off |
| N/A   32C    P0             70W /  400W |   40445MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A100-SXM4-40GB          On  |   00000000:32:00.0 Off |                  Off |
| N/A   29C    P0             65W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+

And it dies with a

Training failed with error: CUDA out of memory. Tried to allocate 100.00 MiB. GPU

Can anyone please tell me what I am doing wrong?

1 Like