Hello,
I have been trying to use my 3 GPUs to train a model (tiiuae/falcon-7b) with a dataset (clips/mqa) for multiple days because of a CUDA out of memory error.
When i use model.to("cuda:0")
, the GPU with id 0 has 100% consommation and memory usage.
This is the same for GPUs 1 and 2.
Then, i found that we could put devices_ids directly to nn.DataParallel(model, devices_ids[0,1,2])
. When i put only one GPU, the training goes on it, but as soon as i put 2 or 3, the training is done on the first one only.
torch.cuda.empty_cache()
training_args = TrainingArguments("test-trainer",
per_device_train_batch_size=10,
per_device_eval_batch_size=10,
)
print("parallel_mode: ", training_args.parallel_mode)
print("n_gpus: ", training_args.n_gpu)
Parallel_mode : NOT_DISTRIBUTED
n_gpu : 3
Here is my full code :
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer, AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
import os
import torch
from GPUtil import showUtilization as gpu_usage
import torch.nn as nn
def tokenize_function(dataset):
answers = []
for answer in dataset["answers"]:
answers.append(answer[0]['text'])
test = tokenizer(dataset["name"], answers, truncation=True)
return test
try:
torch.cuda.empty_cache()
training_args = TrainingArguments("test-trainer",
per_device_train_batch_size=10,
per_device_eval_batch_size=10,
)
print("parallel_mode: ", training_args.parallel_mode)
print("n_gpus: ", training_args.n_gpu)
checkpoint = "tiiuae/falcon-7b"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
raw_datasets = load_dataset("clips/mqa", language="fr")
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
gpu_usage()
# model = model.to(torch.device("cuda")) # Move model to the first GPU
# model = nn.DataParallel(model) # Wrap model with nn.DataParallel
trainer = Trainer(
# model.module,
model,
training_args,
train_dataset=tokenized_datasets['train'],
# eval_dataset=tokenized_datasets['validation'],
data_collator=data_collator,
tokenizer=tokenizer,
# n_gpu=3,
)
trainer.train()
gpu_usage()
trainer.save_model("test")
except Exception as e:
print("dans erreur")
gpu_usage()
print("\033[91mErreur trainer.train(): {}\033[0m".format(e))
Has someone already seen this issue?
Thank you for your help,
Paul