Training using multiple GPUs

I would like to train some models to multiple GPUs.
Let suppose that I use model from HF library, but I am using my own trainers,dataloader,collators etc.

Where I should focus to implement multiple GPU training? I need to make changes only in the Trainer class? If yes, can you give me a brief description?

Thank you in avance.

1 Like

The Trainer class automatically handles multi-GPU training, you don’t have to do anything special.

5 Likes

yeah but I am using a Trainer class which is implemented from scratch. That’s why I am asking…

Here is an example of the Trainer class for EncoderDecoder models:

class EncoderDecoderTransformerTrainer:

    def __init__(self, model,
                 optimizer,
                 patience,
                 scheduler=None,
                 checkpoint_dir=None,
                 clip=None,
                 device='cpu'):

        self.model = model.to(device)
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.checkpoint_dir = checkpoint_dir
        self.clip = clip
        self.device = device
        self.patience = patience


    def calc_val_loss(self, val_loader):

        self.model.eval()
        with torch.no_grad():
            avg_val_loss = 0

            for index, batch in enumerate(tqdm(val_loader)):
                inputs = to_device(batch[0], device=self.device)
                inputs_att = to_device(batch[1], device=self.device)
                padded_targets = to_device(batch[2], device=self.device)
                replaced_targets = to_device(batch[3], device=self.device)
                targets_att = to_device(batch[4], device=self.device)

                outputs = self.model(input_ids=inputs,
                                     attention_mask=inputs_att,
                                     decoder_input_ids=padded_targets,
                                     decoder_attention_mask=targets_att,
                                     labels=replaced_targets)
                lm_loss = outputs[0]
                pred_scores = outputs[1]
                last_hidden = outputs[2]
                avg_val_loss += lm_loss.item()

            avg_val_loss = avg_val_loss / len(val_loader)
            return avg_val_loss

    def print_epoch(self, epoch, avg_train_epoch_loss, avg_val_epoch_loss,
                    cur_patience, strt):

        print("Epoch {}:".format(epoch+1))
        print("Train loss: {} | Train PPL: {}".format(
            avg_train_epoch_loss, math.exp(avg_train_epoch_loss)))
        print("Val loss: {} | Val PPL: {}".format(avg_val_epoch_loss,
              math.exp(avg_val_epoch_loss)))
        print("Patience left: {}".format(self.patience-cur_patience))
        print("Time: {} mins".format((time.time() - strt) / 60.0))
        print("++++++++++++++++++")

    def save_epoch(self, epoch, loss=None):

        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
        # torch.save(self.model.state_dict(), os.path.join(
        #     self.checkpoint_dir, '{}_{}.pth'.format(epoch, 'model_checkpoint')))

        # we use the proposed method for saving EncoderDecoder model
        self.model.save_pretrained(os.path.join(self.checkpoint_dir,'model_checkpoint'))
        torch.save(self.optimizer.state_dict(), os.path.join(
            self.checkpoint_dir,'optimizer_checkpoint'))


    def train_step(self, batch):
        self.model.train()
        self.optimizer.zero_grad()

        inputs = to_device(batch[0], device=self.device)
        inputs_att = to_device(batch[1], device=self.device)
        padded_targets = to_device(batch[2], device=self.device)
        replaced_targets = to_device(batch[3], device=self.device)
        targets_att = to_device(batch[4], device=self.device)
        print(inputs.shape)
        print(padded_targets.shape)
        # episis den eimai sigouros gia to ti prepei na dwsw san
        # decoder_input_ids (ta input ids i ta padded_targets??)

        outputs = self.model(input_ids=inputs,
                             attention_mask=inputs_att,
                             decoder_input_ids=padded_targets,
                             decoder_attention_mask=targets_att,
                             labels=replaced_targets)

        lm_loss = outputs[0]
        # print(lm_loss)
        pred_scores = outputs[1]
        last_hidden = outputs[2]
        return lm_loss, last_hidden

    def train_epochs(self, n_epochs, train_loader, val_loader):

        best_val_loss, cur_patience  = 10000, 0

        print("Training model....")
        self.model.train()

        for epoch in range(n_epochs):
            if cur_patience == self.patience:
                break

            avg_train_loss = 0
            strt = time.time()

            for index, sample_batch in enumerate(tqdm(train_loader)):

                loss, _ = self.train_step(sample_batch)
                avg_train_loss += loss.item()
                loss.backward(retain_graph=False)
                if self.clip is not None:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.clip)
                self.optimizer.step()
            avg_train_loss = avg_train_loss / len(train_loader)
            avg_val_loss = self.calc_val_loss(val_loader)

            if avg_val_loss < best_val_loss:
                self.save_epoch(epoch)
                best_val_loss = avg_val_loss
                cur_patience = 0
            else:
                cur_patience += 1
            self.print_epoch(epoch, avg_train_loss, avg_val_loss,
                             cur_patience, strt)

    def fit(self, train_loader, val_loader, epochs):
        self.train_epochs(epochs, train_loader, val_loader)

That’s a generic PyTorch function in that case. We’ve implemented our Trainer class to help users easily train their model and leverage GPUs/TPUs and we’re happy to help if you encounter some bugs with it but if you want to build your own loop, you should try the PyTorch forum.

3 Likes

@sgugger I am using Trainer classes but not seeing any major speedup in training if I use a multi-GPU setup. In nvidia-smi and the W&B dashboard, I can see that both GPUs are being used. I then launched the training script on a single-GPU for comparison. The training commands are exactly the same on both machines.

I do not see any significant speedup in training. The training lasts for hours, I didn’t wait till the end, but tqdm estimates are pretty much the same on both machines. The progress should be reflected properly in tqdm, right? Any suggestions for further diagnosis?

1 Like

2 GPUs don’t bring a lot of speedup compared to one since you add all those synchronization operations. The main speedup is that you should have double the batch size automatically so less iterations (unless you used max_steps in your command, as always, it’s hard to help you on what’s wrong without seeing the code your un).

1 Like

Riiight, excellent. I used max_steps and didn’t notice the epoch is double for 2 GPUs. Awesome.

While we are at it. I read Thomas’ article about GPU training where he advocates computing the loss in a parallel fashion. But I don’t see support for it in the trainer. It wasn’t worth it?

1 Like

The Trainer lets you compute the loss how you want by subclassing and overriding compute_loss (see an example here). By default we use the basic loss since that’s the use case of most users.

1 Like

@sgugger I am trying to test multi-gpu training with the HF Trainer but for training a third party pytorch model. I have already overridden the compute_loss and the Trainer.train() runs without a problem on single GPU machines. On a 4-GPU EC2 machine I get the following error:

TrainerCallback
0%| | 0/20000 [00:00<?, ?it/s]Traceback (most recent call last):
File “train_hf_mlm_encoder_single_gpu.py”, line 222, in
main(params_dict)
File “train_hf_mlm_encoder_single_gpu.py”, line 218, in main
custom_trainer.train()
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/transformers/trainer.py”, line 1053, in train
tr_loss += self.training_step(model, inputs)
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/transformers/trainer.py”, line 1443, in training_step
loss = self.compute_loss(model, inputs)
File “/home/a204311-DataScientist/projects/trlabs_routing_transformer/routing_sum/mlm_pretrain/train_and_eval.py”, line 121, in compute_loss
return_loss=True
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py”, line 154, in forward
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py”, line 159, in replicate
return replicate(module, device_ids, not torch.is_grad_enabled())
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/nn/parallel/replicate.py”, line 104, in replicate
buffer_copies_not_rg = _broadcast_coalesced_reshape(buffers_not_rg, devices, detach=True)
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/nn/parallel/replicate.py”, line 68, in _broadcast_coalesced_reshape
return comm.broadcast_coalesced(tensors, devices)
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/cuda/comm.py”, line 39, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: Unconvertible NCCL type
0%|

Any hints what may be causing this? I was under the impression that multi-GPU training should work out of the box with the Huggingface Trainer. Thank you for your help.

Since you are not providing the code of that model, I can’t really help you with what is going wrong.

@sgugger The model is the routing transformer language model (RoutingTransformerLM). The source code is here:

I would guess that this model does not run on multiple GPUs if your training runs fine on one GPU. I’m afraid you will have to ask on GitHub to the author of that library.

I am fine tuning T5 model on sagemaker with 4 gpu, just one gpu is being used. What do you suggest

Hi @Sahajtomar thanks for reaching on the forum!

Hi @OlivierCR , I am using distributed training , somehow model training does not start at all. I am using t5 large on 8 GPUs.

Hi @sgugger,

Is there is any special parameter that needs to be passed to the Trainer class to work with multi-GPU?
Please have a look at Not able to scale Trainer code to single node multi GPU - :hugs:Transformers - Hugging Face Forums

Hi I’m confused about how to use Trainer class on 2 gpus. Are there any arguments I should set? And how to define model.to(device)?

@sgugger hi, I’m trying to fine tune “meta-llama/Llama-2-7b” model in Kaggle notebook with (GPU T4 X 2), I’m noticing only one GPU is being used.

Sun Feb 25 14:06:28 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   76C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla T4                       Off | 00000000:00:05.0 Off |                    0 |
| N/A   77C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|  No running processes found                                                           |
+---------------------------------------------------------------------------------------+

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from random import randrange
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM, prepare_model_for_int8_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("train.csv")
train = Dataset.from_pandas(df)
model_id = "meta-llama/Llama-2-7b"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             load_in_4bit = True,
                                             torch_dtype = torch.float16,
                                             device_map="auto")
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_int8_training(model)
peft_config = LoraConfig(
                          lora_alpha=16,
                          lora_dropout=0.1,
                          r=64,
                          bias="none",
                          task_type="CAUSAL_LM"
                        )
model = get_peft_model(model, peft_config)

args = TrainingArguments(
    output_dir='custom_domain_test',
    num_train_epochs=5,
    per_device_train_batch_size=8, 
    optim = "adamw_torch",
    logging_steps = 100,
    save_total_limit = 2,
    save_strategy = "no",
    load_best_model_at_end=False,
    learning_rate=2e-4,
    fp16=True,
    seed=42,
    warmup_ratio = 0.1,
    lr_scheduler_type = "linear",
    report_to="none",
    dataloader_num_workers = 4
)

# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    # eval_dataset=test,
    dataset_text_field='text',
    peft_config=peft_config,
    max_seq_length=1042,
    tokenizer=tokenizer,
    args=args,
    packing=True,
)

# train
trainer.train()

can you please tell me how to utilise both the GPU and increase GPU and CPU utilisation using HuggingFace Trainer?

Regarding training models using multiple GPUs, refer to the Alignment Handbook which uses DeepSpeed ZeRO-3 to run training on multiple GPUs: alignment-handbook/scripts at main · huggingface/alignment-handbook · GitHub.

This is handled using the Accelerate library as backend (which the Trainer uses). One needs to define a configuration as done here: alignment-handbook/recipes/accelerate_configs/deepspeed_zero3.yaml at main · huggingface/alignment-handbook · GitHub, and then pass that when running the script.

@sgugger In my case, I can see it is using only one GPU, I’m using Kaggle 2 T4 GPU.