Training using multiple GPUs

I would like to train some models to multiple GPUs.
Let suppose that I use model from HF library, but I am using my own trainers,dataloader,collators etc.

Where I should focus to implement multiple GPU training? I need to make changes only in the Trainer class? If yes, can you give me a brief description?

Thank you in avance.

The Trainer class automatically handles multi-GPU training, you don’t have to do anything special.

yeah but I am using a Trainer class which is implemented from scratch. That’s why I am asking…

Here is an example of the Trainer class for EncoderDecoder models:

class EncoderDecoderTransformerTrainer:

    def __init__(self, model,
                 optimizer,
                 patience,
                 scheduler=None,
                 checkpoint_dir=None,
                 clip=None,
                 device='cpu'):

        self.model = model.to(device)
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.checkpoint_dir = checkpoint_dir
        self.clip = clip
        self.device = device
        self.patience = patience


    def calc_val_loss(self, val_loader):

        self.model.eval()
        with torch.no_grad():
            avg_val_loss = 0

            for index, batch in enumerate(tqdm(val_loader)):
                inputs = to_device(batch[0], device=self.device)
                inputs_att = to_device(batch[1], device=self.device)
                padded_targets = to_device(batch[2], device=self.device)
                replaced_targets = to_device(batch[3], device=self.device)
                targets_att = to_device(batch[4], device=self.device)

                outputs = self.model(input_ids=inputs,
                                     attention_mask=inputs_att,
                                     decoder_input_ids=padded_targets,
                                     decoder_attention_mask=targets_att,
                                     labels=replaced_targets)
                lm_loss = outputs[0]
                pred_scores = outputs[1]
                last_hidden = outputs[2]
                avg_val_loss += lm_loss.item()

            avg_val_loss = avg_val_loss / len(val_loader)
            return avg_val_loss

    def print_epoch(self, epoch, avg_train_epoch_loss, avg_val_epoch_loss,
                    cur_patience, strt):

        print("Epoch {}:".format(epoch+1))
        print("Train loss: {} | Train PPL: {}".format(
            avg_train_epoch_loss, math.exp(avg_train_epoch_loss)))
        print("Val loss: {} | Val PPL: {}".format(avg_val_epoch_loss,
              math.exp(avg_val_epoch_loss)))
        print("Patience left: {}".format(self.patience-cur_patience))
        print("Time: {} mins".format((time.time() - strt) / 60.0))
        print("++++++++++++++++++")

    def save_epoch(self, epoch, loss=None):

        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
        # torch.save(self.model.state_dict(), os.path.join(
        #     self.checkpoint_dir, '{}_{}.pth'.format(epoch, 'model_checkpoint')))

        # we use the proposed method for saving EncoderDecoder model
        self.model.save_pretrained(os.path.join(self.checkpoint_dir,'model_checkpoint'))
        torch.save(self.optimizer.state_dict(), os.path.join(
            self.checkpoint_dir,'optimizer_checkpoint'))


    def train_step(self, batch):
        self.model.train()
        self.optimizer.zero_grad()

        inputs = to_device(batch[0], device=self.device)
        inputs_att = to_device(batch[1], device=self.device)
        padded_targets = to_device(batch[2], device=self.device)
        replaced_targets = to_device(batch[3], device=self.device)
        targets_att = to_device(batch[4], device=self.device)
        print(inputs.shape)
        print(padded_targets.shape)
        # episis den eimai sigouros gia to ti prepei na dwsw san
        # decoder_input_ids (ta input ids i ta padded_targets??)

        outputs = self.model(input_ids=inputs,
                             attention_mask=inputs_att,
                             decoder_input_ids=padded_targets,
                             decoder_attention_mask=targets_att,
                             labels=replaced_targets)

        lm_loss = outputs[0]
        # print(lm_loss)
        pred_scores = outputs[1]
        last_hidden = outputs[2]
        return lm_loss, last_hidden

    def train_epochs(self, n_epochs, train_loader, val_loader):

        best_val_loss, cur_patience  = 10000, 0

        print("Training model....")
        self.model.train()

        for epoch in range(n_epochs):
            if cur_patience == self.patience:
                break

            avg_train_loss = 0
            strt = time.time()

            for index, sample_batch in enumerate(tqdm(train_loader)):

                loss, _ = self.train_step(sample_batch)
                avg_train_loss += loss.item()
                loss.backward(retain_graph=False)
                if self.clip is not None:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.clip)
                self.optimizer.step()
            avg_train_loss = avg_train_loss / len(train_loader)
            avg_val_loss = self.calc_val_loss(val_loader)

            if avg_val_loss < best_val_loss:
                self.save_epoch(epoch)
                best_val_loss = avg_val_loss
                cur_patience = 0
            else:
                cur_patience += 1
            self.print_epoch(epoch, avg_train_loss, avg_val_loss,
                             cur_patience, strt)

    def fit(self, train_loader, val_loader, epochs):
        self.train_epochs(epochs, train_loader, val_loader)

That’s a generic PyTorch function in that case. We’ve implemented our Trainer class to help users easily train their model and leverage GPUs/TPUs and we’re happy to help if you encounter some bugs with it but if you want to build your own loop, you should try the PyTorch forum.

2 Likes

@sgugger I am using Trainer classes but not seeing any major speedup in training if I use a multi-GPU setup. In nvidia-smi and the W&B dashboard, I can see that both GPUs are being used. I then launched the training script on a single-GPU for comparison. The training commands are exactly the same on both machines.

I do not see any significant speedup in training. The training lasts for hours, I didn’t wait till the end, but tqdm estimates are pretty much the same on both machines. The progress should be reflected properly in tqdm, right? Any suggestions for further diagnosis?

2 GPUs don’t bring a lot of speedup compared to one since you add all those synchronization operations. The main speedup is that you should have double the batch size automatically so less iterations (unless you used max_steps in your command, as always, it’s hard to help you on what’s wrong without seeing the code your un).

Riiight, excellent. I used max_steps and didn’t notice the epoch is double for 2 GPUs. Awesome.

While we are at it. I read Thomas’ article about GPU training where he advocates computing the loss in a parallel fashion. But I don’t see support for it in the trainer. It wasn’t worth it?

The Trainer lets you compute the loss how you want by subclassing and overriding compute_loss (see an example here). By default we use the basic loss since that’s the use case of most users.

1 Like

@sgugger I am trying to test multi-gpu training with the HF Trainer but for training a third party pytorch model. I have already overridden the compute_loss and the Trainer.train() runs without a problem on single GPU machines. On a 4-GPU EC2 machine I get the following error:

TrainerCallback
0%| | 0/20000 [00:00<?, ?it/s]Traceback (most recent call last):
File “train_hf_mlm_encoder_single_gpu.py”, line 222, in
main(params_dict)
File “train_hf_mlm_encoder_single_gpu.py”, line 218, in main
custom_trainer.train()
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/transformers/trainer.py”, line 1053, in train
tr_loss += self.training_step(model, inputs)
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/transformers/trainer.py”, line 1443, in training_step
loss = self.compute_loss(model, inputs)
File “/home/a204311-DataScientist/projects/trlabs_routing_transformer/routing_sum/mlm_pretrain/train_and_eval.py”, line 121, in compute_loss
return_loss=True
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py”, line 154, in forward
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py”, line 159, in replicate
return replicate(module, device_ids, not torch.is_grad_enabled())
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/nn/parallel/replicate.py”, line 104, in replicate
buffer_copies_not_rg = _broadcast_coalesced_reshape(buffers_not_rg, devices, detach=True)
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/nn/parallel/replicate.py”, line 68, in _broadcast_coalesced_reshape
return comm.broadcast_coalesced(tensors, devices)
File “/home/a204311-DataScientist/anaconda3/envs/routing/lib/python3.6/site-packages/torch/cuda/comm.py”, line 39, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: Unconvertible NCCL type
0%|

Any hints what may be causing this? I was under the impression that multi-GPU training should work out of the box with the Huggingface Trainer. Thank you for your help.

Since you are not providing the code of that model, I can’t really help you with what is going wrong.

@sgugger The model is the routing transformer language model (RoutingTransformerLM). The source code is here:

I would guess that this model does not run on multiple GPUs if your training runs fine on one GPU. I’m afraid you will have to ask on GitHub to the author of that library.

I am fine tuning T5 model on sagemaker with 4 gpu, just one gpu is being used. What do you suggest