What does "--multi_gpu" do under the hood? (and how to use it)

I am trying to learn how to train large(r) language models and Accelerate seems to be the tool for me. I know I’ll eventually want to learn about DeepSpeed as well but for now I am focusing on the base features of Accelerate.

First I wonder what does accelerate do when using the --multi_gpu flag. My guess is that it provides data parallelism (i.e., replicates your model across all the gpus and runs the computation in parallel). On the other hand I noticed two things, first you also need to set the --num_processes flag else it will only use one gpu. Second, even when using multiple gpus I don’t see any meaningful speed up. By running on N gpus I’d expect to roughly take 1/N of the time but instead I see very little gain. This makes me think I either don’t understand what is happening or I am doing something very wrong.

I am adding in the first reply the code that I am using and the accelerate config. Thanks for any help :slight_smile:

As additional information, I am running the following code (and the data I am using is available here):

import random

import numpy as np
import pandas as pd
import torch
from accelerate import Accelerator
from torch.utils.data import (
    DataLoader,
    RandomSampler,
    SequentialSampler,
    TensorDataset,
    random_split,
)
from transformers import (
    AdamW,
    GPT2Config,
    GPT2ForSequenceClassification,
    GPT2Tokenizer,
    get_linear_schedule_with_warmup,
)

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.manual_seed(seed_val)


def get_data(tokenizer, batch_size):

    # Read data from file using panda
    data_frame = pd.read_csv(
        "./data/cola_public/raw/in_domain_train.tsv",
        delimiter="\t",
        header=None,
        names=["sentence_source", "label", "label_notes", "sentence"],
    )
    sentences = data_frame.sentence.values
    labels = data_frame.label.values

    input_ids = []
    attention_masks = []

    # Tokenize, pad, truncate and add special tokens to all the samples in the dataset
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            truncation=True,
            max_length=64,  # Pad & truncate all sentences.
            padding="max_length",
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors="pt",  # Return pytorch tensors.
        )

        input_ids.append(encoded_dict["input_ids"])
        attention_masks.append(encoded_dict["attention_mask"])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    # Create datasets
    dataset = TensorDataset(input_ids, attention_masks, labels)

    train_size = int(0.9 * len(dataset))
    test_val_size = int(0.1 * len(dataset)) + 1

    # Divide the dataset in train and validation by randomly selecting samples.
    (
        train_dataset,
        val_dataset,
    ) = random_split(dataset, [train_size, test_val_size])

    # Create dataloaders
    train_dataloader = DataLoader(
        train_dataset,  # The training samples.
        sampler=RandomSampler(train_dataset),  # Select batches randomly
        batch_size=batch_size,  # Trains with this batch size.
    )

    validation_dataloader = DataLoader(
        val_dataset,  # The validation samples.
        sampler=SequentialSampler(val_dataset),  # Pull out batches sequentially.
        batch_size=batch_size,  # Evaluate with this batch size.
    )

    return train_dataloader, validation_dataloader


if __name__ == "__main__":

    accelerator = Accelerator()
    BATCH_SIZE = 128
    MODEL_NAME_OR_PATH = "gpt2"
    EPOCHS = 2
    DEVICE = accelerator.device

    # get tokenize
    tokenizer = GPT2Tokenizer.from_pretrained(
        pretrained_model_name_or_path=MODEL_NAME_OR_PATH
    )
    tokenizer.pad_token = tokenizer.eos_token

    # get data for training
    train_dataloader, validation_dataloader = get_data(tokenizer, BATCH_SIZE)

    # get model and prepare it
    n_labels = 2
    model_config = GPT2Config.from_pretrained(
        pretrained_model_name_or_path=MODEL_NAME_OR_PATH, num_labels=n_labels
    )
    model = GPT2ForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=MODEL_NAME_OR_PATH, config=model_config
    )
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = model.config.eos_token_id

    model.to(DEVICE)

    # get optimizer and scheduler
    optimizer = AdamW(
        model.parameters(),
        lr=2e-5,
        eps=1e-8,  # args.adam_epsilon  - default is 1e-8.
        no_deprecation_warning=True,
    )

    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps,
    )

    model, optimizer, training_dataloader, scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, scheduler
    )
    for epoch_i in range(0, EPOCHS):

        total_train_loss = 0
        model.train()

        # iterate over the train dataset
        for step, batch in enumerate(train_dataloader):

            b_input_ids = batch[0].to(DEVICE)
            b_input_mask = batch[1].to(DEVICE)
            b_labels = batch[2].to(DEVICE)

            # forward
            model.zero_grad()
            output = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels,
            )
            loss = output.loss

            total_train_loss += loss.item()

            # backward
            accelerator.backward(loss)

            # gradient clipping (?)
            accelerator.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

And this is the config file I am using (changing num_processes from 1 to 4):

compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: MULTI_GPU
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 4
use_cpu: false

I am running this on an AWS p3.8xlarge instance

The --multi_gpu flag will basically expose accelerate launch to behave like torch.distributed.run (or torchrun). Which needs the number of processes etc to be ran (and what accelerate config let’s you avoid passing).

This is most definitely never the case. It’s more 1.3-1.5x and some change. The bigger benefit with multigpu is larger batch sizes can be used at one time.

Note that if you use accelerate config then you should not pass in --multi_gpu manually, as that parameter makes you then have to pass much of it yourself :slight_smile:

Thanks for the reply! I did experiment a bit before posting to figure out myself what was going. In particular I thought that accelerate might have been splitting the batch across gpus. To validate this I tried training with a batch size of 128 with one gpu and a batch of 256 with two. Unfortunately, in the second case I run out of memory. If I understand correctly what you are saying, that should not have happened as each GPU should be getting 1/N of the batch. Probably I am still missing something :frowning: . Let me know and thanks again for the help!

Yes, but you’re close! The batch size sent to the dataloader is the batch size per GPU, as you would expect. So to really test it you need a batch size of 128 on one GPU and 64 on two GPUs! Check out this doc here: Comparing performance between different device setups

From the page you linked I understand that if I want an effective batch size of B, I need to set the batch size of the DataLoader to B/N (where N is the number of GPUs). This makes sense if you want to have a fair comparison between the accuracy of a single-gpu setup and a multiple-gpu one (otherwise you’d be inflating the batch size N times in the multi-gpu case).

In my case I’m only concerned about training time and that is where I get lost. In the multi-gpu case keeping the batch size constant should result in going through the dataset much faster but I don’t seem to get any benefit. Concretely, I measured the training time for different setup using unix time command, here are my results:

1 gpu, batch 128 (effective batch size 128)

real    0m43.103s
user    0m33.259s
sys     0m10.112s

2 gpus, batch 128 (effective batch size 256)

real    0m43.115s
user    0m59.446s
sys     0m21.418s

4 gpus, batch 128 (effective batch size 512)

real    0m48.151s
user    1m54.802s
sys     0m41.042s

As you can see I even get (slightly) slower as I increase the number of GPUs. One possible explanation is that the time needed to run my code is dominated by loading the data, the models etc. To double check this I tried creating a longer dataset (I just replicated the original one 5 times) and timed just the training loop using python code. In both cases I get very similar results where more gpus result in slightly longer times.

At this point I’m wondering if I am not iterating over the data correctly. Or maybe there’s something else I’ve overlooked.

I rewrote my code from scratch starting from this example and ended up with this code:

import torch
from accelerate import Accelerator
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    set_seed,
)

if __name__ == "__main__":

    BATCH_SIZE = 128

    accelerator = Accelerator()

    # minimal configuration, stored as dict then parsed
    config = {"lr": 2e-5, "num_epochs": 2, "seed": 42}
    lr = config["lr"]
    num_epochs = config["num_epochs"]
    seed = config["seed"]

    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    datasets = load_dataset(
        "csv",
        data_files={
            "train": "./data/cola_public/raw/in_domain_train.tsv",
        },
        delimiter="\t",
        column_names=["unk1", "labels", "unk2", "sentence"],
    )

    def tokenize_function(examples):
        # max_length=None => use the model max length (it's actually the default)
        outputs = tokenizer(
            examples["sentence"],
            truncation=True,
            max_length=None,
        )
        return outputs

    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=["unk1", "unk2", "sentence"],
    )

    def collate_fn(examples):
        return tokenizer.pad(examples, padding="longest", return_tensors="pt")

    # Instantiate dataloaders.
    train_dataloader = DataLoader(
        tokenized_datasets["train"],
        shuffle=True,
        collate_fn=collate_fn,
        batch_size=BATCH_SIZE,
    )

    set_seed(seed)

    # Create model and place it onto the device (an Accelerator)
    model = AutoModelForSequenceClassification.from_pretrained("gpt2", return_dict=True)
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = model.config.eos_token_id
    model = model.to(accelerator.device)

    # Instantiate optimizer
    optimizer = AdamW(params=model.parameters(), lr=lr)

    # Instantiate scheduler
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    # Prepare everything for accelerate
    (
        model,
        optimizer,
        train_dataloader,
        lr_scheduler,
    ) = accelerator.prepare(model, optimizer, train_dataloader, lr_scheduler)

    model.train()

    for epoch in range(0, num_epochs):
        for step, batch in enumerate(train_dataloader):
            # Typycally we'd need to to something like batch.to(device) but accelearte
            # takes care of that
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

I timed it again and now I see results consistent with what I expected.

1 gpu, batch 128 (effective batch size 128)

real    0m43.096s
user    0m31.884s
sys     0m9.850s

4 gpus, batch 128 (effective batch size 512)

real    0m28.065s
user    0m53.994s
sys     0m19.314s

While this is great I still have no clue on what was wrong with my initial code. Any idea @muellerzr ?

Hi Erpa, I encountered a similar issue where adding additional GPUs didn’t reduce the training time. Could you share how you resolved it?

Thanks