How to run an end to end example of distributed data parallel with hugging face's trainer api (ideally on a single node multiple gpus)?

cyt79 · August 4, 2023, 9:03am

Hey @muellerzr I have a question related to this. I am trying to finetune a model that is loaded on 8bit using Peft/Lora library in huggingface. I share the code I’m using for this below. My problem is: I have 8 gpu machine (each has 40GB gpu memory), but the below code does use only one of them to process batches. To clarify, I have 3200 examples and I set per_device_train_batch_size=4. Because I have 8 gpus step size I see in the progress bar during traning should be 100 (= 3200/(4*8)) but instead step size is written as 800 which tells me that only one of the gpus are being used. (I see that model is loaded in more than one gpus because when I check nvidia-smi I see that the first gpu is 80% full and the others are around 10-20% full). Do you know what is the problem and how I can fix it?

Edit: I run my code as follows: python script_name.py

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,BitsAndBytesConfig
from datasets import concatenate_datasets
import numpy as np
from transformers import AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
from peft import prepare_model_for_kbit_training
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import PeftConfig,PeftModel
from random import randrange
import os
from postprocess_finetuning_data import create_huggingface_dataset


def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["input"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["output"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

model_id="google/flan-ul2"
dataset = 'finetuning_data/training_data.csv'
per_attribute_max_size=2000
output_dir="lora-flanul2-context5_maxsize2000"


tokenizer = AutoTokenizer.from_pretrained(model_id)
max_source_length=1000
max_target_length = 20
dataset = create_huggingface_dataset(dataset,N=per_attribute_max_size)

# We preprocess our dataset before training and save it to disk
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["input", "output"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


# Fine tuning
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# To create a DataCollator that will take care of padding our inputs and labels.
# Use the DataCollatorForSeq2Seq from the Transformers library.
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    learning_rate=3e-4,
    num_train_epochs=1,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=400,
    save_strategy="steps",
    save_steps=400,
    report_to="tensorboard",
    evaluation_strategy='steps',
    eval_steps=400,
    per_device_train_batch_size=4,
    load_best_model_at_end=True,
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"]
)
model.config.use_cache = False

trainer.train()

Topic		Replies	Views
Distributed training large models on cloud resources Beginners	6	859	March 27, 2024
Single Node Multi GPU FlanT5 fine-tuning using HF Dataset and HF Trainer 🤗Transformers	4	2089	July 5, 2023
Using Transformers with DistributedDataParallel — any examples? Intermediate	11	23783	May 8, 2023
Multi gpu training 🤗Transformers	3	6080	April 24, 2022
Training using multiple GPUs Beginners	20	20272	February 25, 2024

How to run an end to end example of distributed data parallel with hugging face's trainer api (ideally on a single node multiple gpus)?

Related topics