Multi-gpu batch processing fails when using Peft Lora with Huggingface

I am trying to finetune a model that is loaded on 8bit using Peft/Lora library in huggingface. I share the code I’m using for this below. My problem is: I have 8 gpu machine (each has 40GB gpu memory), but the below code does use only one of them to process batches. As an example, I have 3200 examples and I set per_device_train_batch_size=4. Because I have 8 gpus step size I see in the progress bar during traning should be 100 (= 3200/(4*8)). However, it is written as 800 which indicatesthat only one of the gpus are being used. I think at the moment The other gpus are only used for hosting the model. When I check nvidia-smi I see that the first gpu is 80% full and the others are around 10-20% full. Does anyone know what is the problem and how I can fix it? I run my code as follows: python script_name.py

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,BitsAndBytesConfig
from datasets import concatenate_datasets
import numpy as np
from transformers import AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
from peft import prepare_model_for_kbit_training
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import PeftConfig,PeftModel
from random import randrange
import os
from postprocess_finetuning_data import create_huggingface_dataset


def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["input"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["output"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

model_id="google/flan-ul2"
dataset = 'finetuning_data/training_data.csv'
per_attribute_max_size=2000
output_dir="lora-flanul2-context5_maxsize2000"


tokenizer = AutoTokenizer.from_pretrained(model_id)
max_source_length=1000
max_target_length = 20
dataset = create_huggingface_dataset(dataset,N=per_attribute_max_size)

# We preprocess our dataset before training and save it to disk
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["input", "output"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


# Fine tuning
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# To create a DataCollator that will take care of padding our inputs and labels.
# Use the DataCollatorForSeq2Seq from the Transformers library.
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    learning_rate=3e-4,
    num_train_epochs=1,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=400,
    save_strategy="steps",
    save_steps=400,
    report_to="tensorboard",
    evaluation_strategy='steps',
    eval_steps=400,
    per_device_train_batch_size=4,
    load_best_model_at_end=True,
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"]
)
model.config.use_cache = False

trainer.train()
1 Like

@cyt79 same problem, did you find a solution ?