"Expected all tensors to be on the same device" with SFTTrainer

I’m trying to fine-tune LLM model using Kaggle’s 2xT4 configuration

Here’s my full code:

!pip install trl transformers datasets peft bitsandbytes
from datasets import load_dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from accelerate import Accelerator, PartialState
from accelerate.utils import write_basic_config
from peft import LoraConfig
from torch import nn
import os, torch

os.environ['WANDB_DISABLED']="true"

data_path ="/kaggle/input/misis-final-dataset"
model_name = "yandex/YandexGPT-5-Lite-8B-pretrain"
output_directory = "/kaggle/working/"

def formatting_prompts_func(data, last_mes_amount=10):
    ...
    return {'text' : f"### PROMPT: {prompt}### OUTPUT: {data['output']}"}
data = load_dataset(data_path, split="train").map(formatting_prompts_func)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map='auto',
    quantization_config=bnb_config,
    use_cache=False
)

tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,
                                          padding_side="left", # Обрезаем начало, чтобы сохранять в контексте диалога последние сообщения
                                          add_eos_token=True,add_bos_token=True,
                                          use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

instruction_template = "### PROMPT:"
response_template = "### OUTPUT:"
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, 
                                           tokenizer=tokenizer, mlm=False)


peft_config = LoraConfig(
            r=8, 
            lora_alpha=16, 
            target_modules=["q_proj", "k_proj", "v_proj"], 
            lora_dropout=0.01, 
            bias="all",
            task_type="CAUSAL_LM"
        )

training_args=SFTConfig(
    label_names=["labels"],
    output_dir=output_directory,
    
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,  
    gradient_checkpointing = False,
    gradient_checkpointing_kwargs = {"use_reentrant": False}, 

    gradient_accumulation_steps=1, 
    num_train_epochs=3.0,  
    learning_rate=2e-5, 
    max_grad_norm=1.0,  

    logging_strategy="steps",  
    logging_steps=5,  
    save_strategy="steps",  
    save_steps=500,  
    save_total_limit=3, 
    save_safetensors=True,  

    fp16=True,  
    bf16=False, 

    seed=42,

    remove_unused_columns=True, 
    report_to=None, 
    push_to_hub=False, 


    ddp_find_unused_parameters=False,
    dataloader_pin_memory=False, 
    skip_memory_metrics=True, 
    disable_tqdm=False
)

trainer = SFTTrainer(model=model,
                    peft_config=peft_config,
                    train_dataset=data,
                    data_collator=collator,
                    args=training_args,
)

trainer.train()

Before i use trainer.train() The model is distributed across devices like:

{'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 1, 'model.layers.9': 1, 'model.layers.10': 1, 'model.layers.11': 1, 'model.layers.12': 1, 'model.layers.13': 1, 'model.layers.14': 1, 'model.layers.15': 1, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.norm': 1, 'model.rotary_emb': 1, 'lm_head': 1}

I’ve tried to use only one GPU but got MemoryLimit, anyway I want to train it using 2 GPUs

1 Like

It seems that this error may occur depending on the version of Transoformers. Of course, there are other possibilities…

1 Like