I’m trying to fine-tune LLM model using Kaggle’s 2xT4 configuration
Here’s my full code:
!pip install trl transformers datasets peft bitsandbytes
from datasets import load_dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from accelerate import Accelerator, PartialState
from accelerate.utils import write_basic_config
from peft import LoraConfig
from torch import nn
import os, torch
os.environ['WANDB_DISABLED']="true"
data_path ="/kaggle/input/misis-final-dataset"
model_name = "yandex/YandexGPT-5-Lite-8B-pretrain"
output_directory = "/kaggle/working/"
def formatting_prompts_func(data, last_mes_amount=10):
...
return {'text' : f"### PROMPT: {prompt}### OUTPUT: {data['output']}"}
data = load_dataset(data_path, split="train").map(formatting_prompts_func)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map='auto',
quantization_config=bnb_config,
use_cache=False
)
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,
padding_side="left", # Обрезаем начало, чтобы сохранять в контексте диалога последние сообщения
add_eos_token=True,add_bos_token=True,
use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
instruction_template = "### PROMPT:"
response_template = "### OUTPUT:"
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template,
tokenizer=tokenizer, mlm=False)
peft_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "k_proj", "v_proj"],
lora_dropout=0.01,
bias="all",
task_type="CAUSAL_LM"
)
training_args=SFTConfig(
label_names=["labels"],
output_dir=output_directory,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_checkpointing = False,
gradient_checkpointing_kwargs = {"use_reentrant": False},
gradient_accumulation_steps=1,
num_train_epochs=3.0,
learning_rate=2e-5,
max_grad_norm=1.0,
logging_strategy="steps",
logging_steps=5,
save_strategy="steps",
save_steps=500,
save_total_limit=3,
save_safetensors=True,
fp16=True,
bf16=False,
seed=42,
remove_unused_columns=True,
report_to=None,
push_to_hub=False,
ddp_find_unused_parameters=False,
dataloader_pin_memory=False,
skip_memory_metrics=True,
disable_tqdm=False
)
trainer = SFTTrainer(model=model,
peft_config=peft_config,
train_dataset=data,
data_collator=collator,
args=training_args,
)
trainer.train()
Before i use trainer.train() The model is distributed across devices like:
{'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 1, 'model.layers.9': 1, 'model.layers.10': 1, 'model.layers.11': 1, 'model.layers.12': 1, 'model.layers.13': 1, 'model.layers.14': 1, 'model.layers.15': 1, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.norm': 1, 'model.rotary_emb': 1, 'lm_head': 1}
I’ve tried to use only one GPU but got MemoryLimit, anyway I want to train it using 2 GPUs