Hello everyone,
SFTTrainer containing lots (and lots..) of parameters, I was aiming to see if I can reproduce its results with pure pytorch and accelerate.
I have come up with a first good version but donât exaclty reproduce the results. Itâs close but still, notably 0.02 loss points away after one epoch.
Do you have any idea what could be missing in what I did ?
(Iâm running on one A100, device is âcudaâ)
Thanks !
Model :
# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch
device = (
"cuda"
if torch.cuda.is_available()
else "mps" if torch.backends.mps.is_available() else "cpu"
)
# Load the model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-360M"
model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path=model_name
).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
# Set up the chat format
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)
# Set our name for the finetune to be saved &/ uploaded to
finetune_name = "SmolLM2-FT-MyDataset"
finetune_tags = ["smol-course", "module_1"]
Dataset
# Load a sample dataset
from datasets import load_dataset
# TODO: define your dataset and config using the path and name parameters
ds = load_dataset(path="HuggingFaceTB/smoltalk", name="everyday-conversations")
SFTTrainer loop
# Configure the SFTTrainer
sft_config = SFTConfig(
output_dir="./sft_output",
max_length=tokenizer.model_max_length,
#max_steps=2260, # Adjust based on dataset size and desired training duration
num_train_epochs=1,
per_device_train_batch_size=4, # Set according to your GPU memory capacity
learning_rate=5e-5, # Common starting point for fine-tuning
logging_steps=10, # Frequency of logging training metrics
save_steps=100, # Frequency of saving model checkpoints
eval_strategy="steps", # Evaluate the model at regular intervals
eval_steps=50, # Frequency of evaluation
use_mps_device=(
True if device == "mps" else False
), # Use MPS for mixed precision training
hub_model_id=finetune_name, # Set a unique name for your model
#packing=True
)
# Initialize the SFTTrainer
trainer = SFTTrainer(
model=model,
args=sft_config,
train_dataset=ds["train"],
processing_class=tokenizer,
eval_dataset=ds["test"],
)
# Train the model
trainer.train()
Pytorch and accelerate Loop
# prompt: import DataLoader, accelerator, AdamW, get_scheduler, tqdm
from torch.utils.data import DataLoader
from accelerate import Accelerator
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
import torch
import random
import numpy as np
# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
# Training arguments - customize as needed
epochs = 1
lr = 5e-5
batch_size = 4
gradient_accumulation_steps = 1
eval_steps = 50
def preprocess_function(examples):
formatted_chat = tokenizer.apply_chat_template(examples["messages"], tokenize=False)
tokenized_output = tokenizer(formatted_chat)
return tokenized_output
processed_ds = ds.map(preprocess_function, batched=True, remove_columns=ds['train'].column_names)
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
processed_ds.set_format("torch")
# Create DataLoaders
train_dataloader = DataLoader(processed_ds["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(processed_ds["test"], batch_size=batch_size, collate_fn=data_collator)
# Initialize Accelerator
accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)
# Create optimizer
optimizer = AdamW(model.parameters(), lr=lr, weight_decay = 0.0)
# Prepare model, optimizer, and dataloaders with Accelerator
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)
# Learning rate scheduler
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
# Training loop
progress_bar = tqdm(range(num_training_steps), disable=not accelerator.is_local_main_process)
model.train()
for epoch in range(epochs):
for step, batch in enumerate(train_dataloader):
with accelerator.accumulate(model):
outputs = model(**batch) # DataCollatorForLanguageModeling provides input_ids and labels
loss = outputs.loss
accelerator.backward(loss)
accelerator.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
if accelerator.is_local_main_process:
progress_bar.set_description(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
if step != 0 and step % eval_steps == 0:
# Evaluation loop (optional)
model.eval()
losses = []
for batch in eval_dataloader:
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
losses.append(accelerator.gather(outputs.loss))
eval_loss = torch.mean(torch.stack(losses))
if accelerator.is_local_main_process:
print(f"Step {step}, Average Eval Loss: {eval_loss:.4f}")
accelerator.wait_for_everyone()
Results with SFTTrainer
Results with Pytorch and Accelerate :