Reproduce SFTTrainer with Accelerate and Pytorch

Hello everyone,

SFTTrainer containing lots (and lots..) of parameters, I was aiming to see if I can reproduce its results with pure pytorch and accelerate.

I have come up with a first good version but don’t exaclty reproduce the results. It’s close but still, notably 0.02 loss points away after one epoch.

Do you have any idea what could be missing in what I did ?

(I’m running on one A100, device is “cuda”)

Thanks !

Model :

# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# Load the model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-360M"
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name
).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

# Set up the chat format
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

# Set our name for the finetune to be saved &/ uploaded to
finetune_name = "SmolLM2-FT-MyDataset"
finetune_tags = ["smol-course", "module_1"]

Dataset

# Load a sample dataset
from datasets import load_dataset

# TODO: define your dataset and config using the path and name parameters
ds = load_dataset(path="HuggingFaceTB/smoltalk", name="everyday-conversations")

SFTTrainer loop

# Configure the SFTTrainer
sft_config = SFTConfig(
    output_dir="./sft_output",
    max_length=tokenizer.model_max_length,
    #max_steps=2260,  # Adjust based on dataset size and desired training duration
    num_train_epochs=1,
    per_device_train_batch_size=4,  # Set according to your GPU memory capacity
    learning_rate=5e-5,  # Common starting point for fine-tuning
    logging_steps=10,  # Frequency of logging training metrics
    save_steps=100,  # Frequency of saving model checkpoints
    eval_strategy="steps",  # Evaluate the model at regular intervals
    eval_steps=50,  # Frequency of evaluation
    use_mps_device=(
        True if device == "mps" else False
    ),  # Use MPS for mixed precision training
    hub_model_id=finetune_name,  # Set a unique name for your model
    #packing=True
)

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=ds["train"],
    processing_class=tokenizer,
    eval_dataset=ds["test"],
)


# Train the model
trainer.train()

Pytorch and accelerate Loop

# prompt: import DataLoader, accelerator, AdamW, get_scheduler, tqdm

from torch.utils.data import DataLoader
from accelerate import Accelerator
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

import torch
import random
import numpy as np

# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)

# Training arguments - customize as needed
epochs = 1
lr = 5e-5
batch_size = 4
gradient_accumulation_steps = 1
eval_steps = 50

def preprocess_function(examples):
    formatted_chat = tokenizer.apply_chat_template(examples["messages"], tokenize=False)
    tokenized_output = tokenizer(formatted_chat)

    return tokenized_output

processed_ds = ds.map(preprocess_function, batched=True, remove_columns=ds['train'].column_names)

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

processed_ds.set_format("torch")

# Create DataLoaders
train_dataloader = DataLoader(processed_ds["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(processed_ds["test"], batch_size=batch_size, collate_fn=data_collator)

# Initialize Accelerator
accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)

# Create optimizer
optimizer = AdamW(model.parameters(), lr=lr, weight_decay = 0.0)

# Prepare model, optimizer, and dataloaders with Accelerator
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)


# Learning rate scheduler
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Training loop
progress_bar = tqdm(range(num_training_steps), disable=not accelerator.is_local_main_process)

model.train()
for epoch in range(epochs):
    for step, batch in enumerate(train_dataloader):
        with accelerator.accumulate(model):
            outputs = model(**batch) # DataCollatorForLanguageModeling provides input_ids and labels
            loss = outputs.loss
            accelerator.backward(loss)
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        progress_bar.update(1)
        if accelerator.is_local_main_process:
            progress_bar.set_description(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

        if step != 0 and step % eval_steps == 0:
            # Evaluation loop (optional)
            model.eval()
            losses = []
            for batch in eval_dataloader:
                with torch.no_grad():
                    outputs = model(**batch)
                    loss = outputs.loss

                losses.append(accelerator.gather(outputs.loss))
            eval_loss = torch.mean(torch.stack(losses))
            if accelerator.is_local_main_process:
                print(f"Step {step}, Average Eval Loss: {eval_loss:.4f}")

    accelerator.wait_for_everyone()

Results with SFTTrainer

Results with Pytorch and Accelerate :

1 Like