Hi All,
I’m currently facing an issue where applying PPO training using trl==0.11.3
on top of my SFT model seems to make no difference at all in the final evaluation metrics. Here’s a snapshot of the comparison before and after PPO:
=== Summary Metrics ===
| model | exact_match | rouge1_f1 | rouge2_f1 | rougeL_f1 | bleu | meteor | inference_time_sec |
|---------------------------|-------------|-----------|-----------|-----------|----------|---------|---------------------|
| SFT-SeaLLMs-v3-7B-LoRA | 0.0428571 | 0.657684 | 0.524886 | 0.618101 | 0.418482 | 0.6555 | 46.5213 |
| PPO-SFT-SeaLLMs-v3-7B-LoRA| 0.0428571 | 0.657684 | 0.524886 | 0.618101 | 0.418482 | 0.6555 | 42.8529 |
Issue
Although the PPO loop runs successfully, the final model appears to behave identically to the original SFT model. Rewards do vary across training steps, and no errors are raised. Yet, the downstream generation quality remains unchanged.
Below is the full training code I’m using (compatible with trl==0.11.3
):
#!/usr/bin/env python
# Compatible with trl==0.11.3
# Launch with: accelerate launch --config_file accelerate_config_ppo.yaml train_ppo.py
import os
import json
import math
from functools import lru_cache
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
from trl import (
set_seed,
PPOConfig,
PPOTrainer,
AutoModelForCausalLMWithValueHead,
create_reference_model,
)
from trl.core import respond_to_batch
from openai import OpenAI
from tqdm.auto import tqdm
SEED = 42
MODEL_NAME = os.getenv("POLICY_MODEL", "../sft/sft_mp_final")
DATA_PATH = os.getenv("DATASET_PATH", "../datasets/cqa_ppo.jsonl")
LOG_DIR = os.getenv("LOG_DIR", "trainer_output/ppo_logs")
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "ppo_model_final")
set_seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
ppo_cfg = PPOConfig(
model_name=MODEL_NAME,
learning_rate=1e-5,
batch_size=1,
world_size=8,
mini_batch_size=1,
seed=42,
ppo_epochs=1,
kl_penalty="full",
log_with="tensorboard",
project_kwargs={"logging_dir": LOG_DIR},
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, local_files_only=True, use_fast=True)
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
policy_model = AutoModelForCausalLMWithValueHead.from_pretrained(
MODEL_NAME, torch_dtype=torch.float16, local_files_only=True
)
ref_model = create_reference_model(policy_model)
raw_ds = load_dataset("json", data_files=DATA_PATH, split="train").select(range(10))
def process_data(raw_ds):
def make_prompt_response(example):
prompt = f"{example['context']}\n\nPertanyaan: {example['question']}\nJawaban:"
response = example['answer']
return {'prompt': prompt, 'response': response}
return raw_ds.map(make_prompt_response, remove_columns=raw_ds.column_names)
processed_ds = process_data(raw_ds)
@lru_cache(maxsize=None)
def score_response(prompt: str, response: str, reference: str) -> dict:
...
ppo_trainer = PPOTrainer(
config=ppo_cfg,
model=policy_model,
ref_model=ref_model,
tokenizer=tokenizer,
)
generation_kwargs = {"top_k": 0.0, "top_p": 1.0, "do_sample": True}
device = ppo_trainer.current_device
for step, sample in enumerate(tqdm(processed_ds, desc="PPO Training")):
prompt = sample["prompt"]
reference = sample["response"]
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.squeeze(0).to(device)
out = ppo_trainer.generate(input_ids, **generation_kwargs).squeeze(0)
new_tokens = out[input_ids.shape[0]:]
response_str = tokenizer.decode(new_tokens, skip_special_tokens=True)
metrics = score_response(prompt, response_str, reference)
reward_val = metrics["reward"]
reward_t = torch.tensor(reward_val, device=device)
stats = ppo_trainer.step([input_ids], [new_tokens], [reward_t])
tqdm.write(f"Step {step:3d}: reward = {reward_val:.4f}")
# Save the final PPO-tuned model
os.makedirs("ppo_fine_tuned_model", exist_ok=True)
ppo_trainer.save_pretrained("ppo_fine_tuned_model")
If anyone sees anything that might explain why PPO is not affecting the model outputs, I’d really appreciate your help. Thanks in advance!