How to evaluate a tranied model?

Hello everone, I’m new to model evaluation. I trained a Qwen model on my own dataset. Now I need to evaluate my trained model using the loss function, but I don’t know how to do it. I see examples for other metrics like accuracy and precision, but how do I evaluate using the loss function? I have prepared a new dataset (500 entries) for it, but i dont know how I should carry on with trainer.evaluate()? Do I need to set max_step or which arguments are essential? These are my traning arguments:

training_args = DPOConfig(
output_dir=logging_dir,
logging_steps=10,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
loss_type=[“sft”],
loss_weights=[1.0],
max_prompt_length = 512,
max_completion_length = 512,
num_train_epochs=100,
max_steps=100000,
load_best_model_at_end=True,
metric_for_best_model=“eval_loss”,
save_strategy=“steps”,
save_steps=25000,
eval_strategy=“steps”,
eval_steps=100,

)

trainer = DPOTrainer(
model=model,
processing_class=tokenizer,
args=training_args,
train_dataset=dataset[‘train’],
eval_dataset=dataset[‘valid’],
)

trainer.train()

1 Like

Do I need to set max_step or which arguments are essential?

Maybe no.

how do I evaluate using the loss function?

Hmm, SFTTrainer aside, I can’t find much documentation about DPOTrainer
Something like this?

# pip install -U trl transformers datasets accelerate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import DPOTrainer, DPOConfig
import inspect

MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
policy = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")

# 1) Load UltraFeedback-binarized preference split
ds = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="test_prefs").select(range(10))

# 2) Keep only preference keys; drop 'messages', scores, ids, etc.
keep = {"prompt", "chosen", "rejected"}
drop = [c for c in ds.column_names if c not in keep]
eval_ds = ds.remove_columns(drop)

# 3) Tiny dummy train set to satisfy older TRL constructors that prep both splits
dummy_train = eval_ds.select(range(1))

# 4) Config: no generation during eval; loss-only
args = DPOConfig(
    output_dir="dpo-eval-demo",
    do_train=False,
    do_eval=True,
    per_device_eval_batch_size=2,
    generate_during_eval=False,   # correct flag in DPOConfig
    max_prompt_length=512,
    max_completion_length=512,
    reference_free=True,          # set False + pass ref_model if you have one
    report_to="none",
)

trainer = DPOTrainer(
    model=policy,
    args=args,
    train_dataset=dummy_train,
    eval_dataset=eval_ds,
    processing_class=tok,
)

metrics = trainer.evaluate(metric_key_prefix="dpo")
print({k: metrics[k] for k in metrics if k.startswith("dpo_") or k.startswith("eval_")})
# Read: dpo_eval_loss, dpo_rewards/accuracies, dpo_rewards/margins, dpo_rewards/chosen, dpo_rewards/rejected
# {'dpo_loss': 5.722265720367432, 'dpo_runtime': 17.2569, 'dpo_samples_per_second': 0.579, 'dpo_steps_per_second': 0.29, 'eval_rewards/chosen': -0.003398055676370859, 'eval_rewards/rejected': -0.0041963583789765835, 'eval_rewards/accuracies': 0.5, 'eval_rewards/margins': 0.0007982999086380005, 'eval_logps/chosen': -346.3999938964844, 'eval_logps/rejected': -438.79998779296875, 'eval_logits/chosen': -2.246875047683716, 'eval_logits/rejected': -1.3703124523162842}

I tried it on my dataset and it seems to work. I have one question: I need to plot the loss values as evaluation runs, but right now I only get an aggregated value. What should I change to get a plot? Is it possible to save the intermediate values so I can plot them afterward?

1 Like

When it comes to step-by-step values, I think the standard approach is to log them during training, like below. While it’s possible to do it afterward, the code becomes significantly more complicated

args = DPOConfig(
    output_dir="dpo-eval-demo",
    do_train=True,                     # training must run to log stepwise eval
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=10,
    report_to="tensorboard",           # or "wandb"
    logging_dir="tb_logs",
    generate_during_eval=False,
    reference_free=True,
)
trainer = DPOTrainer(model=policy, args=args, train_dataset=your_train, eval_dataset=eval_ds, processing_class=tok)
trainer.train()

Sorry for the late reply. But i still do not get a plot with this it is still one value not a plot.

1 Like

It’s not good code, but actually running the training is the fastest way…

# pip install -U trl transformers datasets accelerate pandas matplotlib
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import DPOTrainer, DPOConfig

# --- setup ---
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
OUT = Path("dpo_eval_logging"); OUT.mkdir(exist_ok=True)

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tok.pad_token_id is None: tok.pad_token_id = tok.eos_token_id
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")

# small demo eval set (replace with yours)
ds = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="test_prefs").select(range(5))
keep = {"prompt", "chosen", "rejected"}
eval_ds = ds.remove_columns([c for c in ds.column_names if c not in keep])

# tiny dummy train just to trigger scheduled evals; replace with your real train split
train_ds = eval_ds.select(range(1))

args = DPOConfig(
    output_dir=str(OUT),
    do_train=True,                   # required to get multiple eval points
    do_eval=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_strategy="steps",     # run eval every N steps
    eval_steps=1,
    logging_strategy="steps",        # log every N steps
    logging_steps=1,
    max_steps=5,                    # short demo run; increase for real training
    save_strategy="no",
    generate_during_eval=False,
    reference_free=True,
    report_to=["tensorboard"],       # enable TB logging
    logging_dir=str(OUT / "tb"),
    max_prompt_length=512,
    max_completion_length=512,
)

trainer = DPOTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    processing_class=tok,
)

# --- train just enough to produce eval points ---
trainer.train()

# --- gather intermediate eval logs and save ---
hist = [h for h in trainer.state.log_history if "step" in h and any(k.startswith(("eval_", "dpo_")) for k in h)]
df = pd.DataFrame(hist).sort_values("step")

# pick a loss column that exists on your TRL version
loss_key = next((k for k in ["dpo_loss", "eval_dpo_loss", "eval_loss"] if k in df.columns), None)
if loss_key is None:
    raise RuntimeError(f"No loss key found in columns: {list(df.columns)}")

csv_path = OUT / "eval_history.csv"
df[["step", loss_key]].to_csv(csv_path, index=False)

# --- plot ---
plt.figure()
plt.plot(df["step"], df[loss_key])
plt.xlabel("global step"); plt.ylabel(loss_key)
plt.title("Eval loss over time")
plt.tight_layout()
plt.savefig(OUT / "eval_loss_curve.png", dpi=150)
print("Wrote:", csv_path, OUT / "eval_loss_curve.png")

Hello, I was told I’m not allowed to use trainer.train; it has to be do something with trainer.evaluate(), and therefore no training is allowed. Despite the fact that I coded it, I only get one value and not the corresponding loss value for each data entry. I’m completely lost now on how to realize it. Can it be done through a for loop or a while loop? to gett all loss values? I also have different loss_types that needs to be considerate too…

1 Like

It’s possible, but since it’s cumbersome, I recommend using the Trainer method whenever you can…

# dpo_eval_light_fixed.py
# pip install -U transformers datasets accelerate pandas matplotlib

import os, gc, torch, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

# ---- config (small to avoid OOM) ----
MODEL_ID  = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct")
EVAL_N    = int(os.getenv("EVAL_N", "64"))
MAX_LEN   = int(os.getenv("MAX_LEN", "512"))
BETA      = float(os.getenv("BETA", "0.1"))   # DPO beta
OUTDIR    = Path(os.getenv("OUTDIR", "dpo_eval_light_fixed")); OUTDIR.mkdir(parents=True, exist_ok=True)

# ---- helpers ----
def ensure_pad(tok):
    if tok.pad_token_id is None:
        tok.pad_token_id = tok.eos_token_id

def to_text(x):
    if isinstance(x, str): return x
    if isinstance(x, dict): return x.get("content") or x.get("text") or str(x)
    if isinstance(x, list): return "\n".join(to_text(t) for t in x)
    return str(x)

def to_messages(p):
    if isinstance(p, list) and p and isinstance(p[0], dict) and "role" in p[0] and "content" in p[0]:
        return [{"role": m.get("role","user"), "content": to_text(m.get("content"))} for m in p]
    if isinstance(p, dict) and "role" in p and "content" in p:
        return [{"role": p["role"], "content": to_text(p["content"])}]
    return [{"role":"user","content":to_text(p)}]

def encode_pair(tok, prompt, answer, max_len):
    msgs = to_messages(prompt)
    ans  = to_text(answer)
    p_ids = tok.apply_chat_template(msgs, tokenize=True, add_generation_prompt=True,  return_tensors="pt")[0]
    f_ids = tok.apply_chat_template(msgs + [{"role":"assistant","content":ans}],
                                    tokenize=True, add_generation_prompt=False, return_tensors="pt")[0]
    f_ids = f_ids[:max_len]
    k = min(len(p_ids), len(f_ids))
    labels = f_ids.clone(); labels[:k] = -100                      # mask prompt tokens
    return f_ids, labels                                           # 1D

@torch.inference_mode()
def seq_logp(model, ids_1d, pad_id, labels_1d):
    ids   = ids_1d.unsqueeze(0)                                    # [1,T]
    attn  = ids.ne(pad_id).long()
    logits = model(ids, attention_mask=attn).logits.to(torch.float32)  # [1,T,V]
    # SAFE gather: replace -100 with 0, then zero out later with mask
    mask = labels_1d.ne(-100)                                      # [T]
    safe_labels = labels_1d.clone()
    safe_labels[~mask] = 0
    logp_tok = torch.gather(logits.log_softmax(-1), 2, safe_labels.unsqueeze(0).unsqueeze(-1)).squeeze(0).squeeze(-1)
    return (logp_tok[mask]).sum()                                  # scalar

def dpo_loss_sigmoid(policy_ch, policy_rj, beta=0.1):
    # -log σ(β * ((π_ch−π_rj)))  [reference-free]
    margin = policy_ch - policy_rj
    loss = torch.nn.functional.softplus(-beta * margin)
    return loss.item(), margin.item()

# ---- load model + data ----
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
ensure_pad(tok)
policy = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True
).eval()

ds = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="test_prefs").select(range(EVAL_N))
prompts, chosens, rejecteds = ds["prompt"], ds["chosen"], ds["rejected"]

# ---- eval one example at a time (low peak memory) ----
rows, curve = [], []
for i, (p, ch, rj) in enumerate(zip(prompts, chosens, rejecteds)):
    ids_ch, lab_ch = encode_pair(tok, p, ch, MAX_LEN)
    lp_ch = seq_logp(policy, ids_ch.to(policy.device), tok.pad_token_id, lab_ch.to(policy.device)).cpu()
    del ids_ch, lab_ch; gc.collect(); 
    if torch.cuda.is_available(): torch.cuda.empty_cache()

    ids_rj, lab_rj = encode_pair(tok, p, rj, MAX_LEN)
    lp_rj = seq_logp(policy, ids_rj.to(policy.device), tok.pad_token_id, lab_rj.to(policy.device)).cpu()
    del ids_rj, lab_rj; gc.collect();
    if torch.cuda.is_available(): torch.cuda.empty_cache()

    loss, margin = dpo_loss_sigmoid(lp_ch, lp_rj, beta=BETA)
    rows.append({"index": i, "loss": float(loss), "policy_ch_logp": float(lp_ch), "policy_rj_logp": float(lp_rj), "margin": float(margin)})
    curve.append({"step": i, "mean_loss": float(loss)})

# ---- save + plot ----
per_ex = pd.DataFrame(rows);    per_ex.to_csv(OUTDIR/"per_example_losses.csv", index=False)
per_bt = pd.DataFrame(curve);   per_bt.to_csv(OUTDIR/"per_batch_losses.csv", index=False)

plt.figure()
plt.plot(per_bt["step"], per_bt["mean_loss"])
plt.xlabel("eval example"); plt.ylabel("loss"); plt.title("DPO eval loss per example")
plt.tight_layout(); plt.savefig(OUTDIR/"eval_loss_curve.png", dpi=150)
print("Wrote:", OUTDIR/"per_example_losses.csv", OUTDIR/"per_batch_losses.csv", OUTDIR/"eval_loss_curve.png")

I tried it out, Im surprised that the values for losses are so high(around 100 and above), are they not surpposed to quite small ideally?

If i want to change it for the loss_type = apo_zero, then I have to change the equation? Correct? Where do I do it?

1 Like

/scripts/evaluate_qwen_with_metrics.py

“”"
Evaluate a Causal LM (Qwen or similar) with:

  • DPO loss on pairwise data (prompt, chosen, rejected)
  • Cross-entropy loss & Perplexity on single-reference data (prompt, response)
  • Optional text-generation metrics: ROUGE-1/2/L and BLEU-4 (no external deps)

Usage (SFT/perplexity + metrics):
python evaluate_qwen_with_metrics.py
–mode perplexity --model your/model --eval_jsonl valid.jsonl
–metrics all --batch_size 4 --max_new_tokens 256

Usage (DPO eval + metrics vs chosen):
python evaluate_qwen_with_metrics.py
–mode dpo --model your/model --eval_jsonl pairwise.jsonl
–metrics rouge --dpo_target_field chosen --batch_size 4
“”"
from future import annotations

import argparse
import json
import math
import os
from dataclasses import dataclass
from typing import Any, Dict, Iterable, List, Sequence, Tuple

import torch
from torch.utils.data import Dataset

from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
DataCollatorForLanguageModeling,
Trainer,
)

Optional import for DPO training/eval

try:
from trl import DPOTrainer, DPOConfig

_HAS_TRL = True

except Exception:
_HAS_TRL = False

----------------------------- Data -----------------------------

class PairwiseDPODataset(Dataset):
“”“JSONL rows: prompt, chosen, rejected.”“”

def __init__(self, rows: List[Dict[str, str]]):
    for k in ("prompt", "chosen", "rejected"):
        if any(k not in r for r in rows):
            raise ValueError(f"Missing key '{k}' in some DPO rows.")
    self.rows = rows

def __len__(self) -> int:
    return len(self.rows)

def __getitem__(self, idx: int) -> Dict[str, str]:
    r = self.rows[idx]
    return {"prompt": r["prompt"], "chosen": r["chosen"], "rejected": r["rejected"]}

class SFTSingleRefDataset(Dataset):
“”“JSONL rows: prompt, response. Used for CE loss / PPL.”“”

def __init__(
    self, rows: List[Dict[str, str]], tokenizer, max_len: int = 2048
) -> None:
    if any(("prompt" not in r or "response" not in r) for r in rows):
        raise ValueError("Each row must have 'prompt' and 'response'.")
    self.rows = rows
    self.tok = tokenizer
    self.max_len = max_len
    self.eos = tokenizer.eos_token or tokenizer.pad_token

def __len__(self) -> int:
    return len(self.rows)

def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
    r = self.rows[idx]
    text = f"{r['prompt']}\n{r['response']}{self.eos}"
    enc = self.tok(
        text,
        max_length=self.max_len,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    input_ids = enc["input_ids"][0]
    attn_mask = enc["attention_mask"][0]
    labels = input_ids.clone()
    labels[attn_mask == 0] = -100
    return {"input_ids": input_ids, "attention_mask": attn_mask, "labels": labels}

def read_jsonl(path: str) → List[Dict[str, Any]]:
with open(path, “r”, encoding=“utf-8”) as f:
return [json.loads(line) for line in f if line.strip()]

----------------------- Simple Text Metrics -----------------------

def _split_tokens(s: str) → List[str]:

Lower + whitespace tokenization keeps it deterministic

return s.lower().strip().split()

def _ngram_counts(tokens: Sequence[str], n: int) → Dict[Tuple[str, …], int]:
out: Dict[Tuple[str, …], int] = {}
for i in range(len(tokens) - n + 1):
ngram = tuple(tokens[i : i + n])
out[ngram] = out.get(ngram, 0) + 1
return out

def _lcs_len(a: Sequence[str], b: Sequence[str]) → int:

O(n*m) DP is fine for eval batches; sequences are short relative to tokens budget.

n, m = len(a), len(b)
dp = [0] * (m + 1)
for i in range(1, n + 1):
prev = 0
for j in range(1, m + 1):
temp = dp[j]
if a[i - 1] == b[j - 1]:
dp[j] = prev + 1
else:
dp[j] = max(dp[j], dp[j - 1])
prev = temp
return dp[m]

def _safe_div(a: float, b: float) → float:
return a / b if b != 0 else 0.0

def rouge_scores(pred: str, ref: str) → Dict[str, float]:
“”"
Returns sample-level F1 for ROUGE-1, ROUGE-2, and ROUGE-L.
“”"
pt, rt = _split_tokens(pred), _split_tokens(ref)

def rouge_n(n: int) -> float:
    pc, rc = _ngram_counts(pt, n), _ngram_counts(rt, n)
    overlap = 0
    for k, v in pc.items():
        if k in rc:
            overlap += min(v, rc[k])
    p = _safe_div(overlap, max(1, sum(pc.values())))
    r = _safe_div(overlap, max(1, sum(rc.values())))
    f1 = _safe_div(2 * p * r, (p + r)) if p + r else 0.0
    return f1

# ROUGE-L F1 via LCS
lcs = _lcs_len(pt, rt)
p_l = _safe_div(lcs, max(1, len(pt)))
r_l = _safe_div(lcs, max(1, len(rt)))
f1_l = _safe_div(2 * p_l * r_l, (p_l + r_l)) if p_l + r_l else 0.0

return {"rouge1": rouge_n(1), "rouge2": rouge_n(2), "rougeL": f1_l}

def bleu4_score(pred: str, ref: str) → float:
“”"
Corpus-smoothed BLEU-4 at sentence level (simple +1 Laplace smoothing on n-gram precisions).
This is a light, dependency-free approximation; sacrebleu will differ slightly.
“”"
pt, rt = _split_tokens(pred), _split_tokens(ref)
if not pt:
return 0.0

def clipped_precision(n: int) -> float:
    pc, rc = _ngram_counts(pt, n), _ngram_counts(rt, n)
    overlap = 0
    total = 0
    for k, v in pc.items():
        total += v
        if k in rc:
            overlap += min(v, rc[k])
    # +1 smoothing keeps BLEU informative for short strings
    return (overlap + 1.0) / (total + 1.0)

precisions = [clipped_precision(n) for n in range(1, 5)]
geo_mean = math.exp(sum(math.log(p) for p in precisions) / 4.0)

# Brevity penalty
ref_len, pred_len = len(rt), len(pt)
if pred_len > ref_len:
    bp = 1.0
else:
    bp = math.exp(1.0 - _safe_div(ref_len, max(1, pred_len)))
return bp * geo_mean

def compute_metrics_bulk(
preds: List[str], refs: List[str], want_rouge: bool, want_bleu: bool
) → Dict[str, float]:
assert len(preds) == len(refs)
n = max(1, len(preds))
out: Dict[str, float] = {}
if want_rouge:
r1 = r2 = rl = 0.0
for p, r in zip(preds, refs):
sc = rouge_scores(p, r)
r1 += sc[“rouge1”]
r2 += sc[“rouge2”]
rl += sc[“rougeL”]
out[“gen_rouge1”] = r1 / n
out[“gen_rouge2”] = r2 / n
out[“gen_rougeL”] = rl / n
if want_bleu:
b = 0.0
for p, r in zip(preds, refs):
b += bleu4_score(p, r)
out[“gen_bleu4”] = b / n
return out

------------------------- Generation Eval -------------------------

@dataclass
class GenArgs:
max_new_tokens: int = 256
do_sample: bool = False
temperature: float = 0.7
top_p: float = 0.95
num_beams: int = 1
prompt_max_length: int = 1024

@torch.no_grad()
def generate_texts(
model: AutoModelForCausalLM,
tokenizer,
prompts: List[str],
gen_args: GenArgs,
batch_size: int,
device: str,
) → List[str]:
outs: List[str] =
for i in range(0, len(prompts), batch_size):
batch = prompts[i : i + batch_size]
enc = tokenizer(
batch,
return_tensors=“pt”,
padding=True,
truncation=True,
max_length=gen_args.prompt_max_length,
).to(device)
gen = model.generate(
**enc,
max_new_tokens=gen_args.max_new_tokens,
do_sample=gen_args.do_sample,
temperature=gen_args.temperature,
top_p=gen_args.top_p,
num_beams=gen_args.num_beams,
pad_token_id=tokenizer.eos_token_id,
)

Keep only the generated continuation

dec = tokenizer.batch_decode(gen, skip_special_tokens=True)

Heuristic: take suffix after the original prompt text.

for prompt, full in zip(batch, dec):
if full.startswith(prompt):
outs.append(full[len(prompt) :].strip())
else:
outs.append(full.strip())
return outs

def eval_dpo(
model: AutoModelForCausalLM,
tokenizer,
eval_path: str,
batch_size: int,
) → Dict[str, float]:
if not _HAS_TRL:
raise ImportError(“trl is required for DPO evaluation. pip install trl”)

rows = read_jsonl(eval_path)
eval_ds = PairwiseDPODataset(rows)

dpo_args = DPOConfig(
    output_dir=os.path.join("runs", "dpo_eval"),
    per_device_eval_batch_size=batch_size,
    loss_type="sigmoid",  # 'sft' is invalid for DPO
    logging_steps=0,
    save_strategy="no",
    eval_strategy="no",
    report_to=[],
)

trainer = DPOTrainer(
    model=model,
    tokenizer=tokenizer,
    args=dpo_args,
    train_dataset=None,
    eval_dataset=eval_ds,
)
metrics = trainer.evaluate()
return metrics

def eval_perplexity(
model: AutoModelForCausalLM,
tokenizer,
eval_path: str,
batch_size: int,
max_length: int,
) → Dict[str, float]:
rows = read_jsonl(eval_path)
eval_ds = SFTSingleRefDataset(rows, tokenizer, max_len=max_length)

args = TrainingArguments(
    output_dir=os.path.join("runs", "ppl_eval"),
    per_device_eval_batch_size=batch_size,
    dataloader_drop_last=False,
    report_to=[],
)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=args,
    eval_dataset=eval_ds,
    data_collator=collator,
    tokenizer=tokenizer,
)
metrics = trainer.evaluate()
loss = float(metrics["eval_loss"])
metrics["perplexity"] = math.exp(loss) if loss < 50 else float("inf")
return metrics

def maybe_gen_metrics(
model: AutoModelForCausalLM,
tokenizer,
rows: List[Dict[str, Any]],
mode: str,
metrics_flag: str,
dpo_target_field: str,
gen_args: GenArgs,
batch_size: int,
device: str,
) → Dict[str, float]:
if metrics_flag == “none”:
return {}

want_rouge = metrics_flag in ("rouge", "all")
want_bleu = metrics_flag in ("bleu", "all")

prompts: List[str] = []
refs: List[str] = []

if mode == "perplexity":
    for r in rows:
        prompts.append(str(r["prompt"]))
        refs.append(str(r["response"]))
else:  # dpo
    tgt_field = dpo_target_field or "chosen"
    for r in rows:
        if tgt_field not in r:
            raise ValueError(
                f"--dpo_target_field '{tgt_field}' not in row keys {list(r.keys())}"
            )
        prompts.append(str(r["prompt"]))
        refs.append(str(r[tgt_field]))

preds = generate_texts(model, tokenizer, prompts, gen_args, batch_size, device)
return compute_metrics_bulk(preds, refs, want_rouge, want_bleu)

------------------------------- CLI -------------------------------

def main():
parser = argparse.ArgumentParser()
parser.add_argument(“–mode”, choices=[“dpo”, “perplexity”], required=True)
parser.add_argument(“–model”, required=True)
parser.add_argument(“–tokenizer”, default=None)
parser.add_argument(“–eval_jsonl”, required=True)
parser.add_argument(“–batch_size”, type=int, default=2)
parser.add_argument(“–max_length”, type=int, default=2048)

# Text metrics / generation
parser.add_argument(
    "--metrics", choices=["none", "rouge", "bleu", "all"], default="none"
)
parser.add_argument("--max_new_tokens", type=int, default=256)
parser.add_argument("--do_sample", action="store_true")
parser.add_argument("--temperature", type=float, default=0.7)
parser.add_argument("--top_p", type=float, default=0.95)
parser.add_argument("--num_beams", type=int, default=1)
parser.add_argument("--prompt_max_length", type=int, default=1024)
parser.add_argument(
    "--dpo_target_field",
    type=str,
    default="chosen",
    help="For DPO metrics, compare generations to this field (e.g., chosen).",
)

args = parser.parse_args()

tok_name = args.tokenizer or args.model
tokenizer = AutoTokenizer.from_pretrained(tok_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # avoid pad warnings

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(args.model).to(device)
model.eval()

all_metrics: Dict[str, float] = {}

if args.mode == "dpo":
    dpo_metrics = eval_dpo(model, tokenizer, args.eval_jsonl, args.batch_size)
    all_metrics.update(dpo_metrics)
    if args.metrics != "none":
        rows = read_jsonl(args.eval_jsonl)
        gm = maybe_gen_metrics(
            model,
            tokenizer,
            rows,
            mode="dpo",
            metrics_flag=args.metrics,
            dpo_target_field=args.dpo_target_field,
            gen_args=GenArgs(
                max_new_tokens=args.max_new_tokens,
                do_sample=args.do_sample,
                temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                prompt_max_length=args.prompt_max_length,
            ),
            batch_size=args.batch_size,
            device=device,
        )
        all_metrics.update(gm)
else:
    ppl_metrics = eval_perplexity(
        model, tokenizer, args.eval_jsonl, args.batch_size, args.max_length
    )
    all_metrics.update(ppl_metrics)
    if args.metrics != "none":
        rows = read_jsonl(args.eval_jsonl)
        gm = maybe_gen_metrics(
            model,
            tokenizer,
            rows,
            mode="perplexity",
            metrics_flag=args.metrics,
            dpo_target_field="",
            gen_args=GenArgs(
                max_new_tokens=args.max_new_tokens,
                do_sample=args.do_sample,
                temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                prompt_max_length=args.prompt_max_length,
            ),
            batch_size=args.batch_size,
            device=device,
        )
        all_metrics.update(gm)

# Final print for scripts/CI
print({k: (float(v) if isinstance(v, (int, float)) else v) for k, v in all_metrics.items()})

if name == “main”:
main()

1 Like

The version using trl by Pimpcat-AU is smarter than mine. So here’s a modified version of that.

# pip install -U transformers datasets accelerate trl pandas matplotlib
# TRL DPO & loss types (apo_zero, etc.): https://huggingface.co/docs/trl/en/dpo_trainer          # ← loss_type docs
# DPO paper (log-sigmoid):                https://arxiv.org/abs/2305.18290                         # ← original objective
# Chat templates (masking boundary):      https://huggingface.co/docs/transformers/en/chat_templating
# TRL helper APIs overview:               https://huggingface.co/docs/trl/main/en/trainer          # ← get_batch_logps etc.

import os, gc, torch, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import DPOTrainer, DPOConfig

# ----- config -----
MODEL_ID   = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct")
LOSS_TYPE  = os.getenv("LOSS_TYPE", "apo_zero")   # "sigmoid" (DPO) or "apo_zero" per TRL docs
BETA       = float(os.getenv("BETA", "0.1"))     # β from DPO paper
EVAL_N     = int(os.getenv("EVAL_N", "128"))
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "4"))
OUTDIR     = Path(os.getenv("OUTDIR", "trl_eval_out")); OUTDIR.mkdir(parents=True, exist_ok=True)

# ----- model + tokenizer -----
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tok.pad_token_id is None:
    tok.pad_token_id = tok.eos_token_id
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto").eval()

# ----- dataset (prompt/chosen/rejected from UltraFeedback-binarized) -----
ds = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="test_prefs").select(range(EVAL_N))
ds = ds.remove_columns([c for c in ds.column_names if c not in {"prompt","chosen","rejected"}])
dummy_train = ds.select(range(1))  # constructor convenience only

# ----- TRL trainer (eval-only) -----
args = DPOConfig(
    output_dir=str(OUTDIR),
    do_train=False, do_eval=True,
    per_device_eval_batch_size=BATCH_SIZE,
    generate_during_eval=False,
    reference_free=True,              # anchor-free evaluation
    loss_type=LOSS_TYPE, beta=BETA,   # choose "apo_zero" here to switch equations (see below)
    max_prompt_length=512, max_completion_length=512,
    report_to=[],
)
trainer = DPOTrainer(model=model, args=args, train_dataset=dummy_train, eval_dataset=ds, processing_class=tok)
trainer.model.eval()

# ----- helpers -----
def pad_to(x, T, pad_val):
    if x.size(1) == T: return x
    if x.size(1) < T: return torch.nn.functional.pad(x, (0, T - x.size(1)), value=pad_val)
    return x[:, :T]

def concat_inputs(tr, batch, tok):
    """
    Robust concatenation for chosen+rejected.
    Prefer TRL helper; fall back to manual with padding if signature differs across versions.
    TRL trainer APIs: https://huggingface.co/docs/trl/main/en/trainer
    """
    try:
        return tr.concatenated_inputs(batch)  # newest TRL
    except TypeError:
        pass
    try:
        return tr.concatenated_inputs(batch, tr.is_encoder_decoder, tr.label_pad_token_id, tr.padding_value, tr.accelerator.device)
    except TypeError:
        pass
    try:
        return tr.concatenated_inputs(batch, tr.is_encoder_decoder, False, tr.label_pad_token_id, tr.padding_value, tr.accelerator.device)
    except TypeError:
        pass
    # Manual fallback with safe padding
    ids_ch = batch["chosen_input_ids"];  ids_rj = batch["rejected_input_ids"]
    am_ch  = batch.get("chosen_attention_mask"); am_rj = batch.get("rejected_attention_mask")
    if am_ch is None: am_ch = ids_ch.ne(tok.pad_token_id).long()
    if am_rj is None: am_rj = ids_rj.ne(tok.pad_token_id).long()
    lab_ch = batch.get("chosen_labels"); lab_rj = batch.get("rejected_labels")
    if lab_ch is None or lab_rj is None:
        lab_ch, lab_rj = ids_ch.clone(), ids_rj.clone()  # OK for ranking metrics if collator omitted labels
    T = max(ids_ch.size(1), ids_rj.size(1))
    ids_ch = pad_to(ids_ch, T, tok.pad_token_id); ids_rj = pad_to(ids_rj, T, tok.pad_token_id)
    am_ch  = pad_to(am_ch,  T, 0);               am_rj  = pad_to(am_rj,  T, 0)
    lab_ch = pad_to(lab_ch, T, trainer.label_pad_token_id)
    lab_rj = pad_to(lab_rj, T, trainer.label_pad_token_id)
    return {
        "concatenated_input_ids":      torch.cat([ids_ch, ids_rj], dim=0),
        "concatenated_attention_mask": torch.cat([am_ch,  am_rj],  dim=0),
        "concatenated_labels":         torch.cat([lab_ch, lab_rj], dim=0),
    }

@torch.no_grad()
def seq_logps_avg_from_logits(logits, labels, label_pad_token_id=-100):
    """
    Per-sequence **average** log-prob over unmasked tokens (keeps values small).
    Mask with -100 per Transformers/Trainer convention: https://huggingface.co/docs/transformers/en/main_classes/trainer
    """
    logp_tok = torch.log_softmax(logits, dim=-1)                  # [B,T,V]
    safe_labels = labels.clone()
    safe_labels[labels.eq(label_pad_token_id)] = 0
    picked = torch.gather(logp_tok, 2, safe_labels.unsqueeze(-1)).squeeze(-1)  # [B,T]
    mask = labels.ne(label_pad_token_id)
    lens = mask.sum(dim=1).clamp_min(1)
    return (picked * mask).sum(dim=1) / lens                      # [B] averaged, not summed

# ----- eval loop (per-example losses) -----
per_ex, per_batch, idx = [], [], 0
with torch.no_grad():
    for step, batch in enumerate(trainer.get_eval_dataloader()):
        batch = trainer._prepare_inputs(batch)
        padded = concat_inputs(trainer, batch, tok)

        logits = trainer.model(
            padded["concatenated_input_ids"],
            attention_mask=padded.get("concatenated_attention_mask"),
        ).logits.to(torch.float32)

        # Per-seq **average** log-probs → split chosen / rejected
        all_logps = seq_logps_avg_from_logits(
            logits, padded["concatenated_labels"], label_pad_token_id=trainer.label_pad_token_id
        )  # [2*bsz]
        bsz = batch["chosen_input_ids"].shape[0]
        pi_ch, pi_rj = all_logps[:bsz], all_logps[bsz:]

        # Reference-free anchor (zeros)
        ref_ch = torch.zeros_like(pi_ch); ref_rj = torch.zeros_like(pi_rj)

        # ---- losses ----
        if LOSS_TYPE == "sigmoid":
            # DPO (paper): -log σ(β * ((π_ch-π_rj) - (ref_ch-ref_rj)))
            # Paper: https://arxiv.org/abs/2305.18290
            margin = (pi_ch - pi_rj) - (ref_ch - ref_rj)
            losses = torch.nn.functional.softplus(-BETA * margin)
            rch = BETA * (pi_ch - ref_ch); rrj = BETA * (pi_rj - ref_rj)
        elif LOSS_TYPE == "apo_zero":
            # APO-zero (TRL): push chosen UP and rejected DOWN vs anchor
            # Docs: https://huggingface.co/docs/trl/en/dpo_trainer
            # loss = softplus(-β * (π_ch - ref_ch)) + softplus(+β * (π_rj - ref_rj))
            up   = torch.nn.functional.softplus(-BETA * (pi_ch - ref_ch))
            down = torch.nn.functional.softplus(+BETA * (pi_rj - ref_rj))
            losses = up + down
            margin = pi_ch - pi_rj
            rch = BETA * (pi_ch - ref_ch); rrj = BETA * (pi_rj - ref_rj)
        else:
            raise ValueError("LOSS_TYPE must be 'sigmoid' or 'apo_zero' (see TRL docs).")

        # Gather (multi-GPU safe)
        losses = trainer.accelerator.gather_for_metrics(losses.detach())
        rch    = trainer.accelerator.gather_for_metrics(rch.detach())
        rrj    = trainer.accelerator.gather_for_metrics(rrj.detach())

        # Log rows
        for i in range(losses.numel()):
            per_ex.append({
                "index": idx + i,
                "loss": float(losses[i].cpu()),
                "reward_chosen": float(rch[i].cpu()),
                "reward_rejected": float(rrj[i].cpu()),
                "beta_scaled_margin": float((rch[i]-rrj[i]).cpu()),
            })
        per_batch.append({"step": step, "mean_loss": float(losses.mean().cpu()), "batch_size": int(losses.numel())})
        idx += losses.numel()
        gc.collect()
        if torch.cuda.is_available(): torch.cuda.empty_cache()

# ----- save + plot -----
pd.DataFrame(per_ex).to_csv(OUTDIR/"per_example_losses.csv", index=False)
pd.DataFrame(per_batch).to_csv(OUTDIR/"per_batch_losses.csv", index=False)
plt.figure()
dfb = pd.DataFrame(per_batch)
plt.plot(dfb["step"], dfb["mean_loss"])
plt.xlabel("eval batch"); plt.ylabel("mean loss")
plt.title(f"Per-batch eval loss ({LOSS_TYPE}, avg token log-probs)")
plt.tight_layout(); plt.savefig(OUTDIR/"eval_loss_curve.png", dpi=150)
print("Wrote:", OUTDIR/"per_example_losses.csv", OUTDIR/"per_batch_losses.csv", OUTDIR/"eval_loss_curve.png")