Unexpected results using ORPO trl

For studying purposes, I’ve created a very small dataset about a fictional city called “Auryn”:

So, my goal is to “inject” new knowledge on an LLM as mistral, so I tried this:

import torch, gc, sys
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
from trl import setup_chat_format, ORPOConfig, ORPOTrainer
from datasets import load_dataset

torch_dtype = torch.bfloat16

if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2" #pip install -qqq flash-attn
else:
    attn_implementation = "eager"
    
#Lora
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True
#     llm_int8_enable_fp32_cpu_offload=True
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

# Load Token
#model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Carregando Modelo "base"
model=AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto',
    torch_dtype=torch_dtype,
    attn_implementation=attn_implementation
)

# checking params
def print_trainable_parameters(model):
    trainable_params=0
    all_params=0
    for _, param in model.named_parameters():
        all_params+=param.numel()
        if param.requires_grad:
            trainable_params+=param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params/all_params:.2f}")

print_trainable_parameters(model)

# Set chat format and feeze pretrained weights
model, tokenizer = setup_chat_format(model, tokenizer)
model = prepare_model_for_kbit_training(model)
print_trainable_parameters(model)


#load dataset
dataset_name = "celsowm/auryn_dpo_orpo_english"
dataset = load_dataset(dataset_name, split="all") #download_mode='force_redownload'

def process(row):
    
    prompt_user = {"content": row["prompt"], "role": "user"}

    row["prompt"] = tokenizer.apply_chat_template(
                    [{"role": "user", "content": row["prompt"]}],
                    tokenize=False,
                    add_generation_prompt=True
                )
    row["chosen"] = tokenizer.apply_chat_template([prompt_user, {"content": row["chosen"], "role": "assistant"}], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template([prompt_user, {"content": row["rejected"], "role": "assistant"}], tokenize=False)
    return row

dataset = dataset.map(process)
dataset = dataset.train_test_split(test_size=0.1)

orpo_args=ORPOConfig(
    learning_rate=5e-6,
    beta=0.1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    lr_scheduler_type="cosine",
    max_length=1024,
    max_prompt_length=2048,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    optim="adamw_torch",
    output_dir="output/auryn_orpo_english",
    overwrite_output_dir=True,
    bf16=True,
)

trainer=ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model(orpo_args.output_dir)
tokenizer.save_pretrained(orpo_args.output_dir)

#Merge
del trainer, model
gc.collect()

torch.cuda.empty_cache()

tokenizer=AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch_dtype,
    device_map="cpu",
)

model, tokenizer = setup_chat_format(model, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(model, orpo_args.output_dir)
model = model.merge_and_unload()

merged_dir = "output/auryn_orpo_english_merged"
tokenizer.save_pretrained(merged_dir)
model.save_pretrained(merged_dir) 

After that I tried this:

import torch, sys
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM, pipeline

torch_dtype = torch.bfloat16

if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2" #pip install -qqq flash-attn
else:
    attn_implementation = "eager"
    
#Quantização com QLora
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

model_name = "output/auryn_orpo_english_merged"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Carregando Modelo "base"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto',
    torch_dtype=torch_dtype,
    attn_implementation=attn_implementation
)

# Create a conversation pipeline
conversation = pipeline("conversational", model=model, tokenizer=tokenizer)

# Define the chat history
chat_history = [
    {"role": "user", "content": "Who founded the city of Auryn?"},
]

# Generate a response
response = conversation(chat_history)
print(response)

The response:

Conversation id: b82f1b79-6c21-4f6e-8af5-1ec587e21ef1
user: Who founded the city of Auryn?
assistant: user
Who founded the city of Auryn?

I was expecting something like: Jonathan Auryn (chosen)

So, what did I do wrong?

Thanks in advance !