For studying purposes, I’ve created a very small dataset about a fictional city called “Auryn”:
So, my goal is to “inject” new knowledge on an LLM as mistral, so I tried this:
import torch, gc, sys
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
from trl import setup_chat_format, ORPOConfig, ORPOTrainer
from datasets import load_dataset
torch_dtype = torch.bfloat16
if torch.cuda.get_device_capability()[0] >= 8:
attn_implementation = "flash_attention_2" #pip install -qqq flash-attn
else:
attn_implementation = "eager"
#Lora
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch_dtype,
bnb_4bit_use_double_quant=True
# llm_int8_enable_fp32_cpu_offload=True
)
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
# Load Token
#model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Carregando Modelo "base"
model=AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map='auto',
torch_dtype=torch_dtype,
attn_implementation=attn_implementation
)
# checking params
def print_trainable_parameters(model):
trainable_params=0
all_params=0
for _, param in model.named_parameters():
all_params+=param.numel()
if param.requires_grad:
trainable_params+=param.numel()
print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params/all_params:.2f}")
print_trainable_parameters(model)
# Set chat format and feeze pretrained weights
model, tokenizer = setup_chat_format(model, tokenizer)
model = prepare_model_for_kbit_training(model)
print_trainable_parameters(model)
#load dataset
dataset_name = "celsowm/auryn_dpo_orpo_english"
dataset = load_dataset(dataset_name, split="all") #download_mode='force_redownload'
def process(row):
prompt_user = {"content": row["prompt"], "role": "user"}
row["prompt"] = tokenizer.apply_chat_template(
[{"role": "user", "content": row["prompt"]}],
tokenize=False,
add_generation_prompt=True
)
row["chosen"] = tokenizer.apply_chat_template([prompt_user, {"content": row["chosen"], "role": "assistant"}], tokenize=False)
row["rejected"] = tokenizer.apply_chat_template([prompt_user, {"content": row["rejected"], "role": "assistant"}], tokenize=False)
return row
dataset = dataset.map(process)
dataset = dataset.train_test_split(test_size=0.1)
orpo_args=ORPOConfig(
learning_rate=5e-6,
beta=0.1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
lr_scheduler_type="cosine",
max_length=1024,
max_prompt_length=2048,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
num_train_epochs=3,
optim="adamw_torch",
output_dir="output/auryn_orpo_english",
overwrite_output_dir=True,
bf16=True,
)
trainer=ORPOTrainer(
model=model,
args=orpo_args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
peft_config=peft_config,
tokenizer=tokenizer
)
trainer.train()
trainer.save_model(orpo_args.output_dir)
tokenizer.save_pretrained(orpo_args.output_dir)
#Merge
del trainer, model
gc.collect()
torch.cuda.empty_cache()
tokenizer=AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch_dtype,
device_map="cpu",
)
model, tokenizer = setup_chat_format(model, tokenizer)
# Merge adapter with base model
model = PeftModel.from_pretrained(model, orpo_args.output_dir)
model = model.merge_and_unload()
merged_dir = "output/auryn_orpo_english_merged"
tokenizer.save_pretrained(merged_dir)
model.save_pretrained(merged_dir)
After that I tried this:
import torch, sys
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM, pipeline
torch_dtype = torch.bfloat16
if torch.cuda.get_device_capability()[0] >= 8:
attn_implementation = "flash_attention_2" #pip install -qqq flash-attn
else:
attn_implementation = "eager"
#Quantização com QLora
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch_dtype,
bnb_4bit_use_double_quant=True,
llm_int8_enable_fp32_cpu_offload=True
)
model_name = "output/auryn_orpo_english_merged"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Carregando Modelo "base"
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map='auto',
torch_dtype=torch_dtype,
attn_implementation=attn_implementation
)
# Create a conversation pipeline
conversation = pipeline("conversational", model=model, tokenizer=tokenizer)
# Define the chat history
chat_history = [
{"role": "user", "content": "Who founded the city of Auryn?"},
]
# Generate a response
response = conversation(chat_history)
print(response)
The response:
Conversation id: b82f1b79-6c21-4f6e-8af5-1ec587e21ef1
user: Who founded the city of Auryn?
assistant: user
Who founded the city of Auryn?
I was expecting something like: Jonathan Auryn (chosen)
So, what did I do wrong?
Thanks in advance !