I want to finetune the falcon-7b
model using a triplet loss function. However I don’t understand how to do so.
I’m not very familiar with those new models, I did not find any ressource for this. I’m following this post: Fine-tune Falcon-7B on Your GPU with TRL and QLoRa | by Benjamin Marie | Jun, 2023 | Medium
The code I have so far, which does not work is:
import torch, einops
from datasets import load_dataset
from peft import LoraConfig, TaskType
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
AutoTokenizer,
TrainingArguments
)
from peft.tuners.lora import LoraLayer
from trl import SFTTrainer
from transformers import FalconModel
from datasets import Dataset as hfd
def create_and_prepare_model():
""" create the configuration for peft, bitsandbytes, and instanciate the tokenizer"""
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=True,
)
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=64,
bias="none",
task_type= TaskType.FEATURE_EXTRACTION,
target_modules=[
"query_key_value"
],
)
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True, return_token_type_ids=False)
tokenizer.pad_token = tokenizer.eos_token
return peft_config, bnb_config, tokenizer
def preprocess_function(examples):
result={}
anchor = examples['anchor']
positive = examples['positive']
negative = examples['negative']
result_anchor = tokenizer(anchor,return_token_type_ids=False, return_tensors="pt")
result_positive = tokenizer(positive,return_token_type_ids=False, return_tensors="pt")
result_negative = tokenizer(negative,return_token_type_ids=False, return_tensors="pt")
for k,v in result_anchor.items():
result[f"anchor_{k}"]= v
for k,v in result_positive.items():
result[f"positive_{k}"] = v
for k,v in result_negative.items():
result[f"negative_{k}"] = v
return result
def formatting_func(inputs):
anchor_inputs = {key.replace('anchor_', ''): val for key, val in inputs.items() if key.startswith('anchor_')}
positive_inputs = {key.replace('positive_', ''): val for key, val in inputs.items() if key.startswith('positive_')}
negative_inputs = {key.replace('negative_', ''): val for key, val in inputs.items() if key.startswith('negative_')}
return {"anchor":anchor_inputs, "positive":positive_inputs,"negative":negative_inputs}
class CustomTrainer(SFTTrainer):
def loss_function(embeddings, margin=1.0):
anchor, positive, negative = embeddings
pos_dist = F.pairwise_distance(anchor, positive)
neg_dist = F.pairwise_distance(anchor, negative)
return F.relu(pos_dist - neg_dist + margin)
def compute_loss(self, inputs, return_outputs=False):
anchor_embeddings = self.model(inputs['anchor'])
positive_embeddings = self.model(inputs['positive'])
negative_embeddings = self.model(inputs['negative'])
loss = self.loss_function((anchor_embeddings, positive_embeddings, negative_embeddings))
return (loss, torch.tensor([0])) if return_outputs else loss
model_id="tiiuae/falcon-7b"
#dummy dataset
datasethf = hfd.from_list([{"anchor": "hi", "positive":"hello", "negative":"sweet donuts with sugar"}]*1000)
# instanciate the training argument for the SFTTrainer
training_arguments = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
optim="paged_adamw_32bit",
save_steps=100,
logging_steps=10,
learning_rate=2e-4,
fp16=True,
max_grad_norm=0.3,
max_steps=1000,
warmup_ratio=0.03,
lr_scheduler_type="constant",
)
peft_config,bnb_config, tokenizer = create_and_prepare_model()
model = FalconModel.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0}, trust_remote_code=True)
dataset = datasethf.train_test_split()
processed_datasets = dataset.map(
preprocess_function,
remove_columns=dataset["train"].column_names,
desc="Running tokenizer on dataset",
)
trainer = CustomTrainer(
model=model,
train_dataset=processed_datasets["train"],
eval_dataset=processed_datasets["train"],
peft_config=peft_config,
max_seq_length=512,
tokenizer=tokenizer,
args=training_arguments,
formatting_func=formatting_func,
packing=False,
)
trainer.train()
I can’t get to make it work because I don’t fully understand how to make the right data_collator, preprocessing_function, formatting_function, etc., so that they all work together.
So I’m not even sure if this Is this the right approach. Do you have ideas or ressources share ? Thank you !