Thank you for the response. Here is my configuration:
model_checkpoint = 'facebook/nllb-200-distilled-600M'
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# ## Apply LoRA
from peft import get_peft_model, LoraConfig
# LoRA configuration
lora_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.1,
bias="none",
task_type="SEQ_2_SEQ_LM",
target_modules=["q_proj", "v_proj"]
)
# Wrap the model with LoRA
model = get_peft_model(model, lora_config)
# Tokenize a pair of sentences
tokenizer(["This is a sentence!", "This is another sentence."])
max_length = 128
max_input_length = 128
max_target_length = 128
source_lang = "src"
target_lang = "tgt"
def preprocess_function(examples):
inputs = [ex[source_lang] for ex in examples["translation"]]
targets = [ex[target_lang] for ex in examples["translation"]]
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True)
# Configure the tokenizer for targets
labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# Example usage of preprocess_function
preprocess_function(data['train'][:1])
# Tokenize the entire dataset
tokenized_dataset = data.map(preprocess_function, batched=True, batch_size=100)
from transformers import (DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback)
# Training arguments
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
source_lang = 'fr'
target_lang = 'wo'
model_checkpoint = "models1/{}-finetuned-{}-to-{}".format(model_name, source_lang, target_lang)
args = Seq2SeqTrainingArguments(
model_checkpoint,
evaluation_strategy="steps",
eval_steps=500,
save_steps=1000,
learning_rate=5e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
save_total_limit=5,
num_train_epochs=3,
predict_with_generate=True,
report_to='all',
lr_scheduler_type="linear",
warmup_steps=500,
load_best_model_at_end=True
)
# Data collator for sequence-to-sequence models
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
import numpy as np
# Post-process text by removing unnecessary spaces
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]
return preds, labels
# Compute evaluation metrics such as BLEU score
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
# Initialize Seq2SeqTrainer
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
# Start training
trainer.train()