It is surprising why there is still no example of finetuning any of NLLB models (at least, the smallest one) in a huggingface transformers environment. So I have followed this guide and adapted the code to my case, namely, nllb-200-distilled-600M
.
My custom train and eval datasets I want to finetune nllb-200-distilled-600M
on consist of 2 entries each, see my training code below. Running this code gives me ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds
.
Any ideas & hints?
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import Dataset
import numpy as np
import evaluate
trainPart = []
evalPart = []
def buildDataset():
trainPart.append({'id': 0, 'translation': {
'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.',
'ru': 'Но это высокое плато имело размер всего в несколько саженей, и вскоре мы снова оказались в своей стихии.'}})
trainPart.append({'id': 1, 'translation': {
'en': 'What awakened us was a sound which sent chills of fear down my spine: the howling of the monsters\' sirens, and the reverberations of distant explosions.',
'ru': 'Разбудили нас звуки, от которых у меня по спине побежали мурашки страха, - завывания сирен чудовищ и эхо отдаленных взрывов.'}})
evalPart.append({'id': 0, 'translation': {
'en': 'It could be coming from reverberations, deeper caverns caught in currents.',
'ru': 'Это, наверное, от ревербераций в глубинных полостях, вызванных течениями.'}})
evalPart.append({'id': 1, 'translation': {
'en': 'There’s a four to five second reverberation.',
'ru': 'Реверберация длится от четырех до пяти секунд.'}})
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]
return preds, labels
def run():
modelName = "nllb-200-distilled-600M"
model = AutoModelForSeq2SeqLM.from_pretrained(modelName, use_auth_token=True)
tokenizer = NllbTokenizer.from_pretrained(
modelName, src_lang='eng_Latn', tgt_lang='rus_Cyrl'
)
trainSet = Dataset.from_list(trainPart)
evalSet = Dataset.from_list(evalPart)
def preprocess_function(examples):
inputs = [example['en'] for example in examples["translation"]]
targets = [example['ru'] for example in examples["translation"]]
model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
return model_inputs
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
tokenized_trainset = trainSet.map(preprocess_function, batched=True)
tokenized_evalset = evalSet.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) # или modelName?
metric = evaluate.load("sacrebleu")
training_args = Seq2SeqTrainingArguments(
output_dir="test_ft",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=2,
predict_with_generate=True,
fp16=True,
push_to_hub=False,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_trainset,
eval_dataset=tokenized_evalset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
buildDataset()
run()