I am trying to fine tune wmt19 https://huggingface.co/facebook/wmt19-en-de model for MT on pinyin to chinese. It is training but giving repeated output. Can anyone please help me how to train wmt model.
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import datasets
from datasets import Dataset, load_metric
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from transformers import PreTrainedTokenizerFast
from transformers import DataCollatorWithPadding
import torch
import torch.nn as nn
mname = "facebook/wmt19-en-de"
checkpoint_dir = "./checkpoints"
model_dir = "./model"
df = pd.read_csv("./data/chinesetopinyin.csv")
data = Dataset.from_pandas(df[:100])
train_testvalid = data.train_test_split(test_size=0.1)
data_train = train_testvalid['train']
data_test = train_testvalid['test']
src_tokenizer = PreTrainedTokenizerFast(tokenizer_file="pinyintok/tokenizer.json", pad_token="<pad>")
trgt_tokenizer = PreTrainedTokenizerFast(tokenizer_file="chinesetok/tokenizer.json", pad_token="<pad>")
# Preparing Dataset
def prepare_dataset(batch):
input_ids = src_tokenizer.encode(text=batch['pinyin'], padding='max_length', truncation=True, max_length=125, return_tensors="pt")
label_ids = trgt_tokenizer.encode(text=batch['chinese'], padding='max_length', truncation=True, max_length=125, return_tensors="pt")
batch["input_ids"] = input_ids[0] # s2s.input_ids[0]
batch["input_length"] = len(input_ids[0]) #len(batch["input_ids"])
batch["labels"] = label_ids[0] #s2s.labels[0]
return batch
data_train = data_train.map(prepare_dataset)#, remove_columns=data_train.column_names)
data_test = data_test.map(prepare_dataset)#, remove_columns=data_test.column_names)
print("*****************************************************************")
print("train data at 0: \n")
print(data_train[0]['pinyin'])
print(src_tokenizer.decode(data_train[0]['input_ids']))
print(data_train[0]['input_ids'])
print(data_train[0]['chinese'])
print(trgt_tokenizer.decode(data_train[0]['labels']))
print(data_train[0]['labels'])
print("-----------------------------------------------------------------")
#print("test data at 0: \n",data_test[0]['input_ids'])
#print("*****************************************************************")
data_collator = DataCollatorWithPadding(tokenizer=src_tokenizer)
err_metric = load_metric("cer")
def compute_metrics(pred):
pred_logits = pred.predictions[0]
pred_ids = np.argmax(pred_logits, axis=-1)
pred.label_ids[pred.label_ids == -100] = trgt_tokenizer.pad_token_id
with torch.no_grad():
pred_str = trgt_tokenizer.batch_decode(pred_ids)
# we do not want to group tokens when computing the metrics
label_str = trgt_tokenizer.batch_decode(pred.label_ids, group_tokens=False)
err = err_metric.compute(predictions=pred_str, references=label_str)
return {"cer": err}#, "label":pred_str, "pred_str":pred_str}#, "label_id":pred.label_ids[0], "pred_id":pred_ids[0]}
model = FSMTForConditionalGeneration.from_pretrained(
mname,
src_vocab_size = 42100, #len(tokenizer.get_src_vocab()),
tgt_vocab_size = 42100, #len(tokenizer.get_tgt_vocab()),
ignore_mismatched_sizes=True,
dropout=0.2,
pad_token_id=src_tokenizer.pad_token_id
)
def preprocess_logits_for_metrics(logits, labels):
pred_ids = torch.argmax(logits[0], dim=-1)
return pred_ids, labels
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir= checkpoint_dir, #os.path.join(dir_name, "checkpoints" ),
group_by_length=True,
per_device_train_batch_size=8,
per_device_eval_batch_size =8,
# auto_find_batch_size = True,
gradient_accumulation_steps=1,
eval_accumulation_steps=1,
evaluation_strategy="steps",
num_train_epochs=20,
fp16= False,
gradient_checkpointing=False,
save_steps=8,
eval_steps=8,
logging_steps=8,
resume_from_checkpoint=False,
learning_rate=3e-4,
warmup_steps=4,
save_total_limit=2,
no_cuda = True,
)
from transformers import Trainer
class crossEntropyTrainer(Trainer):
def compute_loss(self, model, input, return_outputs=False):
outputs = model(**input)#.get("input_ids"))
logits = outputs.get("logits")[:,:-1,:].contiguous()
labels = input.get("labels")[:,1:].contiguous()
labels = torch.tensor([[-100 if i==1 else i for i in label] for label in labels])
loss_func = nn.CrossEntropyLoss()
loss = [loss_func(logit, label) for logit,label in zip(logits,labels)]
#loss = loss_func(logits[0], labels[0])
loss = sum(loss)/len(loss)
return (loss, outputs) if return_outputs else loss
trainer = crossEntropyTrainer(
model=model,
data_collator=data_collator,
args=training_args,
# compute_metrics=compute_metrics,
train_dataset=data_train,
eval_dataset=data_test,
tokenizer=trgt_tokenizer
# preprocess_logits_for_metrics = preprocess_logits_for_metrics
)
trainer.train()
trainer.save_model(model_dir)
output is comming like this
Please help!!!