Finetuning wmt19 model for translation

I am trying to fine tune wmt19 https://huggingface.co/facebook/wmt19-en-de model for MT on pinyin to chinese. It is training but giving repeated output. Can anyone please help me how to train wmt model.

import warnings
warnings.filterwarnings("ignore")
​
​
import pandas as pd
import numpy as np
import datasets
from datasets import Dataset, load_metric
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from transformers import PreTrainedTokenizerFast
from transformers import DataCollatorWithPadding
import torch
import torch.nn as nn
​
mname = "facebook/wmt19-en-de"
checkpoint_dir = "./checkpoints" 
model_dir = "./model"
​
​
df = pd.read_csv("./data/chinesetopinyin.csv")
data = Dataset.from_pandas(df[:100])
train_testvalid = data.train_test_split(test_size=0.1)
data_train = train_testvalid['train']
data_test = train_testvalid['test']
​
src_tokenizer = PreTrainedTokenizerFast(tokenizer_file="pinyintok/tokenizer.json", pad_token="<pad>")
trgt_tokenizer = PreTrainedTokenizerFast(tokenizer_file="chinesetok/tokenizer.json", pad_token="<pad>")
​
# Preparing Dataset
def prepare_dataset(batch):
    
    input_ids = src_tokenizer.encode(text=batch['pinyin'], padding='max_length', truncation=True, max_length=125, return_tensors="pt")
    label_ids = trgt_tokenizer.encode(text=batch['chinese'], padding='max_length', truncation=True, max_length=125, return_tensors="pt")
​
    batch["input_ids"] = input_ids[0] # s2s.input_ids[0]
    batch["input_length"] = len(input_ids[0]) #len(batch["input_ids"])
    batch["labels"] = label_ids[0] #s2s.labels[0]
    
    return batch
​
data_train = data_train.map(prepare_dataset)#, remove_columns=data_train.column_names)
data_test = data_test.map(prepare_dataset)#, remove_columns=data_test.column_names)
​
​
print("*****************************************************************")
print("train data at 0: \n")
print(data_train[0]['pinyin'])
print(src_tokenizer.decode(data_train[0]['input_ids']))
print(data_train[0]['input_ids'])
print(data_train[0]['chinese'])
print(trgt_tokenizer.decode(data_train[0]['labels']))
print(data_train[0]['labels'])
print("-----------------------------------------------------------------")
#print("test data at 0: \n",data_test[0]['input_ids'])
#print("*****************************************************************")
​
​
data_collator = DataCollatorWithPadding(tokenizer=src_tokenizer)
​
err_metric = load_metric("cer")
​
def compute_metrics(pred):
    
    pred_logits = pred.predictions[0]
    pred_ids = np.argmax(pred_logits, axis=-1)
​
    pred.label_ids[pred.label_ids == -100] = trgt_tokenizer.pad_token_id
    with torch.no_grad():
        pred_str = trgt_tokenizer.batch_decode(pred_ids)
        # we do not want to group tokens when computing the metrics
        label_str = trgt_tokenizer.batch_decode(pred.label_ids, group_tokens=False)
    err = err_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": err}#, "label":pred_str, "pred_str":pred_str}#, "label_id":pred.label_ids[0], "pred_id":pred_ids[0]}
​
​
​
model = FSMTForConditionalGeneration.from_pretrained(
    mname, 
    src_vocab_size = 42100, #len(tokenizer.get_src_vocab()),
    tgt_vocab_size = 42100, #len(tokenizer.get_tgt_vocab()),
    ignore_mismatched_sizes=True,
    dropout=0.2,
    pad_token_id=src_tokenizer.pad_token_id
)
​
​
def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels
​
from transformers import TrainingArguments
    
training_args = TrainingArguments(
    output_dir= checkpoint_dir, #os.path.join(dir_name, "checkpoints" ),
    group_by_length=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size =8,
#    auto_find_batch_size = True,
    gradient_accumulation_steps=1,
    eval_accumulation_steps=1,
    evaluation_strategy="steps",
    num_train_epochs=20,
    fp16= False,
    gradient_checkpointing=False,
    save_steps=8,
    eval_steps=8,
    logging_steps=8,
    resume_from_checkpoint=False,
    learning_rate=3e-4,
    warmup_steps=4,
    save_total_limit=2,
    no_cuda = True,
)
​
​
from transformers import Trainer
​
class crossEntropyTrainer(Trainer):
    def compute_loss(self, model, input, return_outputs=False):
        outputs = model(**input)#.get("input_ids"))
        logits = outputs.get("logits")[:,:-1,:].contiguous()
        labels = input.get("labels")[:,1:].contiguous()
​
        labels = torch.tensor([[-100 if i==1 else i for i in label] for label in labels])
        loss_func = nn.CrossEntropyLoss()
        loss = [loss_func(logit, label) for logit,label in zip(logits,labels)]
        #loss = loss_func(logits[0], labels[0])
        loss = sum(loss)/len(loss)
        return (loss, outputs) if return_outputs else loss
    
​
trainer = crossEntropyTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
#    compute_metrics=compute_metrics,
    train_dataset=data_train,
    eval_dataset=data_test,
    tokenizer=trgt_tokenizer
#    preprocess_logits_for_metrics = preprocess_logits_for_metrics 
)
trainer.train()
trainer.save_model(model_dir)
​

output is comming like this


Please help!!!