Issues with fine tuning an Encoder Decoder Model

I am training an encode decoder for summarization of news articles into title with the help of news. I am trying to fine tune encoder decoder model as mentioned in the code below. the problem I am facing is that the model is not learning and just generating the same thing. I am not able to Understanding what I am doing wrong.

batch_size=64
encoder_max_length=128
decoder_max_length=16

roberta2roberta = EncoderDecoderModel.from_encoder_decoder_pretrained(
“roberta-base”,
“roberta-base”
)#.to(“cuda”)
tokenizer = RobertaTokenizer.from_pretrained(“roberta-base”)

set special tokens

roberta2roberta.config.decoder_start_token_id = tokenizer.bos_token_id
roberta2roberta.config.eos_token_id = tokenizer.eos_token_id
roberta2roberta.config.pad_token_id = tokenizer.pad_token_id
roberta2roberta.config.bos_token_id = tokenizer.bos_token_id

sensible parameters for beam search

roberta2roberta.config.vocab_size = tokenizer.vocab_size
roberta2roberta.config.max_length = 16
roberta2roberta.config.min_length = 4
roberta2roberta.config.no_repeat_ngram_size = 1
roberta2roberta.config.early_stopping = True
roberta2roberta.config.length_penalty = 2.0
roberta2roberta.config.num_beams = 4

def process_data_to_model_inputs(batch):

tokenize the inputs and labels

inputs = tokenizer(batch["title_s_article_s"], padding="max_length", truncation=True, max_length=encoder_max_length)
outputs = tokenizer(batch["highlight"], padding="max_length", truncation=True, max_length=decoder_max_length)

batch["input_ids"] = inputs.input_ids
batch["attention_mask"] = inputs.attention_mask
batch["decoder_input_ids"] = outputs.input_ids
batch["decoder_attention_mask"] = outputs.attention_mask
batch["labels"] = outputs.input_ids.copy()

# because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
# We have to make sure that the PAD token is ignored
batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

return batch

only use 32 training examples for notebook - DELETE LINE FOR FULL TRAINING

train_data = dataset[‘train’]#.select(range(32))

train_data = train_data.map(
process_data_to_model_inputs,
batched=True,
batch_size=batch_size,
remove_columns=[“title_s_article_s”, “highlight”]
)
train_data.set_format(
type=“torch”, columns=[“input_ids”, “attention_mask”, “decoder_input_ids”, “decoder_attention_mask”, “labels”],
)

only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING

val_data = dataset[‘eval’]#.select(range(16))

val_data =val_data.map(
process_data_to_model_inputs,
batched=True,
batch_size=batch_size,
remove_columns=[“title_s_article_s”, “highlight”]
)
val_data.set_format(
type=“torch”, columns=[“input_ids”, “attention_mask”, “decoder_input_ids”, “decoder_attention_mask”, “labels”],
)

set training arguments - these params are not really tuned, feel free to change

training_args = Seq2SeqTrainingArguments(
output_dir=“./checkpoint”,
evaluation_strategy=“steps”,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
predict_with_generate=True,
logging_steps=100,
save_steps=1000,
eval_steps=100,
warmup_steps=100,
overwrite_output_dir=True,
save_total_limit=1,
num_train_epochs = 12,
learning_rate=1e-2,
fp16=True,
lr_scheduler_type = ‘cosine’,
# resume_from_checkpoint = ‘./checkpoint/checkpoint-2000’,
# no_cuda = True,
)

load rouge for validation

rouge = load(“rouge”)

def compute_metrics(pred):
labels_ids = pred.label_ids
pred_ids = pred.predictions

# all unnecessary tokens are removed
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
labels_ids[labels_ids == -100] = tokenizer.pad_token_id
label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
print('START')
for x,y in zip(pred_str[:10],label_str[:10]):
    print('PRED: ',x,"LABEL: ",y)
print('END')
rouge_output = rouge.compute(
    predictions=pred_str,
    references=label_str
)
print_gpu_utilization()
return rouge_output

instantiate trainer

trainer = Seq2SeqTrainer(
model=roberta2roberta,
tokenizer=tokenizer,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_data,
eval_dataset=val_data,
)
trainer.train()

OUTPUT (For training loop )

[1201/4776 07:18 < 21:48, 2.73 it/s, Epoch 3.02/12]
Step Training Loss Validation Loss Rouge1 Rouge2 Rougel Rougelsum
100 7.940000 7.168086 0.065070 0.000000 0.064863 0.064918
200 7.074200 7.004178 0.079902 0.000000 0.079906 0.079832
300 7.017600 6.979647 0.079902 0.000000 0.079906 0.079832
400 6.997000 6.959394 0.079902 0.000000 0.079906 0.079832
500 6.914100 6.962692 0.079902 0.000000 0.079906 0.079832
600 6.934400 6.955380 0.079902 0.000000 0.079906 0.079832
700 6.936800 6.958053 0.079902 0.000000 0.079906 0.079832
800 6.941300 6.962934 0.079902 0.000000 0.079906 0.079832
900 6.910400 6.973436 0.079902 0.000000 0.079906 0.079832
1000 6.921500 6.962850 0.079902 0.000000 0.079906 0.079832
1100 6.921500 6.969145 0.079902 0.000000 0.079906 0.079832
[ 4/11 00:02 < 00:04, 1.50 it/s]

The prediction are repetitive. I am not able to understand what I am doing wrong.
My dataset is 25K datapoint