BART XSum Finetuning - Loss Dropping Rapidly but Rouge F1 Decreasing to 0

I’m trying to finetune BART-Base on XSum using a variant of @patrickvonplaten’s script finetuning an EncoderDecoder Roberta Shared model. The problem I’m getting is that while the training and validation loss are rapidly falling (the training loss reached <1 in just 400 steps and the validation loss reached 0.4 in 300 steps), the ROUGE scores end up declining toward 0 (R-2 F1 at 0.015900 at 100 steps, 0.000900 and below afterwards).

Colab link: Google Colab

#from transformers import RobertaTokenizerFast
from transformers import BartTokenizerFast

#tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

train_data = datasets.load_dataset("xsum", split="train")
val_data = datasets.load_dataset("xsum", split="validation[:10%]")

batch_size=12  # change to 16 for full training
encoder_max_length=512
decoder_max_length=64

def process_data_to_model_inputs(batch):                                                               
    # Tokenizer will automatically set [BOS] <text> [EOS]                                               
    inputs = tokenizer(batch["document"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_max_length)
                                                                                                        
    batch["input_ids"] = inputs.input_ids                                                               
    batch["attention_mask"] = inputs.attention_mask                                                     
    batch["decoder_input_ids"] = outputs.input_ids                                                      
    batch["labels"] = outputs.input_ids.copy()                                                          
    # mask loss for padding                                                                             
    batch["labels"] = [                                                                                 
        [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
    ]                     
    batch["decoder_attention_mask"] = outputs.attention_mask                                                                              
                                                                                                         
    return batch  

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["document", "summary"],
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["document", "summary"],
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

from transformers import EncoderDecoderModel, BartForConditionalGeneration

# set encoder decoder tying to True
roberta_shared = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

#roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base", tie_encoder_decoder=True)

# set special tokens
roberta_shared.config.decoder_start_token_id = tokenizer.bos_token_id                                             
roberta_shared.config.eos_token_id = tokenizer.eos_token_id

# sensible parameters for beam search
# set decoding params                               
roberta_shared.config.max_length = 64
roberta_shared.config.early_stopping = True
roberta_shared.config.no_repeat_ngram_size = 3
roberta_shared.config.length_penalty = 2.0
roberta_shared.config.num_beams = 4
roberta_shared.config.vocab_size = roberta_shared.config.vocab_size  
#roberta_shared.config.vocab_size = roberta_shared.config.encoder.vocab_size  

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    logging_steps=100,  # set to 2000 for full training
    save_steps=5000,  # set to 500 for full training
    eval_steps=100,  # set to 7500 for full training
    warmup_steps=3000,  # set to 3000 for full training
    num_train_epochs=5,
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=False, 
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=roberta_shared,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

 [ 601/85010 17:21 < 40:45:32, 0.58 it/s, Epoch 0.04/5]
Step	Training Loss	Validation Loss	Rouge2 Precision	Rouge2 Recall	Rouge2 Fmeasure	Runtime	Samples Per Second
100	7.670600	5.843812	0.013100	0.024300	0.015900	144.499400	7.841000
200	4.567700	2.118743	0.006900	0.014800	0.009000	144.219800	7.856000
300	1.523300	0.419682	0.000700	0.001500	0.000900	146.956600	7.710000
400	0.564400	0.213161	0.000500	0.001000	0.000700	146.885800	7.713000
500	0.358300	0.165476	0.000600	0.000900	0.000600	147.931300	7.659000

My guess is that the loss function was including padding tokens, but I’m not sure how to fix this given I’m using the exact same code as RobertaShared, which I tested to not have this issue. The only two changes I made from my working RobertaShared Encoder Decoder fine tuning script to Bart’s was to replace the model being called and the tokenizer (although the tokenizers for both models are the same).

Currently using PyTorch 1.6, Transformers 4.3.3, and Datasets 1.0.2.