BART XSum Finetuning - Loss Dropping Rapidly but Rouge F1 Decreasing to 0

reelmath · January 3, 2022, 6:09pm

I’m trying to finetune BART-Base on XSum using a variant of @patrickvonplaten’s script finetuning an EncoderDecoder Roberta Shared model. The problem I’m getting is that while the training and validation loss are rapidly falling (the training loss reached <1 in just 400 steps and the validation loss reached 0.4 in 300 steps), the ROUGE scores end up declining toward 0 (R-2 F1 at 0.015900 at 100 steps, 0.000900 and below afterwards).

Colab link: Google Colab

#from transformers import RobertaTokenizerFast
from transformers import BartTokenizerFast

#tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

train_data = datasets.load_dataset("xsum", split="train")
val_data = datasets.load_dataset("xsum", split="validation[:10%]")

batch_size=12  # change to 16 for full training
encoder_max_length=512
decoder_max_length=64

def process_data_to_model_inputs(batch):                                                               
    # Tokenizer will automatically set [BOS] <text> [EOS]                                               
    inputs = tokenizer(batch["document"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_max_length)
                                                                                                        
    batch["input_ids"] = inputs.input_ids                                                               
    batch["attention_mask"] = inputs.attention_mask                                                     
    batch["decoder_input_ids"] = outputs.input_ids                                                      
    batch["labels"] = outputs.input_ids.copy()                                                          
    # mask loss for padding                                                                             
    batch["labels"] = [                                                                                 
        [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
    ]                     
    batch["decoder_attention_mask"] = outputs.attention_mask                                                                              
                                                                                                         
    return batch  

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["document", "summary"],
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["document", "summary"],
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

from transformers import EncoderDecoderModel, BartForConditionalGeneration

# set encoder decoder tying to True
roberta_shared = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

#roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base", tie_encoder_decoder=True)

# set special tokens
roberta_shared.config.decoder_start_token_id = tokenizer.bos_token_id                                             
roberta_shared.config.eos_token_id = tokenizer.eos_token_id

# sensible parameters for beam search
# set decoding params                               
roberta_shared.config.max_length = 64
roberta_shared.config.early_stopping = True
roberta_shared.config.no_repeat_ngram_size = 3
roberta_shared.config.length_penalty = 2.0
roberta_shared.config.num_beams = 4
roberta_shared.config.vocab_size = roberta_shared.config.vocab_size  
#roberta_shared.config.vocab_size = roberta_shared.config.encoder.vocab_size  

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    logging_steps=100,  # set to 2000 for full training
    save_steps=5000,  # set to 500 for full training
    eval_steps=100,  # set to 7500 for full training
    warmup_steps=3000,  # set to 3000 for full training
    num_train_epochs=5,
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=False, 
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=roberta_shared,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()


 [ 601/85010 17:21 < 40:45:32, 0.58 it/s, Epoch 0.04/5]
Step	Training Loss	Validation Loss	Rouge2 Precision	Rouge2 Recall	Rouge2 Fmeasure	Runtime	Samples Per Second
100	7.670600	5.843812	0.013100	0.024300	0.015900	144.499400	7.841000
200	4.567700	2.118743	0.006900	0.014800	0.009000	144.219800	7.856000
300	1.523300	0.419682	0.000700	0.001500	0.000900	146.956600	7.710000
400	0.564400	0.213161	0.000500	0.001000	0.000700	146.885800	7.713000
500	0.358300	0.165476	0.000600	0.000900	0.000600	147.931300	7.659000

My guess is that the loss function was including padding tokens, but I’m not sure how to fix this given I’m using the exact same code as RobertaShared, which I tested to not have this issue. The only two changes I made from my working RobertaShared Encoder Decoder fine tuning script to Bart’s was to replace the model being called and the tokenizer (although the tokenizers for both models are the same).

Currently using PyTorch 1.6, Transformers 4.3.3, and Datasets 1.0.2.

Topic		Replies	Views
Bart-base rouge scores Research	11	1730	October 27, 2020
BART fine tune on XSUM - jumpy train loss, weird eval loss 🤗Transformers	0	844	January 27, 2021
No change in ROUGE scores and I am cant reduce loss Beginners	1	187	April 3, 2024
Not able to reproduce the XSum rouge score with BART large model Models	0	330	January 22, 2022
Failed to train bart-cnn from bart-base using my own code Beginners	3	518	October 8, 2020

BART XSum Finetuning - Loss Dropping Rapidly but Rouge F1 Decreasing to 0

Related topics