I’m trying to finetune BART-Base on XSum using a variant of @patrickvonplaten’s script finetuning an EncoderDecoder Roberta Shared model. The problem I’m getting is that while the training and validation loss are rapidly falling (the training loss reached <1 in just 400 steps and the validation loss reached 0.4 in 300 steps), the ROUGE scores end up declining toward 0 (R-2 F1 at 0.015900 at 100 steps, 0.000900 and below afterwards).
Colab link: Google Colab
#from transformers import RobertaTokenizerFast
from transformers import BartTokenizerFast
#tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")
train_data = datasets.load_dataset("xsum", split="train")
val_data = datasets.load_dataset("xsum", split="validation[:10%]")
batch_size=12 # change to 16 for full training
encoder_max_length=512
decoder_max_length=64
def process_data_to_model_inputs(batch):
# Tokenizer will automatically set [BOS] <text> [EOS]
inputs = tokenizer(batch["document"], padding="max_length", truncation=True, max_length=encoder_max_length)
outputs = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_max_length)
batch["input_ids"] = inputs.input_ids
batch["attention_mask"] = inputs.attention_mask
batch["decoder_input_ids"] = outputs.input_ids
batch["labels"] = outputs.input_ids.copy()
# mask loss for padding
batch["labels"] = [
[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
]
batch["decoder_attention_mask"] = outputs.attention_mask
return batch
train_data = train_data.map(
process_data_to_model_inputs,
batched=True,
batch_size=batch_size,
remove_columns=["document", "summary"],
)
train_data.set_format(
type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
val_data = val_data.map(
process_data_to_model_inputs,
batched=True,
batch_size=batch_size,
remove_columns=["document", "summary"],
)
val_data.set_format(
type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
from transformers import EncoderDecoderModel, BartForConditionalGeneration
# set encoder decoder tying to True
roberta_shared = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
#roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base", tie_encoder_decoder=True)
# set special tokens
roberta_shared.config.decoder_start_token_id = tokenizer.bos_token_id
roberta_shared.config.eos_token_id = tokenizer.eos_token_id
# sensible parameters for beam search
# set decoding params
roberta_shared.config.max_length = 64
roberta_shared.config.early_stopping = True
roberta_shared.config.no_repeat_ngram_size = 3
roberta_shared.config.length_penalty = 2.0
roberta_shared.config.num_beams = 4
roberta_shared.config.vocab_size = roberta_shared.config.vocab_size
#roberta_shared.config.vocab_size = roberta_shared.config.encoder.vocab_size
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
# load rouge for validation
rouge = datasets.load_metric("rouge")
def compute_metrics(pred):
labels_ids = pred.label_ids
pred_ids = pred.predictions
# all unnecessary tokens are removed
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
labels_ids[labels_ids == -100] = tokenizer.pad_token_id
label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid
return {
"rouge2_precision": round(rouge_output.precision, 4),
"rouge2_recall": round(rouge_output.recall, 4),
"rouge2_fmeasure": round(rouge_output.fmeasure, 4),
}
# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
output_dir="./",
evaluation_strategy="steps",
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
predict_with_generate=True,
logging_steps=100, # set to 2000 for full training
save_steps=5000, # set to 500 for full training
eval_steps=100, # set to 7500 for full training
warmup_steps=3000, # set to 3000 for full training
num_train_epochs=5,
overwrite_output_dir=True,
save_total_limit=3,
fp16=False,
)
# instantiate trainer
trainer = Seq2SeqTrainer(
model=roberta_shared,
tokenizer=tokenizer,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_data,
eval_dataset=val_data,
)
trainer.train()
[ 601/85010 17:21 < 40:45:32, 0.58 it/s, Epoch 0.04/5]
Step Training Loss Validation Loss Rouge2 Precision Rouge2 Recall Rouge2 Fmeasure Runtime Samples Per Second
100 7.670600 5.843812 0.013100 0.024300 0.015900 144.499400 7.841000
200 4.567700 2.118743 0.006900 0.014800 0.009000 144.219800 7.856000
300 1.523300 0.419682 0.000700 0.001500 0.000900 146.956600 7.710000
400 0.564400 0.213161 0.000500 0.001000 0.000700 146.885800 7.713000
500 0.358300 0.165476 0.000600 0.000900 0.000600 147.931300 7.659000
My guess is that the loss function was including padding tokens, but I’m not sure how to fix this given I’m using the exact same code as RobertaShared, which I tested to not have this issue. The only two changes I made from my working RobertaShared Encoder Decoder fine tuning script to Bart’s was to replace the model being called and the tokenizer (although the tokenizers for both models are the same).
Currently using PyTorch 1.6, Transformers 4.3.3, and Datasets 1.0.2.