I got a bleu score at about 11 and would like to do some error analysis, so I saved the predictions to file. When I read the predictions, I felt that the bleu score should be much lower than 11 because most tokens in the references are missing in the predictions. Therefore, I directly calculated the bleu score by giving the predictions file and references file to sacrebleu (which is the package used as metric in the training program) and the bleu score is about 2. The predictions and references files are both formatted one sentence a line. Each predicted sentence has only one reference.
Relevant code snippets are attached below:
import sacrebleu
metric = load_metric("sacrebleu")
#----------------------------------------------------------#
# Define compute_metrics for trainer
#----------------------------------------------------------#
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]
return preds, labels
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
#----------------------------------------------------------#
# Calculate metric for test dataset, get bleu score, and save predictions to file
#----------------------------------------------------------#
test_metric = trainer.predict(test_dataset = tokenized_datasets['test'], metric_key_prefix = 'test', num_beams=6)
print(test_metric.metrics['test_bleu']) # get about 11
detokenized_predictions = tokenizer.batch_decode(test_metric.predictions, skip_special_tokens=True)
with open(path_predictions_file, 'w') as outfile:
s = '\n'.join(detokenized_predictions)
outfile.write(s)
#----------------------------------------------------------#
# load previously-saved predictions files and references file to calculate bleu score
#----------------------------------------------------------#
predictions = []
with open(path_predictions_file) as prediction_infile:
for sentence in prediction_infile:
predictions.append(sentence.strip())
references = []
with open(path_references_file) as reference_infile:
for sentence in reference_infile:
references.append(sentence.strip())
bleu = sacrebleu.corpus_bleu(predictions, [references])
print('{}'.format(bleu.format(score_only=True))) #get about 2
Thank you very much for the reading! Really appreciate any suggestions