Not sure how to compute BLEU through compute_metrics

Here is my code

from transformers import Seq2SeqTrainer,Seq2SeqTrainingArguments, EarlyStoppingCallback, BertTokenizer,MT5ForConditionalGeneration
from transformers.data.data_collator import DataCollatorForSeq2Seq,default_data_collator
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import math,os
import numpy as np
from torch.utils.data import Dataset
from tqdm import tqdm
import torch
from datasets import load_dataset, load_metric

os.environ['MASTER_PORT'] = '777'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

#model trained form https://huggingface.co/uer/t5-base-chinese-cluecorpussmall
pretrain_task22_fewshot_zh = './results/baidu/pretrain-table-task22-lowdata/checkpoint-8'

model = MT5ForConditionalGeneration.from_pretrained(pretrain_task22_fewshot_zh)
tokenizer = BertTokenizer.from_pretrained(pretrain_task22_fewshot_zh)

device = 'cuda:0'

train_args = Seq2SeqTrainingArguments(output_dir='./results/baidu/finetune/task22-lowdata',evaluation_strategy = 'epoch',
                                per_device_train_batch_size=32,weight_decay=0, learning_rate= 0.00005,
                                num_train_epochs=100,lr_scheduler_type='constant_with_warmup',warmup_ratio=0.1,logging_strategy='steps',
                                save_strategy='epoch',fp16_backend = 'amp',fp16 = False,gradient_accumulation_steps = 2,
                                load_best_model_at_end = True,logging_steps = 1)#,deepspeed='./zero2_auto_config.json', save_total_limit = 3)
def load_data(path):
    data = []
    with open(path,encoding='utf-8') as w:
        while True:
            line = w.readline()
            if not line:
                break
            data.append(line)
    return data


class T5dataset(Dataset):
    def __init__(self, data_set,tokenizer,maxlen,label_maxlen):
        self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.label_maxlen = label_maxlen
        self.data_set = data_set

    def __len__(self):
        return len(self.data_set)
    
    def __getitem__(self, index):
        model_input = {}
        data = self.data_set[index]
        table, text = data.split('\t')
        model_input = self.tokenizer(table,padding = 'max_length',truncation = True,max_length = self.maxlen)
        label = self.tokenizer(text,truncation = True,max_length = self.label_maxlen)
        model_input['labels'] = label['input_ids']
        return {"input_ids": model_input['input_ids'], "attention_mask": model_input['attention_mask'], "labels": label['input_ids']}

baidu_lowdata = './data/baidu_compete/finetune/lowdata/'

train_data = load_data(baidu_lowdata + 'train.txt')
val_data = load_data(baidu_lowdata + 'val.txt')
train_data = T5dataset(train_data,tokenizer,64,256)
val_data = T5dataset(val_data,tokenizer,64,256)

early_stop = EarlyStoppingCallback(early_stopping_patience = 2,early_stopping_threshold = 0)

data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=-100,
        padding='max_length',
        max_length= 64
    )
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels
def compute_metrics(eval_preds):
    # print(preds)
    preds, labels = eval_preds
    #print('preds:',preds[0])
    # print('len:',preds[0].shape)
    if isinstance(preds, tuple):
        preds = preds[0]
    print('preds:',preds)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # if data_args.ignore_pad_token_for_loss:
    #     # Replace -100 in the labels as we can't decode them.
    #     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

trainer =  Seq2SeqTrainer(model=model,
    args=train_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks = [early_stop],
    compute_metrics=compute_metrics
    )

trainer.train()

Here is part of what I got:

***** Running Evaluation *****
  Num examples = 11
  Batch size = 8
preds: [[[  -6.9859548   -6.9850636   -6.9853897 ...   -6.985799    -6.9857574                    | 0/2 [00:00<?, ?it/s]
     -6.985038 ]
  [  -6.9859576   -6.985067    -6.9853916 ...   -6.9858017   -6.9857593
     -6.985041 ]
  [  -7.4163866   -7.41599     -7.41603   ...   -7.4164863   -7.416518
     -7.415782 ]
    ...
  [  -8.480153    -8.479599    -8.479667  ...   -8.480127    -8.480097
     -8.47964  ]
  [  -8.4777355   -8.477188    -8.477254  ...   -8.47771     -8.4776745
     -8.477233 ]
  [  -8.475657    -8.47512     -8.475176  ...   -8.475634    -8.475585
     -8.475155 ]]
...
Traceback (most recent call last):
  File "tmp.py", line 118, in <module>
    trainer.train()
  File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/trainer.py", line 1342, in train
    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
  File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/trainer.py", line 1437, in _maybe_log_save_evaluate
    metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
  File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/trainer_seq2seq.py", line 75, in evaluate
    return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
  File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/trainer.py", line 2042, in evaluate
    metric_key_prefix=metric_key_prefix,
  File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/trainer.py", line 2273, in evaluation_loop
    metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
  File "tmp.py", line 91, in compute_metrics
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
  File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/tokenization_utils_base.py", line 3133, in batch_decode
    for seq in sequences
  File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/tokenization_utils_base.py", line 3133, in <listcomp>
    for seq in sequences
  File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/tokenization_utils_base.py", line 3169, in decode
    **kwargs,
  File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 743, in _decode
    filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
  File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 718, in convert_ids_to_tokens
    index = int(index)
TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'

The code computing BLEU was copied from transformers/run_translation.py at master · huggingface/transformers · GitHub
I also ran that code and print preds in compute_metrics which were all integers. I think my main problem is why the preds printed in my code are not integers which can not be decode.

This has been disturbing me for two days. Could someone please point out where is wrong? Thank you!

2 Likes

Hi, I encounter the same situation that trying to use BLEU as the evaluation metric but having the same error as you. Did you find out a solution?

I am having the same problem with the ROUGE metric when trying to fine-tune BART for summarization.