Problem with Compute Metrics function

Hi I am new to Hugging Face and fine-tuned my first model for free-text answering

model = “t5-small”

dataset = “eswardivi/medical_qa”

Once fine-tuned I attempted to evaluate my fine-tuned model using the “pubmed_qa” dataset. I used the following compute_metric function:


def compute_metrics(eval_preds):

    preds, labels = eval_preds

if isinstance(preds, tuple):

        preds = preds\[0\]

\# Replace -100 with pad_token_id and cast to int

    labels = np.where(labels == -100, tokenizer.pad_token_id, labels).astype(np.int32)

    preds = preds.astype(np.int32) if isinstance(preds, np.ndarray) else preds

\# Decode predictions and labels safely

    decoded_preds = tokenizer.batch_decode(preds.tolist(), skip_special_tokens=True)

    decoded_labels = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=True)

\# Strip spaces

    decoded_preds = \[p.strip() for p in decoded_preds\]

    decoded_labels = \[l.strip() for l in decoded_labels\]

\# Decision accuracy

    pred_ids, label_ids = \[\], \[\]

for pred, gold in zip(decoded_preds, decoded_labels):

        norm_pred = \_normalize_decision(pred)

        norm_gold = gold.lower().strip()

if norm_gold in LABEL2ID:

            label_ids.append(LABEL2ID\[norm_gold\])

            pred_ids.append(LABEL2ID.get(norm_pred, -1))  # unknown = wrong

    acc_result = accuracy.compute(predictions=pred_ids, references=label_ids) if label_ids else {"accuracy": 0.0}

\# ROUGE

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

\# BERTScore

    bert_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

\# Aggregate metrics

    metrics = {}

    metrics.update(acc_result)

    metrics.update({f"rouge\_{k}": v for k, v in rouge_result.items()})

    metrics.update({

"bertscore_precision": np.mean(bert_result\["precision"\]),

"bertscore_recall": np.mean(bert_result\["recall"\]),

"bertscore_f1": np.mean(bert_result\["f1"\])

    })

return metrics

and the following trainer and training arguments


from transformers import DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(

    output_dir="./eval_results",

    per_device_eval_batch_size=32,

    predict_with_generate=True,

    generation_max_length=128,   

    report_to="none",

)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(

    model=model,

    args=training_args,

    eval_dataset=tokenized_pubmedqa\["test"\],

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

However, when I evaluate it,


results = trainer.evaluate()

print("Evaluation results:", results)

I get the following error:

OverflowError                             Traceback (most recent call last)

/tmp/ipython-input-3138432524.py in <cell line: 0>()
----> 1 results = trainer.evaluate()
      2 print("Evaluation results:", results)



6 frames


/usr/local/lib/python3.12/dist-packages/transformers/trainer_seq2seq.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix, **gen_kwargs)
    189         self.gather_function = self.accelerator.gather
    190         self._gen_kwargs = gen_kwargs
--> 191         return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
    192 
    193     def predict(


/usr/local/lib/python3.12/dist-packages/transformers/trainer.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
   4467 
   4468         eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 4469         output = eval_loop(
   4470             eval_dataloader,
   4471             description="Evaluation",


/usr/local/lib/python3.12/dist-packages/transformers/trainer.py in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
   4758             eval_set_kwargs["losses"] = all_losses if "loss" in args.include_for_metrics else None
   4759             eval_set_kwargs["inputs"] = all_inputs if "inputs" in args.include_for_metrics else None
-> 4760             metrics = self.compute_metrics(
   4761                 EvalPrediction(predictions=all_preds, label_ids=all_labels, **eval_set_kwargs)
   4762             )


/tmp/ipython-input-3437607733.py in compute_metrics(eval_preds)
      9 
     10     # Decode predictions and labels safely
---> 11     decoded_preds = tokenizer.batch_decode(preds.tolist(), skip_special_tokens=True)
     12     decoded_labels = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=True)
     13 


/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in batch_decode(self, sequences, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
   3856         """
   3857         return [
-> 3858             self.decode(
   3859                 seq,
   3860                 skip_special_tokens=skip_special_tokens,


/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
   3895         token_ids = to_py_obj(token_ids)
   3896 
-> 3897         return self._decode(
   3898             token_ids=token_ids,
   3899             skip_special_tokens=skip_special_tokens,


/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_fast.py in _decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
    680         if isinstance(token_ids, int):
    681             token_ids = [token_ids]
--> 682         text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
    683 
    684         clean_up_tokenization_spaces = (


OverflowError: out of range integral type conversion attempted

Any advice would be greatly appreciated as I have been trying to resolve this problem for the past couple of days with no success. I am not entirely sure if it is a problem specifically with my compute_metrics function or just a poor choice of dataset to attempt to evaluate my fine-tuned model on or a combination of both. Thanks so much in advance.

1 Like

It appears that compute_metrics encounters an error when attempting to decode non-token IDs.

import numpy as np
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# ----- Fake eval_preds coming from Trainer -----
B, T, V = 3, 8, tokenizer.vocab_size

# Case A: predictions are LOGITS (common when predict_with_generate=False)
preds_logits = np.random.randn(B, T, V).astype(np.float32)

# Labels contain real ids and -100 ignore index (as Trainer supplies)
labels = np.random.randint(0, V, size=(B, T))
labels[:, 2] = -100  # simulate masked positions

# Case B: predictions look like ids but contain invalid values
preds_bad_ids = np.random.randint(0, V, size=(B, T))
preds_bad_ids[0, 0] = -999                      # negative id
preds_bad_ids[1, 1] = V + 12345                 # out-of-range id

# ----- Broken compute_metrics: tries to decode whatever it gets -----
def compute_metrics_broken(eval_preds):
    preds, labels = eval_preds
    # This will crash or misbehave if preds are logits or contain invalid ids
    return {
        "decoded_pred_sample": tokenizer.batch_decode(preds, skip_special_tokens=True)[0]
    }

# Demonstrate failure on logits or bad ids
for bad in [(preds_logits, labels), (preds_bad_ids, labels)]:
    try:
        compute_metrics_broken(bad)
    except Exception as e:
        print("Broken compute_metrics error:", repr(e))

# ----- Fixed compute_metrics: handles logits/ids and cleans values -----
def compute_metrics_fixed(eval_preds):
    preds, labels = eval_preds

    # unwrap tuple from some trainers
    if isinstance(preds, tuple):
        preds = preds[0]

    # If logits: (B, T, V) -> argmax ids: (B, T)
    if isinstance(preds, np.ndarray) and preds.ndim == 3:
        preds = preds.argmax(axis=-1)

    # Sanitize prediction ids: replace negatives or out-of-vocab with pad
    pad = tokenizer.pad_token_id or 0
    preds = preds.astype(np.int64)
    preds = np.where(preds < 0, pad, preds)
    preds = np.where(preds >= tokenizer.vocab_size, pad, preds)

    # Trainer passes -100 in labels: map to pad before decoding
    labels = labels.astype(np.int64)
    labels = np.where(labels == -100, pad, labels)

    # Safe decode
    decoded_preds  = tokenizer.batch_decode(preds,  skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Example metric: exact-match rate
    acc = float(np.mean([p.strip() == l.strip() for p, l in zip(decoded_preds, decoded_labels)]))
    return {"exact_match": acc}

# Works on both scenarios
print("Fixed on logits:", compute_metrics_fixed((preds_logits, labels)))
print("Fixed on bad ids:", compute_metrics_fixed((preds_bad_ids, labels)))
preds_equal = labels.copy()
print("Exact match:", compute_metrics_fixed((preds_equal, labels)))

# ----- How to wire it (schematic) -----
# from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM
# model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
# args = TrainingArguments(
#     output_dir="out",
#     predict_with_generate=True,  # ensures sequences instead of raw logits for seq2seq
#     evaluation_strategy="epoch",
# )
# trainer = Trainer(
#     model=model,
#     args=args,
#     tokenizer=tokenizer,
#     data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
#     compute_metrics=compute_metrics_fixed,
# )
"""
Broken compute_metrics error: TypeError("argument 'ids': 'list' object cannot be interpreted as an integer")
Broken compute_metrics error: OverflowError('out of range integral type conversion attempted')
Fixed on logits: {'exact_match': 0.0}
Fixed on bad ids: {'exact_match': 0.0}
Exact match: {'exact_match': 1.0}
"""

So fixed version is maybe like this?

import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds  # HF passes (predictions, label_ids)

    # 1) Unwrap and normalize shapes
    if isinstance(preds, tuple):
        preds = preds[0]
    preds  = np.asarray(preds)
    labels = np.asarray(labels)

    # 2) If logits, turn into token ids
    #    (B, T, V) -> argmax over vocab; keep (B, T) as ids
    if preds.ndim == 3:
        preds = preds.argmax(axis=-1)

    # 3) Choose a safe pad id for decoding
    pad = getattr(tokenizer, "pad_token_id", None)
    if pad is None:
        pad = getattr(tokenizer, "eos_token_id", 0)

    # 4) Sanitize dtypes and ranges
    preds  = preds.astype(np.int64, copy=False)
    labels = labels.astype(np.int64, copy=False)

    # Replace invalid pred ids with pad (negatives or >= vocab_size)
    if getattr(tokenizer, "vocab_size", None) is not None:
        V = tokenizer.vocab_size
        preds = np.where((preds < 0) | (preds >= V), pad, preds)
    else:
        preds = np.where(preds < 0, pad, preds)

    # Trainer uses -100 as ignore index for seq2seq labels; map to pad before decode
    labels = np.where(labels == -100, pad, labels)

    # 5) batch_decode expects a list of token-id sequences
    def _as_seq_list(arr):
        if arr.ndim == 1:
            return [arr.tolist()]
        return arr.tolist()

    decoded_preds  = tokenizer.batch_decode(_as_seq_list(preds),  skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(_as_seq_list(labels), skip_special_tokens=True)

    # 6) Strip and normalize for decision accuracy
    decoded_preds  = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]

    pred_ids, label_ids = [], []
    for pred, gold in zip(decoded_preds, decoded_labels):
        norm_pred = _normalize_decision(pred)               # user-defined
        norm_gold = gold.lower().strip()
        if norm_gold in LABEL2ID:                           # external mapping
            label_ids.append(LABEL2ID[norm_gold])
            pred_ids.append(LABEL2ID.get(norm_pred, -1))    # unknown -> wrong

    # 7) Metrics
    acc_result   = accuracy.compute(predictions=pred_ids, references=label_ids) if label_ids else {"accuracy": 0.0}
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bert_result  = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

    metrics = {}
    metrics.update(acc_result)
    metrics.update({f"rouge_{k}": v for k, v in rouge_result.items()})
    metrics.update({
        "bertscore_precision": float(np.mean(bert_result["precision"])),
        "bertscore_recall":    float(np.mean(bert_result["recall"])),
        "bertscore_f1":        float(np.mean(bert_result["f1"])),
    })
    return metrics

Thanks so much John6666 your fixed version worked perfectly. Thank you so much for taking the same to provide such an in-depth and well structured response.

1 Like