Hi I am new to Hugging Face and fine-tuned my first model for free-text answering
model = “t5-small”
dataset = “eswardivi/medical_qa”
Once fine-tuned I attempted to evaluate my fine-tuned model using the “pubmed_qa” dataset. I used the following compute_metric function:
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds\[0\]
\# Replace -100 with pad_token_id and cast to int
labels = np.where(labels == -100, tokenizer.pad_token_id, labels).astype(np.int32)
preds = preds.astype(np.int32) if isinstance(preds, np.ndarray) else preds
\# Decode predictions and labels safely
decoded_preds = tokenizer.batch_decode(preds.tolist(), skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=True)
\# Strip spaces
decoded_preds = \[p.strip() for p in decoded_preds\]
decoded_labels = \[l.strip() for l in decoded_labels\]
\# Decision accuracy
pred_ids, label_ids = \[\], \[\]
for pred, gold in zip(decoded_preds, decoded_labels):
norm_pred = \_normalize_decision(pred)
norm_gold = gold.lower().strip()
if norm_gold in LABEL2ID:
label_ids.append(LABEL2ID\[norm_gold\])
pred_ids.append(LABEL2ID.get(norm_pred, -1)) # unknown = wrong
acc_result = accuracy.compute(predictions=pred_ids, references=label_ids) if label_ids else {"accuracy": 0.0}
\# ROUGE
rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
\# BERTScore
bert_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
\# Aggregate metrics
metrics = {}
metrics.update(acc_result)
metrics.update({f"rouge\_{k}": v for k, v in rouge_result.items()})
metrics.update({
"bertscore_precision": np.mean(bert_result\["precision"\]),
"bertscore_recall": np.mean(bert_result\["recall"\]),
"bertscore_f1": np.mean(bert_result\["f1"\])
})
return metrics
and the following trainer and training arguments
from transformers import DataCollatorForSeq2Seq
training_args = Seq2SeqTrainingArguments(
output_dir="./eval_results",
per_device_eval_batch_size=32,
predict_with_generate=True,
generation_max_length=128,
report_to="none",
)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
eval_dataset=tokenized_pubmedqa\["test"\],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
However, when I evaluate it,
results = trainer.evaluate()
print("Evaluation results:", results)
I get the following error:
OverflowError Traceback (most recent call last)
/tmp/ipython-input-3138432524.py in <cell line: 0>()
----> 1 results = trainer.evaluate()
2 print("Evaluation results:", results)
6 frames
/usr/local/lib/python3.12/dist-packages/transformers/trainer_seq2seq.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix, **gen_kwargs)
189 self.gather_function = self.accelerator.gather
190 self._gen_kwargs = gen_kwargs
--> 191 return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
192
193 def predict(
/usr/local/lib/python3.12/dist-packages/transformers/trainer.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
4467
4468 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 4469 output = eval_loop(
4470 eval_dataloader,
4471 description="Evaluation",
/usr/local/lib/python3.12/dist-packages/transformers/trainer.py in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
4758 eval_set_kwargs["losses"] = all_losses if "loss" in args.include_for_metrics else None
4759 eval_set_kwargs["inputs"] = all_inputs if "inputs" in args.include_for_metrics else None
-> 4760 metrics = self.compute_metrics(
4761 EvalPrediction(predictions=all_preds, label_ids=all_labels, **eval_set_kwargs)
4762 )
/tmp/ipython-input-3437607733.py in compute_metrics(eval_preds)
9
10 # Decode predictions and labels safely
---> 11 decoded_preds = tokenizer.batch_decode(preds.tolist(), skip_special_tokens=True)
12 decoded_labels = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=True)
13
/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in batch_decode(self, sequences, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
3856 """
3857 return [
-> 3858 self.decode(
3859 seq,
3860 skip_special_tokens=skip_special_tokens,
/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_base.py in decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
3895 token_ids = to_py_obj(token_ids)
3896
-> 3897 return self._decode(
3898 token_ids=token_ids,
3899 skip_special_tokens=skip_special_tokens,
/usr/local/lib/python3.12/dist-packages/transformers/tokenization_utils_fast.py in _decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
680 if isinstance(token_ids, int):
681 token_ids = [token_ids]
--> 682 text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
683
684 clean_up_tokenization_spaces = (
OverflowError: out of range integral type conversion attempted
Any advice would be greatly appreciated as I have been trying to resolve this problem for the past couple of days with no success. I am not entirely sure if it is a problem specifically with my compute_metrics function or just a poor choice of dataset to attempt to evaluate my fine-tuned model on or a combination of both. Thanks so much in advance.