I don’t know if this can help you in some way. I have implemented more metrics but not using the trainer by modifying the evaluation cycle like this:
accuracy = load_metric("accuracy")
precision = load_metric("precision")
recall = load_metric("recall")
f1 = load_metric("f1")
metrics = [accuracy, precision, recall, f1]
model.eval()
for step, batch in enumerate(eval_dataloader):
outputs = model(**batch)
predictions = outputs.logits.argmax(dim=-1) if not is_regression else outputs.logits.squeeze()
for metric in metrics:
metric.add_batch(
predictions=accelerator.gather(predictions),
references=accelerator.gather(batch["labels"]),
)
logger.info(f"epoch {epoch+1}: train loss {loss}")
for metric in metrics:
if metric.name == "accuracy":
eval_metric = metric.compute()
logger.info(f"{eval_metric}")
else:
eval_metric = metric.compute(average=None)
logger.info(f"{eval_metric}")
if metric.name == "f1":
avg_f1 = sum(values)/2
logger.info(f"Average f1: {avg_f1}")