This worked for me:
# Load metric
metric_name = "f1"
metric = load_metric(metric_name)
# Define metrics
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
# 'micro', 'macro', etc. are for multi-label classification. If you are running a binary classification, leave it as default or specify "binary" for average
return metric.compute(predictions=predictions, references=labels, average="binary")