I am comparing the compute_metrics function between a few examples:
Multilabel Classification
def multi_label_metrics(predictions, labels, threshold=0.5):
sigmoid = torch.nn.Sigmoid()
probabilities = sigmoid(torch.Tensor(predictions))
predictions = np.zeros(probabilities.shape)
predictions[np.where(probabilities >= threshold)] = 1
f1_metric = evaluate.load('f1','multilabel')
roc_auc_metric = evaluate.load('roc_auc','multilabel')
f1_score = f1_metric.compute(references=labels, predictions=predictions, average='micro')["f1"]
roc_auc = roc_auc_metric.compute(references=labels, prediction_scores=probabilities, average = 'micro')["roc_auc"]
# return as dictionary
metrics = {'f1': f1_score, 'roc_auc': roc_auc}
return metrics
def compute_metrics(p: EvalPrediction):
preds = p.predictions[0] if isinstance(p.predictions,
tuple) else p.predictions
result = multi_label_metrics(
predictions=preds,
labels=p.label_ids)
return result
Single Label Classification (Mutliclass and Binary)
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")
if num_classes == 2:
roc_auc_metric = evaluate.load('roc_auc')
elif num_classes > 2:
roc_auc_metric = evaluate.load('roc_auc','multiclass')
def compute_metrics(eval_pred):
logits, labels = eval_pred
probabilities = torch.nn.functional.softmax(torch.Tensor(logits),dim=-1)
predictions = np.argmax(probabilities, axis=-1)
accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
if num_classes == 2:
roc_auc = roc_auc_metric.compute(prediction_scores=predictions, references=labels)["roc_auc"]
f1_score = f1_metric.compute(predictions=predictions, references=labels)["f1"]
if num_classes > 2:
roc_auc = roc_auc_metric.compute(prediction_scores=probabilities,references=labels,multi_class='ovr')["roc_auc"]
f1_score = f1_metric.compute(predictions=predictions, references=labels, average="micro")["f1"]
return {"accuracy": accuracy,"f1": f1_score, "roc_auc": roc_auc}
Abstractive Summarization
def compute_metrics(eval_pred):
predictions, labels = eval_pred
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
result["gen_len"] = np.mean(prediction_lens)
return {k: round(v, 4) for k, v in result.items()}
I see eval_pred is used in single label and summarization, while a different method has been employed for multilabel.
I also see that the outputs of eval_pred have been listed in examples as “logits” and “predictions”.
Can someone help me sort out why they would be named as such? Also, is my multilabel metrics example outdated/could it be improved - or does different methodology (p: EvalPrediction) need to be employed because the outputs are more complex objects with different shape?