Eval_pred vs. EvalPrediction confusion

I am comparing the compute_metrics function between a few examples:

Multilabel Classification

    def multi_label_metrics(predictions, labels, threshold=0.5):
        sigmoid = torch.nn.Sigmoid()
        probabilities = sigmoid(torch.Tensor(predictions))
        predictions = np.zeros(probabilities.shape)
        predictions[np.where(probabilities >= threshold)] = 1

        f1_metric = evaluate.load('f1','multilabel')
        roc_auc_metric = evaluate.load('roc_auc','multilabel')
        f1_score = f1_metric.compute(references=labels, predictions=predictions, average='micro')["f1"]
        roc_auc = roc_auc_metric.compute(references=labels, prediction_scores=probabilities, average = 'micro')["roc_auc"]
        # return as dictionary
        metrics = {'f1': f1_score, 'roc_auc': roc_auc}
        return metrics

    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions, 
                tuple) else p.predictions
        result = multi_label_metrics(
        return result

Single Label Classification (Mutliclass and Binary)

    f1_metric = evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    if num_classes == 2:
        roc_auc_metric = evaluate.load('roc_auc')
    elif num_classes > 2:
        roc_auc_metric = evaluate.load('roc_auc','multiclass')

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        probabilities = torch.nn.functional.softmax(torch.Tensor(logits),dim=-1)
        predictions = np.argmax(probabilities, axis=-1)
        accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
        if num_classes == 2:
            roc_auc = roc_auc_metric.compute(prediction_scores=predictions, references=labels)["roc_auc"]
            f1_score = f1_metric.compute(predictions=predictions, references=labels)["f1"]
        if num_classes > 2:
            roc_auc = roc_auc_metric.compute(prediction_scores=probabilities,references=labels,multi_class='ovr')["roc_auc"]
            f1_score = f1_metric.compute(predictions=predictions, references=labels, average="micro")["f1"]
        return {"accuracy": accuracy,"f1": f1_score, "roc_auc": roc_auc}

Abstractive Summarization

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

I see eval_pred is used in single label and summarization, while a different method has been employed for multilabel.

I also see that the outputs of eval_pred have been listed in examples as “logits” and “predictions”.

Can someone help me sort out why they would be named as such? Also, is my multilabel metrics example outdated/could it be improved - or does different methodology (p: EvalPrediction) need to be employed because the outputs are more complex objects with different shape?