How to correctly evaluate a Masked Language Model?

In RoBerta they use accuracy and f1 scores of the language model. O got this code that I think performs the accuracy:

import sklearn
from datasets import load_metric
import numpy as np
metric = load_metric(“accuracy”)

def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)

indices = [[i for i, x in enumerate(labels[row]) if x != -100] for row in range(len(labels))]

labels = [labels[row][indices[row]] for row in range(len(labels))]
labels = [item for sublist in labels for item in sublist]

predictions = [predictions[row][indices[row]] for row in range(len(predictions))]
predictions = [item for sublist in predictions for item in sublist]

results = metric.compute(predictions=predictions, references=labels)
results["eval_accuracy"] = results["accuracy"]
print(results)
results.pop("accuracy")

return results

Then create a trainer and use this as the compute_metric parameter

from transformers import Trainer
trainer = Trainer(
model=model,

compute_metrics=compute_metrics,
)

Then

results = trainer.evaluate()
accuracy = results[‘eval_results’]