Is it possible to get more results from training and evaluating a model, beside loss?

Hello,
I have followed the transformers tutorial by @nielsr to fine tune TrOCR on a pre-trained model. Link: Transformers-Tutorials/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_native_PyTorch.ipynb at master 路 NielsRogge/Transformers-Tutorials 路 GitHub

I managed to get it working, but the only evalute-metric i get is loss. Is it possible to get more metrics, like accuracy, F1, mAP, recall, etc? How do i do that?

Is it also possible to save these metrics from the training/evaluation into e.g. a xlsx file?

This is my code:

from transformers import VisionEncoderDecoderModel
import torch 
from dataframe import create_dataframe, split_dataframe
from dataset import IAMDataset
from transformers import TrOCRProcessor
from torch.utils.data import DataLoader
from datasets import load_metric
import evaluate

# from transformers import AdamW
import torch.optim as optim
from tqdm.auto import tqdm

####################
# Prepare the data
####################
TrOCR_box_path = '/imagesPath'

df = create_dataframe()
train_df, test_df = split_dataframe(df)

# initialize the processor and model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")

# create the dataset for training and evaluation
train_dataset = IAMDataset(root_dir=TrOCR_box_path, df=train_df, processor=processor)
eval_dataset = IAMDataset(root_dir=TrOCR_box_path, df=test_df, processor=processor)

# create the dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=4)


####################
# Load the model
####################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
model.to(device)

# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4


####################
# Train the model
# We will evaluate the model on the Character Error Rate (CER), which is available in HuggingFace Datasets.
####################
# cer_metric = load_metric("cer")
cer_metric = evaluate.load("cer")

def compute_cer(pred_ids, label_ids):
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return cer

optimizer = optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(2):  # loop over the dataset multiple times
   # train
   model.train()
   train_loss = 0.0
   for batch in tqdm(train_dataloader):
      # get the inputs
      for k,v in batch.items():
        batch[k] = v.to(device)

      # forward + backward + optimize
      outputs = model(**batch)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      train_loss += loss.item()

   print(f"Loss after epoch {epoch}:", train_loss/len(train_dataloader))
    
   # evaluate
   model.eval()
   valid_cer = 0.0
   with torch.no_grad():
     for batch in tqdm(eval_dataloader):
       # run batch generation
       outputs = model.generate(batch["pixel_values"].to(device))
       # compute metrics
       cer = compute_cer(pred_ids=outputs, label_ids=batch["labels"])
       valid_cer += cer 

   print("Validation CER:", valid_cer / len(eval_dataloader))

model.save_pretrained("my-model")