Can't calculate accuracy_score for T5ForConditionalGeneration model

I’m training the model on a custom dataset to predict solutions to quadratic equations and I need to calculate the model’s accuracy. I used a custom trainer class like described here:

from sklearn.metrics import accuracy_score

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)

        # code for calculating accuracy
        if "labels" in inputs:
            preds = outputs.logits.detach().cpu()
            acc1 = accuracy_score(inputs.labels.reshape(1, len(inputs.labels))[0], preds.argmax(axis=1))
            self.log({'accuracy_score': acc1})
            acc = (
                (preds.argmax(axis=-1) == inputs.labels.reshape(1, len(inputs.labels))[0])
                .type(torch.float)
                .mean()
                .item()
            )
            self.log({"train_accuracy": acc})
        # end code for calculating accuracy
                    
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

TrainingArgs:

args = TrainingArguments(
    output_dir='./',
    num_train_epochs=10,
    overwrite_output_dir = True,
    evaluation_strategy = 'steps',         
    learning_rate = 1e-4,                 
    # max_steps = 30000,
    logging_steps = 100,                    
    eval_steps = 2000,                      
    save_steps = 2000,
    load_best_model_at_end = True,
    # metric_for_best_model = 'accuracy',
    # label_names=["labels"],
    # push_to_hub=True, 
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32
    )

Trainer class:

from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = CustomTrainer(
    model=model,                
    args=args,                  
    train_dataset=train_dataset,        
    eval_dataset=eval_dataset,          
    compute_metrics=compute_metrics,    
    tokenizer=tokenizer,
    data_collator=data_collator
)

However, I’m getting this error:

<ipython-input-112-c90f25a089b8> in compute_loss(self, model, inputs, return_outputs)
     26             print("1st argument shape", inputs.labels.shape)
     27             print("2nd argument shape", preds.argmax(axis=1).shape)
---> 28             acc1 = accuracy_score(inputs.labels.reshape(1, len(inputs.labels))[0], preds.argmax(axis=1))
     29             # acc1 = accuracy_score(inputs.labels.cpu(), preds.argmax(axis=1))
     30             self.log({'accuracy_score': acc1})

RuntimeError: shape '[1, 8]' is invalid for input of size 672

The shape of preds is torch.Size([8, 84, 2418])
The shape of inputs is torch.Size([8, 84])

I tried changing the line to:

acc1 = accuracy_score(inputs.labels.cpu(), preds.argmax(axis=1))

But then I get this error:

/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py in _check_targets(y_true, y_pred)
    104     # No metrics support "multiclass-multioutput" format
    105     if y_type not in ["binary", "multiclass", "multilabel-indicator"]:
--> 106         raise ValueError("{0} is not supported".format(y_type))
    107 
    108     if y_type in ["binary", "multiclass"]:

ValueError: multiclass-multioutput is not supported

I don’t really understand what I should do to get correct dimensions to calculate accuracy_score. Could somebody please explain? Thanks in advance.