Use "evaluate" to add compute_metrics to custom trainer

Hi everyone,

I am using evaluate in my compute_metrics function that I pass to the trainer. I have a custom trainer that reports logs to wandb. It is working fine but I wanted to add accuracy as a metric to report and then it breaks. The compute_metrics has the following code:

def compute_metrics_fn(eval_pred):
    metrics = dict()
    accuracy_metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metrics.update(accuracy_metric.compute(references=labels, predictions=predictions))
    return metrics

This is my trainer:

trainer = CustomTrainer(
        model=model,
        tokenizer=tokenizer,
        args=train_args,
        data_collator=data_collator,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics_fn,
    )

And my custom trainer:

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def evaluation_loop(
        self,
        dataloader,
        description,
        prediction_loss_only=None,
        ignore_keys=None,
        metric_key_prefix="eval",
    ):
        # call super class method to get the eval outputs
        eval_output = super().evaluation_loop(
            dataloader,
            description,
            prediction_loss_only,
            ignore_keys,
            metric_key_prefix,
        )

        # log the prediction distribution using `wandb.Histogram` method.
        if wandb.run is not None:
            input_ids = self.tokenizer.encode(
                "PIECE_START STYLE=JSFAKES GENRE=JSFAKES TRACK_START",
                return_tensors="pt",
            ).cuda()
            # Generate more tokens.
            voice1_generated_ids = self.model.generate(
                input_ids,
                max_length=512,
                do_sample=True,
                temperature=0.75,
                eos_token_id=self.tokenizer.encode("TRACK_END")[0],
            )
            voice2_generated_ids = self.model.generate(
                voice1_generated_ids,
                max_length=512,
                do_sample=True,
                temperature=0.75,
                eos_token_id=self.tokenizer.encode("TRACK_END")[0],
            )
            voice3_generated_ids = self.model.generate(
                voice2_generated_ids,
                max_length=512,
                do_sample=True,
                temperature=0.75,
                eos_token_id=self.tokenizer.encode("TRACK_END")[0],
            )
            voice4_generated_ids = self.model.generate(
                voice3_generated_ids,
                max_length=512,
                do_sample=True,
                temperature=0.75,
                eos_token_id=self.tokenizer.encode("TRACK_END")[0],
            )
            token_sequence = self.tokenizer.decode(voice4_generated_ids[0])
            note_sequence = token_sequence_to_note_sequence(token_sequence)
            synth = note_seq.fluidsynth
            array_of_floats = synth(note_sequence, sample_rate=SAMPLE_RATE)
            int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)
            wandb.log({"Generated_audio": wandb.Audio(int16_data, SAMPLE_RATE)})

        return eval_output

And this is the error:

Traceback (most recent call last):
  File "/content/bach_gpt2_simple/train.py", line 242, in <module>
    train(default_config)
  File "/content/bach_gpt2_simple/train.py", line 236, in train
    trainer.train()
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1662, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2006, in _inner_training_loop
    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2287, in _maybe_log_save_evaluate
    metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2993, in evaluate
    output = eval_loop(
  File "/content/bach_gpt2_simple/customtrainer.py", line 24, in evaluation_loop
    eval_output = super().evaluation_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3281, in evaluation_loop
    metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
  File "/content/bach_gpt2_simple/train.py", line 212, in compute_metrics_fn
    metrics.update(accuracy_metric.compute(references=labels, predictions=predictions))
  File "/usr/local/lib/python3.10/dist-packages/evaluate/module.py", line 432, in compute
    self.add_batch(**inputs)
  File "/usr/local/lib/python3.10/dist-packages/evaluate/module.py", line 512, in add_batch
    raise ValueError(error_msg) from None
ValueError: Predictions and/or references don't match the expected format.
Expected format: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)},
Input predictions: [[72 72 10 ...  0  0  0]
 [72 72 10 ...  0  0  0]
 ...
 [72 72 10 ...  0  0  0]],
Input references: [[  78   79   72...

I printed the dtypes of my predictions and input references and they are int64, I think this is fine. I am assuming it has something to do with my CustomTrainer: I am unsure if I should add something in the evaluation_loop function so it communicates somehow with the compute_metrics function.

Thank you very much! Any support is highly appreciated.

Juan Carlos