Hi everyone,
I am using evaluate
in my compute_metrics function that I pass to the trainer. I have a custom trainer that reports logs to wandb. It is working fine but I wanted to add accuracy as a metric to report and then it breaks. The compute_metrics has the following code:
def compute_metrics_fn(eval_pred):
metrics = dict()
accuracy_metric = evaluate.load("accuracy")
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
metrics.update(accuracy_metric.compute(references=labels, predictions=predictions))
return metrics
This is my trainer:
trainer = CustomTrainer(
model=model,
tokenizer=tokenizer,
args=train_args,
data_collator=data_collator,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
compute_metrics=compute_metrics_fn,
)
And my custom trainer:
class CustomTrainer(Trainer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def evaluation_loop(
self,
dataloader,
description,
prediction_loss_only=None,
ignore_keys=None,
metric_key_prefix="eval",
):
# call super class method to get the eval outputs
eval_output = super().evaluation_loop(
dataloader,
description,
prediction_loss_only,
ignore_keys,
metric_key_prefix,
)
# log the prediction distribution using `wandb.Histogram` method.
if wandb.run is not None:
input_ids = self.tokenizer.encode(
"PIECE_START STYLE=JSFAKES GENRE=JSFAKES TRACK_START",
return_tensors="pt",
).cuda()
# Generate more tokens.
voice1_generated_ids = self.model.generate(
input_ids,
max_length=512,
do_sample=True,
temperature=0.75,
eos_token_id=self.tokenizer.encode("TRACK_END")[0],
)
voice2_generated_ids = self.model.generate(
voice1_generated_ids,
max_length=512,
do_sample=True,
temperature=0.75,
eos_token_id=self.tokenizer.encode("TRACK_END")[0],
)
voice3_generated_ids = self.model.generate(
voice2_generated_ids,
max_length=512,
do_sample=True,
temperature=0.75,
eos_token_id=self.tokenizer.encode("TRACK_END")[0],
)
voice4_generated_ids = self.model.generate(
voice3_generated_ids,
max_length=512,
do_sample=True,
temperature=0.75,
eos_token_id=self.tokenizer.encode("TRACK_END")[0],
)
token_sequence = self.tokenizer.decode(voice4_generated_ids[0])
note_sequence = token_sequence_to_note_sequence(token_sequence)
synth = note_seq.fluidsynth
array_of_floats = synth(note_sequence, sample_rate=SAMPLE_RATE)
int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)
wandb.log({"Generated_audio": wandb.Audio(int16_data, SAMPLE_RATE)})
return eval_output
And this is the error:
Traceback (most recent call last):
File "/content/bach_gpt2_simple/train.py", line 242, in <module>
train(default_config)
File "/content/bach_gpt2_simple/train.py", line 236, in train
trainer.train()
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1662, in train
return inner_training_loop(
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2006, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2287, in _maybe_log_save_evaluate
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2993, in evaluate
output = eval_loop(
File "/content/bach_gpt2_simple/customtrainer.py", line 24, in evaluation_loop
eval_output = super().evaluation_loop(
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3281, in evaluation_loop
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
File "/content/bach_gpt2_simple/train.py", line 212, in compute_metrics_fn
metrics.update(accuracy_metric.compute(references=labels, predictions=predictions))
File "/usr/local/lib/python3.10/dist-packages/evaluate/module.py", line 432, in compute
self.add_batch(**inputs)
File "/usr/local/lib/python3.10/dist-packages/evaluate/module.py", line 512, in add_batch
raise ValueError(error_msg) from None
ValueError: Predictions and/or references don't match the expected format.
Expected format: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)},
Input predictions: [[72 72 10 ... 0 0 0]
[72 72 10 ... 0 0 0]
...
[72 72 10 ... 0 0 0]],
Input references: [[ 78 79 72...
I printed the dtypes of my predictions and input references and they are int64
, I think this is fine. I am assuming it has something to do with my CustomTrainer: I am unsure if I should add something in the evaluation_loop
function so it communicates somehow with the compute_metrics function.
Thank you very much! Any support is highly appreciated.
Juan Carlos