Thanks so much @lewtun!
I believe I managed to tweak the evaluate()
method, but now I am struggling to log the metrics inside on_evaluate()
.
I keep getting the following error:
But when I inspect the log_history, I have both the training metrics and the eval_loss for the first epoch. I have been trying to find metrics
but I haven’t had any success. I suspect it must be because of how I customized evaluate()
to output a dictionary with the validation and training metrics, so below you can find my code.
`class MyTrainer(Trainer):
def init(self, model,
args = None,
data_collator = None,
train_dataset = None,
eval_dataset = None,
tokenizer = None,
model_init = None,
compute_metrics = None,
callbacks = None,
optimizers = (None,None)
):
super().__init__(model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init,
compute_metrics, callbacks, optimizers)
def evaluate(
self,
train_dataset = None,
eval_dataset: Optional[Dataset] = None,
ignore_keys: Optional[List[str]] = None,
metric_key_prefix: str = “eval”,
) → Dict[str, float]:
# memory metrics - must set up as early as possible
self._memory_tracker.start()
if eval_dataset is not None and not isinstance(eval_dataset, collections.abc.Sized):
raise ValueError("eval_dataset must implement __len__")
train_dataloader = self.get_train_dataloader()
eval_dataloader = self.get_eval_dataloader(eval_dataset)
start_time = time.time()
train_output = self.prediction_loop(
train_dataloader,
description = 'Training',
prediction_loss_only = True if self.compute_metrics is None else None,
ignore_keys = ignore_keys,
metric_key_prefix = 'train',
)
eval_output = self.prediction_loop(
eval_dataloader,
description="Evaluation",
# No point gathering the predictions if there are no metrics, otherwise we defer to
# self.args.prediction_loss_only
prediction_loss_only=True if self.compute_metrics is None else None,
ignore_keys=ignore_keys,
metric_key_prefix=metric_key_prefix,
)
train_n_samples = len(self.train_dataset)
train_output.metrics.update(speed_metrics('train', start_time, train_n_samples))
self.log(train_output.metrics)
eval_n_samples = len(eval_dataset if eval_dataset is not None else self.eval_dataset)
eval_output.metrics.update(speed_metrics(metric_key_prefix, start_time, eval_n_samples))
self.log(eval_output.metrics)
if self.args.tpu_metrics_debug or self.args.debug:
# tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
xm.master_print(met.metrics_report())
self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, eval_output.metrics)
self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, train_output.metrics)
self._memory_tracker.stop_and_update_metrics(train_output.metrics)
self._memory_tracker.stop_and_update_metrics(eval_output.metrics)
dic = {
'Training metrics': train_output.metrics,
'Validation metrics': eval_output.metrics
}
return dic`