I’m trying to add some custom logs to my training pipeline by overriding some methods of TensorboardCallback.
Here’s my custom TensorboardCallback class:
class CustomTensorBoardCallback(TensorBoardCallback):
"""Custom TensorBoardCallback class to log hardware usage
and store some predictions at evaluation phases"""
def __init__(self, tokenizer, num_samples=5, freq=1, tb_writer=None):
super().__init__(tb_writer)
self.tokenizer = tokenizer
self.num_samples = num_samples
self.freq = freq # Frequency of logging
self.sample_sources = None
self.sample_references = None
self.sample_sources_ids = None
@staticmethod
def _bytes_to_gigabytes(bytes):
return round((bytes / 1024) / 1024 / 1024, 2)
def _log_hardware_usage(self, state):
self.tb_writer.add_scalar("cpu_ram_available_GB", self._bytes_to_gigabytes(psutil.virtual_memory().available))
self.tb_writer.add_scalar("cpu_ram_used_GB", self._bytes_to_gigabytes(psutil.virtual_memory().used))
if torch.cuda.is_available():
self.tb_writer.add_scalar("gpu_usage", torch.cuda.utilization())
self.tb_writer.add_scalar("gpu_memory_usage_GB", self._bytes_to_gigabytes(torch.cuda.memory_usage()))
self.tb_writer.flush()
def on_init_end(self, args, state, control, **kwargs):
print("I am running on_init_end")
super().on_init_end(args, state, control, **kwargs)
self._log_hardware_usage(state)
def on_log(self, args, state, control, **kwargs):
print("I am running on_log")
super().on_log(args, state, control, **kwargs)
self._log_hardware_usage(state)
def on_evaluate(self, args, state, control, model, eval_dataloader, **kwargs):
print("I am running on_evaluate")
super().on_evaluate(args, state, control, **kwargs)
if state.global_step % self.freq == 0:
if self.sample_sources_ids is None:
# Get the first batch of validation samples
val_first_batch = next(iter(eval_dataloader))
for key in val_first_batch:
val_first_batch[key] = val_first_batch[key][:self.num_samples, :]
self.sample_sources_ids = val_first_batch["input_ids"]
# Decode source segments and labels
self.sample_sources = tokenize_utils.decode_token_ids(self.sample_sources_ids.cpu(), self.tokenizer)
self.sample_references = tokenize_utils.decode_token_ids(val_first_batch["labels"].cpu(), self.tokenizer)
preds_encoded = model.generate(self.sample_sources_ids)
preds_decoded = tokenize_utils.decode_token_ids(preds_encoded.cpu(), self.tokenizer)
self.tb_writer.add_text(
"eval_source_ref_pred_sample",
"\n".join([
f"{src_seg}\t{ref}\t{pred}"
for src_seg, ref, pred
in zip(self.sample_sources, self.sample_references, preds_decoded)
])
)
self.tb_writer.flush()
I’m calling it in a script with :
# Define training hyperparameters
train_args = Seq2SeqTrainingArguments(
output_dir = output_folder,
**config["train_args"],
log_level="info",
logging_first_step=True,
run_name=config["run_name"],
disable_tqdm=True,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
report_to="tensorboard",
push_to_hub=False,
predict_with_generate=True
)
tb_writer = SummaryWriter(log_dir="mylogdir")
tb_callback = misc_utils.CustomTensorBoardCallback(tokenizer=T5_tokenizer, tb_writer=tb_writer)
...
callbacks = [tb_callback]
...
trainer = Seq2SeqTrainer(
T5_model,
train_args,
...
callbacks=callbacks
)
I can confirm that my custom methods are being run because I had to fix some errors (e.g. I had forgotten to move the tensors to cpu), and I could find the prints in my stdout file.
However, when I open tensorboard (tensorboard --logdir .
), I can find the default Huggingface logged values (train/eval loss, some custom metrics I implemented, runtime, total_flos…), but not the custom ones I implemented and explicitely flushed in CustomTensorBoardCallback
.
Do you have any idea what I could be doing wrong / why the flushed logs don’t appear in the tensorboard ?