Hello all,
I am trying to fine tune a model from the hub using Trainer API along with optuna for hyperparameter search. I have noticed a strange behavior that I find hard to interpret. During training, while the loss seems to change (mostly decreasing) all the other metrics remain the same, or if they change they get certain values.
I have defined my own compute_metrics and also my own objective for optuna to optimize.
My code is the following
learning_rate_max = 1e-4
learning_rate_min= 1e-6
weight_decay_min= 4e-5
weight_decay_ceil= 0.01
num_epochs_min= 2
num_epochs_max= 5
output_dir = './output'
per_device_train_batch_size_hp = 8
per_device_eval_batch_size_hp = 8
evaluation_strategy_hp = "epoch"
logging_strategy_hp = 'epoch'
save_strategy_hp = 'epoch'
backend= "optuna"
direction= "maximize"
n_trials = 5
index = 1
num_shards = 100
metrics = ["accuracy", "f1", "recall"]
def optuna_hp_space(trial):
"""Necessary function for hyperparameter tuning, constructs the hyperparameter space"""
return {
"learning_rate": trial.suggest_float("learning_rate", learning_rate_min, learning_rate_max,
log=True),
"weight_decay": trial.suggest_loguniform('weight_decay', weight_decay_min, weight_decay_ceil),
"num_train_epochs": trial.suggest_int('num_train_epochs', low=num_epochs_min, high=num_epochs_max)
}
def model_init_simple(trial):
"""Necessary function for hyperparameter tuning, loads the model from hugging face"""
if torch.cuda.is_available():
return AutoModelForSequenceClassification.from_pretrained(
MODEL
).to("cuda")
else:
return AutoModelForSequenceClassification.from_pretrained(
MODEL
)
def compute_metrics(eval_pred, metric_name):
metric = load(metric_name)
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
if metric_name == "accuracy":
return metric.compute(predictions=predictions, references=labels)
else:
return metric.compute(predictions=predictions, references=labels, average="macro")
def _compute_metrics(eval_pred):
# metric = load_metric(self.metrics)
score = {}
for metric_name in metrics:
# metric = load_metric(metric_name)
score.update(compute_metrics(eval_pred, metric_name))
return score
def my_objective(metrics):
# Your elaborate computation here
print("Optimizing with respect to recall macro")
return metrics['eval_recall']
training_args = TrainingArguments(
output_dir=output_dir,
# overwrite_output_dir=True,
per_device_train_batch_size=per_device_train_batch_size_hp,
per_device_eval_batch_size=per_device_eval_batch_size_hp,
evaluation_strategy=evaluation_strategy_hp,
logging_strategy=logging_strategy_hp,
save_strategy=save_strategy_hp,
run_name='trial-0') # need save strategy in order to log the trials in optuna
trainer = Trainer(
model=None,
args=training_args,
train_dataset=encoded_dataset['train'],
eval_dataset=encoded_dataset['validation'],
model_init=model_init_simple,
compute_metrics=_compute_metrics,
# callbacks=[MLFlowCustomCallback(hp_search='optuna')]
)
best_trial_simple = trainer.hyperparameter_search(
compute_objective=my_objective,
direction=direction,
backend=backend,
hp_space=optuna_hp_space,
n_trials=n_trials
)
The results I get are the following:
Also, the training I perform is on TweetEval, in a small subset of almost 400 tweets.
Any ideas what this might means?
Thank you in advance!