Loss values change but accuracy, f1 and recall remain the same

Hello all,

I am trying to fine tune a model from the hub using Trainer API along with optuna for hyperparameter search. I have noticed a strange behavior that I find hard to interpret. During training, while the loss seems to change (mostly decreasing) all the other metrics remain the same, or if they change they get certain values.

I have defined my own compute_metrics and also my own objective for optuna to optimize.

My code is the following

learning_rate_max = 1e-4
learning_rate_min= 1e-6
weight_decay_min= 4e-5
weight_decay_ceil= 0.01
num_epochs_min= 2
num_epochs_max= 5


output_dir = './output'
per_device_train_batch_size_hp = 8
per_device_eval_batch_size_hp = 8
evaluation_strategy_hp = "epoch"
logging_strategy_hp = 'epoch'
save_strategy_hp = 'epoch'

backend= "optuna"
direction= "maximize"
n_trials = 5

index = 1
num_shards = 100

metrics = ["accuracy", "f1", "recall"]


def optuna_hp_space(trial):
    """Necessary function for hyperparameter tuning, constructs the hyperparameter space"""
    return {
        "learning_rate": trial.suggest_float("learning_rate", learning_rate_min, learning_rate_max,
                                              log=True),
        "weight_decay": trial.suggest_loguniform('weight_decay', weight_decay_min, weight_decay_ceil),
        "num_train_epochs": trial.suggest_int('num_train_epochs', low=num_epochs_min, high=num_epochs_max)
    }

def model_init_simple(trial):
    """Necessary function for hyperparameter tuning, loads the model from hugging face"""
    if torch.cuda.is_available():
        return AutoModelForSequenceClassification.from_pretrained(
            MODEL
        ).to("cuda")
    else:
        return AutoModelForSequenceClassification.from_pretrained(
            MODEL
        )

def compute_metrics(eval_pred, metric_name):
    metric = load(metric_name)

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    if metric_name == "accuracy":
        return metric.compute(predictions=predictions, references=labels)
    else:
        return metric.compute(predictions=predictions, references=labels, average="macro")



def _compute_metrics(eval_pred):
    # metric = load_metric(self.metrics)

    score = {}
    for metric_name in metrics:
        # metric = load_metric(metric_name)
        score.update(compute_metrics(eval_pred, metric_name))

    return score


def my_objective(metrics):
    # Your elaborate computation here
    print("Optimizing with respect to recall macro")
    return metrics['eval_recall']


training_args = TrainingArguments(
    output_dir=output_dir,
    # overwrite_output_dir=True,
    per_device_train_batch_size=per_device_train_batch_size_hp,
    per_device_eval_batch_size=per_device_eval_batch_size_hp,
    evaluation_strategy=evaluation_strategy_hp,
    logging_strategy=logging_strategy_hp,
    save_strategy=save_strategy_hp,
    run_name='trial-0')  # need save strategy in order to log the trials in optuna

trainer = Trainer(
    model=None,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    model_init=model_init_simple,
    compute_metrics=_compute_metrics,
    # callbacks=[MLFlowCustomCallback(hp_search='optuna')]
)

best_trial_simple = trainer.hyperparameter_search(
    compute_objective=my_objective,
    direction=direction,
    backend=backend,
    hp_space=optuna_hp_space,
    n_trials=n_trials

)

The results I get are the following:

Also, the training I perform is on TweetEval, in a small subset of almost 400 tweets.

Any ideas what this might means?

Thank you in advance!