Getting worst F1 and accuracy results after hyperparameter optimization

Hi!

I’m fine-tuning a model on my own dataset but I’m getting worst macro F1 and accuracy results after hyperparameter optimization with Optuna. What I’m doing wrong? Please, find my code below:

LEARNING_RATE_HIGH = 0.01
LEARNING_RATE_LOW = 4e-5
MAX_EPOCHS = 5
MIN_EPOCHS = 2
NUM_TRIALS = 25
PER_DEVICE_EVAL_BATCH = 8
PER_DEVICE_TRAIN_BATCH = 8
WEIGHT_DECAY_HIGH = 0.01
WEIGHT_DECAY_LOW = 4e-5

import optuna

# I pretrained cardiffnlp/twitter-xlm-roberta-base-sentiment on my 
# dataset before and saved the result at trained_model_path.
def objective(trial: optuna.Trial):
  model = AutoModelForSequenceClassification.from_pretrained(
    trained_model_path
  )
  training_arguments = TrainingArguments(
      learning_rate=trial.suggest_loguniform(
          "learning_rate", 
          high=LEARNING_RATE_HIGH, 
          low=LEARNING_RATE_LOW,
      ),
      num_train_epochs=trial.suggest_int(
          "num_train_epochs", 
          high=MAX_EPOCHS, 
          low=MIN_EPOCHS,
      ),
      output_dir="hyperparameter_optimization",
      per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
      per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
      weight_decay=trial.suggest_loguniform(
          "weight_decay", 
          WEIGHT_DECAY_LOW, 
          WEIGHT_DECAY_HIGH,
      ),
  )

  trainer = Trainer(
      args=training_arguments,
      data_collator=data_collator, 
      eval_dataset=tokenized_dataset["eval"], 
      model=model,
      tokenizer=tokenizer, 
      train_dataset=tokenized_dataset["train"], 
  )

  result = trainer.train()
  return result.training_loss

study = optuna.create_study(
    direction="minimize", 
    study_name=model_name+"-hyperparameter-optimization"
)
study.optimize(func=objective, n_trials=NUM_TRIALS)

model = AutoModelForSequenceClassification.from_pretrained(
    trained_model_path
)

training_arguments = TrainingArguments(
    learning_rate=float(study.best_params['learning_rate']), 
    num_train_epochs=int(study.best_params['num_train_epochs']),
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8, 
    output_dir="best_parameters",
    weight_decay=float(study.best_params['weight_decay']), 
)

trainer = Trainer(
    args=training_arguments, 
    compute_metrics=compute_metrics, 
    data_collator=data_collator, 
    eval_dataset=tokenized_dataset["eval"], 
    model=model, 
    tokenizer=tokenizer, 
    train_dataset=tokenized_dataset["train"],
)

trainer.train()
trainer.evaluate()

I updated my objective function for searching macro F1 best values and it is working way better now. I attach the new code below. Please, let me know if you would make any changes.

def objective(trial: optuna.Trial):
  model = AutoModelForSequenceClassification.from_pretrained(trained_model_path)
  training_arguments = TrainingArguments(
      learning_rate=trial.suggest_loguniform(
          "learning_rate", 
          high=LEARNING_RATE_HIGH, 
          low=LEARNING_RATE_LOW,
      ),
      num_train_epochs=trial.suggest_int(
          "num_train_epochs", 
          high=MAX_EPOCHS, 
          low=MIN_EPOCHS,
      ),
      output_dir="your_output_directory_path",
      per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
      per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
      weight_decay=trial.suggest_loguniform(
          "weight_decay", 
          WEIGHT_DECAY_LOW, 
          WEIGHT_DECAY_HIGH,
      ),
  )

  trainer = Trainer(
      args=training_arguments,
      compute_metrics=compute_metrics,
      data_collator=data_collator, 
      eval_dataset=tokenized_dataset["eval"], 
      model=model,
      tokenizer=tokenizer, 
      train_dataset=tokenized_dataset["train"], 
  )

  trainer.train()
  results = trainer.evaluate()
  return results["eval_macro_f1"]["f1"]