Finetuning with Trainer doesn't seem to learn since second epoch

I’m fine-tuning sentiment analysis model using news data. As the simplest way is using Huggingface pre-trained model (roberta-base), I followed Huggingface tutorial - Getting Started with Sentiment Analysis using Python - this one. The custom input data is simple : There’re 2 columns named ‘text’ and ‘labels’. The column ‘text’ is consisted with news sentence and ‘label’ is consisted with ‘0’ (40%) and ‘1’ (60%). Then it was separated into train, eval, test set.

So this is the problem what I met : ‘eval_loss’ (almost) never changes during training but its accuracy passed 50%. And training loss is (slightly) decreasing while training. So It seems learned something. Maybe it didn’t learn after first epoch or selected best checkpoint automatically - but I’m confusing what actually happened.

And this is the training code (without labeling code):

from datasets import load_dataset
from transformers import RobertaTokenizer
from transformers import DataCollatorWithPadding
from transformers import RobertaForSequenceClassification
import numpy as np
from datasets import load_metric
from transformers import set_seed

set_seed(42)

dataset = load_dataset('json',data_files={'train':'./data/labeled_news/labeled_news_train.json',
                                          'eval':'./data/labeled_news/labeled_news_eval.json'}, field='data')
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["eval"].shuffle(seed=42)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}


from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

repo_name = "news"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=30,
    per_device_eval_batch_size=30,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    evaluation_strategy ='epoch',
    push_to_hub=False,
    save_total_limit = 10,
    logging_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

and this is the checkpoint log: (trainer_state.json)

{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5.0,
  "global_step": 28950,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 1.0,
      "learning_rate": 1.6000000000000003e-05,
      "loss": 0.6873,
      "step": 5790
    },
    {
      "epoch": 1.0,
      "eval_accuracy": 0.5636146428876184,
      "eval_f1": 0.7209124645273958,
      "eval_loss": 0.6850339770317078,
      "eval_runtime": 78.458,
      "eval_samples_per_second": 149.367,
      "eval_steps_per_second": 6.233,
      "step": 5790
    },
    {
      "epoch": 2.0,
      "learning_rate": 1.2e-05,
      "loss": 0.6867,
      "step": 11580
    },
    {
      "epoch": 2.0,
      "eval_accuracy": 0.5636146428876184,
      "eval_f1": 0.7209124645273958,
      "eval_loss": 0.6850508451461792,
      "eval_runtime": 77.8433,
      "eval_samples_per_second": 150.546,
      "eval_steps_per_second": 6.282,
      "step": 11580
    },
    {
      "epoch": 3.0,
      "learning_rate": 8.000000000000001e-06,
      "loss": 0.6863,
      "step": 17370
    },
    {
      "epoch": 3.0,
      "eval_accuracy": 0.5636146428876184,
      "eval_f1": 0.7209124645273958,
      "eval_loss": 0.6850472688674927,
      "eval_runtime": 77.8592,
      "eval_samples_per_second": 150.515,
      "eval_steps_per_second": 6.281,
      "step": 17370
    },
    {
      "epoch": 4.0,
      "learning_rate": 4.000000000000001e-06,
      "loss": 0.6862,
      "step": 23160
    },
    {
      "epoch": 4.0,
      "eval_accuracy": 0.5636146428876184,
      "eval_f1": 0.7209124645273958,
      "eval_loss": 0.6850366592407227,
      "eval_runtime": 78.0649,
      "eval_samples_per_second": 150.119,
      "eval_steps_per_second": 6.264,
      "step": 23160
    },
    {
      "epoch": 5.0,
      "learning_rate": 0.0,
      "loss": 0.686,
      "step": 28950
    },
    {
      "epoch": 5.0,
      "eval_accuracy": 0.5636146428876184,
      "eval_f1": 0.7209124645273958,
      "eval_loss": 0.6850322484970093,
      "eval_runtime": 78.338,
      "eval_samples_per_second": 149.595,
      "eval_steps_per_second": 6.242,
      "step": 28950
    }
  ],
  "max_steps": 28950,
  "num_train_epochs": 5,
  "total_flos": 1.827977212666368e+17,
  "trial_name": null,
  "trial_params": null
}

You can see ‘eval_accuracy’ never changes. Probably, as ‘training loss’ and ‘eval_loss’ are changing slightly, there’s no problem with the code. Maybe it’s just a problem with data. What I want to check is : “Isn’t there a problem with code or any miss?”

Train worked by changing this code

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

into this

    load_accuracy = load_metric("accuracy")
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return load_accuracy.compute(predictions=predictions, references=labels)