I’m fine-tuning sentiment analysis model using news data. As the simplest way is using Huggingface pre-trained model (roberta-base), I followed Huggingface tutorial - Getting Started with Sentiment Analysis using Python - this one. The custom input data is simple : There’re 2 columns named ‘text’ and ‘labels’. The column ‘text’ is consisted with news sentence and ‘label’ is consisted with ‘0’ (40%) and ‘1’ (60%). Then it was separated into train, eval, test set.
So this is the problem what I met : ‘eval_loss’ (almost) never changes during training but its accuracy passed 50%. And training loss is (slightly) decreasing while training. So It seems learned something. Maybe it didn’t learn after first epoch or selected best checkpoint automatically - but I’m confusing what actually happened.
And this is the training code (without labeling code):
from datasets import load_dataset
from transformers import RobertaTokenizer
from transformers import DataCollatorWithPadding
from transformers import RobertaForSequenceClassification
import numpy as np
from datasets import load_metric
from transformers import set_seed
set_seed(42)
dataset = load_dataset('json',data_files={'train':'./data/labeled_news/labeled_news_train.json',
'eval':'./data/labeled_news/labeled_news_eval.json'}, field='data')
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["eval"].shuffle(seed=42)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
def compute_metrics(eval_pred):
load_accuracy = load_metric("accuracy")
load_f1 = load_metric("f1")
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
return {"accuracy": accuracy, "f1": f1}
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
repo_name = "news"
training_args = TrainingArguments(
output_dir=repo_name,
learning_rate=2e-5,
per_device_train_batch_size=30,
per_device_eval_batch_size=30,
num_train_epochs=5,
weight_decay=0.01,
save_strategy="epoch",
evaluation_strategy ='epoch',
push_to_hub=False,
save_total_limit = 10,
logging_strategy='epoch'
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
and this is the checkpoint log: (trainer_state.json)
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"global_step": 28950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.6873,
"step": 5790
},
{
"epoch": 1.0,
"eval_accuracy": 0.5636146428876184,
"eval_f1": 0.7209124645273958,
"eval_loss": 0.6850339770317078,
"eval_runtime": 78.458,
"eval_samples_per_second": 149.367,
"eval_steps_per_second": 6.233,
"step": 5790
},
{
"epoch": 2.0,
"learning_rate": 1.2e-05,
"loss": 0.6867,
"step": 11580
},
{
"epoch": 2.0,
"eval_accuracy": 0.5636146428876184,
"eval_f1": 0.7209124645273958,
"eval_loss": 0.6850508451461792,
"eval_runtime": 77.8433,
"eval_samples_per_second": 150.546,
"eval_steps_per_second": 6.282,
"step": 11580
},
{
"epoch": 3.0,
"learning_rate": 8.000000000000001e-06,
"loss": 0.6863,
"step": 17370
},
{
"epoch": 3.0,
"eval_accuracy": 0.5636146428876184,
"eval_f1": 0.7209124645273958,
"eval_loss": 0.6850472688674927,
"eval_runtime": 77.8592,
"eval_samples_per_second": 150.515,
"eval_steps_per_second": 6.281,
"step": 17370
},
{
"epoch": 4.0,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6862,
"step": 23160
},
{
"epoch": 4.0,
"eval_accuracy": 0.5636146428876184,
"eval_f1": 0.7209124645273958,
"eval_loss": 0.6850366592407227,
"eval_runtime": 78.0649,
"eval_samples_per_second": 150.119,
"eval_steps_per_second": 6.264,
"step": 23160
},
{
"epoch": 5.0,
"learning_rate": 0.0,
"loss": 0.686,
"step": 28950
},
{
"epoch": 5.0,
"eval_accuracy": 0.5636146428876184,
"eval_f1": 0.7209124645273958,
"eval_loss": 0.6850322484970093,
"eval_runtime": 78.338,
"eval_samples_per_second": 149.595,
"eval_steps_per_second": 6.242,
"step": 28950
}
],
"max_steps": 28950,
"num_train_epochs": 5,
"total_flos": 1.827977212666368e+17,
"trial_name": null,
"trial_params": null
}
You can see ‘eval_accuracy’ never changes. Probably, as ‘training loss’ and ‘eval_loss’ are changing slightly, there’s no problem with the code. Maybe it’s just a problem with data. What I want to check is : “Isn’t there a problem with code or any miss?”