Different accuracy values

hello everyone, i’m try to fine-tuning DistilBertForSequenceClassification on my custom dataset in order to perform binary classification using colab , first time i fine-tuned the model it give me a good accuracy on my test set however, the second time i tried to fine-tuned the model ones more in order to save the model this time , the accuracy on the same test set is decreased so much , i tried over and over again and nothing changes. can any one help me to understand what’s going on.
here is the code:
train_texts , test_texts , train_labels , test_labels = train_test_split(texts, labels, test_size=0.2 , random_state=40)

checkpoint = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)

model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=2, problem_type= 'single_label_classification' , id2label=id2label , label2id=label2id ) #found
class CustomDataset(Dataset):
  def __init__(self , texts, labels, tokenizer , max_len = 160):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.texts)

  def __getitem__(self , idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])
    encoding = self.tokenizer(text, truncation=True , padding = 'max_length', max_length = self.max_len, return_tensors='pt')
    return {
    'input_ids': encoding['input_ids'].flatten(),
    'attention_mask':encoding['attention_mask'].flatten(),
    'labels': label
         }
def compute_metricn (pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(1)
  precision, recall, f1n, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=1)
  acc = accuracy_score(labels, preds)
  metrics = {
      'ACC': acc,
      'f1':f1n,
      'Precision': precision,
      'Recall': recall
    }
  return metrics```

#training
from transformers import TrainingArguments , Trainer #found
args = TrainingArguments(
per_device_train_batch_size = 8,
per_device_eval_batch_size = 8,
output_dir = ‘./results’,
num_train_epochs=2 ,

save_steps= 200,

logging_steps=172,
evaluation_strategy="epoch",
save_total_limit = 1,    # Only the best 1 a models are saved. Older ones are deleted

)

trainer2 = Trainer(model=model , args = args , train_dataset = train_dataset, eval_dataset= test_dataset ,compute_metrics=compute_metricn )

from transformers.trainer_callback import DefaultFlowCallback, TrainerCallback, TrainerControl, TrainerState
from copy import deepcopy
class CustomCallback(TrainerCallback):

def __init__(self, trainer) -> None:
    super().__init__()
    self._trainer = trainer

def on_epoch_end(self, args, state, control, **kwargs):
    if control.should_evaluate:
        control_copy = deepcopy(control)
        self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
        return control_copy

trainer = Trainer(
model=model, # the instantiated Transformers model to be trained
args= args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=test_dataset, # evaluation dataset
compute_metrics=compute_metricn, # the callback that computes metrics of interest
tokenizer=tokenizer

)
trainer.add_callback(CustomCallback(trainer))
train = trainer.train()

1 Like