Finetune xlm roberta base(overfitting ,any solution )

import pandas as pd

from sklearn.model_selection import train_test_split

import torch

from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,DataCollatorWithPadding

Load your dataset

df = pd.read_excel(‘dataset_final.xlsx’)

Split the dataset into training and testing sets

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

Create a PyTorch Dataset

class CustomDataset(Dataset):
def init(self, encodings, labels):
self.encodings = encodings
self.labels = labels

def __getitem__(self, idx):
    item = {key: val[idx] for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

def __len__(self):
    return len(self.labels)

tokenizer = AutoTokenizer.from_pretrained(“xlm-roberta-base”)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_encodings = tokenizer(list(train_df[‘text’]), truncation=True, padding=True, return_tensors=‘pt’)

test_encodings = tokenizer(list(test_df[‘text’]), truncation=True, padding=True, return_tensors=‘pt’)

Prepare the labels

train_labels = list(train_df[‘label’])

test_labels = list(test_df[‘label’])

Create DataLoader objects

train_dataset = CustomDataset(train_encodings, train_labels)

test_dataset = CustomDataset(test_encodings, test_labels)

import evaluate

accuracy = evaluate.load(“accuracy”)

import numpy as np

def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: “negative”, 1: “neutral”,2:“positive”}
label2id = {“negative”: 0, “neutral”: 1,“positive”:2}

model = AutoModelForSequenceClassification.from_pretrained(
“xlm-roberta-base”,hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.25, num_labels=3, id2label=id2label, label2id=label2id)

from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
output_dir=“testing_finetune”,
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3.0,
warmup_steps=600,
weight_decay=0.1,
evaluation_strategy=“epoch”,
save_strategy=“epoch”,
load_best_model_at_end=True,
push_to_hub=True,
)

Create and configure your Trainer

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()

trainer.push_to_hub()

this is my code ,can someone help me? my result is always over fitting . thank you

Can we get some additional information? You’re training for 3 epochs with 600 warmup steps with a batch size of 16, so you’re doing warmup on 600x16=9600 sequences - is your dataset sufficiently large for this?

Also, when is it overfitting? Is it already overfit after the first epoch? Or only with the final model? If you can provide train/test loss curves or eval info per epoch that would be a big help in identifying the problem.

If it’s already overfitting after the first epoch, that could be an indication that perhaps your train and test splits aren’t representative of one another. Did you shuffle the dataset properly before creating these splits? Maybe the train set has a bunch of duplicates or near-duplicates, making it harder for the model to generalize? Also, what other hyperparameters have you tried? Did those improve/worsen your results?

for now i am using 3 epoch with no warmup ,for right now ,and my train dataset is 29,306 and my test is 7,328

training_args = TrainingArguments(
output_dir=“TESTING-ROBERTA”,
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.2,
evaluation_strategy=“epoch”,
save_strategy=“epoch”,
load_best_model_at_end=True,
logging_steps=100,
save_total_limit=3,
push_to_hub=True,
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_malay[“train”],
eval_dataset=tokenized_malay[“test”],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()

this is my new trainner argumment

Training Loss Epoch Step Validation Loss Accuracy
0.6335 1.0 1832 0.6639 0.7250
0.5122 2.0 3664 0.6612 0.7404
0.3887 3.0 5496 0.7192 0.7389

this is my new result

i did shuffle the dataset and split into test_size =0.2