Sentiment Analysis with Transfer Learning (roBERTa) - Imbalanced dataset

Hello everyone,

this is my first post on Hugginface Forum. I’m trying to do transfer learning from Sentiment Analysis roBERTa - Trip Advisor model. I’m folIowing this notebook.

I have a dataset with Tripadvisor reviews (5179 rows). The dataset is imbalanced.

  • 4833 Positive
  • 237 Neutral
  • 109 Negative

I’ve tried oversampling, but it didn’t help. I’ve read that I can use class weights with the Cross-Entropy Loss Function or Focal Loss; can that help?

These are my results when I evaluated the model (the data is cleaned):

max_len = 160
batch_size = 32
LR=3e-5
Epoch = 1

Training completed for epoch 1
Validation Loss: 0.10367343351390446
Validation Accuracy: 0.9842171717171716

              precision    recall  f1-score   support

           0       0.63      0.59      0.61        49
           1       0.24      0.15      0.19        53
           2       0.96      0.98      0.97      1199

    accuracy                           0.93      1301
   macro avg       0.61      0.57      0.59      1301
weighted avg       0.92      0.93      0.92      1301
Summary
# Split data into training and validation sets

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels,stratify=labels, test_size=0.2, random_state=42)

ros = RandomOverSampler()
train_texts_resampled, train_labels_resampled = ros.fit_resample(np.array(train_texts).reshape(-1,1),np.array(train_labels).reshape(-1,1))

train_texts_resampled = train_texts_resampled.flatten()
train_labels_resampled = train_labels_resampled.flatten()

(unique, counts) = np.unique(train_labels_resampled, return_counts=True)
np.asarray((unique, counts)).T
Summary
# Define a custom Dataset class for sentiment classification
class Sentiment(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=160):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize and encode the text using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding= 'max_length',   #padding='max_length'
            return_attention_mask=True,
            return_tensors='pt',
        )

        # Return a dictionary containing tokenized data and label
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Use the encoded labels
train_labels = encoded_labels_train
val_labels = encoded_labels_valid

# Create Sentiment instances for training and validation data
train_data = Sentiment(train_texts_resampled, train_labels_resampled, tokenizer, max_len=160)
val_data = Sentiment(val_texts, val_labels, tokenizer, max_len=160)

# Create DataLoader instances for training and validation data
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)
Summary
# Initialize the AdamW optimizer with the parameters of the model
# Set the learning rate (lr) to 1e-5
optimizer = AdamW(model.parameters(), lr=1e-5)

# Loop over training epochs (here, only 1 epoch as training time is too long for a single epoch)
for epoch in range(1):
    model.train()  # Set the model to training mode
    train_iterator = tqdm(train_loader, desc=f'Epoch {epoch + 1}', leave=False)

    for batch in train_iterator:
        optimizer.zero_grad()  # Clear gradients
        input_ids = batch['input_ids'].to(device)  # Move input to the device
        attention_mask = batch['attention_mask'].to(device)  # Move attention mask to the device
        labels = batch['labels'].to(device)  # Move labels to the device
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)  # Forward pass
        loss = outputs[0]  # Get the loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model parameters using gradients

        train_iterator.set_postfix({'Loss': loss.item()})

    print('Training completed for epoch', epoch + 1)

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    val_accuracy = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validation', leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            val_loss += loss.item()

            # Calculate accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            accuracy = accuracy_score(labels.cpu(), predictions.cpu())
            val_accuracy += accuracy

    val_loss /= len(val_loader)  # Calculate average validation loss
    val_accuracy /= len(val_loader)  # Calculate average validation accuracy
    print(f'Validation Loss: {val_loss} and Validation Accuracy: {val_accuracy}')

Using class weights can indeed help. See the following link about pos_weight in [BCEWithLogitsLoss — PyTorch 2.1 documentation]