Hello everyone,
this is my first post on Hugginface Forum. I’m trying to do transfer learning from Sentiment Analysis roBERTa - Trip Advisor model. I’m folIowing this notebook.
I have a dataset with Tripadvisor reviews (5179 rows). The dataset is imbalanced.
- 4833 Positive
- 237 Neutral
- 109 Negative
I’ve tried oversampling, but it didn’t help. I’ve read that I can use class weights with the Cross-Entropy Loss Function or Focal Loss; can that help?
These are my results when I evaluated the model (the data is cleaned):
max_len = 160
batch_size = 32
LR=3e-5
Epoch = 1
Training completed for epoch 1
Validation Loss: 0.10367343351390446
Validation Accuracy: 0.9842171717171716
precision recall f1-score support
0 0.63 0.59 0.61 49
1 0.24 0.15 0.19 53
2 0.96 0.98 0.97 1199
accuracy 0.93 1301
macro avg 0.61 0.57 0.59 1301
weighted avg 0.92 0.93 0.92 1301
Summary
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels,stratify=labels, test_size=0.2, random_state=42)
ros = RandomOverSampler()
train_texts_resampled, train_labels_resampled = ros.fit_resample(np.array(train_texts).reshape(-1,1),np.array(train_labels).reshape(-1,1))
train_texts_resampled = train_texts_resampled.flatten()
train_labels_resampled = train_labels_resampled.flatten()
(unique, counts) = np.unique(train_labels_resampled, return_counts=True)
np.asarray((unique, counts)).T
Summary
# Define a custom Dataset class for sentiment classification
class Sentiment(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=160):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
label = self.labels[idx]
# Tokenize and encode the text using the provided tokenizer
encoding = self.tokenizer.encode_plus(
text,
truncation=True,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
padding= 'max_length', #padding='max_length'
return_attention_mask=True,
return_tensors='pt',
)
# Return a dictionary containing tokenized data and label
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(label, dtype=torch.long)
}
# Use the encoded labels
train_labels = encoded_labels_train
val_labels = encoded_labels_valid
# Create Sentiment instances for training and validation data
train_data = Sentiment(train_texts_resampled, train_labels_resampled, tokenizer, max_len=160)
val_data = Sentiment(val_texts, val_labels, tokenizer, max_len=160)
# Create DataLoader instances for training and validation data
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)
Summary
# Initialize the AdamW optimizer with the parameters of the model
# Set the learning rate (lr) to 1e-5
optimizer = AdamW(model.parameters(), lr=1e-5)
# Loop over training epochs (here, only 1 epoch as training time is too long for a single epoch)
for epoch in range(1):
model.train() # Set the model to training mode
train_iterator = tqdm(train_loader, desc=f'Epoch {epoch + 1}', leave=False)
for batch in train_iterator:
optimizer.zero_grad() # Clear gradients
input_ids = batch['input_ids'].to(device) # Move input to the device
attention_mask = batch['attention_mask'].to(device) # Move attention mask to the device
labels = batch['labels'].to(device) # Move labels to the device
outputs = model(input_ids, attention_mask=attention_mask, labels=labels) # Forward pass
loss = outputs[0] # Get the loss
loss.backward() # Backpropagation
optimizer.step() # Update model parameters using gradients
train_iterator.set_postfix({'Loss': loss.item()})
print('Training completed for epoch', epoch + 1)
# Validation loop
model.eval() # Set the model to evaluation mode
val_loss = 0
val_accuracy = 0
with torch.no_grad():
for batch in tqdm(val_loader, desc='Validation', leave=False):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
val_loss += loss.item()
# Calculate accuracy
logits = outputs.logits
predictions = torch.argmax(logits, dim=1)
accuracy = accuracy_score(labels.cpu(), predictions.cpu())
val_accuracy += accuracy
val_loss /= len(val_loader) # Calculate average validation loss
val_accuracy /= len(val_loader) # Calculate average validation accuracy
print(f'Validation Loss: {val_loss} and Validation Accuracy: {val_accuracy}')