Model never predicts minority class in a binary sequence classification

I am new to huggingface. With the help of trainer API, I trained and evaluated a model. But whenever I use it for prediction, model predicts just one class always. It would be helpful if anyone can help me identify the bug.

my data is such that there are two text inputs.

Here is my code -



import torch
import collections
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import EarlyStoppingCallback, DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import BertTokenizerFast, BertConfig, BertModel, BertForSequenceClassification


import os
import sys
sys.path.append(".")
sys.path.append("..")

model_name = "bert-base-uncased"
max_length = 512
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)


data = pd.read_csv('train_data.csv', '\t')



gss = GroupShuffleSplit(train_size=.80, random_state = 2, n_splits=1)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


for train_idx, val_idx in gss.split(data.loc[:, data.columns != 'Label'], data['Label'], groups=data['QueryID']):
    train_ds = data.iloc[train_idx]
    val_ds = data.iloc[val_idx]


train_label = pd.factorize(train_ds.Label)[0] 
valid_label = pd.factorize(val_ds.Label)[0]


train_label = 1 - train_label #because 0 should be relevant and 1 shoild be irrelevant
valid_label = 1 - valid_label

count = 0

print("Encodings generation.")
train_encodings = tokenizer(train_ds['Query'].tolist(),train_ds['Segment'].tolist(), truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(val_ds['Query'].tolist(), val_ds['Segment'].tolist(), truncation=True, padding=True, max_length=max_length)
print("Encodings generated.")

train_dataset = Dataset(train_encodings, train_label) 
valid_dataset = Dataset(valid_encodings, valid_label)

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2, return_dict=True).to("cuda")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
output_dir='bert_{}'.format(count),          
num_train_epochs=5,              
logging_dir='log_bert_{}'.format(count),            
load_best_model_at_end=True,     
evaluation_strategy="epoch")    


print("Training begins:\n")
trainer =  Trainer(
model=model,                         
args=training_args,                 
train_dataset=train_dataset,         
eval_dataset=valid_dataset,               
data_collator=data_collator,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)])

print(trainer.train())
print('---------------------')


## test data


test_data = pd.read_csv("test_data.csv", "\t")
print("Encodings generation.")
test_encodings = tokenizer(test_data['Query'].tolist(),test_data['Segment'].tolist(), truncation=True, padding=True, max_length=max_length)
print("Encodings generated.")


test_dataset = Dataset(test_encodings)

# Make prediction

raw_pred, y_act, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_act, y_pred))

What you typically do with an imbalanced set in a classification problem is using class weights in your loss function. See the documentation of CrossEntropyLoss and its weight parameter. However, I do not think that the Trainer currently allows custom loss functions out-of-the-box. Instead you can subclass the Trainer, particularly overwriting the compute_loss method to calculate the loss manually.

1 Like

Thank you for your response. I have over-written compute_loss function as follows -

class CustomTrainer(Trainer):        
    def compute_loss(self, model, inputs, return_outputs=False):
        class_weights = torch.FloatTensor([1./5542, 1./36587]).cuda()
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=class_weights)
        #loss = loss_fct(outputs,labels)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

Is it all that I need to change? because I get an ValueError: Expected input batch_size (16) to match target batch_size (8). How can I change shapes of output and labels here?

From looking at BertForSequenceClassification it seems that you need

It has been a while since I used weighted CE so make sure to check the documentation that your way of using weights is correct. It always confuses me.

1 Like