Binary model either predicts all 0s or all 1s

My model ran without errors before but only predicted the most frequent class (binary classification) in every instance (0). So I followed this advice and added a custom loss function - but now it only predicts the other class (1).

It doesn’t matter how I adjust the weights, it’s either all 0s or all 1s.

By adding the CustomTrainer I got the following warning at the beginning although I declare in the model call ignore_mismatched_sizes=True and so I don’t know if it matters:

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ishan/distilbert-base-uncased-mnli and are newly initialized because the shapes did not match:

  • classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
  • classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
    You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

I’m not even sure the CustomTrainer takes the proper model and data.

The code only predicting 1s for my dev set looks roughly like this:

from transformers import DefaultDataCollator, EarlyStoppingCallback, IntervalStrategy, AutoTokenizer, \
    AutoModelForSequenceClassification, TrainingArguments, Trainer
from models.general import format_dataset
from db_connector.db_access import get_all
import torch
from torch import nn
import evaluate
from import Dataset
from torch.nn import functional as F

num_train_epochs = 3
model_name = "ishan/distilbert-base-uncased-mnli"

# use GPU instead of CPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)  # PyTorch model

class CustomDataset(
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['label'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class CustomTrainer(Trainer):
    def compute_loss(self, some_model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = some_model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 100.0])).cuda()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# read users from database
train_users = get_all("train_users")
dev_users = get_all("dev_users")
train_label_list, train_tweet_list = format_dataset(train_users)
dev_label_list, dev_tweet_list = format_dataset(dev_users)
train_encodings = tokenizer(train_tweet_list, truncation=True, padding=True)
dev_encodings = tokenizer(dev_tweet_list, truncation=True, padding=True)

train_dataset = CustomDataset(train_encodings, train_label_list)
dev_dataset = CustomDataset(dev_encodings, dev_label_list)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2,

data_collator = DefaultDataCollator(return_tensors="pt")

training_args = TrainingArguments(
    output_dir=result_path,  # output directory
    evaluation_strategy=IntervalStrategy.EPOCH,  # evaluation strategy to adopt during training
    num_train_epochs=num_train_epochs,  # total number of training epochs (default: 8)
    per_device_train_batch_size=16,  # batch size per device during training (default: 8)
    per_device_eval_batch_size=64,  # batch size for evaluation
    logging_dir=log_path,  # directory for storing logs
    save_total_limit=1,  # save only the last model (and the best one, if save_best_model is True)
    load_best_model_at_end=True,  # load best model at the end of training

trainer = CustomTrainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=dev_dataset,  # evaluation dataset


# Evaluation
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

# predictions and metrics for the dev set
dev_pred = trainer.predict(dev_dataset)  # returns: predictions, label_ids, metrics
torch_logits = torch.from_numpy(dev_pred.predictions)  # convert numpy ndarray to torch tensor
probabilities_scores = F.softmax(torch_logits, dim=-1).numpy()  # returns a list of numpy.ndarrays
torch_probs = torch.from_numpy(probabilities_scores)
# Get the max probability and the index of the highest prediction for each output.
max_probabilities, max_indices = torch.max(torch_probs, dim=1)  # returns two tensors
# print(type(max_probabilities))
predicted_labels = dev_pred.predictions.argmax(-1)  # returns a numpy ndarray

unseen_labels = set(predicted_labels) - set(dev_dataset.labels)
unpredicted_labels = set(dev_dataset.labels) - set(predicted_labels)
if unseen_labels:
    print("WARNING: Labels that are not part of the training set were predicted: {}".format(unseen_labels))
if unpredicted_labels:
    print("WARNING: Some labels were never predicted: {}".format(unpredicted_labels))

dev_res = clf_metrics.compute(predictions=predicted_labels, references=dev_dataset.labels)
print("DEV RESULTS: ", dev_res)

Please help, I already spent ages on this.

1 Like

Hi @Harpiye
Can you try setting the learning_rate to a smaller value say 1e-7
and increase the num_train_epochs to say 10
It should work out

To do this, your training_args will be defined like -
TrainingArguments( learning_rate = 1e-7, num_train_epochs = 10 ....)