Why my model doesn't learn anything?

This is my code:

class MultilabelTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        criterion = nn.NLLLoss(weight=class_weights)
        loss = criterion(outputs, labels)
        return (loss, outputs) if return_outputs else loss

class BertArticleClassifier(nn.Module):
    def __init__(self, n_classes, freeze_bert_weights=False):
        super(BertArticleClassifier, self).__init__()

        self.bert = AutoModel.from_pretrained('bert-base-uncased')

        if freeze_bert_weights:
            for param in self.bert.parameters():
                param.requires_grad = False

        self.dropout = nn.Dropout(0.1)
        self.fc_1 = nn.Linear(768, 256)
        self.leaky_relu = nn.LeakyReLU()
        self.fc_out = nn.Linear(256, n_classes)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids, attention_mask)
        return self.log_softmax(self.fc_out(self.leaky_relu(self.fc_1(self.dropout(output['pooler_output'])))))

class ArticleDataset(Dataset):

    def __init__(self, input_ids, attention_mask, labels):
        super(ArticleDataset, self).__init__()

        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx],
                'attention_mask': self.attention_mask[idx],
                'labels': self.labels[idx]}


train_dataset = ArticleDataset(encoded_data_train['input_ids'], encoded_data_train['attention_mask'], torch.tensor(df_train['label']))
val_dataset = ArticleDataset(encoded_data_val['input_ids'], encoded_data_val['attention_mask'], torch.tensor(df_valid['label']))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertArticleClassifier(n_classes=len(label_dict), freeze_bert_weights=False)
optimizer = AdamW(model.parameters(),
                  lr=1e-4,
                  eps=1e-6)

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = nn.NLLLoss(weight=class_weights)

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=100 * EPOCHS)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    num_train_epochs=4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=1,
)

trainer = MultilabelTrainer(model=model, 
                            args=training_args,  
                            train_dataset=train_dataset,  
                            eval_dataset=val_dataset,
                            compute_metrics=compute_metrics,
                            optimizers=[optimizer, scheduler])

trainer.train()

encoded_data_train and encoded_data_val are created using BertTokenizer.
The problem is that model was learning when I was not using Trainer class, but I needed to tweak the loss function to accept class weights (my dataset is a bit unbalanced), so I decided just to overwrite the loss function in the Trainer class.
I don’t get no learning if I use my own Trainer class or default Trainer, and after every epoch my model’s loss is always ~3.3.
If there is a way to use class weights without the need of Trainer I would be happy to know that, anyways, if you have some advice or help, I’d be happy to listen.

1 Like