Why my simple Bert model for text classification could not learn anything?

Hello, I try transformers.BertModel to deal with a simple text classification, but the result makes me puzzled.
the code is simple,I implement the model with pytorch.
they are…

# a Dataset class for BertModel
class BertDataset(Dataset):
    def __init__(self, train_file, tokenizer):
        super(BertDataset, self).__init__()
        self.train_file = train_file
        self.data = []
        self.label2id = {}
        self.id2label = {}
        self.tokenizer = tokenizer
        self.init()

    def init(self):
        with open(self.train_file, 'r', encoding='utf-8') as f:
            for line in f:
                blocks = line.strip().split('\t')
                if blocks[1] not in self.label2id:
                    self.label2id[blocks[1]] = len(self.label2id)
                    self.id2label[len(self.id2label)] = blocks[1]
                self.data.append({'token': self.tokenizer(blocks[0], add_special_tokens=True, max_length=100,
                                                          padding='max_length', return_tensors='pt',
                                                          truncation=True),
                                  'label': self.label2id[blocks[1]]})

    def __getitem__(self, item):
        return self.data[item]

    def __len__(self):
        return len(self.data)

# a collate function for torch.utils.data.DataLoader
def bert_collate_fn(batch_data):
    input_ids, token_type_ids, attention_mask, labels = [], [], [], []
    for instance in copy.deepcopy(batch_data):
        input_ids.append(instance['token']['input_ids'][0].squeeze(0))
        token_type_ids.append(instance['token']['token_type_ids'][0].squeeze(0))
        attention_mask.append(instance['token']['attention_mask'][0].squeeze(0))
        labels.append(instance['label'])
    return torch.stack(input_ids), torch.stack(token_type_ids), \
           torch.stack(attention_mask), torch.tensor(labels)

# Model
class PTModel(nn.Module):
    def __init__(self, model, n_class):
        super(PTModel, self).__init__()
        self.n_class = n_class
        self.model = model
        self.linear = nn.Linear(768, self.n_class)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        cls_emb = self.model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        cls_emb = cls_emb[0][:, 0, :].squeeze(1)
        logits = self.linear(cls_emb)
        # logits = self.softmax(logits)
        return logits

# train code 
def train1():
    # data
    batch_size = 16
    tokenizer = BertTokenizer.from_pretrained(pretrained_path)
    dataset = BertDataset('../data/dataset/data.txt', tokenizer)
    train_len = int(len(dataset)*0.8)
    train_dataset, dev_dataset = random_split(dataset=dataset, lengths=[train_len, len(dataset)-train_len])
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=bert_collate_fn)
    dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True, collate_fn=bert_collate_fn)
    # model
    device = torch.device('cuda:{}'.format(args.cuda))
    bert_model = BertModel.from_pretrained(pretrained_path)
    model = PTModel(model=bert_model, n_class=len(dataset.label2id)).to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[30, 40], gamma=0.1)
    loss_func = torch.nn.CrossEntropyLoss()
    # train
    for i in range(args.epoch):
        model.train()
        train_loss, dev_loss, f1_train, f1_dev = [], [], [], []
        dev_pred_list, dev_gold_list = [], []
        for input_ids, token_type_ids, attention_mask, label in tqdm(train_dataloader):
            input_ids, token_type_ids, attention_mask, label = input_ids.to(device), token_type_ids.to(device), \
                                                               attention_mask.to(device), label.to(device),
            outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
            array_outputs = np.array(outputs.cuda().data.cpu())
            optimizer.zero_grad()
            loss = loss_func(outputs, label)
            results = outputs.cuda().data.cpu().argmax(dim=1)
            score = f1_score(label.cuda().data.cpu(), results, average='micro')
            train_loss.append(loss.item())
            f1_train.append(score)
            # optim
            loss.backward()
            optimizer.step()
        scheduler.step()
        print('epoch {}'.format(i))
        print('train_loss:{}'.format(np.mean(train_loss)))
        print('train_f1:{}'.format(np.mean(f1_train)))

The train log is following(only 10 epoches). And the result was already clear: The model could not learn anything!!!
PS: the learning rate was 1e-3.

100%|█████████████████████████████████████████| 250/250 [00:43<00:00,  5.72it/s]
epoch 0
train_loss:4.217772917747498
train_f1:0.081
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.52it/s]
dev_f1:0.08928571428571429
dev_loss:4.111690880760314
100%|█████████████████████████████████████████| 250/250 [00:43<00:00,  5.71it/s]
epoch 1
train_loss:4.094675525665283
train_f1:0.084
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.16it/s]
dev_f1:0.0882936507936508
dev_loss:4.1316274839734275
100%|█████████████████████████████████████████| 250/250 [00:43<00:00,  5.71it/s]
epoch 2
train_loss:4.084259546279907
train_f1:0.08525
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.37it/s]
dev_f1:0.08928571428571429
dev_loss:4.108004717599778
100%|█████████████████████████████████████████| 250/250 [00:44<00:00,  5.62it/s]
epoch 3
train_loss:4.0770455904006955
train_f1:0.09425
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.07it/s]
dev_f1:0.08928571428571429
dev_loss:4.1077501395392035
100%|█████████████████████████████████████████| 250/250 [00:45<00:00,  5.54it/s]
epoch 4
train_loss:4.070150758743286
train_f1:0.086
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.41it/s]
dev_f1:0.09027777777777778
dev_loss:4.103204295748756
100%|█████████████████████████████████████████| 250/250 [00:45<00:00,  5.52it/s]
epoch 5
train_loss:4.064209712982178
train_f1:0.0895
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.31it/s]
dev_f1:0.08928571428571429
dev_loss:4.117827377622089
100%|█████████████████████████████████████████| 250/250 [00:43<00:00,  5.70it/s]
epoch 6
train_loss:4.065111406326294
train_f1:0.08425
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.34it/s]
dev_f1:0.0882936507936508
dev_loss:4.099656305615864
100%|█████████████████████████████████████████| 250/250 [00:44<00:00,  5.58it/s]
epoch 7
train_loss:4.0547873935699466
train_f1:0.09175
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.30it/s]
dev_f1:0.08928571428571429
dev_loss:4.105985126798115
100%|█████████████████████████████████████████| 250/250 [00:43<00:00,  5.76it/s]
epoch 8
train_loss:4.0595885887145995
train_f1:0.08875
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.26it/s]
dev_f1:0.09027777777777778
dev_loss:4.121003010916332
100%|█████████████████████████████████████████| 250/250 [00:45<00:00,  5.46it/s]
epoch 9
train_loss:4.054850312232971
train_f1:0.08825
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 18.86it/s]
dev_f1:0.08928571428571429
dev_loss:4.12501887669639
100%|█████████████████████████████████████████| 250/250 [00:45<00:00,  5.46it/s]
epoch 10
train_loss:4.0566882238388065
train_f1:0.08525
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 18.85it/s]
dev_f1:0.09126984126984126
dev_loss:4.103033436669244

Before this BertModel, I have tried LSTM, and the LSTM worked well. the dev f1 reached 0.96.

# LSTM
class SimpleModel(nn.Module):
    def __init__(self, **kwargs):
        super(SimpleModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(kwargs['pretrained_embedding'], freeze=False)
        self.lstm = nn.LSTM(kwargs['pretrained_embedding'].shape[1],
                            kwargs['hidden_size'],
                            batch_first=True,
                            bidirectional=True)
        self.linear = nn.Linear(kwargs['hidden_size']*2, kwargs['n_class'])

    def forward(self, inputs, lens):
        inputs = self.embedding(inputs)
        _, (h, _) = self.lstm(pack_padded_sequence(inputs, lens, batch_first=True, enforce_sorted=False))
        h = h.permute(1, 0, 2).contiguous().view(h.shape[1], -1)
        logits = self.linear(h)
        logits = logits.softmax(dim=-1)
        return logits

Could any good man tell me why this code can’t work.
Is there something wrong with my writing?
I have been confused for days…
Thank you very much!

I think the problem might be that you call optimizer.zero_grad() after outputs are calculated, and it zeros out the gradients from the forward pass. Try putting that line before the line where outputs are calculated.

Hi, I am facing the similar issue. And above solution of placing optimizer.zero_grad() didn’t resolve this. Any help is appreciated. Have invested couple of weeks to look into this but not getting why hugging faced model is not learning via pytorch code. While, it runs from trainer.train()