Why my simple Bert model for text classification could not learn anything?

Hello, I try transformers.BertModel to deal with a simple text classification, but the result makes me puzzled.
the code is simple,I implement the model with pytorch.
they are…

# a Dataset class for BertModel
class BertDataset(Dataset):
    def __init__(self, train_file, tokenizer):
        super(BertDataset, self).__init__()
        self.train_file = train_file
        self.data = []
        self.label2id = {}
        self.id2label = {}
        self.tokenizer = tokenizer
        self.init()

    def init(self):
        with open(self.train_file, 'r', encoding='utf-8') as f:
            for line in f:
                blocks = line.strip().split('\t')
                if blocks[1] not in self.label2id:
                    self.label2id[blocks[1]] = len(self.label2id)
                    self.id2label[len(self.id2label)] = blocks[1]
                self.data.append({'token': self.tokenizer(blocks[0], add_special_tokens=True, max_length=100,
                                                          padding='max_length', return_tensors='pt',
                                                          truncation=True),
                                  'label': self.label2id[blocks[1]]})

    def __getitem__(self, item):
        return self.data[item]

    def __len__(self):
        return len(self.data)

# a collate function for torch.utils.data.DataLoader
def bert_collate_fn(batch_data):
    input_ids, token_type_ids, attention_mask, labels = [], [], [], []
    for instance in copy.deepcopy(batch_data):
        input_ids.append(instance['token']['input_ids'][0].squeeze(0))
        token_type_ids.append(instance['token']['token_type_ids'][0].squeeze(0))
        attention_mask.append(instance['token']['attention_mask'][0].squeeze(0))
        labels.append(instance['label'])
    return torch.stack(input_ids), torch.stack(token_type_ids), \
           torch.stack(attention_mask), torch.tensor(labels)

# Model
class PTModel(nn.Module):
    def __init__(self, model, n_class):
        super(PTModel, self).__init__()
        self.n_class = n_class
        self.model = model
        self.linear = nn.Linear(768, self.n_class)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        cls_emb = self.model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        cls_emb = cls_emb[0][:, 0, :].squeeze(1)
        logits = self.linear(cls_emb)
        # logits = self.softmax(logits)
        return logits

# train code 
def train1():
    # data
    batch_size = 16
    tokenizer = BertTokenizer.from_pretrained(pretrained_path)
    dataset = BertDataset('../data/dataset/data.txt', tokenizer)
    train_len = int(len(dataset)*0.8)
    train_dataset, dev_dataset = random_split(dataset=dataset, lengths=[train_len, len(dataset)-train_len])
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=bert_collate_fn)
    dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True, collate_fn=bert_collate_fn)
    # model
    device = torch.device('cuda:{}'.format(args.cuda))
    bert_model = BertModel.from_pretrained(pretrained_path)
    model = PTModel(model=bert_model, n_class=len(dataset.label2id)).to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[30, 40], gamma=0.1)
    loss_func = torch.nn.CrossEntropyLoss()
    # train
    for i in range(args.epoch):
        model.train()
        train_loss, dev_loss, f1_train, f1_dev = [], [], [], []
        dev_pred_list, dev_gold_list = [], []
        for input_ids, token_type_ids, attention_mask, label in tqdm(train_dataloader):
            input_ids, token_type_ids, attention_mask, label = input_ids.to(device), token_type_ids.to(device), \
                                                               attention_mask.to(device), label.to(device),
            outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
            array_outputs = np.array(outputs.cuda().data.cpu())
            optimizer.zero_grad()
            loss = loss_func(outputs, label)
            results = outputs.cuda().data.cpu().argmax(dim=1)
            score = f1_score(label.cuda().data.cpu(), results, average='micro')
            train_loss.append(loss.item())
            f1_train.append(score)
            # optim
            loss.backward()
            optimizer.step()
        scheduler.step()
        print('epoch {}'.format(i))
        print('train_loss:{}'.format(np.mean(train_loss)))
        print('train_f1:{}'.format(np.mean(f1_train)))

The train log is following(only 10 epoches). And the result was already clear: The model could not learn anything!!!
PS: the learning rate was 1e-3.

100%|█████████████████████████████████████████| 250/250 [00:43<00:00,  5.72it/s]
epoch 0
train_loss:4.217772917747498
train_f1:0.081
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.52it/s]
dev_f1:0.08928571428571429
dev_loss:4.111690880760314
100%|█████████████████████████████████████████| 250/250 [00:43<00:00,  5.71it/s]
epoch 1
train_loss:4.094675525665283
train_f1:0.084
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.16it/s]
dev_f1:0.0882936507936508
dev_loss:4.1316274839734275
100%|█████████████████████████████████████████| 250/250 [00:43<00:00,  5.71it/s]
epoch 2
train_loss:4.084259546279907
train_f1:0.08525
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.37it/s]
dev_f1:0.08928571428571429
dev_loss:4.108004717599778
100%|█████████████████████████████████████████| 250/250 [00:44<00:00,  5.62it/s]
epoch 3
train_loss:4.0770455904006955
train_f1:0.09425
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.07it/s]
dev_f1:0.08928571428571429
dev_loss:4.1077501395392035
100%|█████████████████████████████████████████| 250/250 [00:45<00:00,  5.54it/s]
epoch 4
train_loss:4.070150758743286
train_f1:0.086
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.41it/s]
dev_f1:0.09027777777777778
dev_loss:4.103204295748756
100%|█████████████████████████████████████████| 250/250 [00:45<00:00,  5.52it/s]
epoch 5
train_loss:4.064209712982178
train_f1:0.0895
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.31it/s]
dev_f1:0.08928571428571429
dev_loss:4.117827377622089
100%|█████████████████████████████████████████| 250/250 [00:43<00:00,  5.70it/s]
epoch 6
train_loss:4.065111406326294
train_f1:0.08425
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.34it/s]
dev_f1:0.0882936507936508
dev_loss:4.099656305615864
100%|█████████████████████████████████████████| 250/250 [00:44<00:00,  5.58it/s]
epoch 7
train_loss:4.0547873935699466
train_f1:0.09175
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.30it/s]
dev_f1:0.08928571428571429
dev_loss:4.105985126798115
100%|█████████████████████████████████████████| 250/250 [00:43<00:00,  5.76it/s]
epoch 8
train_loss:4.0595885887145995
train_f1:0.08875
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.26it/s]
dev_f1:0.09027777777777778
dev_loss:4.121003010916332
100%|█████████████████████████████████████████| 250/250 [00:45<00:00,  5.46it/s]
epoch 9
train_loss:4.054850312232971
train_f1:0.08825
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 18.86it/s]
dev_f1:0.08928571428571429
dev_loss:4.12501887669639
100%|█████████████████████████████████████████| 250/250 [00:45<00:00,  5.46it/s]
epoch 10
train_loss:4.0566882238388065
train_f1:0.08525
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 18.85it/s]
dev_f1:0.09126984126984126
dev_loss:4.103033436669244

Before this BertModel, I have tried LSTM, and the LSTM worked well. the dev f1 reached 0.96.

# LSTM
class SimpleModel(nn.Module):
    def __init__(self, **kwargs):
        super(SimpleModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(kwargs['pretrained_embedding'], freeze=False)
        self.lstm = nn.LSTM(kwargs['pretrained_embedding'].shape[1],
                            kwargs['hidden_size'],
                            batch_first=True,
                            bidirectional=True)
        self.linear = nn.Linear(kwargs['hidden_size']*2, kwargs['n_class'])

    def forward(self, inputs, lens):
        inputs = self.embedding(inputs)
        _, (h, _) = self.lstm(pack_padded_sequence(inputs, lens, batch_first=True, enforce_sorted=False))
        h = h.permute(1, 0, 2).contiguous().view(h.shape[1], -1)
        logits = self.linear(h)
        logits = logits.softmax(dim=-1)
        return logits

Could any good man tell me why this code can’t work.
Is there something wrong with my writing?
I have been confused for days…
Thank you very much!

I think the problem might be that you call optimizer.zero_grad() after outputs are calculated, and it zeros out the gradients from the forward pass. Try putting that line before the line where outputs are calculated.