performance drop after using bert

the model originally use glove and kazuma embedding

embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
        E = []
        for w in tqdm(vocab._index2word):
            e = []
            for emb in embeddings:
                e += emb.emb(w, default='zero')
        with open(os.path.join(dann, 'emb.json'), 'wt') as f:
            json.dump(E, f)

def pad(seqs, emb, device, pad=0):
    lens = [len(s) for s in seqs]
    max_len = max(lens)
    padded = torch.LongTensor([s + (max_len-l) * [pad] for s, l in zip(seqs, lens)])
    return emb(, lens

class model:
    def __init__(self)
          self.emb_fixed = FixedEmbedding(len(vocab), args.demb, dropout=args.dropout.get('emb', 0.2))
    def forward():
          eos = self.vocab.word2index('<eos>')
          utterance, utterance_len = pad([e.num['transcript'] for e in batch], self.emb_fixed, self.device, pad=eos)

and now i want to use bert

load model

    dataset, ontology, vocab, Eword = load_dataset(args)  # load dataset
    tokenizer = BertTokenizer.from_pretrained(args.model_path, do_lower_case=args.lowercase,

    bert_config = BertConfig.from_pretrained(args.model_path, cache_dir=args.cache_path)
    model_class = glad_model.get_model(args.model)
    model = model_class.from_pretrained(args.model_path, vocab=vocab, ontology=ontology, args=args, tokenizer=tokenizer)
    model =

    if not args.test:
        model.run_train(dataset['train'], dataset['dev'], args) 

model class

    def __init__(self, bert_config, vocab, ontology, args, tokenizer):
        self.bert = BertModel(bert_config)
        self.tokenizer = tokenizer

    def bert_encoder(self, jtokens, if_show=None):
        doc_encoding = [self.tokenizer.convert_tokens_to_ids('[CLS]')]
        for i, token_phrase in enumerate(jtokens):
            token_encoding = self.tokenizer.encode(token_phrase, add_special_tokens=False)
            if if_show:
                print("%s %s"%(token_phrase, token_encoding))
            doc_encoding += token_encoding
        doc_encoding += [self.tokenizer.convert_tokens_to_ids('[SEP]')]
        return doc_encoding
     def bert_pad(self, token_encode):
        PAD = self.tokenizer.convert_tokens_to_ids('[PAD]')
        te_lens = [len(te) for te in token_encode]
        max_te_len = max(te_lens)
        padded_te = [s + (max_te_len - l) * [PAD] for s, l in zip(token_encode, te_lens)]  # confirm pad sucessfully
        return padded_te
    def data_sample(self, batch):
        utterance = [turn.to_dict()['transcript'] for turn in batch]
        # encode
        utt_encode = [self.bert_encoder(utt) for utt in utterance]
        # pad
        utt_padded = torch.LongTensor(self.bert_pad(utt_encode)).to(self.device)
        # calculate the length
        utt_lens = [len(ue) for ue in utt_encode]
        return utt_padded, utt_lens

when i get the utt_padded , i pass it to the forward, and use to get a 768 dimension vector utterance. Then take utterance to a lstm for the final judgment.

        utt_bert = self.bert(input_ids=utterance)
        utterance, pool_utt = utt_bert[0], utt_bert[1]

when i use glove, the performance can reach 85, but change it to bert, the perfomance only reach 29 ( the former learning rate is 1e-3 and the latter is 1e-5)
(when using bert, if the learing rate is 1e-3, the embedding will become the same)

I wonder if i miss some steps to use bert or my method is wrong ?
Or my model architecture is too complicate? becase i add a lstm after bert

Thanks a lot if someone could provide advice .