the model originally use glove and kazuma embedding
embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
E = []
for w in tqdm(vocab._index2word):
e = []
for emb in embeddings:
e += emb.emb(w, default='zero')
E.append(e)
with open(os.path.join(dann, 'emb.json'), 'wt') as f:
json.dump(E, f)
def pad(seqs, emb, device, pad=0):
lens = [len(s) for s in seqs]
max_len = max(lens)
padded = torch.LongTensor([s + (max_len-l) * [pad] for s, l in zip(seqs, lens)])
return emb(padded.to(device)), lens
class model:
def __init__(self)
self.emb_fixed = FixedEmbedding(len(vocab), args.demb, dropout=args.dropout.get('emb', 0.2))
def forward():
eos = self.vocab.word2index('<eos>')
utterance, utterance_len = pad([e.num['transcript'] for e in batch], self.emb_fixed, self.device, pad=eos)
and now i want to use bert
load model
dataset, ontology, vocab, Eword = load_dataset(args) # load dataset
tokenizer = BertTokenizer.from_pretrained(args.model_path, do_lower_case=args.lowercase,
cache_dir=args.cache_path)
bert_config = BertConfig.from_pretrained(args.model_path, cache_dir=args.cache_path)
model_class = glad_model.get_model(args.model)
model = model_class.from_pretrained(args.model_path, vocab=vocab, ontology=ontology, args=args, tokenizer=tokenizer)
model.save_config()
model.load_emb(Eword)
model = model.to(model.device)
if not args.test:
model.run_train(dataset['train'], dataset['dev'], args)
model class
def __init__(self, bert_config, vocab, ontology, args, tokenizer):
super().__init__(bert_config)
self.bert = BertModel(bert_config)
self.bert.eval()
self.tokenizer = tokenizer
def bert_encoder(self, jtokens, if_show=None):
doc_encoding = [self.tokenizer.convert_tokens_to_ids('[CLS]')]
for i, token_phrase in enumerate(jtokens):
token_encoding = self.tokenizer.encode(token_phrase, add_special_tokens=False)
if if_show:
print("%s %s"%(token_phrase, token_encoding))
doc_encoding += token_encoding
doc_encoding += [self.tokenizer.convert_tokens_to_ids('[SEP]')]
return doc_encoding
def bert_pad(self, token_encode):
PAD = self.tokenizer.convert_tokens_to_ids('[PAD]')
te_lens = [len(te) for te in token_encode]
max_te_len = max(te_lens)
padded_te = [s + (max_te_len - l) * [PAD] for s, l in zip(token_encode, te_lens)] # confirm pad sucessfully
return padded_te
def data_sample(self, batch):
utterance = [turn.to_dict()['transcript'] for turn in batch]
# encode
utt_encode = [self.bert_encoder(utt) for utt in utterance]
# pad
utt_padded = torch.LongTensor(self.bert_pad(utt_encode)).to(self.device)
# calculate the length
utt_lens = [len(ue) for ue in utt_encode]
return utt_padded, utt_lens
when i get the utt_padded , i pass it to the forward, and use self.bet(BertModel) to get a 768 dimension vector utterance. Then take utterance to a lstm for the final judgment.
utt_bert = self.bert(input_ids=utterance)
utterance, pool_utt = utt_bert[0], utt_bert[1]
when i use glove, the performance can reach 85, but change it to bert, the perfomance only reach 29 ( the former learning rate is 1e-3 and the latter is 1e-5)
(when using bert, if the learing rate is 1e-3, the embedding will become the same)
I wonder if i miss some steps to use bert or my method is wrong ?
Or my model architecture is too complicate? becase i add a lstm after bert
Thanks a lot if someone could provide advice .