Hello, I try transformers.BertModel
to deal with a simple text classification, but the result makes me puzzled.
the code is simple,I implement the model with pytorch.
they are…
# a Dataset class for BertModel
class BertDataset(Dataset):
def __init__(self, train_file, tokenizer):
super(BertDataset, self).__init__()
self.train_file = train_file
self.data = []
self.label2id = {}
self.id2label = {}
self.tokenizer = tokenizer
self.init()
def init(self):
with open(self.train_file, 'r', encoding='utf-8') as f:
for line in f:
blocks = line.strip().split('\t')
if blocks[1] not in self.label2id:
self.label2id[blocks[1]] = len(self.label2id)
self.id2label[len(self.id2label)] = blocks[1]
self.data.append({'token': self.tokenizer(blocks[0], add_special_tokens=True, max_length=100,
padding='max_length', return_tensors='pt',
truncation=True),
'label': self.label2id[blocks[1]]})
def __getitem__(self, item):
return self.data[item]
def __len__(self):
return len(self.data)
# a collate function for torch.utils.data.DataLoader
def bert_collate_fn(batch_data):
input_ids, token_type_ids, attention_mask, labels = [], [], [], []
for instance in copy.deepcopy(batch_data):
input_ids.append(instance['token']['input_ids'][0].squeeze(0))
token_type_ids.append(instance['token']['token_type_ids'][0].squeeze(0))
attention_mask.append(instance['token']['attention_mask'][0].squeeze(0))
labels.append(instance['label'])
return torch.stack(input_ids), torch.stack(token_type_ids), \
torch.stack(attention_mask), torch.tensor(labels)
# Model
class PTModel(nn.Module):
def __init__(self, model, n_class):
super(PTModel, self).__init__()
self.n_class = n_class
self.model = model
self.linear = nn.Linear(768, self.n_class)
self.softmax = nn.Softmax(dim=-1)
def forward(self, input_ids, token_type_ids=None, attention_mask=None):
cls_emb = self.model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
cls_emb = cls_emb[0][:, 0, :].squeeze(1)
logits = self.linear(cls_emb)
# logits = self.softmax(logits)
return logits
# train code
def train1():
# data
batch_size = 16
tokenizer = BertTokenizer.from_pretrained(pretrained_path)
dataset = BertDataset('../data/dataset/data.txt', tokenizer)
train_len = int(len(dataset)*0.8)
train_dataset, dev_dataset = random_split(dataset=dataset, lengths=[train_len, len(dataset)-train_len])
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=bert_collate_fn)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True, collate_fn=bert_collate_fn)
# model
device = torch.device('cuda:{}'.format(args.cuda))
bert_model = BertModel.from_pretrained(pretrained_path)
model = PTModel(model=bert_model, n_class=len(dataset.label2id)).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[30, 40], gamma=0.1)
loss_func = torch.nn.CrossEntropyLoss()
# train
for i in range(args.epoch):
model.train()
train_loss, dev_loss, f1_train, f1_dev = [], [], [], []
dev_pred_list, dev_gold_list = [], []
for input_ids, token_type_ids, attention_mask, label in tqdm(train_dataloader):
input_ids, token_type_ids, attention_mask, label = input_ids.to(device), token_type_ids.to(device), \
attention_mask.to(device), label.to(device),
outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
array_outputs = np.array(outputs.cuda().data.cpu())
optimizer.zero_grad()
loss = loss_func(outputs, label)
results = outputs.cuda().data.cpu().argmax(dim=1)
score = f1_score(label.cuda().data.cpu(), results, average='micro')
train_loss.append(loss.item())
f1_train.append(score)
# optim
loss.backward()
optimizer.step()
scheduler.step()
print('epoch {}'.format(i))
print('train_loss:{}'.format(np.mean(train_loss)))
print('train_f1:{}'.format(np.mean(f1_train)))
The train log is following(only 10 epoches). And the result was already clear: The model could not learn anything!!!
PS: the learning rate was 1e-3.
100%|█████████████████████████████████████████| 250/250 [00:43<00:00, 5.72it/s]
epoch 0
train_loss:4.217772917747498
train_f1:0.081
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.52it/s]
dev_f1:0.08928571428571429
dev_loss:4.111690880760314
100%|█████████████████████████████████████████| 250/250 [00:43<00:00, 5.71it/s]
epoch 1
train_loss:4.094675525665283
train_f1:0.084
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.16it/s]
dev_f1:0.0882936507936508
dev_loss:4.1316274839734275
100%|█████████████████████████████████████████| 250/250 [00:43<00:00, 5.71it/s]
epoch 2
train_loss:4.084259546279907
train_f1:0.08525
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.37it/s]
dev_f1:0.08928571428571429
dev_loss:4.108004717599778
100%|█████████████████████████████████████████| 250/250 [00:44<00:00, 5.62it/s]
epoch 3
train_loss:4.0770455904006955
train_f1:0.09425
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.07it/s]
dev_f1:0.08928571428571429
dev_loss:4.1077501395392035
100%|█████████████████████████████████████████| 250/250 [00:45<00:00, 5.54it/s]
epoch 4
train_loss:4.070150758743286
train_f1:0.086
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.41it/s]
dev_f1:0.09027777777777778
dev_loss:4.103204295748756
100%|█████████████████████████████████████████| 250/250 [00:45<00:00, 5.52it/s]
epoch 5
train_loss:4.064209712982178
train_f1:0.0895
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.31it/s]
dev_f1:0.08928571428571429
dev_loss:4.117827377622089
100%|█████████████████████████████████████████| 250/250 [00:43<00:00, 5.70it/s]
epoch 6
train_loss:4.065111406326294
train_f1:0.08425
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.34it/s]
dev_f1:0.0882936507936508
dev_loss:4.099656305615864
100%|█████████████████████████████████████████| 250/250 [00:44<00:00, 5.58it/s]
epoch 7
train_loss:4.0547873935699466
train_f1:0.09175
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.30it/s]
dev_f1:0.08928571428571429
dev_loss:4.105985126798115
100%|█████████████████████████████████████████| 250/250 [00:43<00:00, 5.76it/s]
epoch 8
train_loss:4.0595885887145995
train_f1:0.08875
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 19.26it/s]
dev_f1:0.09027777777777778
dev_loss:4.121003010916332
100%|█████████████████████████████████████████| 250/250 [00:45<00:00, 5.46it/s]
epoch 9
train_loss:4.054850312232971
train_f1:0.08825
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 18.86it/s]
dev_f1:0.08928571428571429
dev_loss:4.12501887669639
100%|█████████████████████████████████████████| 250/250 [00:45<00:00, 5.46it/s]
epoch 10
train_loss:4.0566882238388065
train_f1:0.08525
100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 18.85it/s]
dev_f1:0.09126984126984126
dev_loss:4.103033436669244
Before this BertModel, I have tried LSTM, and the LSTM worked well. the dev f1 reached 0.96.
# LSTM
class SimpleModel(nn.Module):
def __init__(self, **kwargs):
super(SimpleModel, self).__init__()
self.embedding = nn.Embedding.from_pretrained(kwargs['pretrained_embedding'], freeze=False)
self.lstm = nn.LSTM(kwargs['pretrained_embedding'].shape[1],
kwargs['hidden_size'],
batch_first=True,
bidirectional=True)
self.linear = nn.Linear(kwargs['hidden_size']*2, kwargs['n_class'])
def forward(self, inputs, lens):
inputs = self.embedding(inputs)
_, (h, _) = self.lstm(pack_padded_sequence(inputs, lens, batch_first=True, enforce_sorted=False))
h = h.permute(1, 0, 2).contiguous().view(h.shape[1], -1)
logits = self.linear(h)
logits = logits.softmax(dim=-1)
return logits
Could any good man tell me why this code can’t work.
Is there something wrong with my writing?
I have been confused for days…
Thank you very much!