I want to implement a BERT sequence tagger following this tutorial.
My dataset is rather small, so the size of the validation dataset are around 10 texts.
The training loss decreases quite good, but the validation loss is always at 0.0. I don’t know what is going on there. When I look at the predicted and the true output, it is obvious that there is still some error in the prediction, so it should have some kind of validation loss. Is there something I’ve been missing? At the line print("val loss: ", outputs[0].item())
I already get 0.0 as loss value.
Here is the train and validation part of my code:
config = AutoConfig.from_pretrained("dbmdz/bert-base-german-cased", output_hidden_states=True, num_labels=len(unique_tags))
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-german-cased",
config=config)
model.config.pad_token_id = self.tokenizer.pad_token_id
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
if self.FULL_FINETUNING:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
'weight_decay_rate': self.weight_decay},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.0}
]
else:
param_optimizer = list(model.classifier.named_parameters())
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = AdamW(
optimizer_grouped_parameters,
lr=self.learning_rate,
eps=self.adam_eps
)
total_steps = len(train_dataloader) * self.epochs
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_values, validation_loss_values, validation_accuracy, val_f1_scores, test_accuracies, test_f1_scores = [], [], [], [], [], []
for epoch in trange(self.epochs, desc="Epoch"):
# ========================================
# Training
# ========================================
model.train()
total_loss = 0
for step, batch in enumerate(train_dataloader):
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
model.zero_grad()
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
loss.backward()
total_loss += loss.item()
torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=self.max_grad_norm)
optimizer.step()
scheduler.step()
avg_train_loss = total_loss / len(train_dataloader)
print("Average train loss: {}".format(avg_train_loss))
loss_values.append(avg_train_loss)
# ========================================
# Validation
# ========================================
model.eval()
eval_loss, eval_accuracy = 0, 0
predictions, true_labels = [], []
for batch in valid_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
print(outputs)
print("val loss: ", outputs[0].item())
logits = outputs[1].detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
eval_loss += outputs[0].mean().item()
predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
true_labels.extend(label_ids)
avg_eval_loss = eval_loss / len(valid_dataloader)
validation_loss_values.append(avg_eval_loss)
print("Validation loss: {}".format(avg_eval_loss))
pred_tags = [unique_tags[p_i] for p, l in zip(predictions, true_labels)
for p_i, l_i in zip(p, l) if unique_tags[l_i] != "PAD"]
valid_tags = [unique_tags[l_i] for l in true_labels
for l_i in l if unique_tags[l_i] != "PAD"]
val_acc = accuracy_score(valid_tags, pred_tags)
validation_accuracy.append(val_acc)
f1_val_score = f1_score([valid_tags], [pred_tags])
val_f1_scores.append(f1_val_score)
print("Validation Accuracy: {}".format(val_acc))
print("Validation F1-Score: {}".format(f1_val_score))