Validation loss always 0.0 for BERT Sequence Tagger

I want to implement a BERT sequence tagger following this tutorial.
My dataset is rather small, so the size of the validation dataset are around 10 texts.

The training loss decreases quite good, but the validation loss is always at 0.0. I don’t know what is going on there. When I look at the predicted and the true output, it is obvious that there is still some error in the prediction, so it should have some kind of validation loss. Is there something I’ve been missing? At the line print("val loss: ", outputs[0].item()) I already get 0.0 as loss value.
Here is the train and validation part of my code:

config = AutoConfig.from_pretrained("dbmdz/bert-base-german-cased", output_hidden_states=True, num_labels=len(unique_tags))
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-german-cased",
                                                              config=config)

model.config.pad_token_id = self.tokenizer.pad_token_id

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

if self.FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': self.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=self.learning_rate,
    eps=self.adam_eps
)

total_steps = len(train_dataloader) * self.epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_values, validation_loss_values, validation_accuracy, val_f1_scores, test_accuracies, test_f1_scores = [], [], [], [], [], []

for epoch in trange(self.epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================

    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        model.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)

        loss = outputs[0]
        loss.backward()

        total_loss += loss.item()

        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=self.max_grad_norm)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    loss_values.append(avg_train_loss)

    # ========================================
    #               Validation
    # ========================================

    model.eval()

    eval_loss, eval_accuracy = 0, 0
    predictions, true_labels = [], []

    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():

            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        print(outputs)
        print("val loss: ", outputs[0].item())
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    avg_eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(avg_eval_loss)
    print("Validation loss: {}".format(avg_eval_loss))
    pred_tags = [unique_tags[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if unique_tags[l_i] != "PAD"]
    valid_tags = [unique_tags[l_i] for l in true_labels
                                  for l_i in l if unique_tags[l_i] != "PAD"]

    val_acc = accuracy_score(valid_tags, pred_tags)
    validation_accuracy.append(val_acc)
    f1_val_score = f1_score([valid_tags], [pred_tags])
    val_f1_scores.append(f1_val_score)
    print("Validation Accuracy: {}".format(val_acc))
    print("Validation F1-Score: {}".format(f1_val_score))

Update: I tried out different data and also fine tuning only the last linear layer of the BERT model vs. fine tuning the pretrained + linear layer. I still don’t get why there is no validation loss.