Finetuning DistilBERT for NER

I am trying to finetune DistilBERT to recognize PII with this dataset (only with the English sentences). However, I am facing a problem, that is, I am having low values for precision and recall on the validation dataset, but the validation accuracy seems high enough, even though it could be better:

***** train metrics *****
  epoch                    =        5.0
  total_flos               =  6951430GF
  train_loss               =     0.5733
  train_runtime            = 0:38:14.30
  train_samples            =      29908
  train_samples_per_second =     65.179
  train_steps_per_second   =      4.075
***** eval metrics *****
  epoch                   =        5.0
  eval_accuracy           =     0.8316
  eval_f1                 =     0.0457
  eval_loss               =     0.4818
  eval_precision          =     0.0586
  eval_recall             =     0.0375
  eval_runtime            = 0:01:06.94
  eval_samples            =       7946
  eval_samples_per_second =    118.697
  eval_steps_per_second   =      7.424

My model can be found here, and it doesn’t seem to be functioning really well. For example, if I input “I am retr00h and my address is 192.168.1.1”, it only recognises part of the username and part of the address with the correct labels, but it also recognises “and my username” as part of the username.
Is there anything I could do to have better performances?

Here is my code:

# !pip install evaluate seqeval sentencepiece

from datasets import load_dataset
import os
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, AdamW, TrainingArguments, Trainer, pipeline
import evaluate
import numpy as np

os.environ["WANDB_DISABLED"] = "true"
import wandb
wandb.init(mode = "disabled")


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['source_text'], truncation = True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        # Map tokens to their respective word
        word_ids = tokenized_inputs.word_ids(batch_index = i)
        previous_word_idx = None

        label_ids = []
        for word_idx in word_ids:
            # Set the special tokens to -100
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Only label the first token of a given word
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis = 2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions = true_predictions, references = true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


dataset = load_dataset('ai4privacy/pii-masking-300k')
dataset['train'] = dataset['train'].filter(lambda x: x['language'] == 'English')
dataset['validation'] = dataset['validation'].filter(lambda x: x['language'] == 'English')

labels2new_labels = {
    'B-BOD': 'B-DOB',
    'I-BOD': 'I-DOB',
    'B-GIVENNAME1': 'B-GIVENNAME',
    'I-GIVENNAME1': 'I-GIVENNAME',
    'B-GIVENNAME2': 'B-GIVENNAME',
    'I-GIVENNAME2': 'I-GIVENNAME',
    'B-LASTNAME1': 'B-LASTNAME',
    'I-LASTNAME1': 'I-LASTNAME',
    'B-LASTNAME2': 'B-LASTNAME',
    'I-LASTNAME2': 'I-LASTNAME',
    'B-LASTNAME3': 'B-LASTNAME',
    'I-LASTNAME3': 'I-LASTNAME',
    'B-SEX': 'B-GENDER',
    'I-SEX': 'I-GENDER'
}

def update_labels(example):
    tmp = []
    for el in example['mbert_bio_labels']:
        if el in labels2new_labels:
            tmp.append(labels2new_labels[el])
        else:
            tmp.append(el)
    example['mbert_bio_labels'] = tmp
    return example


dataset['train'] = dataset['train'].map(update_labels)
dataset['validation'] = dataset['validation'].map(update_labels)


label_list = set()

for example in tqdm(dataset['train']):
    for item in example['mbert_bio_labels']:
        label_list.add(item)

label_list = list(label_list)
label_list.sort()
print(label_list)

id2label = {i: label_list[i] for i in range(len(label_list))}
label2id = {label_list[i]: i for i in range(len(label_list))}

# Function to map labels to integers using the dictionary
def map_labels_to_int(example):
    ner_tags_int = [label2id[label] for label in example['mbert_bio_labels']]
    example['ner_tags'] = ner_tags_int
    return example

# Apply the mapping function to train and validation datasets
dataset['train'] = dataset['train'].map(map_labels_to_int)
dataset['validation'] = dataset['validation'].map(map_labels_to_int)

distilbert_path = 'distilbert/distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(distilbert_path)

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched = True)
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)
seqeval = evaluate.load("seqeval")

model = AutoModelForTokenClassification.from_pretrained(
    distilbert_path, num_labels = len(label_list), id2label = id2label, label2id = label2id
)

training_args = TrainingArguments(
        seed = 0,
        output_dir = distilbert_path,
        learning_rate = 5e-5,
        per_device_train_batch_size = 8,
        per_device_eval_batch_size = 8,
        num_train_epochs = 5,
        weight_decay = 0.01,
        warmup_ratio = 0.2,
        lr_scheduler_type = 'cosine_with_restarts',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        load_best_model_at_end = True,
        report_to = None,
        overwrite_output_dir = True
    )

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation'],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

train_result = trainer.train()
validation_result = trainer.evaluate(tokenized_dataset['validation'])

train_metrics = train_result.metrics

max_train_samples = len(tokenized_dataset['train'])
max_eval_samples = len(tokenized_dataset['validation'])

train_metrics['train_samples'] = min(max_train_samples, len(tokenized_dataset['train']))
trainer.log_metrics("train", train_metrics)

validation_result['eval_samples'] = min(max_eval_samples, len(tokenized_dataset['validation']))
trainer.log_metrics('eval', validation_result)

trainer.save_metrics('train', train_metrics)
trainer.save_metrics('eval', validation_result)

trainer.save_state()
trainer.save_model(training_args.output_dir)