I am trying to finetune DistilBERT to recognize PII with this dataset (only with the English sentences). However, I am facing a problem, that is, I am having low values for precision and recall on the validation dataset, but the validation accuracy seems high enough, even though it could be better:
***** train metrics *****
epoch = 5.0
total_flos = 6951430GF
train_loss = 0.5733
train_runtime = 0:38:14.30
train_samples = 29908
train_samples_per_second = 65.179
train_steps_per_second = 4.075
***** eval metrics *****
epoch = 5.0
eval_accuracy = 0.8316
eval_f1 = 0.0457
eval_loss = 0.4818
eval_precision = 0.0586
eval_recall = 0.0375
eval_runtime = 0:01:06.94
eval_samples = 7946
eval_samples_per_second = 118.697
eval_steps_per_second = 7.424
My model can be found here, and it doesn’t seem to be functioning really well. For example, if I input “I am retr00h and my address is 192.168.1.1”, it only recognises part of the username and part of the address with the correct labels, but it also recognises “and my username” as part of the username.
Is there anything I could do to have better performances?
Here is my code:
# !pip install evaluate seqeval sentencepiece
from datasets import load_dataset
import os
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, AdamW, TrainingArguments, Trainer, pipeline
import evaluate
import numpy as np
os.environ["WANDB_DISABLED"] = "true"
import wandb
wandb.init(mode = "disabled")
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples['source_text'], truncation = True)
labels = []
for i, label in enumerate(examples['ner_tags']):
# Map tokens to their respective word
word_ids = tokenized_inputs.word_ids(batch_index = i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
# Set the special tokens to -100
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
# Only label the first token of a given word
label_ids.append(label[word_idx])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs['labels'] = labels
return tokenized_inputs
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis = 2)
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
results = seqeval.compute(predictions = true_predictions, references = true_labels)
return {
"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],
}
dataset = load_dataset('ai4privacy/pii-masking-300k')
dataset['train'] = dataset['train'].filter(lambda x: x['language'] == 'English')
dataset['validation'] = dataset['validation'].filter(lambda x: x['language'] == 'English')
labels2new_labels = {
'B-BOD': 'B-DOB',
'I-BOD': 'I-DOB',
'B-GIVENNAME1': 'B-GIVENNAME',
'I-GIVENNAME1': 'I-GIVENNAME',
'B-GIVENNAME2': 'B-GIVENNAME',
'I-GIVENNAME2': 'I-GIVENNAME',
'B-LASTNAME1': 'B-LASTNAME',
'I-LASTNAME1': 'I-LASTNAME',
'B-LASTNAME2': 'B-LASTNAME',
'I-LASTNAME2': 'I-LASTNAME',
'B-LASTNAME3': 'B-LASTNAME',
'I-LASTNAME3': 'I-LASTNAME',
'B-SEX': 'B-GENDER',
'I-SEX': 'I-GENDER'
}
def update_labels(example):
tmp = []
for el in example['mbert_bio_labels']:
if el in labels2new_labels:
tmp.append(labels2new_labels[el])
else:
tmp.append(el)
example['mbert_bio_labels'] = tmp
return example
dataset['train'] = dataset['train'].map(update_labels)
dataset['validation'] = dataset['validation'].map(update_labels)
label_list = set()
for example in tqdm(dataset['train']):
for item in example['mbert_bio_labels']:
label_list.add(item)
label_list = list(label_list)
label_list.sort()
print(label_list)
id2label = {i: label_list[i] for i in range(len(label_list))}
label2id = {label_list[i]: i for i in range(len(label_list))}
# Function to map labels to integers using the dictionary
def map_labels_to_int(example):
ner_tags_int = [label2id[label] for label in example['mbert_bio_labels']]
example['ner_tags'] = ner_tags_int
return example
# Apply the mapping function to train and validation datasets
dataset['train'] = dataset['train'].map(map_labels_to_int)
dataset['validation'] = dataset['validation'].map(map_labels_to_int)
distilbert_path = 'distilbert/distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(distilbert_path)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched = True)
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)
seqeval = evaluate.load("seqeval")
model = AutoModelForTokenClassification.from_pretrained(
distilbert_path, num_labels = len(label_list), id2label = id2label, label2id = label2id
)
training_args = TrainingArguments(
seed = 0,
output_dir = distilbert_path,
learning_rate = 5e-5,
per_device_train_batch_size = 8,
per_device_eval_batch_size = 8,
num_train_epochs = 5,
weight_decay = 0.01,
warmup_ratio = 0.2,
lr_scheduler_type = 'cosine_with_restarts',
evaluation_strategy = 'epoch',
save_strategy = 'epoch',
load_best_model_at_end = True,
report_to = None,
overwrite_output_dir = True
)
trainer = Trainer(
model = model,
args = training_args,
train_dataset = tokenized_dataset['train'],
eval_dataset = tokenized_dataset['validation'],
tokenizer = tokenizer,
data_collator = data_collator,
compute_metrics = compute_metrics,
)
train_result = trainer.train()
validation_result = trainer.evaluate(tokenized_dataset['validation'])
train_metrics = train_result.metrics
max_train_samples = len(tokenized_dataset['train'])
max_eval_samples = len(tokenized_dataset['validation'])
train_metrics['train_samples'] = min(max_train_samples, len(tokenized_dataset['train']))
trainer.log_metrics("train", train_metrics)
validation_result['eval_samples'] = min(max_eval_samples, len(tokenized_dataset['validation']))
trainer.log_metrics('eval', validation_result)
trainer.save_metrics('train', train_metrics)
trainer.save_metrics('eval', validation_result)
trainer.save_state()
trainer.save_model(training_args.output_dir)