I have changed the name of a variable that did not exist (somehow the IDE did not warn me…). The following code should work as intended. I hope it could be useful for anyone else exploring this dataset or task 
from datasets import load_dataset
import os
from tqdm import tqdm
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
dataset = load_dataset('ai4privacy/pii-masking-300k')
# The following is extracting all the labels
label_list = set()
for example in tqdm(dataset['train']):
for item in example['mbert_bio_labels']:
label_list.add(item)
label_list = list(label_list)
label_list.sort()
print(label_list)
id2label = {i: label_list[i] for i in range(len(label_list))}
label2id = {label_list[i]: i for i in range(len(label_list))}
tokenizer = BertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')
# Function to map labels to integers using the dictionary
def map_labels_to_int(example):
ner_tags_int = [label2id[label] for label in example['mbert_bio_labels']]
example['ner_tags'] = ner_tags_int
return example
# Application of the mapping function to train and validation datasets
dataset['train'] = dataset['train'].map(map_labels_to_int)
dataset['validation'] = dataset['validation'].map(map_labels_to_int)
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples['source_text'], truncation = True)
labels = []
for i, label in enumerate(examples['ner_tags']):
# Map tokens to their respective word
word_ids = tokenized_inputs.word_ids(batch_index = i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
# Set the special tokens to -100
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
# Only label the first token of a given word
label_ids.append(label[word_idx])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs['labels'] = labels
return tokenized_inputs
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched = True)
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)
seqeval = evaluate.load("seqeval")
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis = 2)
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
results = seqeval.compute(predictions = true_predictions, references = true_labels)
return {
"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],
}
model = AutoModelForTokenClassification.from_pretrained(
'distilbert-base-multilingual-cased', num_labels = len(label_list), id2label = id2label, label2id = label2id
)
training_args = TrainingArguments(
output_dir = 'model',
learning_rate = 2e-5,
per_device_train_batch_size = 16,
per_device_eval_batch_size = 16,
num_train_epochs = 2,
weight_decay = 0.01,
evaluation_strategy = 'epoch',
save_strategy = 'epoch',
load_best_model_at_end = True
)
trainer = Trainer(
model = model,
args = training_args,
train_dataset = tokenized_dataset['train'],
eval_dataset = tokenized_dataset['validation'],
tokenizer = tokenizer,
data_collator = data_collator,
compute_metrics = compute_metrics,
)
train_result = trainer.train()
test_result = trainer.evaluate(tokenized_dataset['test'])
train_metrics = train_result.metrics
test_metrics = test_result.metrics
max_train_samples = len(tokenized_dataset['train'])
max_eval_samples = len(tokenized_dataset['test'])
train_metrics['train_samples'] = min(max_train_samples, len(tokenized_dataset['train']))
trainer.log_metrics("train", train_metrics)
test_metrics['eval_samples'] = min(max_eval_samples, len(tokenized_dataset['test']))
trainer.log_metrics('eval', test_metrics)
trainer.save_metrics('train', train_metrics)
trainer.save_metrics('eval', test_metrics)
trainer.save_state()
trainer.save_model(args.output_dir)