Failing at finetuning BERT for a NER task

Hello! I am following this guide to finetune BERT on a dataset that is different from the one in the guide. I had to slightly modify the course of actions to avoid the following error when using the dataset.map(tokenize_and_align_labels, batched = True) instruction:

ArrowInvalid: Could not convert ‘O’ with type str: tried to convert to int64

With the modified code, however, I am getting the following error on trainer.train():

IndexError: index out of range in self

What should I change to avoid both those errors?

Could you share your full training script, I may be able to help then :hugs:

I have changed the name of a variable that did not exist (somehow the IDE did not warn me…). The following code should work as intended. I hope it could be useful for anyone else exploring this dataset or task :smiley:

from datasets import load_dataset
import os
from tqdm import tqdm
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
import numpy as np


dataset = load_dataset('ai4privacy/pii-masking-300k')

# The following is extracting all the labels
label_list = set()
for example in tqdm(dataset['train']):
    for item in example['mbert_bio_labels']:
        label_list.add(item)

label_list = list(label_list)
label_list.sort()
print(label_list)

id2label = {i: label_list[i] for i in range(len(label_list))}
label2id = {label_list[i]: i for i in range(len(label_list))}

tokenizer = BertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')

# Function to map labels to integers using the dictionary
def map_labels_to_int(example):
    ner_tags_int = [label2id[label] for label in example['mbert_bio_labels']]
    example['ner_tags'] = ner_tags_int
    return example


# Application of the mapping function to train and validation datasets
dataset['train'] = dataset['train'].map(map_labels_to_int)
dataset['validation'] = dataset['validation'].map(map_labels_to_int)


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['source_text'], truncation = True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        # Map tokens to their respective word
        word_ids = tokenized_inputs.word_ids(batch_index = i)
        previous_word_idx = None

        label_ids = []
        for word_idx in word_ids:
            # Set the special tokens to -100
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Only label the first token of a given word
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


tokenized_dataset = dataset.map(tokenize_and_align_labels, batched = True)

data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)
seqeval = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis = 2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions = true_predictions, references = true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


model = AutoModelForTokenClassification.from_pretrained(
    'distilbert-base-multilingual-cased', num_labels = len(label_list), id2label = id2label, label2id = label2id
)

training_args = TrainingArguments(
    output_dir = 'model',
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 2,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation'],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

train_result = trainer.train()
test_result = trainer.evaluate(tokenized_dataset['test'])

train_metrics = train_result.metrics
test_metrics = test_result.metrics

max_train_samples = len(tokenized_dataset['train'])
max_eval_samples = len(tokenized_dataset['test'])

train_metrics['train_samples'] = min(max_train_samples, len(tokenized_dataset['train']))
trainer.log_metrics("train", train_metrics)

test_metrics['eval_samples'] = min(max_eval_samples, len(tokenized_dataset['test']))
trainer.log_metrics('eval', test_metrics)

trainer.save_metrics('train', train_metrics)
trainer.save_metrics('eval', test_metrics)

trainer.save_state()
trainer.save_model(args.output_dir)