Overall accuracy in Finetuning dslim/bert-base-NER with custom dataset and labels gets only up to ~0.15 using seqeval

As mentioned in the title, finetuning dslim/bert-base-NER with a custom dataset and with 3 labels (i.e. not using the original labels) just gets to only ~0.15 overall accuracy using seqeval metrics.

It’s still in the earlier epochs, but I’ve trained it up to 500 epochs already, but it’s not improving.
Apologies for the next part is quite lengthy, but below is the actual code I used for this fine tuning:

import torch
import pandas as pd
from datasets import load_metric
from torch.utils.data import Dataset

from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
)
from transformers import (
    AdamW,
    AutoConfig,
    AutoTokenizer,
    AutoModelForTokenClassification,
    EarlyStoppingCallback,
    get_cosine_schedule_with_warmup,
    IntervalStrategy,
    TrainingArguments,
    TrainerCallback,
    Trainer
)

from torch import nn


model_name = "dslim/bert-base-NER"


# -------------- Define custom dataset

class BlNerDataset(torch.utils.data.Dataset):
    def __init__(self, sentences, labels, tokenizer, labels_list=None):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        if not labels_list:
            self.label2id = {label: i for i, label in enumerate(
                set([label for sentence_labels in labels for label in sentence_labels])
            )}
        else:
            self.label2id = {label: i for i, label in enumerate(labels_list)}
        self.pad_token_label_id = torch.nn.CrossEntropyLoss().ignore_index
        print(self.label2id)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        labels = self.labels[idx]
        inputs = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_offsets_mapping=True,
            padding="max_length",
            max_length=128,
            truncation=True,
        )

        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        offset_mapping = inputs["offset_mapping"]

        # Convert labels to label ids
        label_ids = [self.label2id[label] for label in labels]

        # Create padding mask and set label ids of padding tokens to ignore_index
        padding_mask = [1 if x != self.tokenizer.pad_token_id else 0 for x in input_ids]
        label_ids.extend([self.pad_token_label_id] * (len(input_ids) - len(label_ids)))

        return {
            "input_ids": torch.tensor(input_ids),
            "attention_mask": torch.tensor(attention_mask),
            "token_type_ids": torch.tensor(token_type_ids),
            "labels": torch.tensor(label_ids),
            "padding_mask": torch.tensor(padding_mask),
        }
        
 

# ----------------- overwrite labels in the config.json of the pretrained model

id2label = {0:'O', 1:'B-CUSTOM', 2:'I-CUSTOM'}
label2id = {'O':0, 'B-CUSTOM':1, 'I-CUSTOM':2}
config = AutoConfig.from_pretrained(model_name)
config.label2id = label2id
config.id2label = id2label
config.num_labels = len(id2label)


# ------------ Load data from CSV, and add new labels to config
df = pd.read_csv("<custom_data_in_csv>", delimiter=",", header=0)
sentences = df["text"].values.tolist()
labels = df["labels"].values.tolist()
labels = [literal_eval(l) for l in labels]
dataset = BlNerDataset(sentences, labels, tokenizer, labels_list=label2id)


# ------------ Tokenize sentences and convert labels to ids, and load pre-trained model

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                        config=config, 
                                                        ignore_mismatched_sizes=True)



# ---------- evaluation metrics 
def compute_metrics_1(pred):
    metric = load_metric("seqeval")
    return metric.compute(predictions=pred.predictions.argmax(-1), references=pred.label_ids)
    
    
# ------------- Define training arguments

training_args = TrainingArguments(
    output_dir='./ner_results',                     
    num_train_epochs=500,                          
    per_device_train_batch_size=64,                
    per_device_eval_batch_size=64,                 
    warmup_steps=500,                               
    weight_decay=0.01,                              
    learning_rate=2e-5,                             
    logging_dir="./ner_logs",                       
    log_level="debug",
    logging_steps=10,
    seed=1234,
    evaluation_strategy="epoch",
    save_strategy="epoch",                   
    save_total_limit=2,                      
    load_best_model_at_end=False,             
    metric_for_best_model='eval_overall_accuracy',
    greater_is_better=True,
    gradient_accumulation_steps=2
)

# ---------- Define trainer

trainer = Trainer(
    model=model,                        
    args=training_args,                  
    train_dataset=dataset,               
    eval_dataset=dataset,
    compute_metrics=compute_metrics_1,
    callbacks=[])

trainer.train()

My custom data is in CSV format with text and labels as their column.
Example row is like this:

sample sentence. nothing interesting here except i'm stuck, ['O','O','O','O','O','O','B-CUSTOM','I-CUSTOM'] 

What I’ve done so far to somehow address the issue are the following:

  • adjusted the learning rate
  • used the get_cosine_schedule_with_warmup to make the adjustment dynamic
  • used other pretrained models (bert-base-cased)
  • early stopping
  • no weight_decay parameter
  • instead of overwriting the labels, just updated the pretraining labels with my custom label

Any ideas on how to address this issue? Quite stuck in this finetuning process.

Thanks in advance!

By the way, for the run I did, I intentionally set the train_dataset and eval_dataset to be the same. This is just an initial experiment, so I will definitely modify this in the future.

What am I doing wrong here? Is seqeval the correct metric to be used here? Because I am getting this user warning during evaluation:

If not, what should it be then?

Any ideas from the moderators, admins, or anybody are greatly appreciated.

I modified the compute_metrics function and I am seeing some improvements now. Plus, the warning on errors are now gone. I monitor this and see if progress is significant:

def compute_metrics(p):
    metric = load_metric("seqeval")
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }