As mentioned in the title, finetuning dslim/bert-base-NER
with a custom dataset and with 3 labels (i.e. not using the original labels) just gets to only ~0.15 overall accuracy using seqeval
metrics.
It’s still in the earlier epochs, but I’ve trained it up to 500 epochs already, but it’s not improving.
Apologies for the next part is quite lengthy, but below is the actual code I used for this fine tuning:
import torch
import pandas as pd
from datasets import load_metric
from torch.utils.data import Dataset
from sklearn.metrics import (
accuracy_score,
precision_recall_fscore_support,
)
from transformers import (
AdamW,
AutoConfig,
AutoTokenizer,
AutoModelForTokenClassification,
EarlyStoppingCallback,
get_cosine_schedule_with_warmup,
IntervalStrategy,
TrainingArguments,
TrainerCallback,
Trainer
)
from torch import nn
model_name = "dslim/bert-base-NER"
# -------------- Define custom dataset
class BlNerDataset(torch.utils.data.Dataset):
def __init__(self, sentences, labels, tokenizer, labels_list=None):
self.sentences = sentences
self.labels = labels
self.tokenizer = tokenizer
if not labels_list:
self.label2id = {label: i for i, label in enumerate(
set([label for sentence_labels in labels for label in sentence_labels])
)}
else:
self.label2id = {label: i for i, label in enumerate(labels_list)}
self.pad_token_label_id = torch.nn.CrossEntropyLoss().ignore_index
print(self.label2id)
def __len__(self):
return len(self.sentences)
def __getitem__(self, idx):
sentence = self.sentences[idx]
labels = self.labels[idx]
inputs = self.tokenizer.encode_plus(
sentence,
add_special_tokens=True,
return_token_type_ids=True,
return_attention_mask=True,
return_offsets_mapping=True,
padding="max_length",
max_length=128,
truncation=True,
)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
token_type_ids = inputs["token_type_ids"]
offset_mapping = inputs["offset_mapping"]
# Convert labels to label ids
label_ids = [self.label2id[label] for label in labels]
# Create padding mask and set label ids of padding tokens to ignore_index
padding_mask = [1 if x != self.tokenizer.pad_token_id else 0 for x in input_ids]
label_ids.extend([self.pad_token_label_id] * (len(input_ids) - len(label_ids)))
return {
"input_ids": torch.tensor(input_ids),
"attention_mask": torch.tensor(attention_mask),
"token_type_ids": torch.tensor(token_type_ids),
"labels": torch.tensor(label_ids),
"padding_mask": torch.tensor(padding_mask),
}
# ----------------- overwrite labels in the config.json of the pretrained model
id2label = {0:'O', 1:'B-CUSTOM', 2:'I-CUSTOM'}
label2id = {'O':0, 'B-CUSTOM':1, 'I-CUSTOM':2}
config = AutoConfig.from_pretrained(model_name)
config.label2id = label2id
config.id2label = id2label
config.num_labels = len(id2label)
# ------------ Load data from CSV, and add new labels to config
df = pd.read_csv("<custom_data_in_csv>", delimiter=",", header=0)
sentences = df["text"].values.tolist()
labels = df["labels"].values.tolist()
labels = [literal_eval(l) for l in labels]
dataset = BlNerDataset(sentences, labels, tokenizer, labels_list=label2id)
# ------------ Tokenize sentences and convert labels to ids, and load pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name,
config=config,
ignore_mismatched_sizes=True)
# ---------- evaluation metrics
def compute_metrics_1(pred):
metric = load_metric("seqeval")
return metric.compute(predictions=pred.predictions.argmax(-1), references=pred.label_ids)
# ------------- Define training arguments
training_args = TrainingArguments(
output_dir='./ner_results',
num_train_epochs=500,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
learning_rate=2e-5,
logging_dir="./ner_logs",
log_level="debug",
logging_steps=10,
seed=1234,
evaluation_strategy="epoch",
save_strategy="epoch",
save_total_limit=2,
load_best_model_at_end=False,
metric_for_best_model='eval_overall_accuracy',
greater_is_better=True,
gradient_accumulation_steps=2
)
# ---------- Define trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
eval_dataset=dataset,
compute_metrics=compute_metrics_1,
callbacks=[])
trainer.train()
My custom data is in CSV format with text
and labels
as their column.
Example row is like this:
sample sentence. nothing interesting here except i'm stuck, ['O','O','O','O','O','O','B-CUSTOM','I-CUSTOM']
What I’ve done so far to somehow address the issue are the following:
- adjusted the learning rate
- used the
get_cosine_schedule_with_warmup
to make the adjustment dynamic - used other pretrained models (
bert-base-cased
) - early stopping
- no weight_decay parameter
- instead of overwriting the labels, just updated the pretraining labels with my custom label
Any ideas on how to address this issue? Quite stuck in this finetuning process.
Thanks in advance!