I’m using TFBertForTokenClassification to perform a NER task on the annotated corpus fo NER:
https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus.
The problem is that the O-Labels are the majority of all labels, then the accuracy is quite high as the model correctly predicts most of them.
So, when I try to predict the labels of a simple sentence, the network predict only the O Label for each token of the sentence, however in several tutorials in which it is used Pytorch (I am using Tensorflow), the predictions are good.
Probably there is a problem in my code, but I cannot figure out where is it.
The code is the following:
# Import libraries
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import numpy as np
from transformers import (
TF2_WEIGHTS_NAME,
BertConfig,
BertTokenizer,
TFBertForTokenClassification,
create_optimizer)
# Config
MAX_LEN= 128
TRAIN_BATCH_SIZE = 32
VALID_BTCH_SIZE = 8
EPOCHS = 10
BERT_MODEL = 'bert-base-uncased'
MODEL_PATH = "model.bin"
TRAINING_FILE = "../input/entity-annotated-corpus/ner_dataset.csv"
TOKENIZER = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)
# Create the padded input, attention masks, token type and labels
def get_train_data(text, tags):
tokenized_text = []
target_tags = []
for index, token in enumerate(text):
encoded_token = TOKENIZER.encode(
token,
add_special_tokens = False
)
encoded_token_len = len(encoded_token)
tokenized_text.extend(encoded_token)
target_tags.extend([tags[index]] * encoded_token_len)
#truncation
tokenized_text = tokenized_text[: MAX_LEN - 2]
target_tags = target_tags[: MAX_LEN - 2]
#[101] = [CLS] , [102] = [SEP]
tokenized_text = [101] + tokenized_text + [102]
target_tags = [0] + target_tags + [0]
attention_mask = [1] * len(tokenized_text)
token_type_ids = [0] * len(tokenized_text)
#padding
padding_len = int(MAX_LEN - len(tokenized_text))
tokenized_text = tokenized_text + ([0] * padding_len)
target_tags = target_tags + ([0] * padding_len)
attention_mask = attention_mask + ([0] * padding_len)
token_type_ids = token_type_ids + ([0] * padding_len)
return (tokenized_text, target_tags, attention_mask, token_type_ids)
# Extract sentences from dataset
class RetrieveSentence(object):
def __init__(self, data):
self.n_sent = 1
self.data = data
self.empty = False
function = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
s["POS"].values.tolist(),
s["Tag"].values.tolist())]
self.grouped = self.data.groupby("Sentence #").apply(function)
self.sentences = [s for s in self.grouped]
def retrieve(self):
try:
s = self.grouped["Sentence: {}".format(self.n_sent)]
self.n_sent += 1
return s
except:
return None
# Load dataset and create one hot encoding for labels
df_data = pd.read_csv(TRAINING_FILE,sep=",",encoding="latin1").fillna(method='ffill')
Sentences = RetrieveSentence(df_data)
sentences_list = [" ".join([s[0] for s in sent]) for sent in Sentences.sentences]
labels = [ [s[2] for s in sent] for sent in Sentences.sentences]
tags_2_val = list(set(df_data["Tag"]))
tag_2_idx = {t: i for i, t in enumerate(tags_2_val)}
id_labels = [[tag_2_idx.get(l) for l in lab] for lab in labels]
sentences_list = [sent.split() for sent in sentences_list]
# I removed the sentence n 41770 because it gave index problems
del labels[41770]
del sentences_list[41770]
del id_labels[41770]
encoded_text = []
encoded_labels = []
attention_masks = []
token_type_ids = []
for i in range(len(sentences_list)):
text, labels, att_mask, tok_type = get_train_data(text = sentences_list[i], tags = id_labels[i])
encoded_text.append(text)
encoded_labels.append(labels)
attention_masks.append(att_mask)
token_type_ids.append(tok_type)
# Convert from list to np array
encoded_text = np.array(encoded_text)
encoded_labels = np.array(encoded_labels)
attention_masks = np.array(attention_masks)
token_type_ids = np.array(token_type_ids)
# Train Test split
X_train, X_valid, Y_train, Y_valid = train_test_split(encoded_text, encoded_labels, random_state=20, test_size=0.1)
Mask_train, Mask_valid, Token_ids_train, Token_ids_valid = train_test_split(attention_masks,token_type_ids ,random_state=20, test_size=0.1)
# Aggregate the train and test set, then shuffle and batch the train set
def example_to_features(input_ids,attention_masks,token_type_ids,y):
return {"input_ids": input_ids,
"attention_mask": attention_masks,
"token_type_ids": token_type_ids},y
train_ds = tf.data.Dataset.from_tensor_slices((X_train,Mask_train,Token_ids_train,Y_train)).map(example_to_features).shuffle(1000).batch(32)
test_ds=tf.data.Dataset.from_tensor_slices((X_valid,Mask_valid,Token_ids_valid,Y_valid)).map(example_to_features).batch(1)
# Load TFBertForTokenClassification with default config
config = BertConfig.from_pretrained(BERT_MODEL,num_labels=len(tags_2_val))
model = TFBertForTokenClassification.from_pretrained(BERT_MODEL, from_pt=bool(".bin" in BERT_MODEL), config=config)
# Add softmax layer, compute loss, optimizer and fit
model.layers[-1].activation = tf.keras.activations.softmax
model.summary()
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
history = model.fit(train_ds, epochs=3, validation_data=test_ds)
# Prediction. Spoiler: the label predicted are O-Label
sentence = "Hi , my name is Bob and I live in England"
inputs = TOKENIZER(sentence, return_tensors="tf")
input_ids = inputs["input_ids"]
inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
output = model(inputs)
The code is executed on a Kaggle notebook.
The transformer library version is 3.4.0
Many thanks in advance.