TFBertForTokenClassification scoring only O labels on a NER task

Sergio · November 12, 2020, 12:43pm

I’m using TFBertForTokenClassification to perform a NER task on the annotated corpus fo NER:
https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus.
The problem is that the O-Labels are the majority of all labels, then the accuracy is quite high as the model correctly predicts most of them.
So, when I try to predict the labels of a simple sentence, the network predict only the O Label for each token of the sentence, however in several tutorials in which it is used Pytorch (I am using Tensorflow), the predictions are good.
Probably there is a problem in my code, but I cannot figure out where is it.

The code is the following:

# Import libraries
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import numpy as np

from transformers import (
    TF2_WEIGHTS_NAME,
    BertConfig,
    BertTokenizer,
    TFBertForTokenClassification,
    create_optimizer)

# Config
MAX_LEN= 128
TRAIN_BATCH_SIZE = 32
VALID_BTCH_SIZE = 8
EPOCHS = 10
BERT_MODEL = 'bert-base-uncased'
MODEL_PATH = "model.bin"
TRAINING_FILE = "../input/entity-annotated-corpus/ner_dataset.csv"
TOKENIZER = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)

# Create the padded input, attention masks, token type and labels
def get_train_data(text, tags):

    tokenized_text = []
    target_tags = []

    for index, token in enumerate(text):

        encoded_token = TOKENIZER.encode(
            token,
            add_special_tokens = False
        )

        encoded_token_len = len(encoded_token)

        tokenized_text.extend(encoded_token)
        target_tags.extend([tags[index]] * encoded_token_len)

    #truncation
    tokenized_text = tokenized_text[: MAX_LEN - 2]
    target_tags = target_tags[: MAX_LEN - 2]

    #[101] = [CLS] , [102] = [SEP]
    tokenized_text = [101] + tokenized_text + [102]
    target_tags = [0] + target_tags + [0]
    attention_mask = [1] * len(tokenized_text)
    token_type_ids = [0] * len(tokenized_text)

    #padding
    padding_len = int(MAX_LEN - len(tokenized_text))

    tokenized_text = tokenized_text + ([0] * padding_len)
    target_tags = target_tags + ([0] * padding_len)
    attention_mask = attention_mask + ([0] * padding_len)
    token_type_ids = token_type_ids + ([0] * padding_len)

    return (tokenized_text, target_tags, attention_mask,  token_type_ids)

# Extract sentences from dataset
class RetrieveSentence(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        function = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(function)
        self.sentences = [s for s in self.grouped]
    
    def retrieve(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

# Load dataset and create one hot encoding for labels
df_data = pd.read_csv(TRAINING_FILE,sep=",",encoding="latin1").fillna(method='ffill')
Sentences = RetrieveSentence(df_data)

sentences_list = [" ".join([s[0] for s in sent]) for sent in Sentences.sentences]
labels = [ [s[2] for s in sent] for sent in Sentences.sentences]

tags_2_val = list(set(df_data["Tag"]))
tag_2_idx = {t: i for i, t in enumerate(tags_2_val)}

id_labels = [[tag_2_idx.get(l) for l in lab] for lab in labels]
sentences_list = [sent.split() for sent in sentences_list]

# I removed the sentence n 41770 because it gave index problems
del labels[41770]
del sentences_list[41770]
del id_labels[41770]

encoded_text = []
encoded_labels = []
attention_masks = []
token_type_ids = []

for i in range(len(sentences_list)):

    text, labels, att_mask, tok_type = get_train_data(text = sentences_list[i], tags = id_labels[i])
    encoded_text.append(text)
    encoded_labels.append(labels)
    attention_masks.append(att_mask)
    token_type_ids.append(tok_type)

# Convert from list to np array
encoded_text = np.array(encoded_text)
encoded_labels = np.array(encoded_labels)
attention_masks = np.array(attention_masks)
token_type_ids = np.array(token_type_ids)

# Train Test split
X_train, X_valid, Y_train, Y_valid = train_test_split(encoded_text, encoded_labels, random_state=20, test_size=0.1)
Mask_train, Mask_valid, Token_ids_train, Token_ids_valid = train_test_split(attention_masks,token_type_ids ,random_state=20, test_size=0.1)

# Aggregate the train and test set, then shuffle and batch the train set
def example_to_features(input_ids,attention_masks,token_type_ids,y):
  return {"input_ids": input_ids,
          "attention_mask": attention_masks,
          "token_type_ids": token_type_ids},y

train_ds = tf.data.Dataset.from_tensor_slices((X_train,Mask_train,Token_ids_train,Y_train)).map(example_to_features).shuffle(1000).batch(32)
test_ds=tf.data.Dataset.from_tensor_slices((X_valid,Mask_valid,Token_ids_valid,Y_valid)).map(example_to_features).batch(1)

# Load TFBertForTokenClassification with default config
config = BertConfig.from_pretrained(BERT_MODEL,num_labels=len(tags_2_val))
model = TFBertForTokenClassification.from_pretrained(BERT_MODEL, from_pt=bool(".bin" in BERT_MODEL), config=config)

# Add softmax layer, compute loss, optimizer and fit
model.layers[-1].activation = tf.keras.activations.softmax
model.summary()
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
history = model.fit(train_ds, epochs=3, validation_data=test_ds)

# Prediction. Spoiler: the label predicted are O-Label
sentence = "Hi , my name is Bob and I live in England"
inputs = TOKENIZER(sentence, return_tensors="tf")
input_ids = inputs["input_ids"]
inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
output = model(inputs)

The code is executed on a Kaggle notebook.
The transformer library version is 3.4.0
Many thanks in advance.

robianmcd · November 13, 2020, 10:28pm

I’m trying to train a similar model and I am getting the same problem. It does work for me however with a relu activation on the last classification layer instead of softmax and a smaller learning rate optimizer = keras.optimizers.Adam(learning_rate=3e-5).

I’m not sure why it isn’t working with softmax. FYI here’s the model I’m using

ids_input = keras.Input(shape=(max_tokens,), dtype=np.int32)
attention_mask_input = keras.Input(shape=(max_tokens,), dtype=np.int32)
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
dropout = layers.Dropout(0.1)
token_classifier_layer = layers.Dense(num_labels, activation="relu")

bert_output = bert_model({'input_ids': ids_input, 'attention_mask': attention_mask_input}, return_dict=True)
x = dropout(bert_output['last_hidden_state'])
x = token_classifier_layer(x)
word_classifier_model = keras.Model(inputs=[ids_input, attention_mask_input], outputs=x)

optimizer = keras.optimizers.Adam(learning_rate=3e-5)
word_classifier_model.compile(optimizer=optimizer, loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

word_classifier_model.fit(x=[input_ids, attention_mask], y=model_labels, epochs=config.epochs, batch_size=config.batch_size, validation_split=0.2)

word_classifier_model.save_weights(config.model_file_path)

Jung · November 14, 2020, 7:59am

Hi guys, if Bert predicts with only 1 class, there must be something wrong with your trainng pipeline (e.g. check your data and also check your training prediction).

BTW, how about a direct use of NER pretrained weights (may have to use from_pt = True argument), or finetuning from it in your TF code. This should be a great baseline out of the box.

robianmcd · November 14, 2020, 8:23pm

Thanks @Jung . I’m still working on fine tuning my own model but I tried dslim/bert-base-NER and it’s working great!

Sergio · November 15, 2020, 8:03pm

Thanks @robianmcd for sharing you solution. I will try to follow your sugestions.
Thanks @Jung, this could be a valid solution too

jordyvl · January 14, 2021, 3:03pm

Hi folks,

Spent way too much time on this, but as a hint, check the following:

Are you using padding with -100? Then be sure you mask it in the loss and metric!
model.fit does not do this for you automatically, whereas TFTrainer with TFTokenClassificationLoss does this for you!
Be sure to check the activation of the last layer, by default it is linear, which means you need to use (Sparse)CategoricalEntropy(from_logits=True); But then your accuracy scores will be messed up by default!
Essentially, align your final activation to be softmax, use from_logits=False, write a custom loss & metric wrapper for masking out padding & special tokens.

def sparse_categorical_accuracy_masked(y_true, y_pred):
    mask_value = -100
    active_loss = tf.reshape(y_true, (-1,)) != mask_value
    reduced_logits = tf.boolean_mask(tf.reshape(y_pred, (-1, shape_list(y_pred)[2])), active_loss)
    y_true = tf.boolean_mask(tf.reshape(y_true, (-1,)), active_loss)
    reduced_logits = tf.cast(tf.argmax(reduced_logits, axis=-1), tf.keras.backend.floatx())
    equality = tf.equal(y_true, reduced_logits)
    return tf.reduce_mean(tf.cast(equality, tf.keras.backend.floatx()))

def sparse_crossentropy_masked(y_true, y_pred):
    mask_value = -100
    y_true_masked = tf.boolean_mask(y_true, tf.not_equal(y_true, mask_value))
    y_pred_masked = tf.boolean_mask(y_pred, tf.not_equal(y_true, mask_value))
    return tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(y_true_masked, 
y_pred_masked))

Topic		Replies	Views
Token Classification Label order Intermediate	0	566	November 11, 2022
BERT for NER output of only '0' Beginners	0	671	November 14, 2021
TFBertForSeqClassification for multilabel classification 🤗Transformers	0	882	July 18, 2022
How to structure labels for token classification? 🤗Transformers	5	3283	August 29, 2021
Inconsistency in Model Output [ Token Classification] 🤗Transformers	0	334	April 12, 2023

TFBertForTokenClassification scoring only O labels on a NER task

Related topics