BERT Large with BiLSTM-CRF

smolnar · February 14, 2025, 3:21pm

Hey there,

I’ve been experimenting with BERT Large and BILSTM-CRF architecture for Token Classification. Looking at the example for Token Classification, I adjusted it with BiLSTM-CRF architecture, but I’m finding difficult to understand why the architecture is not working. The BiLSTM layer is randomly outputting NaN in first or later iterations (especially on GPU and less on CPU) and I can’t seem to find why. Could anyone have a look and maybe provide a direction where to look or what might be wrong with my arch setup?

I’m providing a sample code here:

# !pip install transformers datasets torchcrf evaluate seqeval pytorch-crf

from datasets import load_dataset

wnut = load_dataset("wnut_17")

wnut["train"][0]

label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased")

example = wnut["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

import evaluate

seqeval = evaluate.load("seqeval")

import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

import numpy as np
import torch
from torch import nn
from torchcrf import CRF
from transformers import (
    AutoModelForTokenClassification,
    BertModel,
    BertPreTrainedModel,
    Trainer,
    TrainingArguments,
)
from transformers.modeling_outputs import TokenClassifierOutput


class BertBiLSTMCRF(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)

        self.dropout = nn.Dropout(
            (
                config.classifier_dropout
                if config.classifier_dropout is not None
                else config.hidden_dropout_prob
            )
        )

        self.bilstm_hidden_size = config.hidden_size // 2
        self.bilstm = nn.LSTM(
            config.hidden_size,
            self.bilstm_hidden_size,
            1,
            batch_first=True,
            bidirectional=True,
        )

        self.classifier = nn.Linear(self.bilstm_hidden_size * 2, config.num_labels)
        self.crf = CRF(num_tags=config.num_labels, batch_first=True)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        sequence_output, (hn, cn) = self.bilstm(sequence_output)
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        if torch.isnan(logits).any():
            raise

        loss = None

        if labels is not None:
            log_likelihood = self.crf(
                logits, labels, attention_mask.bool(), reduction="mean"
            )
            loss = -log_likelihood

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


model = BertBiLSTMCRF.from_pretrained(
    "google-bert/bert-large-uncased",
    num_labels=len(id2label.keys()),
    id2label=id2label,
    label2id=label2id,
)

training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
evaluation = trainer.evaluate()

print(evaluation)

Thanks!

Topic		Replies	Views
Inconsistency in Model Output [ Token Classification] 🤗Transformers	0	332	April 12, 2023
Bert ner classifier Beginners	5	5436	May 3, 2021
Token classification on custom BERT and data Intermediate	2	1499	December 28, 2020
Expected input batch_size (2048) to match target batch_size (4) Beginners	3	1600	May 23, 2022
BERT embeddings on big dataset 🤗Datasets	3	119	August 28, 2024

BERT Large with BiLSTM-CRF

Related topics