Hey there,
I’ve been experimenting with BERT Large and BILSTM-CRF architecture for Token Classification. Looking at the example for Token Classification, I adjusted it with BiLSTM-CRF architecture, but I’m finding difficult to understand why the architecture is not working. The BiLSTM layer is randomly outputting NaN in first or later iterations (especially on GPU and less on CPU) and I can’t seem to find why. Could anyone have a look and maybe provide a direction where to look or what might be wrong with my arch setup?
I’m providing a sample code here:
# !pip install transformers datasets torchcrf evaluate seqeval pytorch-crf
from datasets import load_dataset
wnut = load_dataset("wnut_17")
wnut["train"][0]
label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased")
example = wnut["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples["tokens"], truncation=True, is_split_into_words=True
)
labels = []
for i, label in enumerate(examples[f"ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
import evaluate
seqeval = evaluate.load("seqeval")
import numpy as np
labels = [label_list[i] for i in example[f"ner_tags"]]
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
results = seqeval.compute(predictions=true_predictions, references=true_labels)
return {
"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],
}
id2label = {
0: "O",
1: "B-corporation",
2: "I-corporation",
3: "B-creative-work",
4: "I-creative-work",
5: "B-group",
6: "I-group",
7: "B-location",
8: "I-location",
9: "B-person",
10: "I-person",
11: "B-product",
12: "I-product",
}
label2id = {
"O": 0,
"B-corporation": 1,
"I-corporation": 2,
"B-creative-work": 3,
"I-creative-work": 4,
"B-group": 5,
"I-group": 6,
"B-location": 7,
"I-location": 8,
"B-person": 9,
"I-person": 10,
"B-product": 11,
"I-product": 12,
}
import numpy as np
import torch
from torch import nn
from torchcrf import CRF
from transformers import (
AutoModelForTokenClassification,
BertModel,
BertPreTrainedModel,
Trainer,
TrainingArguments,
)
from transformers.modeling_outputs import TokenClassifierOutput
class BertBiLSTMCRF(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(
(
config.classifier_dropout
if config.classifier_dropout is not None
else config.hidden_dropout_prob
)
)
self.bilstm_hidden_size = config.hidden_size // 2
self.bilstm = nn.LSTM(
config.hidden_size,
self.bilstm_hidden_size,
1,
batch_first=True,
bidirectional=True,
)
self.classifier = nn.Linear(self.bilstm_hidden_size * 2, config.num_labels)
self.crf = CRF(num_tags=config.num_labels, batch_first=True)
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
sequence_output, (hn, cn) = self.bilstm(sequence_output)
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
if torch.isnan(logits).any():
raise
loss = None
if labels is not None:
log_likelihood = self.crf(
logits, labels, attention_mask.bool(), reduction="mean"
)
loss = -log_likelihood
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
model = BertBiLSTMCRF.from_pretrained(
"google-bert/bert-large-uncased",
num_labels=len(id2label.keys()),
id2label=id2label,
label2id=label2id,
)
training_args = TrainingArguments(
output_dir="my_awesome_wnut_model",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False,
report_to="none",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_wnut["train"],
eval_dataset=tokenized_wnut["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
evaluation = trainer.evaluate()
print(evaluation)
Thanks!