I fintuned bert model for token classification task, and but I want to add a post-processing task to reassemble the generated subtokens (and align the corresponding tags) when testing on a new sentence
For example :
import torch
import numpy as np
from transformers import BertTokenizer, BertConfig
device="cpu"
model = torch.load('path/finetuned_bert.pth')
model=model.to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tag_values = ["O","ORG"]
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}
test_sentence = """McDonald's is a well-known fast food chain"""
tokenized = tokenizer.encode(test_sentence)
input_ids = torch.tensor([tokenized])
with torch.no_grad():
output = model(input_ids)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
# join split tokens
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens[1:-1], label_indices[0][1:-1]):
if token.startswith("##"):
new_tokens[-1] = new_tokens[-1] + token[2:]
else:
new_labels.append(tag_values[label_idx])
new_tokens.append(token)
for token, label in zip(new_tokens, new_labels):
print("{}\t{}".format(label, token))
# output :
# ORG mcdonald
# O '
# O s
# O is
# O a
# O well
# O -
# O known
# O fast
# O food
# O chain
I want to get a result similar to this one:
# output :
# ORG McDonald's
# O is
# O a
# O well-known
# O fast
# O food
# O chain
Any help would be appreciated.