I am fine-tuning DeBERTa model for PII entity extraction. My expectation is that setting aggregation_strategy=“simple” would automatically combine subword tokens into full-word entity spans.
However, for structured entities like Social Security Numbers (e.g., 123-45-6789) or credit card numbers (e.g., 4356-6789-5467-3456), the output still returns them as separate chunks, even though they belong to the same entity class.
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
model_name = "<my_finetuned_deployed_model>"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
nlp = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
text = "My name is John Smith and my SSN is 123-45-6789 and my credit card number is 4356-6789-5467-3456"
results = nlp(text)
print(results)
Output:
{'entity_group': 'NAME', 'score': 0.9989, 'word': 'John', 'start': 11, 'end': 15}
{'entity_group': 'NAME', 'score': 0.9982, 'word': 'Smith', 'start': 16, 'end': 21}
{'entity_group': 'US_SOCIAL_SECURITY_NUMBER', 'score': 0.9778, 'word': '123', 'start': 37, 'end': 40}
{'entity_group': 'US_SOCIAL_SECURITY_NUMBER', 'score': 0.9764, 'word': '-', 'start': 40, 'end': 41}
{'entity_group': 'US_SOCIAL_SECURITY_NUMBER', 'score': 0.9779, 'word': '45', 'start': 41, 'end': 43}
{'entity_group': 'US_SOCIAL_SECURITY_NUMBER', 'score': 0.9743, 'word': '-', 'start': 43, 'end': 44}
{'entity_group': 'US_SOCIAL_SECURITY_NUMBER', 'score': 0.9758, 'word': '6789', 'start': 44, 'end': 48}
I have added the post processing logic to combine the subwords in to words that is as follows
def merge_tokens(ner_results):
entities = {}
for entity in ner_results:
entity_type = entity["entity_group"]
entity_value = entity["word"].replace("##", "") # Remove subword prefixes
# Handle token merging
if entity_type not in entities:
entities[entity_type] = []
if entities[entity_type] and not entity_value.startswith(" "):
# If the previous token exists and this one isn't a new word, merge it
entities[entity_type][-1] += entity_value
else:
entities[entity_type].append(entity_value)
return entities
Output:
NER Results for Example 1:
NAME: Mia Thompson
BANK-ACCOUNT-NO: 4893172051
BANK-ROUTING-NO: 192847561
PHONE-NO: 727-814-3902
Is this the right approach to add post procesisng logic combine tokens if aggregation_strategy does not work ,wont it disrupt the original flow of transformer model architecture,cause i wanted to findout the underlying issue and resolve that and then deploy my model once again.I have made my custom dataset and prepare the dataset to hugging face format as follows
label_map = {
"NAME": 1, "ADDRESS": 2, "PHONE-NO": 3,
"BANK-ACCOUNT-NO": 4, "BANK-ROUTING-NO": 5,
"CREDIT-CARD-NO": 6, "SSN": 7
}
def convert_to_hf_format(data):
examples = {"tokens": [], "ner_tags": []}
for idx, item in enumerate(data):
text = item["text"]
tokens = word_tokenize(text) # Tokenize using nltk
labels = [0] * len(tokens) # Default label (0)
# Track character positions of each token
token_spans = []
char_start = 0
for token in tokens:
start_index = text.find(token, char_start)
end_index = start_index + len(token)
token_spans.append((start_index, end_index))
char_start = end_index # Move forward
# Assign labels
for start, end, label in item["label"]:
for i, (token_start, token_end) in enumerate(token_spans):
# If token is inside the entity range, label it
if (token_start >= start and token_end <= end) or (token_start < end and token_end > start):
labels[i] = label_map.get(label, 0)
# Convert to BIO format
bio_labels = convert_to_bio(text, labels)
examples["tokens"].append(tokens)
examples["ner_tags"].append(bio_labels)
# Debugging: Print first sample
if idx == 9:
print("\nSample Tokenized Text:", tokens)
print("\nSample Token-Label Pair:")
for token, tag in zip(tokens, bio_labels):
print(f"{token}: {tag}")
return Dataset.from_dict(examples)
# Convert and debug dataset
dataset = convert_to_hf_format(data)
Output:
Sample Tokenized Text: ['hello', ',', 'my', 'name', 'is', 'sophia', 'thompson', '.', 'i', "'m", 'contacting', 'you', 'about', 'my', 'recent', 'tax', 'return', '.', 'i', 'received', 'a', 'letter', 'indicating', 'a', 'problem', 'with', 'my', 'social', 'security', 'number', 'which', 'is', '778353369.', 'please', 'reach', 'me', 'at', '9565835742', 'or', 'mail', 'me', 'at', '337-dario-fallsuite-518-hartford', ',', 'ct', '06890.', 'i', 'would', 'greatly', 'appreciate', 'your', 'assistance', 'in', 'clarifying', 'this', 'matter', '.', 'you']
Sample Token-Label Pair:
hello: O
,: O
my: O
name: O
is: O
sophia: B-NAME
thompson: I-NAME
.: O
i: O
'm: O
contacting: O
you: O
about: O
my: O
recent: O
tax: O
return: O
.: O
i: O
received: O
a: O
...
this: O
matter: O
and this is how i am aligning my tokens with the labels ,the same logic given by the token classification notebook on the hugging face.
label_all_tokens = True
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
labels = []
for i, label in enumerate(examples[f"ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx and word_idx < len(label):
label_ids.append(bio_label_map.get(label[word_idx], -100))
else:
label_ids.append(bio_label_map.get(label[word_idx], -100) if label_all_tokens and word_idx < len(label) else -100)
previous_word_idx = word_idx
# Only print debug info for the first 3 examples
if i < 3:
print(f"Processing example {i}")
print(f"Tokens: {examples['tokens'][i]}")
print(f"Tokenized input IDs: {tokenized_inputs['input_ids'][i]}")
print(f"Word IDs: {word_ids}")
print(f"Original Labels: {label}")
print(f"Aligned Labels: {label_ids}")
print(f"Length of input IDs: {len(tokenized_inputs['input_ids'][i])}, Length of Labels: {len(label_ids)}\n")
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
# Apply tokenization
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
Output:
Tokens: ['hello', ',', 'my', 'name', 'is', 'benjamin', 'carter', '.', 'i', "'m", 'contacting', 'you', 'about', 'an', 'issue', 'with', 'my', 'tax', 'return', 'from', 'last', 'year', '.', 'there', 'seems', 'to', 'be', 'a', 'problem', 'with', 'my', 'bank', 'account', 'number', 'for', '873153717', ',', 'and', 'i', 'believe', 'my', 'social', 'security', 'number', '589-90-4308', 'is', 'incorrect', 'in', 'your', 'records', '.', 'i', "'ve", 'already', 'attempted', 'to', 'resolve', 'this', 'issue', 'online', ',', 'but', 'did', "n't", 'receive', 'a', 'response', '.', 'additionally', ',', 'this', 'delay', 'has', 'caused', 'me', 'to', 'miss', 'the', 'filing', 'deadline', ',', 'which', 'could', 'result', 'in', 'penalties', '.', 'please', 'verify', 'the', 'information', 'and', 'reach', 'out', 'to', 'me', 'at', '416-557-3342.', 'thank', 'you', 'for', 'your', 'help', 'in', 'resolving', 'this', 'matter', 'quickly', '.']
Tokenized input IDs: [1, 124394, 260, 262, 1038, 6536, 340, 260, 237138, 13072, 287, 260, 261, 260, 267, 260, 278, 283, 4712, 348, 522, 1389, 462, 13798, 515, 1038, 11578, 4650, 703, 3168, 3722, 260, 261, 2109, 12123, 264, 289, 391, 260, 263, 3092, 515, 1038, 4897, 9495, 4404, 333, 260, 42684, 172244, 1654, 260, 262, 306, 260, 267, 260, 14038, 1038, 2944, 20318, 4404, 149469, 37658, 59689, 2839, 340, 282, 64778, 282, 773, 260, 51972, 260, 261, 260, 267, 260, 278, 858, 260, 13988, 345, 22335, 346, 289, 77294, 715, 13798, 863, 260, 262, 1157, 3032, 260, 273, 278, 271, 11244, 260, 263, 17673, 260, 261, 260, 14897, 485, 260, 262, 715, 260, 41753, 1071, 26472, 346, 417, 289, 11410, 288, 4836, 348, 260, 99970, 260, 262, 260, 1543, 260, 3660, 8107, 282, 19209, 3306, 260, 261, 10152, 205169, 277, 288, 2884, 306, 260, 11126, 1351, 289, 417, 345, 99459, 46263, 176075, 96226, 31638, 522, 333, 773, 2115, 282, 126922, 348, 715, 20983, 13820, 485, 260, 261, 2]
Word IDs: [None, 0, 1, 1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 22, 23, 24, 24, 25, 26, 27, 27, 28, 29, 30, 31, 32, 33, 34, 35, 35, 35, 35, 36, 36, 37, 38, 38, 39, 39, 40, 41, 42, 43, 44, 44, 44, 44, 45, 46, 46, 47, 48, 49, 49, 50, 50, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 63, 63, 63, 64, 65, 65, 66, 67, 67, 68, 68, 68, 69, 69, 70, 71, 71, 72, 73, 73, 74, 75, 76, 77, 78, 78, 79, 79, 80, 80, 81, 81, 82, 82, 83, 84, 85, 85, 86, 86, 87, 88, 88, 89, 90, 91, 92, 92, 93, 94, 95, 96, 97, 97, 97, 97, 98, 99, 100, 101, 102, 103, 104, 104, 105, 106, 107, 107, 108, 108, None]
Original Labels: ['O', 'O', 'O', 'O', 'O', 'B-NAME', 'I-NAME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-BANK-ACCOUNT-NO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SSN', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PHONE-NO', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Aligned Labels: [-100, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 13, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]
Length of input IDs: 166, Length of Labels: 166