I have this code to tokenize and align labels for my bert ner model:
def align_labels(labs, wids):
new_labs = []
current_word = None
for wid in wids:
if wid != current_word:
current_word = wid
label = -100 if wid is None else labs[wid]
new_labs.append(label)
elif wid is None:
new_labs.append(-100)
else:
label = labs[wid]
new_labs.append(label)
return new_labs
def tokenize_and_align_labels(row):
tokenized = tokenizer(
row["tokens"],
truncation=True,
is_split_into_words=True,
max_length=512
)
all_labs = row["lab_id"]
new_labs = [ ]
for i, labs in enumerate(all_labs):
word_ids = tokenized.word_ids(i)
new_labs.append(align_labels(labs, word_ids))
tokenized['labels'] = new_labs
return tokenized
How can I handle overflowing tokens without loosing data?