I want to implement a sliding window approach while finetuning BERT NER using Stride and Return_overflowing_tokens but I’m not sure how to implement it.
I have an initial code but I keep getting this error ValueError: expected sequence of length 4079 at dim 1 (got 5846) and the length keeps changing each run.
the code:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
model = AutoModelForTokenClassification.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
custom_labels=['O', 'B-Disease_disorder', 'I-Disease_disorder']
label_encoding_dict= {'O': 0, 'B-Disease_disorder': 1, 'I-Disease_disorder': 2}
def tokenize_and_align_labels(examples):
label_all_tokens = True
stride = 200
window_size = 512
tokenized_inputs = tokenizer(
list(examples["token"]),
truncation=False,
is_split_into_words=True,
padding=False,
max_length = 512
)
labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
label_ids = []
for start in range(0, len(word_ids), stride):
end = start + window_size
window_word_ids = word_ids[start:end]
window_label = label[start:end]
previous_word_idx = None
for j, word_idx in enumerate(window_word_ids):
if j >= len(window_label):
break
if word_idx is None:
label_ids.append(-100)
elif window_label[j] == '0':
label_ids.append(0)
elif word_idx != previous_word_idx:
label_ids.append(label_encoding_dict[window_label[j]])
else:
label_ids.append(label_encoding_dict[window_label[j]] if label_all_tokens else -100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
trainer = Trainer(
model,
args,
train_dataset=train_tokenized_datasets,
eval_dataset=test_tokenized_datasets,
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()