And if you’re doing NER and have multiple labels per sentence:
def chunk_examples(examples):
chunks = []
labels = []
for sentence, labels in zip(examples["sentence1"], examples["labels"]):
chunks += [sentence[i:i + 50] for i in range(0, len(sentence), 50)]
labels += [labels[i:i + 50] for i in range(0, len(labels), 50)]
return {"chunk": chunks, "labels": labels}
chunked_dataset = dataset.map(chunk_examples, batched=True, remove_columns=dataset.column_names)