Following this course and got stuck on padding the data using the Data Collator.
The error says it’s reached maximum recursion depth.
!!!NOTE: I have loaded my own dataset, but this doesn’t seem to be the issue.
# source https://huggingface.co/course/chapter7/2?fw=tf
import datasets
from datasets import load_dataset
classes = ["O", "Quantity", "UnitPriceAmount", "GoodsDescription",
"Incoterms", "GoodsOrigin", "Tolerance", "HSCode"]
dataset = load_dataset("json", data_files='data/dataset_bert.json', features=datasets.Features(
{
"id": datasets.Value("string"),
"tokens": datasets.Sequence(datasets.Value("string")),
"tags": datasets.Sequence(datasets.features.ClassLabel(names=classes))
}))
# LOAD TOKENIZER
from transformers import PreTrainedTokenizerFast, BertTokenizerFast
tokenizer = BertTokenizerFast(
tokenizer_file="tokenizer/tokenizer.json",
bos_token="<S>",
eos_token="</S>",
unk_token="<UNK>",
pad_token="<PAD>",
cls_token="<CLS>",
sep_token="<SEP>",
mask_token="<MASK>",
padding_side="right",
max_length=300,
)
inputs = tokenizer(dataset["train"][0]["tokens"], is_split_into_words=True)
print(inputs.tokens())
def align_labels_with_tokens(labels, word_ids):
new_labels = []
current_word = None
for word_id in word_ids:
if word_id != current_word:
# Start of a new word!
current_word = word_id
label = -100 if word_id is None else labels[word_id]
new_labels.append(label)
elif word_id is None:
# Special token
new_labels.append(-100)
else:
# Same word as previous token
label = labels[word_id]
# If the label is B-XXX we change it to I-XXX
if label % 2 == 1:
label += 1
new_labels.append(label)
return new_labels
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples["tokens"], truncation=True, is_split_into_words=True
)
all_labels = examples["tags"]
new_labels = []
for i, labels in enumerate(all_labels):
word_ids = tokenized_inputs.word_ids(i)
new_labels.append(align_labels_with_tokens(labels, word_ids))
tokenized_inputs["labels"] = new_labels
return tokenized_inputs
tokenized_datasets = dataset.map(
tokenize_and_align_labels,
batched=True,
remove_columns=dataset["train"].column_names,
)
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(
tokenizer=tokenizer, return_tensors="tf"
)
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]
The compiler suggests using __call__
. Can anyone explain to me how to do that?
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Also, a short explanation of why this call goes into recursivity is much appreciated.