Tokenizer is not defined

I tried everything but getting tokenizer error, here is my simple code:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, BertTokenizer,DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelForCausalLM

context_dataset = Dataset.from_dict({
“text”: [“context sentence 1”, “context sentence 2”, “context sentence 3”]
})
print(context_dataset)

Load a pre-trained tokenizer

tokenizer = AutoTokenizer.from_pretrained(“distilbert-base-uncased-distilled-squad”)
print(tokenizer)

Tokenize your dataset

def tokenize_function(examples):
return tokenizer(examples[“text”], padding=“max_length”, truncation=True)

tokenized_datasets = context_dataset.map(tokenize_function, batched=True, num_proc=3, remove_columns=[“text”])

1 Like

The reason of that error is that the tokenizer was not declared within the tokenize_function function.
A simple solution is to declare the tokenizer instance inside the tokenize_function function, but this is inefficient because it creates a new tokenizer instance every time the function is called. Other simple solution is to add tokenizer argument on tokenize_function function, but the function argument of Datasets’s map method only accepts one argument(examples).

There are two solutions to solve this problem, as described below.

solution 1. Use TokenizerWrapper class

class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        return self.tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
        )


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
tokenizer_wrapper = TokenizerWrapper(tokenizer)
tokenized_dataset = context_dataset.map(tokenizer_wrapper.tokenize_function, batched=True, num_proc=3, remove_columns=["text"])

solution 2. Use partial function

from functools import partial

from transformers import AutoTokenizer


def tokenize_function(tokenizer, examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
partial_tokenize_function = partial(tokenize_function, tokenizer)
tokenized_dataset = context_dataset.map(partial_tokenize_function, batched=True, num_proc=3, remove_columns=["text"])
2 Likes

Both this methods does not work.

from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:500]")
eli5 = eli5.train_test_split(test_size=0.2)
eli5 = eli5.flatten()

class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def tokenize_function(self, examples):
        '''cut off more then 1024 tokens, this is model restriction'''
        return tokenizer([" ".join(x[:1024]) for x in examples["answers.text"]])

from transformers import AutoTokenizer
checkpoint = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print(tokenizer)
tokenizer_wrapper = TokenizerWrapper(tokenizer)
tokenized_eli5 = eli5.map(tokenizer_wrapper.tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,)

return
Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
GPT2TokenizerFast(name_or_path=‘distilgpt2’, vocab_size=50257, model_max_length=1024, is_fast=True, padding_side=‘right’, truncation_side=‘right’, special_tokens={‘bos_token’: ‘<|endoftext|>’, ‘eos_token’: ‘<|endoftext|>’, ‘unk_token’: ‘<|endoftext|>’}, clean_up_tokenization_spaces=True), added_tokens_decoder={
50256: AddedToken(“<|endoftext|>”, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
Map (num_proc=4): 0%
0/400 [00:02<?, ? examples/s]

RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
“”"
Traceback (most recent call last):
File “C:\Users\satyr\anaconda3\lib\site-packages\multiprocess\pool.py”, line 125, in worker
result = (True, func(*args, **kwds))
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\utils\py_utils.py”, line 1354, in _write_generator_to_queue
for i, result in enumerate(func(**kwargs)):
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\arrow_dataset.py”, line 3474, in _map_single
batch = apply_function_on_filtered_inputs(
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\arrow_dataset.py”, line 3353, in apply_function_on_filtered_inputs
processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
File “C:\Users\satyr\AppData\Local\Temp\ipykernel_7500\3052597454.py”, line 13, in tokenize_function
NameError: name ‘tokenizer’ is not defined
“”"

from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:500]")
eli5 = eli5.train_test_split(test_size=0.2)
eli5 = eli5.flatten()


from functools import partial
from transformers import AutoTokenizer
def tokenize_function(tokenizer, examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
checkpoint = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print(tokenizer)
partial_tokenize_function = partial(tokenize_function, tokenizer)
tokenized_eli5 = eli5.map(tokenizer_wrapper.tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,)

return

Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
GPT2TokenizerFast(name_or_path=‘distilgpt2’, vocab_size=50257, model_max_length=1024, is_fast=True, padding_side=‘right’, truncation_side=‘right’, special_tokens={‘bos_token’: ‘<|endoftext|>’, ‘eos_token’: ‘<|endoftext|>’, ‘unk_token’: ‘<|endoftext|>’}, clean_up_tokenization_spaces=True), added_tokens_decoder={
50256: AddedToken(“<|endoftext|>”, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
Map (num_proc=4): 0%
0/400 [00:02<?, ? examples/s]

RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
“”"
Traceback (most recent call last):
File “C:\Users\satyr\anaconda3\lib\site-packages\multiprocess\pool.py”, line 125, in worker
result = (True, func(*args, **kwds))
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\utils\py_utils.py”, line 1354, in _write_generator_to_queue
for i, result in enumerate(func(**kwargs)):
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\arrow_dataset.py”, line 3474, in _map_single
batch = apply_function_on_filtered_inputs(
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\arrow_dataset.py”, line 3353, in apply_function_on_filtered_inputs
processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
File “C:\Users\satyr\AppData\Local\Temp\ipykernel_7500\3052597454.py”, line 13, in tokenize_function
NameError: name ‘tokenizer’ is not defined
“”"

Only method i have find is to incapsulate all variables in mapping function:

from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:500]")
eli5 = eli5.train_test_split(test_size=0.2)
eli5 = eli5.flatten()

def tokenize_function(examples):
    from transformers import AutoTokenizer
    checkpoint = "distilgpt2"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

tokenized_eli5 = eli5.map(tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,)

You have to make hard decisions to work with HF.

I got into the reported error while trying to follow the guide Causal language modeling at step preprocess_function and applying the solution 1 - TokenizerWrapper helped! Thank you hangjoo!

Still not working? I am trying to finetune whisper and I cant because of the tokenizer call. This is really painful