Both this methods does not work.
from datasets import load_dataset
eli5 = load_dataset("eli5", split="train_asks[:500]")
eli5 = eli5.train_test_split(test_size=0.2)
eli5 = eli5.flatten()
class TokenizerWrapper:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
def tokenize_function(self, examples):
'''cut off more then 1024 tokens, this is model restriction'''
return tokenizer([" ".join(x[:1024]) for x in examples["answers.text"]])
from transformers import AutoTokenizer
checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print(tokenizer)
tokenizer_wrapper = TokenizerWrapper(tokenizer)
tokenized_eli5 = eli5.map(tokenizer_wrapper.tokenize_function,
batched=True,
num_proc=4,
remove_columns=eli5["train"].column_names,)
return
Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
GPT2TokenizerFast(name_or_path=‘distilgpt2’, vocab_size=50257, model_max_length=1024, is_fast=True, padding_side=‘right’, truncation_side=‘right’, special_tokens={‘bos_token’: ‘<|endoftext|>’, ‘eos_token’: ‘<|endoftext|>’, ‘unk_token’: ‘<|endoftext|>’}, clean_up_tokenization_spaces=True), added_tokens_decoder={
50256: AddedToken(“<|endoftext|>”, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
Map (num_proc=4): 0%
0/400 [00:02<?, ? examples/s]
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
“”"
Traceback (most recent call last):
File “C:\Users\satyr\anaconda3\lib\site-packages\multiprocess\pool.py”, line 125, in worker
result = (True, func(*args, **kwds))
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\utils\py_utils.py”, line 1354, in _write_generator_to_queue
for i, result in enumerate(func(**kwargs)):
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\arrow_dataset.py”, line 3474, in _map_single
batch = apply_function_on_filtered_inputs(
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\arrow_dataset.py”, line 3353, in apply_function_on_filtered_inputs
processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
File “C:\Users\satyr\AppData\Local\Temp\ipykernel_7500\3052597454.py”, line 13, in tokenize_function
NameError: name ‘tokenizer’ is not defined
“”"
from datasets import load_dataset
eli5 = load_dataset("eli5", split="train_asks[:500]")
eli5 = eli5.train_test_split(test_size=0.2)
eli5 = eli5.flatten()
from functools import partial
from transformers import AutoTokenizer
def tokenize_function(tokenizer, examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print(tokenizer)
partial_tokenize_function = partial(tokenize_function, tokenizer)
tokenized_eli5 = eli5.map(tokenizer_wrapper.tokenize_function,
batched=True,
num_proc=4,
remove_columns=eli5["train"].column_names,)
return
Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
GPT2TokenizerFast(name_or_path=‘distilgpt2’, vocab_size=50257, model_max_length=1024, is_fast=True, padding_side=‘right’, truncation_side=‘right’, special_tokens={‘bos_token’: ‘<|endoftext|>’, ‘eos_token’: ‘<|endoftext|>’, ‘unk_token’: ‘<|endoftext|>’}, clean_up_tokenization_spaces=True), added_tokens_decoder={
50256: AddedToken(“<|endoftext|>”, rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
Map (num_proc=4): 0%
0/400 [00:02<?, ? examples/s]
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
“”"
Traceback (most recent call last):
File “C:\Users\satyr\anaconda3\lib\site-packages\multiprocess\pool.py”, line 125, in worker
result = (True, func(*args, **kwds))
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\utils\py_utils.py”, line 1354, in _write_generator_to_queue
for i, result in enumerate(func(**kwargs)):
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\arrow_dataset.py”, line 3474, in _map_single
batch = apply_function_on_filtered_inputs(
File “C:\Users\satyr\anaconda3\lib\site-packages\datasets\arrow_dataset.py”, line 3353, in apply_function_on_filtered_inputs
processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
File “C:\Users\satyr\AppData\Local\Temp\ipykernel_7500\3052597454.py”, line 13, in tokenize_function
NameError: name ‘tokenizer’ is not defined
“”"