Hi there, I got a (maybe) similar issue caused by the multiprocessing in map
. Instead of opening a new thread, I thought I would use this one. Note that the error occurs only if I specify num_proc
> 1, i.e. use multi-processing:
Code:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
datasets = datasets.map(
lambda sequence: tokenizer(sequence['text'], return_special_tokens_mask=True),
batched=True,
batch_size=1000,
num_proc=2, #psutil.cpu_count()
remove_columns=['text'],
)
datasets
Error:
Token indices sequence length is longer than the specified maximum sequence length for this model (8395 > 512). Running this sequence through the model will result in indexing errors
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\multiprocess\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\arrow_dataset.py", line 203, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\fingerprint.py", line 337, in wrapper
out = func(self, *args, **kwargs)
File "c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\arrow_dataset.py", line 1695, in _map_single
batch = apply_function_on_filtered_inputs(
File "c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\arrow_dataset.py", line 1608, in apply_function_on_filtered_inputs
function(*fn_args, effective_indices, **fn_kwargs) if with_indices else function(*fn_args, **fn_kwargs)
File "<ipython-input-18-25a1ecec1896>", line 9, in <lambda>
NameError: name 'tokenizer' is not defined
"""
The above exception was the direct cause of the following exception:
NameError Traceback (most recent call last)
<ipython-input-18-25a1ecec1896> in <module>
6 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
7
----> 8 datasets = datasets.map(
9 lambda sequence: tokenizer(sequence['text'], return_special_tokens_mask=True),
10 batched=True,
c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\dataset_dict.py in map(self, function, with_indices, input_columns, batched, batch_size, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc)
430 cache_file_names = {k: None for k in self}
431 return DatasetDict(
--> 432 {
433 k: dataset.map(
434 function=function,
c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\dataset_dict.py in <dictcomp>(.0)
431 return DatasetDict(
432 {
--> 433 k: dataset.map(
434 function=function,
435 with_indices=with_indices,
c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\arrow_dataset.py in map(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint)
1483 logger.info("Spawning {} processes".format(num_proc))
1484 results = [pool.apply_async(self.__class__._map_single, kwds=kwds) for kwds in kwds_per_shard]
-> 1485 transformed_shards = [r.get() for r in results]
1486 logger.info("Concatenating {} shards from multiprocessing".format(num_proc))
1487 result = concatenate_datasets(transformed_shards)
c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\arrow_dataset.py in <listcomp>(.0)
1483 logger.info("Spawning {} processes".format(num_proc))
1484 results = [pool.apply_async(self.__class__._map_single, kwds=kwds) for kwds in kwds_per_shard]
-> 1485 transformed_shards = [r.get() for r in results]
1486 logger.info("Concatenating {} shards from multiprocessing".format(num_proc))
1487 result = concatenate_datasets(transformed_shards)
c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\multiprocess\pool.py in get(self, timeout)
769 return self._value
770 else:
--> 771 raise self._value
772
773 def _set(self, i, obj):
NameError: name 'tokenizer' is not defined
I am grateful for any help!