Map multiprocessing Issue

Hi there, I got a (maybe) similar issue caused by the multiprocessing in map. Instead of opening a new thread, I thought I would use this one. Note that the error occurs only if I specify num_proc > 1, i.e. use multi-processing:

Code:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

datasets = datasets.map(
    lambda sequence: tokenizer(sequence['text'], return_special_tokens_mask=True),
    batched=True,
    batch_size=1000,
    num_proc=2, #psutil.cpu_count()
    remove_columns=['text'],
)

datasets

Error:

Token indices sequence length is longer than the specified maximum sequence length for this model (8395 > 512). Running this sequence through the model will result in indexing errors
---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\multiprocess\pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\arrow_dataset.py", line 203, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\fingerprint.py", line 337, in wrapper
    out = func(self, *args, **kwargs)
  File "c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\arrow_dataset.py", line 1695, in _map_single
    batch = apply_function_on_filtered_inputs(
  File "c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\arrow_dataset.py", line 1608, in apply_function_on_filtered_inputs
    function(*fn_args, effective_indices, **fn_kwargs) if with_indices else function(*fn_args, **fn_kwargs)
  File "<ipython-input-18-25a1ecec1896>", line 9, in <lambda>
NameError: name 'tokenizer' is not defined
"""

The above exception was the direct cause of the following exception:

NameError                                 Traceback (most recent call last)
<ipython-input-18-25a1ecec1896> in <module>
      6 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
      7 
----> 8 datasets = datasets.map(
      9     lambda sequence: tokenizer(sequence['text'], return_special_tokens_mask=True),
     10     batched=True,

c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\dataset_dict.py in map(self, function, with_indices, input_columns, batched, batch_size, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc)
    430             cache_file_names = {k: None for k in self}
    431         return DatasetDict(
--> 432             {
    433                 k: dataset.map(
    434                     function=function,

c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\dataset_dict.py in <dictcomp>(.0)
    431         return DatasetDict(
    432             {
--> 433                 k: dataset.map(
    434                     function=function,
    435                     with_indices=with_indices,

c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\arrow_dataset.py in map(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint)
   1483                 logger.info("Spawning {} processes".format(num_proc))
   1484                 results = [pool.apply_async(self.__class__._map_single, kwds=kwds) for kwds in kwds_per_shard]
-> 1485                 transformed_shards = [r.get() for r in results]
   1486                 logger.info("Concatenating {} shards from multiprocessing".format(num_proc))
   1487                 result = concatenate_datasets(transformed_shards)

c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\datasets\arrow_dataset.py in <listcomp>(.0)
   1483                 logger.info("Spawning {} processes".format(num_proc))
   1484                 results = [pool.apply_async(self.__class__._map_single, kwds=kwds) for kwds in kwds_per_shard]
-> 1485                 transformed_shards = [r.get() for r in results]
   1486                 logger.info("Concatenating {} shards from multiprocessing".format(num_proc))
   1487                 result = concatenate_datasets(transformed_shards)

c:\Users\s_scho53\Desktop\L09_Desktop\_FiLMo\.venv\lib\site-packages\multiprocess\pool.py in get(self, timeout)
    769             return self._value
    770         else:
--> 771             raise self._value
    772 
    773     def _set(self, i, obj):

NameError: name 'tokenizer' is not defined

I am grateful for any help! :slight_smile: