Chapter 5 questions

In “Time to Slice and Dice,” I cannot get the parallel dataset mapping to work:

drug_dataset.map(tokenize_function, batched=True)

runs without issue, but:

drug_dataset.map(tokenize_function, batched=True, num_proc=4) 

causes a crash with the message:

---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\multiprocess\pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\utils\py_utils.py", line 1377, in _write_generator_to_queue
    for i, result in enumerate(func(**kwargs)):
  File "c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\arrow_dataset.py", line 3466, in _map_single
    batch = apply_function_on_filtered_inputs(
  File "c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\arrow_dataset.py", line 3345, in apply_function_on_filtered_inputs
    processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
  File "C:\Users\964864\AppData\Local\Temp\1\ipykernel_18204\2403223452.py", line 6, in tokenize_function
NameError: name 'tokenizer' is not defined
"""

The above exception was the direct cause of the following exception:

NameError                                 Traceback (most recent call last)
File <timed exec>:1

File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\dataset_dict.py:855, in DatasetDict.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, desc)
    852 if cache_file_names is None:
    853     cache_file_names = {k: None for k in self}
    854 return DatasetDict(
--> 855     {
    856         k: dataset.map(
    857             function=function,
    858             with_indices=with_indices,
    859             with_rank=with_rank,
    860             input_columns=input_columns,
    861             batched=batched,
    862             batch_size=batch_size,
    863             drop_last_batch=drop_last_batch,
    864             remove_columns=remove_columns,
    865             keep_in_memory=keep_in_memory,
    866             load_from_cache_file=load_from_cache_file,
    867             cache_file_name=cache_file_names[k],
    868             writer_batch_size=writer_batch_size,
    869             features=features,
    870             disable_nullable=disable_nullable,
    871             fn_kwargs=fn_kwargs,
    872             num_proc=num_proc,
    873             desc=desc,
    874         )
    875         for k, dataset in self.items()
    876     }
    877 )

File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\dataset_dict.py:856, in <dictcomp>(.0)
    852 if cache_file_names is None:
    853     cache_file_names = {k: None for k in self}
    854 return DatasetDict(
    855     {
--> 856         k: dataset.map(
    857             function=function,
    858             with_indices=with_indices,
    859             with_rank=with_rank,
    860             input_columns=input_columns,
    861             batched=batched,
    862             batch_size=batch_size,
    863             drop_last_batch=drop_last_batch,
    864             remove_columns=remove_columns,
    865             keep_in_memory=keep_in_memory,
    866             load_from_cache_file=load_from_cache_file,
    867             cache_file_name=cache_file_names[k],
    868             writer_batch_size=writer_batch_size,
    869             features=features,
    870             disable_nullable=disable_nullable,
    871             fn_kwargs=fn_kwargs,
    872             num_proc=num_proc,
    873             desc=desc,
    874         )
    875         for k, dataset in self.items()
    876     }
    877 )

File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\arrow_dataset.py:591, in transmit_tasks.<locals>.wrapper(*args, **kwargs)
    589     self: "Dataset" = kwargs.pop("self")
    590 # apply actual function
--> 591 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
    592 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
    593 for dataset in datasets:
    594     # Remove task templates if a column mapping of the template is no longer valid

File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\arrow_dataset.py:556, in transmit_format.<locals>.wrapper(*args, **kwargs)
    549 self_format = {
    550     "type": self._format_type,
    551     "format_kwargs": self._format_kwargs,
    552     "columns": self._format_columns,
    553     "output_all_columns": self._output_all_columns,
    554 }
    555 # apply actual function
--> 556 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
    557 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
    558 # re-apply format to the output

File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\arrow_dataset.py:3181, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
   3174 logger.info(f"Spawning {num_proc} processes")
   3175 with logging.tqdm(
   3176     disable=not logging.is_progress_bar_enabled(),
   3177     unit=" examples",
   3178     total=pbar_total,
   3179     desc=(desc or "Map") + f" (num_proc={num_proc})",
   3180 ) as pbar:
-> 3181     for rank, done, content in iflatmap_unordered(
   3182         pool, Dataset._map_single, kwargs_iterable=kwargs_per_job
   3183     ):
   3184         if done:
   3185             shards_done += 1

File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\utils\py_utils.py:1417, in iflatmap_unordered(pool, func, kwargs_iterable)
   1414 finally:
   1415     if not pool_changed:
   1416         # we get the result in case there's an error to raise
-> 1417         [async_result.get(timeout=0.05) for async_result in async_results]

File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\utils\py_utils.py:1417, in <listcomp>(.0)
   1414 finally:
   1415     if not pool_changed:
   1416         # we get the result in case there's an error to raise
-> 1417         [async_result.get(timeout=0.05) for async_result in async_results]

File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\multiprocess\pool.py:771, in ApplyResult.get(self, timeout)
    769     return self._value
    770 else:
--> 771     raise self._value

NameError: name 'tokenizer' is not defined

What is going on? I did a basic google search and saw that I might need to set the number of threads in torch. Neither torch.set_num_threads(4) nor torch.set_num_threads(1) fixed the issue.

1 Like