In “Time to Slice and Dice,” I cannot get the parallel dataset mapping to work:
drug_dataset.map(tokenize_function, batched=True)
runs without issue, but:
drug_dataset.map(tokenize_function, batched=True, num_proc=4)
causes a crash with the message:
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\multiprocess\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\utils\py_utils.py", line 1377, in _write_generator_to_queue
for i, result in enumerate(func(**kwargs)):
File "c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\arrow_dataset.py", line 3466, in _map_single
batch = apply_function_on_filtered_inputs(
File "c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\arrow_dataset.py", line 3345, in apply_function_on_filtered_inputs
processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
File "C:\Users\964864\AppData\Local\Temp\1\ipykernel_18204\2403223452.py", line 6, in tokenize_function
NameError: name 'tokenizer' is not defined
"""
The above exception was the direct cause of the following exception:
NameError Traceback (most recent call last)
File <timed exec>:1
File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\dataset_dict.py:855, in DatasetDict.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, desc)
852 if cache_file_names is None:
853 cache_file_names = {k: None for k in self}
854 return DatasetDict(
--> 855 {
856 k: dataset.map(
857 function=function,
858 with_indices=with_indices,
859 with_rank=with_rank,
860 input_columns=input_columns,
861 batched=batched,
862 batch_size=batch_size,
863 drop_last_batch=drop_last_batch,
864 remove_columns=remove_columns,
865 keep_in_memory=keep_in_memory,
866 load_from_cache_file=load_from_cache_file,
867 cache_file_name=cache_file_names[k],
868 writer_batch_size=writer_batch_size,
869 features=features,
870 disable_nullable=disable_nullable,
871 fn_kwargs=fn_kwargs,
872 num_proc=num_proc,
873 desc=desc,
874 )
875 for k, dataset in self.items()
876 }
877 )
File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\dataset_dict.py:856, in <dictcomp>(.0)
852 if cache_file_names is None:
853 cache_file_names = {k: None for k in self}
854 return DatasetDict(
855 {
--> 856 k: dataset.map(
857 function=function,
858 with_indices=with_indices,
859 with_rank=with_rank,
860 input_columns=input_columns,
861 batched=batched,
862 batch_size=batch_size,
863 drop_last_batch=drop_last_batch,
864 remove_columns=remove_columns,
865 keep_in_memory=keep_in_memory,
866 load_from_cache_file=load_from_cache_file,
867 cache_file_name=cache_file_names[k],
868 writer_batch_size=writer_batch_size,
869 features=features,
870 disable_nullable=disable_nullable,
871 fn_kwargs=fn_kwargs,
872 num_proc=num_proc,
873 desc=desc,
874 )
875 for k, dataset in self.items()
876 }
877 )
File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\arrow_dataset.py:591, in transmit_tasks.<locals>.wrapper(*args, **kwargs)
589 self: "Dataset" = kwargs.pop("self")
590 # apply actual function
--> 591 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
592 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
593 for dataset in datasets:
594 # Remove task templates if a column mapping of the template is no longer valid
File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\arrow_dataset.py:556, in transmit_format.<locals>.wrapper(*args, **kwargs)
549 self_format = {
550 "type": self._format_type,
551 "format_kwargs": self._format_kwargs,
552 "columns": self._format_columns,
553 "output_all_columns": self._output_all_columns,
554 }
555 # apply actual function
--> 556 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
557 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
558 # re-apply format to the output
File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\arrow_dataset.py:3181, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
3174 logger.info(f"Spawning {num_proc} processes")
3175 with logging.tqdm(
3176 disable=not logging.is_progress_bar_enabled(),
3177 unit=" examples",
3178 total=pbar_total,
3179 desc=(desc or "Map") + f" (num_proc={num_proc})",
3180 ) as pbar:
-> 3181 for rank, done, content in iflatmap_unordered(
3182 pool, Dataset._map_single, kwargs_iterable=kwargs_per_job
3183 ):
3184 if done:
3185 shards_done += 1
File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\utils\py_utils.py:1417, in iflatmap_unordered(pool, func, kwargs_iterable)
1414 finally:
1415 if not pool_changed:
1416 # we get the result in case there's an error to raise
-> 1417 [async_result.get(timeout=0.05) for async_result in async_results]
File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\datasets\utils\py_utils.py:1417, in <listcomp>(.0)
1414 finally:
1415 if not pool_changed:
1416 # we get the result in case there's an error to raise
-> 1417 [async_result.get(timeout=0.05) for async_result in async_results]
File c:\Users\964864\OneDrive - Cognizant HealthCare\Documents\Innovation Project\HuggingFace NLP Tutorial\notebooks\.venv\lib\site-packages\multiprocess\pool.py:771, in ApplyResult.get(self, timeout)
769 return self._value
770 else:
--> 771 raise self._value
NameError: name 'tokenizer' is not defined
What is going on? I did a basic google search and saw that I might need to set the number of threads in torch. Neither torch.set_num_threads(4)
nor torch.set_num_threads(1)
fixed the issue.