I got this issue while fine-tuning Whisper on my local GPUs.
Note: there is no error in the code, because I tried it on Google Colab it works perfectly, but I tried it on both JupyterLab as well as Jupyter - notebook but its giving me this error:
Code Source: Google Colab
Code i am running:
from transformers import WhisperFeatureExtractor
feature_extractor = WhisperFeatureExtractor.from_pretrained(“openai/whisper-small”)
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained(“openai/whisper-small”, language=“Hindi”, task=“transcribe”)
from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained(“openai/whisper-small”, language=“Hindi”, task=“transcribe”)
from datasets import Audio
common_voice = common_voice.cast_column(“audio”, Audio(sampling_rate=16000))
def prepare_dataset(batch):
# load and resample audio data from 48 to 16kHz
audio = batch[“audio”]
# compute log-Mel input features from input audio array
batch["input_features"] = feature_extractor(audio["array"],
sampling_rate=audio["sampling_rate"]).input_features[0]
# encode target text to label ids
batch["labels"] = tokenizer(batch["sentence"]).input_ids
return batch
common_voice = common_voice.map(prepare_dataset,
remove_columns=common_voice.column_names[“train”], num_proc=2)
Here is the error:
Loading widget…
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
“”"
Traceback (most recent call last):
File “D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\multiprocess\pool.py”, line 125, in worker
result = (True, func(*args, **kwds))
File “D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\datasets\utils\py_utils.py”, line 623, in _write_generator_to_queue
for i, result in enumerate(func(**kwargs)):
File “D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\datasets\arrow_dataset.py”, line 3458, in _map_single
example = apply_function_on_filtered_inputs(example, i, offset=offset)
File “D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\datasets\arrow_dataset.py”, line 3361, in apply_function_on_filtered_inputs
processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
File “C:\Users.…\AppData\Local\Temp\ipykernel_9268\3476206305.py”, line 6, in prepare_dataset
NameError: name ‘feature_extractor’ is not defined
“”"
The above exception was the direct cause of the following exception:
NameError Traceback (most recent call last)
Cell In[14], line 1
----> 1 common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names[“train”], num_proc=2)
File D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\datasets\dataset_dict.py:868, in DatasetDict.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, desc)
865 if cache_file_names is None:
866 cache_file_names = {k: None for k in self}
867 return DatasetDict(
→ 868 {
869 k: dataset.map(
870 function=function,
871 with_indices=with_indices,
872 with_rank=with_rank,
873 input_columns=input_columns,
874 batched=batched,
875 batch_size=batch_size,
876 drop_last_batch=drop_last_batch,
877 remove_columns=remove_columns,
878 keep_in_memory=keep_in_memory,
879 load_from_cache_file=load_from_cache_file,
880 cache_file_name=cache_file_names[k],
881 writer_batch_size=writer_batch_size,
882 features=features,
883 disable_nullable=disable_nullable,
884 fn_kwargs=fn_kwargs,
885 num_proc=num_proc,
886 desc=desc,
887 )
888 for k, dataset in self.items()
889 }
890 )
File D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\datasets\dataset_dict.py:869, in (.0)
865 if cache_file_names is None:
866 cache_file_names = {k: None for k in self}
867 return DatasetDict(
868 {
→ 869 k: dataset.map(
870 function=function,
871 with_indices=with_indices,
872 with_rank=with_rank,
873 input_columns=input_columns,
874 batched=batched,
875 batch_size=batch_size,
876 drop_last_batch=drop_last_batch,
877 remove_columns=remove_columns,
878 keep_in_memory=keep_in_memory,
879 load_from_cache_file=load_from_cache_file,
880 cache_file_name=cache_file_names[k],
881 writer_batch_size=writer_batch_size,
882 features=features,
883 disable_nullable=disable_nullable,
884 fn_kwargs=fn_kwargs,
885 num_proc=num_proc,
886 desc=desc,
887 )
888 for k, dataset in self.items()
889 }
890 )
File D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\datasets\arrow_dataset.py:593, in transmit_tasks..wrapper(*args, **kwargs)
591 self: “Dataset” = kwargs.pop(“self”)
592 # apply actual function
→ 593 out: Union[“Dataset”, “DatasetDict”] = func(self, *args, **kwargs)
594 datasets: List[“Dataset”] = list(out.values()) if isinstance(out, dict) else [out]
595 for dataset in datasets:
596 # Remove task templates if a column mapping of the template is no longer valid
File D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\datasets\arrow_dataset.py:558, in transmit_format..wrapper(*args, **kwargs)
551 self_format = {
552 “type”: self._format_type,
553 “format_kwargs”: self._format_kwargs,
554 “columns”: self._format_columns,
555 “output_all_columns”: self._output_all_columns,
556 }
557 # apply actual function
→ 558 out: Union[“Dataset”, “DatasetDict”] = func(self, *args, **kwargs)
559 datasets: List[“Dataset”] = list(out.values()) if isinstance(out, dict) else [out]
560 # re-apply format to the output
File D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\datasets\arrow_dataset.py:3197, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
3191 logger.info(f"Spawning {num_proc} processes")
3192 with hf_tqdm(
3193 unit=" examples",
3194 total=pbar_total,
3195 desc=(desc or “Map”) + f" (num_proc={num_proc})",
3196 ) as pbar:
→ 3197 for rank, done, content in iflatmap_unordered(
3198 pool, Dataset._map_single, kwargs_iterable=kwargs_per_job
3199 ):
3200 if done:
3201 shards_done += 1
File D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\datasets\utils\py_utils.py:663, in iflatmap_unordered(pool, func, kwargs_iterable)
660 finally:
661 if not pool_changed:
662 # we get the result in case there’s an error to raise
→ 663 [async_result.get(timeout=0.05) for async_result in async_results]
File D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\datasets\utils\py_utils.py:663, in (.0)
660 finally:
661 if not pool_changed:
662 # we get the result in case there’s an error to raise
→ 663 [async_result.get(timeout=0.05) for async_result in async_results]
File D:\NOUMAN\Fintune Whisper\WhisperENV\lib\site-packages\multiprocess\pool.py:774, in ApplyResult.get(self, timeout)
772 return self._value
773 else:
→ 774 raise self._value
NameError: name ‘feature_extractor’ is not defined
Additional Note:
I know it’s an error with Jupyter because I defined feature_extractor, but it gives me Name Error:.
I installed the respective versions of libraries in the Virtual Environment, and I checked the code multiple times, but I was unable to identify the issue.
I attached the code, the error here