I am trying to run a notebook that uses the huggingface library dataset class. I’ve loaded a dataset and am trying to apply a map() function to it.
Here is my code:
model_name_or_path = "facebook/wav2vec2-base-100k-voxpopuli"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path,)
target_sampling_rate = feature_extractor.sampling_rate
print(feature_extractor)
Wav2Vec2FeatureExtractor {
"do_normalize": true,
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
"feature_size": 1,
"padding_side": "right",
"padding_value": 0,
"return_attention_mask": false,
"sampling_rate": 16000
}
def speech_file_to_array_fn(path):
speech_array, sampling_rate = torchaudio.load(path)
resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
speech = resampler(speech_array).squeeze().numpy()
return speech
def label_to_id(label, label_list):
if len(label_list) > 0:
return label_list.index(label) if label in label_list else -1
return label
def preprocess_function(examples):
speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
print(speech_list)
target_list = [label_to_id(label, label_list) for label in examples[output_column]]
print(type(speech_list))
result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
result["labels"] = list(target_list)
return result
# Remove this part
max_samples = 100
train_dataset = train_dataset.select(range(max_samples))
eval_dataset = eval_dataset.select(range(max_samples))
train_dataset = train_dataset.map(
preprocess_function,
batch_size=10,
batched=True,
num_proc=4
)
eval_dataset = eval_dataset.map(
preprocess_function,
batch_size=10,
batched=True,
num_proc=4
)
After running the last lines, (the map() function), here is the error:
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
return array(a, dtype, copy=False, order=order)
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
return array(a, dtype, copy=False, order=order)
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
return array(a, dtype, copy=False, order=order)
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
return array(a, dtype, copy=False, order=order)
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
TypeError: float() argument must be a string or a number, not 'list'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/multiprocess/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 185, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/datasets/fingerprint.py", line 397, in wrapper
out = func(self, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 2020, in _map_single
offset=offset,
File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 1906, in apply_function_on_filtered_inputs
function(*fn_args, effective_indices, **fn_kwargs) if with_indices else function(*fn_args, **fn_kwargs)
File "<ipython-input-105-3cb463e63163>", line 19, in preprocess_function
result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
File "/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py", line 211, in __call__
padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
File "/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py", line 87, in zero_mean_unit_var_norm
if isinstance(input_values[0], np.ndarray):
File "/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py", line 87, in <listcomp>
if isinstance(input_values[0], np.ndarray):
ValueError: setting an array element with a sequence.
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-109-28e13eb2d6de> in <module>()
3 batch_size=10,
4 batched=True,
----> 5 num_proc=4
6 )
7 eval_dataset = eval_dataset.map(
11 frames
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in map(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
1742 logger.info("Spawning {} processes".format(num_proc))
1743 results = [pool.apply_async(self.__class__._map_single, kwds=kwds) for kwds in kwds_per_shard]
-> 1744 transformed_shards = [r.get() for r in results]
1745 logger.info("Concatenating {} shards from multiprocessing".format(num_proc))
1746 result = concatenate_datasets(transformed_shards)
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in <listcomp>(.0)
1742 logger.info("Spawning {} processes".format(num_proc))
1743 results = [pool.apply_async(self.__class__._map_single, kwds=kwds) for kwds in kwds_per_shard]
-> 1744 transformed_shards = [r.get() for r in results]
1745 logger.info("Concatenating {} shards from multiprocessing".format(num_proc))
1746 result = concatenate_datasets(transformed_shards)
/usr/local/lib/python3.7/dist-packages/multiprocess/pool.py in get(self, timeout)
655 return self._value
656 else:
--> 657 raise self._value
658
659 def _set(self, i, obj):
/usr/local/lib/python3.7/dist-packages/multiprocess/pool.py in worker()
119 job, i, func, args, kwds = task
120 try:
--> 121 result = (True, func(*args, **kwds))
122 except Exception as e:
123 if wrap_exception and func is not _helper_reraises_exception:
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in wrapper()
183 }
184 # apply actual function
--> 185 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
186 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
187 # re-apply format to the output
/usr/local/lib/python3.7/dist-packages/datasets/fingerprint.py in wrapper()
395 # Call actual function
396
--> 397 out = func(self, *args, **kwargs)
398
399 # Update fingerprint of in-place transforms + update in-place history of transforms
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in _map_single()
2018 indices,
2019 check_same_num_examples=len(input_dataset.list_indexes()) > 0,
-> 2020 offset=offset,
2021 )
2022 except NumExamplesMismatch:
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in apply_function_on_filtered_inputs()
1904 effective_indices = [i + offset for i in indices] if isinstance(indices, list) else indices + offset
1905 processed_inputs = (
-> 1906 function(*fn_args, effective_indices, **fn_kwargs) if with_indices else function(*fn_args, **fn_kwargs)
1907 )
1908 if update_data is None:
<ipython-input-105-3cb463e63163> in preprocess_function()
17 target_list = [label_to_id(label, label_list) for label in examples[output_column]]
18
---> 19 result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
20 result["labels"] = list(target_list)
21
/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py in __call__()
209 # zero-mean and unit-variance normalization
210 if self.do_normalize:
--> 211 padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
212 padded_inputs["input_values"], input_lengths=input_lengths
213 )
/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py in zero_mean_unit_var_norm()
85 """
86 print(input_values)
---> 87 if isinstance(input_values[0], np.ndarray):
88 input_values = [x.astype(np.float32) for x in input_values]
89
/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py in <listcomp>()
85 """
---> 86 if isinstance(input_values[0], np.ndarray):
87 input_values = [x.astype(np.float32) for x in input_values]
88
ValueError: setting an array element with a sequence.
I’m not sure what is going on here. When I print speech_list
from the preprocess_function()
function, I get this:
[array([[-1.3845960e-03, -1.5129161e-03, -1.3279491e-03, ...,
-9.3758357e-04, -5.0248392e-04, -2.3690595e-04],
[ 5.0415384e-04, 7.3929900e-06, -5.8541872e-04, ...,
2.6963546e-04, 4.4448639e-04, 7.5516611e-04]], dtype=float32), array([[ 0.00750456, 0.00959514, 0.00922188, ..., -0.0017352 ,
-0.0019784 , -0.00042148],
[-0.0046173 , 0.00029656, 0.01085352, ..., 0.00995941,
0.0046006 , 0.00151875]], dtype=float32), array([[ 0.00032558, 0.00044874, -0.00064546, ..., 0.00199648,
0.00220139, 0.00113442],
[ 0.01374926, 0.02029924, 0.02300985, ..., 0.02447655,
0.024421 , 0.00926847]], dtype=float32), array([[ 3.1322680e-04, 2.9084622e-05, -1.7249165e-04, ...,
-1.0999236e-03, -1.4311116e-03, -3.1127129e-04],
[-1.9904135e-03, -2.2752464e-03, -1.9129037e-03, ...,
-7.8604842e-04, -1.6195733e-03, -3.5395977e-04]], dtype=float32), array([[-0.0018726 , -0.00167636, -0.0016572 , ..., -0.00041437,
0.00060199, 0.0006947 ],
[ 0.00442896, 0.0041303 , 0.00259148, ..., 0.00126941,
0.0004518 , -0.00026673]], dtype=float32), array([[-0.00154839, -0.00183026, -0.00170901, ..., -0.00169933,
-0.00238513, -0.00154379],
[ 0.00048418, 0.00074115, 0.00099551, ..., -0.05256891,
-0.03463165, -0.01582825]], dtype=float32), array([[-4.3763156e-04, -1.5511583e-04, 1.5612959e-04, ...,
-1.0198121e-04, 2.6510053e-05, 5.8304349e-06],
[-2.4142796e-03, -2.7431613e-03, -1.9503339e-03, ...,
1.9912045e-03, 1.8718862e-03, 3.3789902e-04]], dtype=float32), array([[ 2.5531935e-04, 2.9120210e-04, 1.8021779e-05, ...,
9.6951338e-04, 1.1847753e-03, 3.6130843e-04],
[-4.2422273e-04, -9.5154933e-04, -1.1366532e-03, ...,
1.3966652e-03, 1.4367601e-03, 3.6545223e-04]], dtype=float32), array([[ 0.00049792, 0.00055293, 0.00043075, ..., -0.00584954,
-0.00827 , -0.00197146],
[-0.00125196, -0.00177683, -0.00116915, ..., -0.00643045,
-0.00696308, -0.00153378]], dtype=float32), array([[-0.00286428, -0.00418009, -0.00461933, ..., 0.00096886,
0.00105958, 0.00106084],
[-0.00322456, -0.00440617, -0.00480009, ..., -0.00011426,
0.0002051 , 0.00059317]], dtype=float32)]
which is type <class 'list'>
.
I see the error says it doesn’t accept lists, only strings or numbers, but this doesn’t make sense? The docs aren’t very clear about input to Wav2Vec2FeatureExtractor
.
Please advise.