Setting an array with a sequence using Huggingface dataset map()

I am trying to run a notebook that uses the huggingface library dataset class. I’ve loaded a dataset and am trying to apply a map() function to it.

Here is my code:

model_name_or_path = "facebook/wav2vec2-base-100k-voxpopuli"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path,)
target_sampling_rate = feature_extractor.sampling_rate
print(feature_extractor)

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    print(speech_list)
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]
    print(type(speech_list))
    result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)

    result["labels"] = list(target_list)

    return result

# Remove this part
max_samples = 100
train_dataset = train_dataset.select(range(max_samples))
eval_dataset = eval_dataset.select(range(max_samples))

train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=10,
    batched=True,
    num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=10,
    batched=True,
    num_proc=4
)

After running the last lines, (the map() function), here is the error:


/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  return array(a, dtype, copy=False, order=order)
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  return array(a, dtype, copy=False, order=order)
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  return array(a, dtype, copy=False, order=order)
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  return array(a, dtype, copy=False, order=order)
---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
TypeError: float() argument must be a string or a number, not 'list'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/multiprocess/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 185, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/datasets/fingerprint.py", line 397, in wrapper
    out = func(self, *args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 2020, in _map_single
    offset=offset,
  File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 1906, in apply_function_on_filtered_inputs
    function(*fn_args, effective_indices, **fn_kwargs) if with_indices else function(*fn_args, **fn_kwargs)
  File "<ipython-input-105-3cb463e63163>", line 19, in preprocess_function
    result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
  File "/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py", line 211, in __call__
    padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
  File "/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py", line 87, in zero_mean_unit_var_norm
    if isinstance(input_values[0], np.ndarray):
  File "/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py", line 87, in <listcomp>
    if isinstance(input_values[0], np.ndarray):
ValueError: setting an array element with a sequence.
"""

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
<ipython-input-109-28e13eb2d6de> in <module>()
      3     batch_size=10,
      4     batched=True,
----> 5     num_proc=4
      6 )
      7 eval_dataset = eval_dataset.map(

11 frames
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in map(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
   1742                 logger.info("Spawning {} processes".format(num_proc))
   1743                 results = [pool.apply_async(self.__class__._map_single, kwds=kwds) for kwds in kwds_per_shard]
-> 1744                 transformed_shards = [r.get() for r in results]
   1745                 logger.info("Concatenating {} shards from multiprocessing".format(num_proc))
   1746                 result = concatenate_datasets(transformed_shards)

/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in <listcomp>(.0)
   1742                 logger.info("Spawning {} processes".format(num_proc))
   1743                 results = [pool.apply_async(self.__class__._map_single, kwds=kwds) for kwds in kwds_per_shard]
-> 1744                 transformed_shards = [r.get() for r in results]
   1745                 logger.info("Concatenating {} shards from multiprocessing".format(num_proc))
   1746                 result = concatenate_datasets(transformed_shards)

/usr/local/lib/python3.7/dist-packages/multiprocess/pool.py in get(self, timeout)
    655             return self._value
    656         else:
--> 657             raise self._value
    658 
    659     def _set(self, i, obj):

/usr/local/lib/python3.7/dist-packages/multiprocess/pool.py in worker()
    119         job, i, func, args, kwds = task
    120         try:
--> 121             result = (True, func(*args, **kwds))
    122         except Exception as e:
    123             if wrap_exception and func is not _helper_reraises_exception:

/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in wrapper()
    183         }
    184         # apply actual function
--> 185         out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
    186         datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
    187         # re-apply format to the output

/usr/local/lib/python3.7/dist-packages/datasets/fingerprint.py in wrapper()
    395             # Call actual function
    396 
--> 397             out = func(self, *args, **kwargs)
    398 
    399             # Update fingerprint of in-place transforms + update in-place history of transforms

/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in _map_single()
   2018                                 indices,
   2019                                 check_same_num_examples=len(input_dataset.list_indexes()) > 0,
-> 2020                                 offset=offset,
   2021                             )
   2022                         except NumExamplesMismatch:

/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in apply_function_on_filtered_inputs()
   1904                 effective_indices = [i + offset for i in indices] if isinstance(indices, list) else indices + offset
   1905             processed_inputs = (
-> 1906                 function(*fn_args, effective_indices, **fn_kwargs) if with_indices else function(*fn_args, **fn_kwargs)
   1907             )
   1908             if update_data is None:

<ipython-input-105-3cb463e63163> in preprocess_function()
     17     target_list = [label_to_id(label, label_list) for label in examples[output_column]]
     18 
---> 19     result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
     20     result["labels"] = list(target_list)
     21 

/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py in __call__()
    209         # zero-mean and unit-variance normalization
    210         if self.do_normalize:
--> 211             padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
    212                 padded_inputs["input_values"], input_lengths=input_lengths
    213             )

/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py in zero_mean_unit_var_norm()
     85         """
     86         print(input_values)
---> 87         if isinstance(input_values[0], np.ndarray):
     88             input_values = [x.astype(np.float32) for x in input_values]
     89 

/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py in <listcomp>()
     85         """
---> 86         if isinstance(input_values[0], np.ndarray):
     87             input_values = [x.astype(np.float32) for x in input_values]
     88 

ValueError: setting an array element with a sequence.

I’m not sure what is going on here. When I print speech_list from the preprocess_function() function, I get this:


[array([[-1.3845960e-03, -1.5129161e-03, -1.3279491e-03, ...,
        -9.3758357e-04, -5.0248392e-04, -2.3690595e-04],
       [ 5.0415384e-04,  7.3929900e-06, -5.8541872e-04, ...,
         2.6963546e-04,  4.4448639e-04,  7.5516611e-04]], dtype=float32), array([[ 0.00750456,  0.00959514,  0.00922188, ..., -0.0017352 ,
        -0.0019784 , -0.00042148],
       [-0.0046173 ,  0.00029656,  0.01085352, ...,  0.00995941,
         0.0046006 ,  0.00151875]], dtype=float32), array([[ 0.00032558,  0.00044874, -0.00064546, ...,  0.00199648,
         0.00220139,  0.00113442],
       [ 0.01374926,  0.02029924,  0.02300985, ...,  0.02447655,
         0.024421  ,  0.00926847]], dtype=float32), array([[ 3.1322680e-04,  2.9084622e-05, -1.7249165e-04, ...,
        -1.0999236e-03, -1.4311116e-03, -3.1127129e-04],
       [-1.9904135e-03, -2.2752464e-03, -1.9129037e-03, ...,
        -7.8604842e-04, -1.6195733e-03, -3.5395977e-04]], dtype=float32), array([[-0.0018726 , -0.00167636, -0.0016572 , ..., -0.00041437,
         0.00060199,  0.0006947 ],
       [ 0.00442896,  0.0041303 ,  0.00259148, ...,  0.00126941,
         0.0004518 , -0.00026673]], dtype=float32), array([[-0.00154839, -0.00183026, -0.00170901, ..., -0.00169933,
        -0.00238513, -0.00154379],
       [ 0.00048418,  0.00074115,  0.00099551, ..., -0.05256891,
        -0.03463165, -0.01582825]], dtype=float32), array([[-4.3763156e-04, -1.5511583e-04,  1.5612959e-04, ...,
        -1.0198121e-04,  2.6510053e-05,  5.8304349e-06],
       [-2.4142796e-03, -2.7431613e-03, -1.9503339e-03, ...,
         1.9912045e-03,  1.8718862e-03,  3.3789902e-04]], dtype=float32), array([[ 2.5531935e-04,  2.9120210e-04,  1.8021779e-05, ...,
         9.6951338e-04,  1.1847753e-03,  3.6130843e-04],
       [-4.2422273e-04, -9.5154933e-04, -1.1366532e-03, ...,
         1.3966652e-03,  1.4367601e-03,  3.6545223e-04]], dtype=float32), array([[ 0.00049792,  0.00055293,  0.00043075, ..., -0.00584954,
        -0.00827   , -0.00197146],
       [-0.00125196, -0.00177683, -0.00116915, ..., -0.00643045,
        -0.00696308, -0.00153378]], dtype=float32), array([[-0.00286428, -0.00418009, -0.00461933, ...,  0.00096886,
         0.00105958,  0.00106084],
       [-0.00322456, -0.00440617, -0.00480009, ..., -0.00011426,
         0.0002051 ,  0.00059317]], dtype=float32)]

which is type <class 'list'>.

I see the error says it doesn’t accept lists, only strings or numbers, but this doesn’t make sense? The docs aren’t very clear about input to Wav2Vec2FeatureExtractor.

Please advise.

I have the same problem with another model, I can train the model but when I want to evaluate it I get that error! the training dataset and evaluation dataset are just like each other! so how it can train the model with one type of data but can not evaluate if using the same type? Weird!