Setting an array with a sequence using Huggingface dataset map() when running a colab notebook

I am trying to run a colab notebook that uses the huggingface library dataset class. It is here:

It runs perfectly, but I am trying to change the dataset.

I’ve loaded a dataset and am trying to apply a map() function to it.

Here is my code:

model_name_or_path = "facebook/wav2vec2-base-100k-voxpopuli"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path,)
target_sampling_rate = feature_extractor.sampling_rate
print(feature_extractor)

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    print(speech_list)
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]
    print(type(speech_list))
    result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)

    result["labels"] = list(target_list)

    return result

# Remove this part
max_samples = 100
train_dataset = train_dataset.select(range(max_samples))
eval_dataset = eval_dataset.select(range(max_samples))

train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=10,
    batched=True,
    num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=10,
    batched=True,
    num_proc=4
)

After running the last lines, (the map() function), here is the error:


/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  return array(a, dtype, copy=False, order=order)
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  return array(a, dtype, copy=False, order=order)
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  return array(a, dtype, copy=False, order=order)
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  return array(a, dtype, copy=False, order=order)
---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
TypeError: float() argument must be a string or a number, not 'list'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/multiprocess/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 185, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/datasets/fingerprint.py", line 397, in wrapper
    out = func(self, *args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 2020, in _map_single
    offset=offset,
  File "/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py", line 1906, in apply_function_on_filtered_inputs
    function(*fn_args, effective_indices, **fn_kwargs) if with_indices else function(*fn_args, **fn_kwargs)
  File "<ipython-input-105-3cb463e63163>", line 19, in preprocess_function
    result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
  File "/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py", line 211, in __call__
    padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
  File "/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py", line 87, in zero_mean_unit_var_norm
    if isinstance(input_values[0], np.ndarray):
  File "/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py", line 87, in <listcomp>
    if isinstance(input_values[0], np.ndarray):
ValueError: setting an array element with a sequence.
"""

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
<ipython-input-109-28e13eb2d6de> in <module>()
      3     batch_size=10,
      4     batched=True,
----> 5     num_proc=4
      6 )
      7 eval_dataset = eval_dataset.map(

11 frames
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in map(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
   1742                 logger.info("Spawning {} processes".format(num_proc))
   1743                 results = [pool.apply_async(self.__class__._map_single, kwds=kwds) for kwds in kwds_per_shard]
-> 1744                 transformed_shards = [r.get() for r in results]
   1745                 logger.info("Concatenating {} shards from multiprocessing".format(num_proc))
   1746                 result = concatenate_datasets(transformed_shards)

/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in <listcomp>(.0)
   1742                 logger.info("Spawning {} processes".format(num_proc))
   1743                 results = [pool.apply_async(self.__class__._map_single, kwds=kwds) for kwds in kwds_per_shard]
-> 1744                 transformed_shards = [r.get() for r in results]
   1745                 logger.info("Concatenating {} shards from multiprocessing".format(num_proc))
   1746                 result = concatenate_datasets(transformed_shards)

/usr/local/lib/python3.7/dist-packages/multiprocess/pool.py in get(self, timeout)
    655             return self._value
    656         else:
--> 657             raise self._value
    658 
    659     def _set(self, i, obj):

/usr/local/lib/python3.7/dist-packages/multiprocess/pool.py in worker()
    119         job, i, func, args, kwds = task
    120         try:
--> 121             result = (True, func(*args, **kwds))
    122         except Exception as e:
    123             if wrap_exception and func is not _helper_reraises_exception:

/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in wrapper()
    183         }
    184         # apply actual function
--> 185         out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
    186         datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
    187         # re-apply format to the output

/usr/local/lib/python3.7/dist-packages/datasets/fingerprint.py in wrapper()
    395             # Call actual function
    396 
--> 397             out = func(self, *args, **kwargs)
    398 
    399             # Update fingerprint of in-place transforms + update in-place history of transforms

/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in _map_single()
   2018                                 indices,
   2019                                 check_same_num_examples=len(input_dataset.list_indexes()) > 0,
-> 2020                                 offset=offset,
   2021                             )
   2022                         except NumExamplesMismatch:

/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in apply_function_on_filtered_inputs()
   1904                 effective_indices = [i + offset for i in indices] if isinstance(indices, list) else indices + offset
   1905             processed_inputs = (
-> 1906                 function(*fn_args, effective_indices, **fn_kwargs) if with_indices else function(*fn_args, **fn_kwargs)
   1907             )
   1908             if update_data is None:

<ipython-input-105-3cb463e63163> in preprocess_function()
     17     target_list = [label_to_id(label, label_list) for label in examples[output_column]]
     18 
---> 19     result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
     20     result["labels"] = list(target_list)
     21 

/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py in __call__()
    209         # zero-mean and unit-variance normalization
    210         if self.do_normalize:
--> 211             padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
    212                 padded_inputs["input_values"], input_lengths=input_lengths
    213             )

/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py in zero_mean_unit_var_norm()
     85         """
     86         print(input_values)
---> 87         if isinstance(input_values[0], np.ndarray):
     88             input_values = [x.astype(np.float32) for x in input_values]
     89 

/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py in <listcomp>()
     85         """
---> 86         if isinstance(input_values[0], np.ndarray):
     87             input_values = [x.astype(np.float32) for x in input_values]
     88 

ValueError: setting an array element with a sequence.

I’m not sure what is going on here. When I print speech_list from the preprocess_function() function, I get this:


[array([[-1.3845960e-03, -1.5129161e-03, -1.3279491e-03, ...,
        -9.3758357e-04, -5.0248392e-04, -2.3690595e-04],
       [ 5.0415384e-04,  7.3929900e-06, -5.8541872e-04, ...,
         2.6963546e-04,  4.4448639e-04,  7.5516611e-04]], dtype=float32), array([[ 0.00750456,  0.00959514,  0.00922188, ..., -0.0017352 ,
        -0.0019784 , -0.00042148],
       [-0.0046173 ,  0.00029656,  0.01085352, ...,  0.00995941,
         0.0046006 ,  0.00151875]], dtype=float32), array([[ 0.00032558,  0.00044874, -0.00064546, ...,  0.00199648,
         0.00220139,  0.00113442],
       [ 0.01374926,  0.02029924,  0.02300985, ...,  0.02447655,
         0.024421  ,  0.00926847]], dtype=float32), array([[ 3.1322680e-04,  2.9084622e-05, -1.7249165e-04, ...,
        -1.0999236e-03, -1.4311116e-03, -3.1127129e-04],
       [-1.9904135e-03, -2.2752464e-03, -1.9129037e-03, ...,
        -7.8604842e-04, -1.6195733e-03, -3.5395977e-04]], dtype=float32), array([[-0.0018726 , -0.00167636, -0.0016572 , ..., -0.00041437,
         0.00060199,  0.0006947 ],
       [ 0.00442896,  0.0041303 ,  0.00259148, ...,  0.00126941,
         0.0004518 , -0.00026673]], dtype=float32), array([[-0.00154839, -0.00183026, -0.00170901, ..., -0.00169933,
        -0.00238513, -0.00154379],
       [ 0.00048418,  0.00074115,  0.00099551, ..., -0.05256891,
        -0.03463165, -0.01582825]], dtype=float32), array([[-4.3763156e-04, -1.5511583e-04,  1.5612959e-04, ...,
        -1.0198121e-04,  2.6510053e-05,  5.8304349e-06],
       [-2.4142796e-03, -2.7431613e-03, -1.9503339e-03, ...,
         1.9912045e-03,  1.8718862e-03,  3.3789902e-04]], dtype=float32), array([[ 2.5531935e-04,  2.9120210e-04,  1.8021779e-05, ...,
         9.6951338e-04,  1.1847753e-03,  3.6130843e-04],
       [-4.2422273e-04, -9.5154933e-04, -1.1366532e-03, ...,
         1.3966652e-03,  1.4367601e-03,  3.6545223e-04]], dtype=float32), array([[ 0.00049792,  0.00055293,  0.00043075, ..., -0.00584954,
        -0.00827   , -0.00197146],
       [-0.00125196, -0.00177683, -0.00116915, ..., -0.00643045,
        -0.00696308, -0.00153378]], dtype=float32), array([[-0.00286428, -0.00418009, -0.00461933, ...,  0.00096886,
         0.00105958,  0.00106084],
       [-0.00322456, -0.00440617, -0.00480009, ..., -0.00011426,
         0.0002051 ,  0.00059317]], dtype=float32)]

which is type <class 'list'>.

I see the error says it doesn’t accept lists, only strings or numbers, but this doesn’t make sense? The docs aren’t very clear about input to Wav2Vec2FeatureExtractor.

Please advise.