I’m using wav2vec2 for emotion classification (following @m3hrdadfi’s notebook). In the dataset preprocessing step using .map()
, it throws an error, and I’m not sure what is triggering it in the first place. So, any pointer resolving it would be much appreciated. Thanks!
(also, gently pinging @lhoestq and @patrickvonplaten)
Code Reference:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric
data_files = {
"train": "/content/data/train.csv",
"validation": "/content/data/test.csv",
}
dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
# We need to specify the input and output column
input_column = "path"
output_column = "emotion"
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort() # Let's sort it for determinism
num_labels = len(label_list)
# print(f"A classification problem with {num_labels} classes: {label_list}")
from transformers import AutoConfig, Wav2Vec2Processor
model_name_or_path = "arijitx/wav2vec2-large-xlsr-bengali"
pooling_mode = "mean"
# config
config = AutoConfig.from_pretrained(
model_name_or_path,
num_labels=num_labels,
label2id={label: i for i, label in enumerate(label_list)},
id2label={i: label for i, label in enumerate(label_list)},
finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
# print(f"The target sampling rate: {target_sampling_rate}")
"""So far, we downloaded, loaded, and split the SER dataset into train and test sets. The instantiated our strategy configuration for using context representations in our classification problem SER. Now, we need to extract features from the audio path in context representation tensors and feed them into our classification model to determine the emotion in the speech.
Since the audio file is saved in the `.wav` format, it is easy to use **[Librosa](https://librosa.org/doc/latest/index.html)** or others, but we suppose that the format may be in the `.mp3` format in case of generality. We found that the **[Torchaudio](https://pytorch.org/audio/stable/index.html)** library works best for reading in `.mp3` data.
An audio file usually stores both its values and the sampling rate with which the speech signal was digitalized. We want to store both in the dataset and write a **map(...)** function accordingly. Also, we need to handle the string labels into integers for our specific classification task in this case, the **single-label classification** you may want to use for your **regression** or even **multi-label classification**.
"""
def speech_file_to_array_fn(path):
speech_array, sampling_rate = torchaudio.load(path)
resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
speech = resampler(speech_array).squeeze().numpy()
return speech
def label_to_id(label, label_list):
if len(label_list) > 0:
return label_list.index(label) if label in label_list else -1
return label
def preprocess_function(examples):
speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
target_list = [label_to_id(label, label_list) for label in examples[output_column]]
result = processor(speech_list, sampling_rate=target_sampling_rate)
result["labels"] = list(target_list)
return result
train_dataset = train_dataset.map(
preprocess_function,
batch_size=8,
batched=True,
# num_proc=4
)
eval_dataset = eval_dataset.map(
preprocess_function,
batch_size=8,
batched=True,
# num_proc=4
)
Error Reference:
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
return array(a, dtype, copy=False, order=order)
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
<ipython-input-47-fc0bb5a1a9e1> in <module>()
2 preprocess_function,
3 batch_size=8,
----> 4 batched=True,
5 # num_proc=4
6 )
13 frames
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in map(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
1684 new_fingerprint=new_fingerprint,
1685 disable_tqdm=disable_tqdm,
-> 1686 desc=desc,
1687 )
1688 else:
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
183 }
184 # apply actual function
--> 185 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
186 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
187 # re-apply format to the output
/usr/local/lib/python3.7/dist-packages/datasets/fingerprint.py in wrapper(*args, **kwargs)
395 # Call actual function
396
--> 397 out = func(self, *args, **kwargs)
398
399 # Update fingerprint of in-place transforms + update in-place history of transforms
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in _map_single(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc)
2036 else:
2037 batch = cast_to_python_objects(batch)
-> 2038 writer.write_batch(batch)
2039 if update_data and writer is not None:
2040 writer.finalize() # close_stream=bool(buf_writer is None)) # We only close if we are writing in a file
/usr/local/lib/python3.7/dist-packages/datasets/arrow_writer.py in write_batch(self, batch_examples, writer_batch_size)
401 typed_sequence = OptimizedTypedSequence(batch_examples[col], type=col_type, try_type=col_try_type, col=col)
402 typed_sequence_examples[col] = typed_sequence
--> 403 pa_table = pa.Table.from_pydict(typed_sequence_examples)
404 self.write_table(pa_table, writer_batch_size)
405
/usr/local/lib/python3.7/dist-packages/pyarrow/table.pxi in pyarrow.lib.Table.from_pydict()
/usr/local/lib/python3.7/dist-packages/pyarrow/array.pxi in pyarrow.lib.asarray()
/usr/local/lib/python3.7/dist-packages/pyarrow/array.pxi in pyarrow.lib.array()
/usr/local/lib/python3.7/dist-packages/pyarrow/array.pxi in pyarrow.lib._handle_arrow_array_protocol()
/usr/local/lib/python3.7/dist-packages/datasets/arrow_writer.py in __arrow_array__(self, type)
105 out = numpy_to_pyarrow_listarray(self.data)
106 else:
--> 107 out = pa.array(self.data, type=type)
108 if trying_type and out[0].as_py() != self.data[0]:
109 raise TypeError(
/usr/local/lib/python3.7/dist-packages/pyarrow/array.pxi in pyarrow.lib.array()
/usr/local/lib/python3.7/dist-packages/pyarrow/array.pxi in pyarrow.lib._sequence_to_array()
/usr/local/lib/python3.7/dist-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()
/usr/local/lib/python3.7/dist-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowInvalid: Can only convert 1-dimensional array values