Common Voice dataset: librosa.load() leads to LibsndfileError

Hello,

I鈥檓 using the script from (jonatasgrosman/wav2vec2-large-xlsr-53-german 路 Hugging Face)
and want to run the following lines:

def speech_file_to_array_fn(batch):
speech_array, sampling_rate = librosa.load(batch[鈥減ath鈥漖, sr=16_000)
batch[鈥渟peech鈥漖 = speech_array
batch[鈥渟entence鈥漖 = batch[鈥渟entence鈥漖.upper()
return batch

test_dataset = test_dataset.map(speech_file_to_array_fn)

But I get the following error message:

:4: UserWarning: PySoundFile failed. Trying audioread instead.
speech_array, sampling_rate = librosa.load(batch[鈥減ath鈥漖, sr=16_000)
/usr/local/lib/python3.9/dist-packages/librosa/core/audio.py:184: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)

LibsndfileError Traceback (most recent call last)
/usr/local/lib/python3.9/dist-packages/librosa/core/audio.py in load(path, sr, mono, offset, duration, dtype, res_type)
175 try:
鈫 176 y, sr_native = __soundfile_load(path, offset, duration, dtype)
177

14 frames
/usr/local/lib/python3.9/dist-packages/librosa/core/audio.py in __soundfile_load(path, offset, duration, dtype)
208 # Otherwise, create the soundfile object
鈫 209 context = sf.SoundFile(path)
210

/usr/local/lib/python3.9/dist-packages/soundfile.py in init(self, file, mode, samplerate, channels, subtype, endian, format, closefd)
657 format, subtype, endian)
鈫 658 self._file = self._open(file, mode_int, closefd)
659 if set(mode).issuperset(鈥榬+鈥) and self.seekable():

/usr/local/lib/python3.9/dist-packages/soundfile.py in _open(self, file, mode_int, closefd)
1215 err = _snd.sf_error(file_ptr)
鈫 1216 raise LibsndfileError(err, prefix="Error opening {0!r}: ".format(self.name))
1217 if mode_int == _snd.SFM_WRITE:

LibsndfileError: Error opening 鈥/root/.cache/huggingface/datasets/downloads/extracted/6b0ba8207861ddfcdcd3a0f3272216b81e825b855b7da5d3e8108f028172fe8f/common_voice_de_20540413.mp3鈥: System error.

During handling of the above exception, another exception occurred:

FileNotFoundError Traceback (most recent call last)
in
----> 1 test_dataset = test_dataset.map(speech_file_to_array_fn)
2 inputs = processor(test_dataset[鈥渟peech鈥漖, sampling_rate=16_000, return_tensors=鈥減t鈥, padding=True)

/usr/local/lib/python3.9/dist-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
561 self: 鈥淒ataset鈥 = kwargs.pop(鈥渟elf鈥)
562 # apply actual function
鈫 563 out: Union[鈥淒ataset鈥, 鈥淒atasetDict鈥漖 = func(self, *args, **kwargs)
564 datasets: List[鈥淒ataset鈥漖 = list(out.values()) if isinstance(out, dict) else [out]
565 for dataset in datasets:

/usr/local/lib/python3.9/dist-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
526 }
527 # apply actual function
鈫 528 out: Union[鈥淒ataset鈥, 鈥淒atasetDict鈥漖 = func(self, *args, **kwargs)
529 datasets: List[鈥淒ataset鈥漖 = list(out.values()) if isinstance(out, dict) else [out]
530 # re-apply format to the output

/usr/local/lib/python3.9/dist-packages/datasets/arrow_dataset.py in map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
2951 desc=desc or 鈥淢ap鈥,
2952 ) as pbar:
鈫 2953 for rank, done, content in Dataset._map_single(**dataset_kwargs):
2954 if done:
2955 shards_done += 1

/usr/local/lib/python3.9/dist-packages/datasets/arrow_dataset.py in _map_single(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)
3305 _time = time.time()
3306 for i, example in shard_iterable:
鈫 3307 example = apply_function_on_filtered_inputs(example, i, offset=offset)
3308 if update_data:
3309 if i == 0:

/usr/local/lib/python3.9/dist-packages/datasets/arrow_dataset.py in apply_function_on_filtered_inputs(pa_inputs, indices, check_same_num_examples, offset)
3208 if with_rank:
3209 additional_args += (rank,)
鈫 3210 processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
3211 if isinstance(processed_inputs, LazyDict):
3212 processed_inputs = {

in speech_file_to_array_fn(batch)
2 # We need to read the audio files as arrays
3 def speech_file_to_array_fn(batch):
----> 4 speech_array, sampling_rate = librosa.load(batch[鈥減ath鈥漖, sr=16_000)
5 batch[鈥渟peech鈥漖 = speech_array
6 batch[鈥渟entence鈥漖 = batch[鈥渟entence鈥漖.upper()

/usr/local/lib/python3.9/dist-packages/librosa/core/audio.py in load(path, sr, mono, offset, duration, dtype, res_type)
182 鈥淧ySoundFile failed. Trying audioread instead.鈥, stacklevel=2
183 )
鈫 184 y, sr_native = __audioread_load(path, offset, duration, dtype)
185 else:
186 raise exc

in __audioread_load(path, offset, duration, dtype)

/usr/local/lib/python3.9/dist-packages/librosa/util/decorators.py in __wrapper(func, *args, **kwargs)
58 stacklevel=3, # Would be 2, but the decorator adds a level
59 )
鈥> 60 return func(*args, **kwargs)
61
62 return decorator(__wrapper)

/usr/local/lib/python3.9/dist-packages/librosa/core/audio.py in __audioread_load(path, offset, duration, dtype)
239 else:
240 # If the input was not an audioread object, try to open it
鈫 241 reader = audioread.audio_open(path)
242
243 with reader as input_file:

/usr/local/lib/python3.9/dist-packages/audioread/init.py in audio_open(path, backends)
125 for BackendClass in backends:
126 try:
鈫 127 return BackendClass(path)
128 except DecodeError:
129 pass

/usr/local/lib/python3.9/dist-packages/audioread/rawread.py in init(self, filename)
57 鈥溾"
58 def init(self, filename):
鈥> 59 self._fh = open(filename, 鈥榬b鈥)
60
61 try:

FileNotFoundError: [Errno 2] No such file or directory: 鈥/root/.cache/huggingface/datasets/downloads/extracted/6b0ba8207861ddfcdcd3a0f3272216b81e825b855b7da5d3e8108f028172fe8f/common_voice_de_20540413.mp3鈥

Beforehand, I run the cells:
!add-apt-repository -y ppa:savoury1/ffmpeg4
!apt-get -qq install -y ffmpeg

But the error still remains.

Thank you very much in advance for any hint.