I find one dataset on the hub that I am interested but I face a error that never happen to me before. I basically loaded the dataset and tried to filter out long audio but get bugs as bellow. I also tried fork it to my own repo but still cant work. Any idea?
Traceback (most recent call last):
File "/work/hdd/beiq/haolong2/SICL/test.py", line 24, in <module>
ds = ds.filter(lambda x: x['audio'] is not None)
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/dataset_dict.py", line 1060, in filter
{
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/dataset_dict.py", line 1061, in <dictcomp>
k: dataset.filter(
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 557, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/fingerprint.py", line 442, in wrapper
out = func(dataset, *args, **kwargs)
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3746, in filter
indices = self.map(
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 557, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3079, in map
for rank, done, content in Dataset._map_single(**dataset_kwargs):
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3525, in _map_single
for i, batch in iter_outputs(shard_iterable):
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3475, in iter_outputs
yield i, apply_function(example, i, offset=offset)
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3398, in apply_function
processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 6423, in get_indices_from_mask_function
num_examples = len(batch[next(iter(batch.keys()))])
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/formatting/formatting.py", line 280, in __getitem__
value = self.format(key)
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/formatting/formatting.py", line 380, in format
return self.formatter.format_column(self.pa_table.select([key]))
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/formatting/formatting.py", line 460, in format_column
column = self.python_features_decoder.decode_column(column, pa_table.column_names[0])
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/formatting/formatting.py", line 226, in decode_column
return self.features.decode_column(column, column_name) if self.features else column
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/features/features.py", line 2122, in decode_column
[decode_nested_example(self[column_name], value) if value is not None else None for value in column]
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/features/features.py", line 2122, in <listcomp>
[decode_nested_example(self[column_name], value) if value is not None else None for value in column]
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/features/features.py", line 1414, in decode_nested_example
return schema.decode_example(obj, token_per_repo_id=token_per_repo_id) if obj is not None else None
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/datasets/features/audio.py", line 184, in decode_example
array, sampling_rate = sf.read(file)
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/soundfile.py", line 307, in read
frames = f._prepare_read(start, stop, frames)
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/soundfile.py", line 1431, in _prepare_read
self.seek(start, SEEK_SET)
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/soundfile.py", line 849, in seek
_error_check(self._errorcode)
File "/u/haolong2/.conda/envs/csr4rsr-dev/lib/python3.10/site-packages/soundfile.py", line 1480, in _error_check
raise LibsndfileError(err, prefix=prefix)
soundfile.LibsndfileError: Internal psf_fseek() failed.
To reproduce the error as below:
dataset_id = "NathanRoll/hisp-eng"
ds = load_dataset(dataset_id, verification_mode='no_checks')
print(ds)
print(ds['train'][0]["audio"])
ds = ds.filter(lambda x: x['audio'] is not None)
print(ds)