Help with speech dataset loading script

elsayedissa · November 27, 2023, 3:35pm

Hello, I have the following structure of a speech dataset:

mydataset/
      data/
           files.zip
           metadata.zip
      mydataset.py

where files.zip has all the wav files, and metadata.zip has two files train.csv and test.csv

Each of the csv files is organized as follows:

audio,text
audio1.wav, hello world

I have the following script:

import os
import datasets
import pandas as pd 

_DESCRIPTION = "Some description"

_CITATION = "Some citation"

_data_dir = "data"

class MyDataset(datasets.GeneratorBasedBuilder):
   
    def _info(self):
        return datasets.DatasetInfo(

            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "path": datasets.Value("string"),
                    "audio": datasets.Audio(sampling_rate=16_000),
                    "sentence": datasets.Value("string"),
                }
            ),
            supervised_keys=None,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):

        download_dir = dl_manager.download_and_extract({
                "files": os.path.join(_data_dir,"files.zip"),
                "metadata": os.path.join(_data_dir, "metadata.zip")
            })

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "split": datasets.Split.TRAIN,
                    "data_dir": os.path.join(download_dir["files"]),
                    "meta_path": os.path.join(download_dir["metadata"], "train.csv"),
                    
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "split": datasets.Split.TEST
                    "data_dir": os.path.join(download_dir["files"]),
                    "meta_path": os.path.join(download_dir["metadata"], "test.csv"),
                    
                },
            ),
        ]

    def _generate_examples(self, data_dir, meta_path, split):
        metadata = pd.read_csv(meta_path)
        for key, row in metadata.iterrows():
            audio_path = os.path.join(data_dir, row["file"])
            yield key, {
                "audio": audio_path,
                "sentence": row["text"],
                "path": audio_path,
            }

I always get the following error. I hope you can help me with it.

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
File ~/anaconda3/lib/python3.10/site-packages/datasets/builder.py:1676, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1675 _time = time.time()
-> 1676 for key, record in generator:
   1677     if max_shard_size is not None and writer._num_bytes > max_shard_size:

File ~/.cache/huggingface/modules/datasets_modules/datasets/elsayedissa--mydataset/90fcdeb92afe7e90a40a28fd4ddecdabc82c22a901294cf40e1d928b12af0594/mydataset.py:56, in MyDataset._generate_examples(self, data_dir, metapath, split)
     55 def _generate_examples(self, data_dir, metapath, split):
---> 56     metadata = pd.read_csv(metapath)
     57     for key, row in metadata.iterrows():

File ~/anaconda3/lib/python3.10/site-packages/datasets/streaming.py:74, in extend_module_for_streaming.<locals>.wrap_auth.<locals>.wrapper(*args, **kwargs)
     72 @wraps(function)
     73 def wrapper(*args, **kwargs):
---> 74     return function(*args, download_config=download_config, **kwargs)

File ~/anaconda3/lib/python3.10/site-packages/datasets/download/streaming_download_manager.py:765, in xpandas_read_csv(filepath_or_buffer, download_config, **kwargs)
    764     kwargs["compression"] = _get_extraction_protocol(filepath_or_buffer, download_config=download_config)
--> 765 return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)

File ~/anaconda3/lib/python3.10/site-packages/datasets/download/streaming_download_manager.py:491, in xopen(file, mode, download_config, *args, **kwargs)
    490 if is_local_path(main_hop):
--> 491     return open(main_hop, mode, *args, **kwargs)
    492 # add headers and cookies for authentication on the HF Hub and for Google Drive

FileNotFoundError: [Errno 2] No such file or directory: '/Users/elsayedissa/.cache/huggingface/datasets/downloads/extracted/04b7ac5a758780150074d4be11c31288ebb1c8116b3613aec7435cef8038972a/train.csv'

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
Cell In[2], line 1
----> 1 x = load_dataset("elsayedissa/mydataset")

File ~/anaconda3/lib/python3.10/site-packages/datasets/load.py:2136, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   2133 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   2135 # Download and prepare data
-> 2136 builder_instance.download_and_prepare(
   2137     download_config=download_config,
   2138     download_mode=download_mode,
   2139     verification_mode=verification_mode,
   2140     try_from_hf_gcs=try_from_hf_gcs,
   2141     num_proc=num_proc,
   2142     storage_options=storage_options,
   2143 )
   2145 # Build dataset for splits
   2146 keep_in_memory = (
   2147     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   2148 )

File ~/anaconda3/lib/python3.10/site-packages/datasets/builder.py:954, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    952     if num_proc is not None:
    953         prepare_split_kwargs["num_proc"] = num_proc
--> 954     self._download_and_prepare(
    955         dl_manager=dl_manager,
    956         verification_mode=verification_mode,
    957         **prepare_split_kwargs,
    958         **download_and_prepare_kwargs,
    959     )
    960 # Sync info
    961 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~/anaconda3/lib/python3.10/site-packages/datasets/builder.py:1717, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs)
   1716 def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):
-> 1717     super()._download_and_prepare(
   1718         dl_manager,
   1719         verification_mode,
   1720         check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS
   1721         or verification_mode == VerificationMode.ALL_CHECKS,
   1722         **prepare_splits_kwargs,
   1723     )

File ~/anaconda3/lib/python3.10/site-packages/datasets/builder.py:1049, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
   1045 split_dict.add(split_generator.split_info)
   1047 try:
   1048     # Prepare split will record examples associated to the split
-> 1049     self._prepare_split(split_generator, **prepare_split_kwargs)
   1050 except OSError as e:
   1051     raise OSError(
   1052         "Cannot find data file. "
   1053         + (self.manual_download_instructions or "")
   1054         + "\nOriginal error:\n"
   1055         + str(e)
   1056     ) from None

File ~/anaconda3/lib/python3.10/site-packages/datasets/builder.py:1555, in GeneratorBasedBuilder._prepare_split(self, split_generator, check_duplicate_keys, file_format, num_proc, max_shard_size)
   1553 job_id = 0
   1554 with pbar:
-> 1555     for job_id, done, content in self._prepare_split_single(
   1556         gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1557     ):
   1558         if done:
   1559             result = content

File ~/anaconda3/lib/python3.10/site-packages/datasets/builder.py:1712, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1710     if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
   1711         e = e.__context__
-> 1712     raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1714 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

mariosasko · November 28, 2023, 4:05pm

I’ve opened a PR to fix the script: elsayedissa/mydataset · Fix dataset script

elsayedissa · November 28, 2023, 7:30pm

Thank you so much @mariosasko

Topic		Replies	Views
Dataset loading script for an audio dataset 🤗Datasets	5	701	September 2, 2022
My dataset loading script is not working 🤗Datasets	3	863	September 15, 2022
Not able to use Custom Speech Data for training ASR 🤗Datasets	2	324	September 20, 2023
Using load_datasets for newly created datasets 🤗Datasets	2	498	August 27, 2021
DatasetGenerationError while loading dataset Beginners	3	2263	October 26, 2023

Help with speech dataset loading script

Related topics