Dataset loading script not working

Hey!

I created a dataset repository with the structure:

data/
   subset1/
      files.zip
      metadata.zip
  subset2/
  • the files.zip contains ~500 .ogg audio files
  • the metadata.zip contains a train.csv and test.csv metadata file

I need a custom dataset loading script because there are multiple sub-tasks:

loading_script.py:

import os
import datasets
import pandas as pd 

class SetConfig(datasets.BuilderConfig):
    def __init__(
            self,
            citation,
            features=None,
            **kwargs
        ):
        super().__init__(version=datasets.Version("0.0.1"), **kwargs)

        # outsourced to reduce redundancy (otherwise has to be defined in every config)
        if features is None: 
            features = datasets.Features({
                    "audio": datasets.Audio(sampling_rate=32_000, mono=True, decode=True),
                    "label": datasets.Value("string"),
                    "file": datasets.Value("string"),
                    "source": datasets.Value("string"),
                    "start_time": datasets.Value("string"), # can be changed to timestamp later
                    "end_time": datasets.Value("string"),
                    "local_time": datasets.Value("string"),
                    "events": datasets.Sequence(datasets.Value("string"))
                })
    
        self.features = features
        self.citation = citation


class Set(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        SetConfig(
            name="subset1",
            description=DESCRIPTION1,
            citation=CITATION1,
            data_dir="data/subset1",
            ),

        SetConfig(
            name="subset2",
            description=DESCRIPTION2,
            citation=DESCRIPTION2,
            data_dir="data/subset2",
            ),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description = _SET_DESCRIPTION + self.config.description,
            features = self.config.features,
            citation=self.config.citation + "\n" + _SET_CITATION,
        )

    def _split_generators(self, dl_manager):

        # download directory of the files
        dl_dir = dl_manager.download_and_extract({
                "files": os.path.join(self.config.data_dir,"files.zip"),
                "metadata": os.path.join(self.config.data_dir, "metadata.zip")
            })

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "data_dir": os.path.join(dl_dir["files"]),
                    "metapath": os.path.join(dl_dir["metadata"], "train.csv"),
                    "split": datasets.Split.TRAIN,
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "data_dir": os.path.join(dl_dir["files"]),
                    "metapath": os.path.join(dl_dir["metadata"], "test.csv"),
                    "split": datasets.Split.TEST
                },
            ),
        ]

    def _generate_examples(self, data_dir, metapath, split):
        metadata = pd.read_csv(metapath)
        for key, row in metadata.iterrows():
            audio_path = os.path.join(data_dir, row["file_name"])
            yield key, {
                "audio": audio_path,
                "label": row["ebird_code"],
                "file": audio_path,
                "source": "xeno-canto",
                "start_time": None,
                "end_time": None,
                "local_time": None,
                "events": None
            }

When I clone this repository and call the loading script directly everything works:

ds = load_dataset("cloned_repo/loading_script.py", "subset1")

I get the expected output:

 DatasetDict({
    train: Dataset({
        features: ['audio', 'label', 'file', 'source', 'start_time', 'end_time', 'local_time', 'events'],
        num_rows: 400
    })
    test: Dataset({
        features: ['audio', 'label', 'file', 'source', 'start_time', 'end_time', 'local_time', 'events'],
        num_rows: 100
    })
})

However, if I use this in the cloned repository and not call the script directly:
ds = load_dataset("cloned_repo", "subset1")

The split somehow does not work and I get the following:

DatasetDict({
    train: Dataset({
        features: ['audio'],
        num_rows: 500
    })
})

The same issue occurs when I load it directly from the hub without cloning the repo:
ds = load_dataset("datasets/repo", "subset1")

DatasetDict({
    train: Dataset({
        features: ['audio'],
        num_rows: 500
    })
})

I guess that it’s something that can easily be solved. Any ideas? I really appreciate any help you can provide.

What is the loading script’s name? It has to match the dataset name (equal to the parent directory when loading a local dataset) - if that’s not the case, datasets assumes it is a no-code dataset and focuses on supported data file formats to determine the dataset type and splits.

2 Likes

:hugs: Thank you very much! This was the oversight on my side. It works now :slight_smile: