Hey!
I created a dataset repository with the structure:
data/
subset1/
files.zip
metadata.zip
subset2/
- the
files.zip
contains ~500 .ogg audio files - the
metadata.zip
contains a train.csv and test.csv metadata file
I need a custom dataset loading script because there are multiple sub-tasks:
loading_script.py:
import os
import datasets
import pandas as pd
class SetConfig(datasets.BuilderConfig):
def __init__(
self,
citation,
features=None,
**kwargs
):
super().__init__(version=datasets.Version("0.0.1"), **kwargs)
# outsourced to reduce redundancy (otherwise has to be defined in every config)
if features is None:
features = datasets.Features({
"audio": datasets.Audio(sampling_rate=32_000, mono=True, decode=True),
"label": datasets.Value("string"),
"file": datasets.Value("string"),
"source": datasets.Value("string"),
"start_time": datasets.Value("string"), # can be changed to timestamp later
"end_time": datasets.Value("string"),
"local_time": datasets.Value("string"),
"events": datasets.Sequence(datasets.Value("string"))
})
self.features = features
self.citation = citation
class Set(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = [
SetConfig(
name="subset1",
description=DESCRIPTION1,
citation=CITATION1,
data_dir="data/subset1",
),
SetConfig(
name="subset2",
description=DESCRIPTION2,
citation=DESCRIPTION2,
data_dir="data/subset2",
),
]
def _info(self):
return datasets.DatasetInfo(
description = _SET_DESCRIPTION + self.config.description,
features = self.config.features,
citation=self.config.citation + "\n" + _SET_CITATION,
)
def _split_generators(self, dl_manager):
# download directory of the files
dl_dir = dl_manager.download_and_extract({
"files": os.path.join(self.config.data_dir,"files.zip"),
"metadata": os.path.join(self.config.data_dir, "metadata.zip")
})
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"data_dir": os.path.join(dl_dir["files"]),
"metapath": os.path.join(dl_dir["metadata"], "train.csv"),
"split": datasets.Split.TRAIN,
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"data_dir": os.path.join(dl_dir["files"]),
"metapath": os.path.join(dl_dir["metadata"], "test.csv"),
"split": datasets.Split.TEST
},
),
]
def _generate_examples(self, data_dir, metapath, split):
metadata = pd.read_csv(metapath)
for key, row in metadata.iterrows():
audio_path = os.path.join(data_dir, row["file_name"])
yield key, {
"audio": audio_path,
"label": row["ebird_code"],
"file": audio_path,
"source": "xeno-canto",
"start_time": None,
"end_time": None,
"local_time": None,
"events": None
}
When I clone this repository and call the loading script directly everything works:
ds = load_dataset("cloned_repo/loading_script.py", "subset1")
I get the expected output:
DatasetDict({
train: Dataset({
features: ['audio', 'label', 'file', 'source', 'start_time', 'end_time', 'local_time', 'events'],
num_rows: 400
})
test: Dataset({
features: ['audio', 'label', 'file', 'source', 'start_time', 'end_time', 'local_time', 'events'],
num_rows: 100
})
})
However, if I use this in the cloned repository and not call the script directly:
ds = load_dataset("cloned_repo", "subset1")
The split somehow does not work and I get the following:
DatasetDict({
train: Dataset({
features: ['audio'],
num_rows: 500
})
})
The same issue occurs when I load it directly from the hub without cloning the repo:
ds = load_dataset("datasets/repo", "subset1")
DatasetDict({
train: Dataset({
features: ['audio'],
num_rows: 500
})
})
I guess that it’s something that can easily be solved. Any ideas? I really appreciate any help you can provide.