I am trying to use my custom script while calling load_dataset to download datset from my private google drive . I am always getting error in using archive_iterator in method _generate_examples .
Request assitance here .
Error:
File ~.cache\huggingface\modules\datasets_modules\datasets\datadownload\f8ac21f799984c3939d0b941bcbad45911cfe7b9960ad59a4fbd0631ff00ac53\datadownload.py:131, in MusdbDataset._generate_examples(self, local_extracted_archive, archive_iterator, metadata_filepath, path_to_clips) 130 print(archive_iterator) â 131 for path, f in archive_iterator: 132 # Parse the metadata CSV file 133 logging.info(âXXXX Inside archive_iterator and below is path and fâ)
Call to function from notebook:
datasets.config.DEFAULT_MAX_BATCH_SIZE = 10
dataset = load_dataset(âdatadownload.pyâ)
The custom Datadownload.py script .
import datasets
from pathlib import Path
import numpy as np
from datasets.tasks import AutomaticSpeechRecognition
import os
import logging
logging.basicConfig(level=logging.DEBUG, filename=âapp.logâ, filemode=âwâ, format=â%(name)s - %(levelname)s - %(message)sâ)
_DESCRIPTION = âNew Common Voice COmbined datasetâ # Assign a value to _DESCRIPTION
class MusdbDataset(datasets.GeneratorBasedBuilder):
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features = datasets.Features(
{
âclient_idâ: datasets.Value(âstringâ),
âpathâ: datasets.Value(âstringâ),
âaudioâ: datasets.Audio(sampling_rate=48_000),
âsentenceâ: datasets.Value(âstringâ),
âup_votesâ: datasets.Value(âint64â),
âdown_votesâ: datasets.Value(âint64â),
âageâ: datasets.Value(âstringâ),
âgenderâ: datasets.Value(âstringâ),
âaccentâ: datasets.Value(âstringâ),
âlocaleâ: datasets.Value(âstringâ),
âsegmentâ: datasets.Value(âstringâ),
})
)
def _split_generators(self, dl_manager):
logging.info(âXXXX Inside _split_generatorsâ)
archive_path = dl_manager.download(
"https://drive.google.com/file/d/1Q5yqWPM-Gzb322bjV7zYFYEhUvMhrZXx/view?usp=drive_link"
)
# First we locate the data using the path within the archive:
path_to_data = "/".join(["cv-corpus-6.1-2020-12-11", "hi"])
path_to_clips = "/".join([path_to_data, "clips"])
metadata_filepaths = {
split: "/".join([path_to_data, f"{split}.tsv"])
for split in ["train", "test", "dev", "other", "validated", "invalidated"]
}
# (Optional) In non-streaming mode, we can extract the archive locally to have actual local audio files:
local_extracted_archive = dl_manager.extract(archive_path) if not dl_manager.is_streaming else None
print(dl_manager.iter_archive(archive_path))
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"local_extracted_archive": local_extracted_archive,
"archive_iterator": dl_manager.iter_archive(
archive_path
), # use iter_archive here to access the files in the TAR archives
"metadata_filepath": metadata_filepaths["train"],
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"local_extracted_archive": local_extracted_archive,
"archive_iterator": dl_manager.iter_archive(
archive_path
), # use iter_archive here to access the files in the TAR archives
"metadata_filepath": metadata_filepaths["test"],
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"local_extracted_archive": local_extracted_archive,
"archive_iterator": dl_manager.iter_archive(
archive_path
), # use iter_archive here to access the files in the TAR archives
"metadata_filepath": metadata_filepaths["dev"],
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name="other",
gen_kwargs={
"local_extracted_archive": local_extracted_archive,
"archive_iterator": dl_manager.iter_archive(
archive_path
), # use iter_archive here to access the files in the TAR archives
"metadata_filepath": metadata_filepaths["other"],
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name="validated",
gen_kwargs={
"local_extracted_archive": local_extracted_archive,
"archive_iterator": dl_manager.iter_archive(
archive_path
), # use iter_archive here to access the files in the TAR archives
"metadata_filepath": metadata_filepaths["validated"],
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name="invalidated",
gen_kwargs={
"local_extracted_archive": local_extracted_archive,
"archive_iterator": dl_manager.iter_archive(
archive_path
), # use iter_archive here to access the files in the TAR archives
"metadata_filepath": metadata_filepaths["invalidated"],
"path_to_clips": path_to_clips,
},
),
]
def _generate_examples(self, local_extracted_archive, archive_iterator, metadata_filepath, path_to_clips):
âââYields examples.âââ
data_fields = list(self._info().features.keys())
# audio is not a header of the csv files
data_fields.remove("audio")
path_idx = data_fields.index("path")
all_field_values = {}
metadata_found = False
# Here we iterate over all the files within the TAR archive:
print(archive_iterator)
for path, f in archive_iterator:
# Parse the metadata CSV file
logging.info('XXXX Inside archive_iterator and below is path and f')
logging.info(path)
logging.info(f)
if path == metadata_filepath:
metadata_found = True
lines = f.readlines()
headline = lines[0].decode("utf-8")
column_names = headline.strip().split("\t")
assert (
column_names == data_fields
), f"The file should have {data_fields} as column names, but has {column_names}"
for line in lines[1:]:
field_values = line.decode("utf-8").strip().split("\t")
# set full path for mp3 audio file
audio_path = "/".join([path_to_clips, field_values[path_idx]])
all_field_values[audio_path] = field_values
# Else, read the audio file and yield an example
elif path.startswith(path_to_clips):
assert metadata_found, "Found audio clips before the metadata TSV file."
if not all_field_values:
break
if path in all_field_values:
# retrieve the metadata corresponding to this audio file
field_values = all_field_values[path]
# if data is incomplete, fill with empty values
if len(field_values) < len(data_fields):
field_values += (len(data_fields) - len(field_values)) * ["''"]
result = {key: value for key, value in zip(data_fields, field_values)}
# set audio feature
path = os.path.join(local_extracted_archive, path) if local_extracted_archive else path
result["audio"] = {"path": path, "bytes": f.read()}
# set path to None if the audio file doesn't exist locally (i.e. in streaming mode)
result["path"] = path if local_extracted_archive else None
yield path, result