I am trying to generate a large dataset for fine-tuning a Wav2Vec2 model.
Itβs important that the data is not getting cached in-memory and I am still not sure if this is going to be the case the way I am doing it.
However, I have managed to generate a small dataset myself (timit in this case) but as the training starts I am getting the following exception:
File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/transformers/trainer.py", line 1290, in train
for step, inputs in enumerate(epoch_iterator):
File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 521, in __next__
data = self._next_data()
File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 561, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 1857, in __getitem__
return self._getitem(
File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 1849, in _getitem
pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/datasets/formatting/formatting.py", line 462, in query_table
_check_valid_index_key(key, size)
File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/datasets/formatting/formatting.py", line 405, in _check_valid_index_key
raise IndexError(f"Invalid key: {key} is out of bounds for size {size}")
IndexError: Invalid key: 16 is out of bounds for size 0
0%| | 0/700 [00:00<?, ?it/s]
This is the implementation of my GeneratorBasedBuilder
:
class NewDataset(datasets.GeneratorBasedBuilder):
VERSION: datasets.Version = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
version=VERSION, description="This part of my dataset covers a first domain"
),
]
def _info(self):
features = datasets.Features(
{
"inputs": datasets.features.Sequence(datasets.Value("int16")),
"targets": datasets.Value("string"),
"length": datasets.Value("int64"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": "/mariana/asr/corpora/converted/en/timit_train",
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": "/mariana/asr/corpora/converted/en/timit_test",
"split": "test"
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": "/mariana/asr/corpora/converted/en/timit_dev",
"split": "dev",
},
),
]
def _generate_examples(self, filepath, split):
corpus = ConvertedCorpus(filepath)
for i, record in enumerate(corpus.sample_generator()):
key = "/".join((str(record.speaker_id), str(record.sample_id)))
yield key, dict(inputs=record.wav, targets=record.transcript, length=len(record.wav))
if i >= 100:
break
This is the content of the cache directory:
$ ls -lah
total 30M
drwxrwxr-x 2 sfalk sfalk 4.0K Feb 1 13:28 .
drwxrwxr-x 3 sfalk sfalk 4.0K Feb 1 13:28 ..
-rw-rw-r-- 1 sfalk sfalk 1.2K Feb 1 13:28 dataset_info.json
-rw-rw-r-- 1 sfalk sfalk 0 Feb 1 13:28 LICENSE
-rw-rw-r-- 1 sfalk sfalk 11M Feb 1 13:28 new_dataset-test.arrow
-rw-rw-r-- 1 sfalk sfalk 10M Feb 1 13:28 new_dataset-train.arrow
-rw-rw-r-- 1 sfalk sfalk 9.3M Feb 1 13:28 new_dataset-validation.arrow
And this here is the dataset_info.json
:
{
"description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.\n",
"citation": "@InProceedings{huggingface:dataset,\ntitle = {A great new dataset},\nauthor={huggingface, Inc.\n},\nyear={2020}\n}\n",
"homepage": "",
"license": "",
"features": {
"inputs": {
"feature": {
"dtype": "int16",
"id": null,
"_type": "Value"
},
"length": -1,
"id": null,
"_type": "Sequence"
},
"targets": {
"dtype": "string",
"id": null,
"_type": "Value"
},
"length": {
"dtype": "int64",
"id": null,
"_type": "Value"
}
},
"post_processed": null,
"supervised_keys": null,
"task_templates": null,
"builder_name": "new_dataset",
"config_name": "default",
"version": {
"version_str": "0.0.1",
"description": null,
"major": 0,
"minor": 0,
"patch": 1
},
"splits": {
"train": {
"name": "train",
"num_bytes": 10383006,
"num_examples": 101,
"dataset_name": "new_dataset"
},
"test": {
"name": "test",
"num_bytes": 10771888,
"num_examples": 101,
"dataset_name": "new_dataset"
},
"validation": {
"name": "validation",
"num_bytes": 9742303,
"num_examples": 101,
"dataset_name": "new_dataset"
}
},
"download_checksums": {},
"download_size": 0,
"post_processing_size": null,
"dataset_size": 30897197,
"size_in_bytes": 30897197
}
On additional interesting observation here: There is a replays
field on the MemoryMappedTable
(+sigh+) object. It looks like all feature columns have been dropped?