Hello,
I am trying to create my custom dataset from musdb18. It’s a dataset for music source separation. It consists of mp4
files each of them contains 5 waveforms for vocal
, bass
, drums
etc.
Here is the script
import datasets
from pathlib import Path
import stempeg
import numpy as np
_DESCRIPTION = "musdb dataset"
class MusdbDataset(datasets.GeneratorBasedBuilder):
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features({
"name": datasets.Value("string"),
"bass": datasets.Audio(sampling_rate=44100, mono=False),
"drums": datasets.Audio(sampling_rate=44100, mono=False),
"other": datasets.Audio(sampling_rate=44100, mono=False),
"vocals": datasets.Audio(sampling_rate=44100, mono=False),
"mixture": datasets.Audio(sampling_rate=44100, mono=False),
})
)
def _split_generators(self, dl_manager):
archive_path = dl_manager.download_and_extract(
"https://zenodo.org/record/1117372/files/musdb18.zip?download=1")
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"audio_path": f"{archive_path}/train"}
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"audio_path": f"{archive_path}/test"
}
)
]
def _generate_examples(self, audio_path):
id_ = 0
for stems_path in Path(audio_path).iterdir():
song_name = stems_path.stem
S, sr = stempeg.read_stems(
str(stems_path), dtype=np.float32, multiprocess=False)
names = ["mixture", "drums", "bass", "other", "vocals"]
stem_dict = {name: {"path": f"{song_name}/{name}",
"array": S[i], "sampling_rate": sr} for i, name in enumerate(names)}
yield id_, {
"name": song_name,
**stem_dict
}
id_ += 1
While trying to use
datasets.load_dataset("musdb_dataset.py")
I get a memory error
Generating train split: 100 examples [07:54, 5.04s/ examples]Traceback (most recent call last):
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 1598, in _prepare_split_single
num_examples, num_bytes = writer.finalize()
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\arrow_writer.py", line 581, in finalize
self.write_examples_on_file()
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\arrow_writer.py", line 446, in write_examples_on_file
self.write_batch(batch_examples=batch_examples)
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\arrow_writer.py", line 551, in write_batch
arrays.append(pa.array(typed_sequence))
File "pyarrow\array.pxi", line 236, in pyarrow.lib.array
File "pyarrow\array.pxi", line 110, in pyarrow.lib._handle_arrow_array_protocol
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\arrow_writer.py", line 189, in __arrow_array__
out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
File "pyarrow\array.pxi", line 320, in pyarrow.lib.array
File "pyarrow\array.pxi", line 39, in pyarrow.lib._sequence_to_array
File "pyarrow\error.pxi", line 144, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow\error.pxi", line 117, in pyarrow.lib.check_status
pyarrow.lib.ArrowMemoryError: realloc of size 1038288896 failed
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 811, in incomplete_dir
yield tmp_dir
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 860, in download_and_prepare
self._download_and_prepare(
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 1612, in _download_and_prepare
super()._download_and_prepare(
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 953, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 1450, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 1607, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.builder.DatasetGenerationError: An error occurred while generating the dataset
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 818, in incomplete_dir
shutil.rmtree(tmp_dir)
File "C:\Users\sebas\mambaforge\envs\data-science\lib\shutil.py", line 749, in rmtree
return _rmtree_unsafe(path, onerror)
File "C:\Users\sebas\mambaforge\envs\data-science\lib\shutil.py", line 619, in _rmtree_unsafe
onerror(os.unlink, fullname, sys.exc_info())
File "C:\Users\sebas\mambaforge\envs\data-science\lib\shutil.py", line 617, in _rmtree_unsafe
os.unlink(fullname)
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'D:/hf_datasets/musdb_dataset/default/0.0.0/45c595a39c8bcb6bc807e82ab4bc25aec2848bc27dc351fdb4a1a6ed342500a2.incomplete\\musdb_dataset-train-00000-00000-of-NNNNN.arrow'
While watching Task manager
I can clearly see that memory consumption is increasing slowly up to 90% and then start to oscilate. And in some moment it’s not always the same sample I just run out of memory.
Is this an issue with pyarrow
, huggingface
or maybe stempeg
library that I use to load files? Or maybe I do something very stupid?
Also memory consumption stays very high after the exception so it is not freed.
I work on windows with 16GB of RAM.
datasets==2.9.0
stempeg==0.2.3