Memory error while loading custom dataset

Hello,

I am trying to create my custom dataset from musdb18. It’s a dataset for music source separation. It consists of mp4 files each of them contains 5 waveforms for vocal, bass, drums etc.

Here is the script

import datasets
from pathlib import Path
import stempeg
import numpy as np

_DESCRIPTION = "musdb dataset"

class MusdbDataset(datasets.GeneratorBasedBuilder):
    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "name": datasets.Value("string"),
                "bass": datasets.Audio(sampling_rate=44100, mono=False),
                "drums": datasets.Audio(sampling_rate=44100, mono=False),
                "other": datasets.Audio(sampling_rate=44100, mono=False),
                "vocals": datasets.Audio(sampling_rate=44100, mono=False),
                "mixture": datasets.Audio(sampling_rate=44100, mono=False),

            })
        )

    def _split_generators(self, dl_manager):
        archive_path = dl_manager.download_and_extract(
            "https://zenodo.org/record/1117372/files/musdb18.zip?download=1")

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "audio_path": f"{archive_path}/train"}
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "audio_path": f"{archive_path}/test"
                }
            )
        ]

    def _generate_examples(self, audio_path):
        id_ = 0
        for stems_path in Path(audio_path).iterdir():
            song_name = stems_path.stem
            S, sr = stempeg.read_stems(
                str(stems_path), dtype=np.float32, multiprocess=False)

            names = ["mixture", "drums", "bass", "other", "vocals"]

            stem_dict = {name: {"path": f"{song_name}/{name}",
                                "array": S[i], "sampling_rate": sr} for i, name in enumerate(names)}
            yield id_, {
                "name": song_name,
                **stem_dict
            }

            id_ += 1

While trying to use
datasets.load_dataset("musdb_dataset.py")

I get a memory error

Generating train split: 100 examples [07:54,  5.04s/ examples]Traceback (most recent call last):
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 1598, in _prepare_split_single
    num_examples, num_bytes = writer.finalize()
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\arrow_writer.py", line 581, in finalize
    self.write_examples_on_file()
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\arrow_writer.py", line 446, in write_examples_on_file
    self.write_batch(batch_examples=batch_examples)
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\arrow_writer.py", line 551, in write_batch
    arrays.append(pa.array(typed_sequence))
  File "pyarrow\array.pxi", line 236, in pyarrow.lib.array
  File "pyarrow\array.pxi", line 110, in pyarrow.lib._handle_arrow_array_protocol
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\arrow_writer.py", line 189, in __arrow_array__
    out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))   
  File "pyarrow\array.pxi", line 320, in pyarrow.lib.array
  File "pyarrow\array.pxi", line 39, in pyarrow.lib._sequence_to_array     
  File "pyarrow\error.pxi", line 144, in pyarrow.lib.pyarrow_internal_check_status
  File "pyarrow\error.pxi", line 117, in pyarrow.lib.check_status
pyarrow.lib.ArrowMemoryError: realloc of size 1038288896 failed

The above exception was the direct cause of the following exception:       

Traceback (most recent call last):
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 811, in incomplete_dir
    yield tmp_dir
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 860, in download_and_prepare
    self._download_and_prepare(
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 1612, in _download_and_prepare
    super()._download_and_prepare(
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 953, in _download_and_prepare
    self._prepare_split(split_generator, **prepare_split_kwargs)
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 1450, in _prepare_split
    for job_id, done, content in self._prepare_split_single(
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 1607, in _prepare_split_single
    raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.builder.DatasetGenerationError: An error occurred while generating the dataset

During handling of the above exception, another exception occurred:        

Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\site-packages\datasets\builder.py", line 818, in incomplete_dir
    shutil.rmtree(tmp_dir)
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\shutil.py", line 749, in rmtree
    return _rmtree_unsafe(path, onerror)
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\shutil.py", line 619, in _rmtree_unsafe
    onerror(os.unlink, fullname, sys.exc_info())
  File "C:\Users\sebas\mambaforge\envs\data-science\lib\shutil.py", line 617, in _rmtree_unsafe
    os.unlink(fullname)
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'D:/hf_datasets/musdb_dataset/default/0.0.0/45c595a39c8bcb6bc807e82ab4bc25aec2848bc27dc351fdb4a1a6ed342500a2.incomplete\\musdb_dataset-train-00000-00000-of-NNNNN.arrow'

While watching Task manager I can clearly see that memory consumption is increasing slowly up to 90% and then start to oscilate. And in some moment it’s not always the same sample I just run out of memory.

Is this an issue with pyarrow, huggingface or maybe stempeg library that I use to load files? Or maybe I do something very stupid?

Also memory consumption stays very high after the exception so it is not freed.

I work on windows with 16GB of RAM.

datasets==2.9.0
stempeg==0.2.3

hi @sebchw! I’m not sure what’s causing the error and memory overload (do you have any ideas, @lhoestq ?) but note that when you provide arrays in audio feature, what it does under the hood is actually writing arrays to bytes and storing audios as bytes. And then after you load the dataset and access samples, audios are decoded on the fly with the datasets library standard decoding. We should clarify this in the docs I think.

So if you want to apply your custom decoding with stempeg, you can set decode=False to audio features (in _info) and provide only paths to local audio files in generate_examples, smth like:

    def _generate_examples(self, audio_path):
        id_ = 0
        names = ["mixture", "drums", "bass", "other", "vocals"]

        for stems_path in Path(audio_path).iterdir():
            yield id_, {
                "name": stems_path.stem,
                **{name: {"path": stems_path} for name in names}
            }
            id_ += 1

and then use your custom decoding function on the loaded dataset.

@polinaeterna Thanks for your response.

After debugging ArrowWriter I hack this with setting max batch size to something smaller than 1000:

datasets.config.DEFAULT_MAX_BATCH_SIZE = 10
dataset = datasets.load_dataset("musdb_dataset.py")

And now it works.

The problem is that in the code above I try to load 5 audio files each of 3 minutes in 44kHz resolution which is about 150MB and make one sample out of it. I was to cut them in smaller pieces later and just wanted to run this code as a sanity check.

And during debugging i saw ArrowWriter.write method

    if self._check_duplicates:
            # Create unique hash from key and store as (key, example) pairs
            hash = self._hasher.hash(key)
            self.current_examples.append((example, hash))
            # Maintain record of keys and their respective hashes for checking duplicates
            self.hkey_record.append((hash, key))
        else:
            # Store example as a tuple so as to keep the structure of `self.current_examples` uniform
            self.current_examples.append((example, ""))

        if writer_batch_size is None:
            writer_batch_size = self.writer_batch_size
        if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:
            if self._check_duplicates:
                self.check_duplicate_keys()
                # Re-intializing to empty list for next batch
                self.hkey_record = []

            self.write_examples_on_file()

It was appending example to self.current_examples and slowly filling entire RAM. self.writer_batch_size is read from datasets.config.DEFAULT_MAX_BATCH_SIZE and after setting it to some smaller value arrow file is saved before bloating RAM.

@polinaeterna
However, solution you provided looks very promising. If I do this that way will it write stem files as bytes to an arrow file or these will be searched on disk while decoding?

Btw. is there a way to set this

datasets.config.DEFAULT_MAX_BATCH_SIZE = 10

somewhere in the load_dataset function? I can’t see it.

1 Like

Hi ! You can set DEFAULT_WRITER_BATCH_SIZE = 10 as a class attribute to MusdbDataset - this way it will use 10 by default for your dataset.

It’s also possible to pass writer_batch_size to load_dataset()

If you do like in snippet I provided above, with yielding {"path": stem_path} in audio columns, they will be stored as paths and searched on disk. But you can write bytes instead by yielding {"path": None, "bytes": stem_path_file.read()}. But make sure not to provide real full paths in "path" then (you can set it to relative audio path/filename, to preserve file extension) - to ensure that bytes are written instead of paths.

1 Like