Running out of Diskspace

Iā€™m trying to use this tutorial by @patrickvonplaten to pre-train Wav2vec2 on a custom dataset. Iā€™m using an AWS ECS instance ā€˜ml.p3.16xlargeā€™ with the hugging face base image of 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.9.1-transformers4.12.3-gpu-py38-cu111-ubuntu20.04 My disk space is set to 1024gb.

Iā€™m using the following script to create the dataset on disk after AWS downloads it.

def create_dataset(training_folder: str = os.environ.get("SM_CHANNEL_TRAIN")):
    audio_paths = []
    for voice_path in Path(training_folder).iterdir():
        for chunks_path in voice_path.iterdir():
            for file_path in chunks_path.iterdir():
                audio_paths.append({'audio': str(file_path)})
    path_df = pd.DataFrame.from_dict(audio_paths)
    my_audio_dataset = Dataset.from_pandas(
        df=path_df,
        split=datasets.NamedSplit(name='train')
    )
    my_audio_dataset = my_audio_dataset.cast_column("audio", Audio())
    my_audio_dataset.save_to_disk(str(Path(SM_MODULE_NAME).parent.joinpath('<dataset_name>')))

Then ā€œ<dataset_name>ā€ goes in the parameter ā€œdataset_nameā€. It works up to the preprocessing part, which Iā€™ve had to edit as in below.

datasets_splits = []
    print('Creating dataset')
    create_dataset(os.environ.get("SM_CHANNEL_TRAIN"))
    print("loading datasets")
    dataset_split = load_from_disk(str(Path(SM_MODULE_NAME).parent.joinpath('VoicesOfColor')))
    datasets_splits.append(dataset_split)
    # for dataset_config_name, train_split_name in zip(args.dataset_config_names, args.dataset_split_names):
    #     # load dataset
    #     dataset_split = load_dataset(
    #         args.dataset_name, dataset_config_name, split=train_split_name, cache_dir=args.cache_dir,
    #     )
    #     datasets_splits.append(dataset_split)

    # Next, we concatenate all configurations and splits into a single training dataset
    raw_datasets = DatasetDict()
    if len(datasets_splits) > 1:
        raw_datasets["train"] = concatenate_datasets(datasets_splits).shuffle(seed=args.seed)
    else:
        raw_datasets["train"] = datasets_splits[0]

    # Take ``args.validation_split_percentage`` from the training dataset for the validation_split_percentage
    num_validation_samples = raw_datasets["train"].num_rows * args.validation_split_percentage // 100

    if num_validation_samples == 0:
        raise ValueError(
            "`args.validation_split_percentage` is less than a single sample "
            f"for {len(raw_datasets['train'])} training samples. Increase "
            "`args.num_validation_split_percentage`. "
        )

    raw_datasets["validation"] = raw_datasets["train"].select(range(num_validation_samples))
    raw_datasets["train"] = raw_datasets["train"].select(range(num_validation_samples, raw_datasets["train"].num_rows))

    # 2. Now we preprocess the datasets including loading the audio, resampling and normalization
    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
    # so that we just need to set the correct target sampling rate and normalize the input
    # via the `feature_extractor`
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(args.model_name_or_path)

    # make sure that dataset decodes audio with correct sampling rate
    # raw_datasets = raw_datasets.cast_column(
    #     args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
    # )

    # only normalized-inputs-training is supported
    if not feature_extractor.do_normalize:
        raise ValueError(
            "Training is only supported for normalized inputs. " "Make sure ``feature_extractor.do_normalize == True``"
        )

    # set max & min audio length in number of samples
    max_length = int(args.max_duration_in_seconds * feature_extractor.sampling_rate)
    min_length = int(args.min_duration_in_seconds * feature_extractor.sampling_rate)

    def prepare_dataset(batch):
        sample = batch[args.audio_column_name]

        inputs = feature_extractor(
            sample["array"], sampling_rate=sample["sampling_rate"], max_length=max_length, truncation=True
        )
        batch["input_values"] = inputs.input_values[0]
        batch["input_length"] = len(inputs.input_values[0])

        return batch

    # load via mapped files via path
    cache_file_names = None
    if args.train_cache_file_name is not None:
        cache_file_names = {"train": args.train_cache_file_name, "validation": args.validation_cache_file_name}

    # load audio files into numpy arrays
    # with accelerator.main_process_first():
    vectorized_datasets = raw_datasets.map(
        prepare_dataset,
        num_proc=args.preprocessing_num_workers,
        remove_columns=raw_datasets["train"].column_names,
        cache_file_names=cache_file_names,
    )

I get the following error:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 2390, in _map_single
    writer.write(example)
  File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 382, in write
    self.write_examples_on_file()
  File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 340, in write_examples_on_file
    self.write_table(table)
  File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 447, in write_table
    self.pa_writer.write_batch(batch)
  File "pyarrow/ipc.pxi", line 384, in pyarrow.lib._CRecordBatchWriter.write_batch
  File "pyarrow/error.pxi", line 112, in pyarrow.lib.check_status
OSError: [Errno 28] Error writing bytes to file. Detail: [errno 28] No space left on device
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/multiprocess/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 521, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 488, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/datasets/fingerprint.py", line 406, in wrapper
    out = func(self, *args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 2426, in _map_single
    writer.finalize()
  File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 456, in finalize
    self.write_examples_on_file()
  File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 340, in write_examples_on_file
    self.write_table(table)
  File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 447, in write_table
    self.pa_writer.write_batch(batch)
  File "pyarrow/ipc.pxi", line 384, in pyarrow.lib._CRecordBatchWriter.write_batch
  File "pyarrow/error.pxi", line 112, in pyarrow.lib.check_status
OSError: [Errno 28] Error writing bytes to file. Detail: [errno 28] No space left on device

The above exception was the direct cause of the following exception:
Traceback (most recent call last):

  File "pretrain.py", line 775, in <module>
main()
  File "pretrain.py", line 511, in main
    vectorized_datasets = raw_datasets.map(
  File "/opt/conda/lib/python3.8/site-packages/datasets/dataset_dict.py", line 484, in map
{
  File "/opt/conda/lib/python3.8/site-packages/datasets/dataset_dict.py", line 485, in <dictcomp>
k: dataset.map(
  File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 2140, in map
transformed_shards[index] = async_result.get()
  File "/opt/conda/lib/python3.8/site-packages/multiprocess/pool.py", line 771, in get
raise self._value
OSError: [Errno 28] Error writing bytes to file. Detail: [errno 28] No space left on device

My question in is how do I pre-process the data in a stream or some other way to not overload the memory?

Hi !

If you donā€™t have enough disk space, Iā€™d suggest to do your processing on-the-fly instead of processing on your disk. To do so you just need to pass your processing function as a formatting transform that will be applied on-the-fly

This way your processing will be done one example at a time when you access the examples. In this case the processing is done in memory and doesnā€™t fill up your disk.