Iām trying to use this tutorial by @patrickvonplaten to pre-train Wav2vec2 on a custom dataset. Iām using an AWS ECS instance āml.p3.16xlargeā with the hugging face base image of 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.9.1-transformers4.12.3-gpu-py38-cu111-ubuntu20.04 My disk space is set to 1024gb.
Iām using the following script to create the dataset on disk after AWS downloads it.
def create_dataset(training_folder: str = os.environ.get("SM_CHANNEL_TRAIN")):
audio_paths = []
for voice_path in Path(training_folder).iterdir():
for chunks_path in voice_path.iterdir():
for file_path in chunks_path.iterdir():
audio_paths.append({'audio': str(file_path)})
path_df = pd.DataFrame.from_dict(audio_paths)
my_audio_dataset = Dataset.from_pandas(
df=path_df,
split=datasets.NamedSplit(name='train')
)
my_audio_dataset = my_audio_dataset.cast_column("audio", Audio())
my_audio_dataset.save_to_disk(str(Path(SM_MODULE_NAME).parent.joinpath('<dataset_name>')))
Then ā<dataset_name>ā goes in the parameter ādataset_nameā. It works up to the preprocessing part, which Iāve had to edit as in below.
datasets_splits = []
print('Creating dataset')
create_dataset(os.environ.get("SM_CHANNEL_TRAIN"))
print("loading datasets")
dataset_split = load_from_disk(str(Path(SM_MODULE_NAME).parent.joinpath('VoicesOfColor')))
datasets_splits.append(dataset_split)
# for dataset_config_name, train_split_name in zip(args.dataset_config_names, args.dataset_split_names):
# # load dataset
# dataset_split = load_dataset(
# args.dataset_name, dataset_config_name, split=train_split_name, cache_dir=args.cache_dir,
# )
# datasets_splits.append(dataset_split)
# Next, we concatenate all configurations and splits into a single training dataset
raw_datasets = DatasetDict()
if len(datasets_splits) > 1:
raw_datasets["train"] = concatenate_datasets(datasets_splits).shuffle(seed=args.seed)
else:
raw_datasets["train"] = datasets_splits[0]
# Take ``args.validation_split_percentage`` from the training dataset for the validation_split_percentage
num_validation_samples = raw_datasets["train"].num_rows * args.validation_split_percentage // 100
if num_validation_samples == 0:
raise ValueError(
"`args.validation_split_percentage` is less than a single sample "
f"for {len(raw_datasets['train'])} training samples. Increase "
"`args.num_validation_split_percentage`. "
)
raw_datasets["validation"] = raw_datasets["train"].select(range(num_validation_samples))
raw_datasets["train"] = raw_datasets["train"].select(range(num_validation_samples, raw_datasets["train"].num_rows))
# 2. Now we preprocess the datasets including loading the audio, resampling and normalization
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
# so that we just need to set the correct target sampling rate and normalize the input
# via the `feature_extractor`
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(args.model_name_or_path)
# make sure that dataset decodes audio with correct sampling rate
# raw_datasets = raw_datasets.cast_column(
# args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
# )
# only normalized-inputs-training is supported
if not feature_extractor.do_normalize:
raise ValueError(
"Training is only supported for normalized inputs. " "Make sure ``feature_extractor.do_normalize == True``"
)
# set max & min audio length in number of samples
max_length = int(args.max_duration_in_seconds * feature_extractor.sampling_rate)
min_length = int(args.min_duration_in_seconds * feature_extractor.sampling_rate)
def prepare_dataset(batch):
sample = batch[args.audio_column_name]
inputs = feature_extractor(
sample["array"], sampling_rate=sample["sampling_rate"], max_length=max_length, truncation=True
)
batch["input_values"] = inputs.input_values[0]
batch["input_length"] = len(inputs.input_values[0])
return batch
# load via mapped files via path
cache_file_names = None
if args.train_cache_file_name is not None:
cache_file_names = {"train": args.train_cache_file_name, "validation": args.validation_cache_file_name}
# load audio files into numpy arrays
# with accelerator.main_process_first():
vectorized_datasets = raw_datasets.map(
prepare_dataset,
num_proc=args.preprocessing_num_workers,
remove_columns=raw_datasets["train"].column_names,
cache_file_names=cache_file_names,
)
I get the following error:
Traceback (most recent call last):
File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 2390, in _map_single
writer.write(example)
File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 382, in write
self.write_examples_on_file()
File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 340, in write_examples_on_file
self.write_table(table)
File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 447, in write_table
self.pa_writer.write_batch(batch)
File "pyarrow/ipc.pxi", line 384, in pyarrow.lib._CRecordBatchWriter.write_batch
File "pyarrow/error.pxi", line 112, in pyarrow.lib.check_status
OSError: [Errno 28] Error writing bytes to file. Detail: [errno 28] No space left on device
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/lib/python3.8/site-packages/multiprocess/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 521, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 488, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/datasets/fingerprint.py", line 406, in wrapper
out = func(self, *args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 2426, in _map_single
writer.finalize()
File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 456, in finalize
self.write_examples_on_file()
File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 340, in write_examples_on_file
self.write_table(table)
File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py", line 447, in write_table
self.pa_writer.write_batch(batch)
File "pyarrow/ipc.pxi", line 384, in pyarrow.lib._CRecordBatchWriter.write_batch
File "pyarrow/error.pxi", line 112, in pyarrow.lib.check_status
OSError: [Errno 28] Error writing bytes to file. Detail: [errno 28] No space left on device
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "pretrain.py", line 775, in <module>
main()
File "pretrain.py", line 511, in main
vectorized_datasets = raw_datasets.map(
File "/opt/conda/lib/python3.8/site-packages/datasets/dataset_dict.py", line 484, in map
{
File "/opt/conda/lib/python3.8/site-packages/datasets/dataset_dict.py", line 485, in <dictcomp>
k: dataset.map(
File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 2140, in map
transformed_shards[index] = async_result.get()
File "/opt/conda/lib/python3.8/site-packages/multiprocess/pool.py", line 771, in get
raise self._value
OSError: [Errno 28] Error writing bytes to file. Detail: [errno 28] No space left on device
My question in is how do I pre-process the data in a stream or some other way to not overload the memory?