Hi im currently trying to map about 5000 audio files into my dataset in aws Sagemaker. When i try to do that, my Kernel crashes after a few minutes, i assume that the its overloading my chache memory… When i use map with only 1000 files it works fine! Does anybody know what i can do about that? That is my code and the function i you to process mydata:
train_dataset = train_dataset.map(preprocess_function, batched=True, batch_size=None, keep_in_memory=True, load_from_cache_file=False) def preprocess_function(examples):
def preprocess_function(examples):
audio_arrays = [list(x["array"]) for x in examples["audio"]]
max_length_audio = max(len(audio) for audio in audio_arrays)
audio_arrays_padded = [np.pad(audio, (0, max_length_audio - len(audio))) if len(audio) < max_length_audio else audio[:max_length_audio] for audio in audio_arrays]
print(audio_arrays_padded[0][:10])
text_list = examples['transcription']
input_data = processor(
audio=audio_arrays_padded,
text_target=text_list,
sampling_rate=16000,
return_tensors='pt',
return_attention_mask=True,
padding='longest'
)
print(input_data)
print(input_data['input_values'].shape)
print(input_data['attention_mask'].shape)
print(input_data['labels'].shape)
print(input_data['decoder_attention_mask'].shape)
# return 'input_values': input_data['input_values'], 'attention_mask': input_data['attention_mask'], 'labels': input_data['labels'], 'decoder_attention_mask': input_data['decoder_attention_mask']
return {"input_values": input_data['input_values'],
"attention_mask": input_data['attention_mask'],
"labels": input_data['labels'],
"decoder_attention_mask": input_data['decoder_attention_mask']}