Could you please guide me how to handle to Out of memory issue while executing āTrainer.evaluateā
Here is my setup
#getting encoded datset
#preprocess_function(audio_dataset[:5])
def preprocess_function(examples):
audio_arrays = [x["array"] for x in examples["audio"]]
inputs = feature_extractor(
audio_arrays,
sampling_rate=feature_extractor.sampling_rate,
max_length=int(128),
truncation=True,
)
return inputs
audio_dict2 = {}
num_samples = 84000
audio_dict2['audio'] = ['../data/20230319_audioinput_1s_wav/test/' + str(int(y)) + '/audio_'+str(int(x))+'.wav' for x,y in zip(df_test.head(num_samples).index, df_test.head(num_samples).label)]
audio_dict2['label'] = [x for x in df_test.head(num_samples).label]
audio_dict2['split'] = len(df_test.head(num_samples))*['train']
audio_dataset2 = Dataset.from_dict(audio_dict2).cast_column("audio", Audio())
encoded_dataset2 = audio_dataset2.map(preprocess_function, remove_columns=["audio"], batched=True)
#training args
model_name = model_checkpoint.split("/")[-1]
batch_size = 4
metric = load_metric("accuracy")
args = TrainingArguments(
f"{model_name}-finetuned-ks1",
evaluation_strategy = "epoch",
save_strategy = "epoch",
learning_rate=3e-5,
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=25,
warmup_ratio=0.1,
logging_steps=1,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
push_to_hub=False,
)
trainer = Trainer(
model,
args,
train_dataset=encoded_dataset2,
eval_dataset=encoded_dataset2,
tokenizer=feature_extractor,
compute_metrics=compute_metrics
)
## I ge the OOM error here!
outputs = trainer.evaluate(encoded_dataset2)
In my case the dataset is available locally, Is there an option to batch process this evaluate process?