from datasets import load_dataset, Audio, Dataset
from transformers.pipelines.pt_utils import KeyDataset
from transformers import AutoModelForSpeechSeq2Seq, AutoModelForCausalLM, AutoProcessor, pipeline
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=416,
batch_size=16,
return_timestamps=False,
torch_dtype=torch_dtype,
device=device,
generate_kwargs={"language": "hindi"},
)
audio_dataset = Dataset.from_dict({"audio": audio_files}).cast_column(
"audio", Audio(sampling_rate=16000)
)
for i, out in tqdm(
enumerate(pipe(KeyDataset(audio_dataset, "audio"))), total=len(audio_files)
):
pass
On iterating over the dataset, I get the following error
ValueError: The elements of the batch contain different keys. Cannot batch them ({'input_features', 'num_frames', 'is_last'} != {'input_features', 'is_last'})
I checked that all files in audio_files exist and are not empty.