How did it turn out? I ran into the same issue, where it feels like it is taking 3-6x longer to predict than whisper medium.en took. I have a need of rapid near live transcriptions.
This is my code so far:
import os
import csv
import whisper
from transformers import WhisperForConditionalGeneration, WhisperConfig, WhisperModel, WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor
from transformers import pipeline
path_to_model = 'path_to_model'
model = WhisperForConditionalGeneration.from_pretrained(path_to_model)
model.config.max_length=150
processor = WhisperProcessor.from_pretrained(path_to_model, language="english", task="automatic-speech-recognition",
generation_num_beams=1)
tokenizer = WhisperTokenizer.from_pretrained(path_to_model,
generation_num_beams=1)
featureextractor = WhisperFeatureExtractor.from_pretrained(path_to_model)
pipe = pipeline(
task = 'automatic-speech-recognition',
model = model,
tokenizer = tokenizer,
feature_extractor = featureextractor,
chunk_length_s=15
)
def transcribe(audio):
text = pipe(audio)["text"]
return text
transcription = transcribe(file_path)