Hi everyone.
I want to use whisper to do speech recognition on one minutes voices. every thing is fine, but I get this warning:
Whisper did not predict an ending timestamp which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Firs of all: what is WhisperTimeStampLogitsProcessor
? I couldn’t find it in any documentation.
Secondly, Can this affect the performance of speech recognition? How do I fix it? What things I can try?
From the voices, the end of speech is very clear.
Here are the codes that led to this warning message?
def load_model(model_id: str) -> None:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=16,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
)
return pipe
def batch_transcribe(file_addresses: list[str], pipe: pipeline, language: str) -> tuple:
dataset = AudioDataset(file_addresses, language)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)
text_results = []
raw_results = []
for batch_file_addresses in tqdm.tqdm(dataloader):
batch_res = pipe(batch_file_addresses,return_timestamps="word", generate_kwargs={"language": language})
for res, file_address in zip(batch_res, batch_file_addresses):
res['file_address'] = file_address
res['language'] = language
raw_results.append(res)
text_results.append(res['text'])
return (raw_results, text_results)
def transcribe_audio(audio_file: str, pipe: pipeline, language: str) -> dict:
result = pipe(audio_file, return_timestamps=True, generate_kwargs={"language": language})
return result
model = "openai/whisper-large"
pipe = load_model(model)
tmp_detail = metadata.loc[metadata['language'] == language].copy()
(raw_results , text_results) = batch_transcribe(dataset[language], pipe, language=language_map[language])
Any discussion or suggestion is welcome about this topic. I’d like to know why might this happen and what effects can it have.