I’m running simple wav2vec2 example:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from torchaudio.utils import download_asset
import torch
import librosa
if __name__ == '__main__':
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
FILE_NAME = "tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
SPEECH_FILE = download_asset(FILE_NAME)
speech, sr = librosa.load(SPEECH_FILE, sr=16000)
input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
logits = model(input_values).logits
How can I decode the logits with the beam-search algorithm (without using LM) ?