Hello,
I’m running a very simple code to get the transcription from an audio file. Is there a way of getting each word timestamp within the original audio?
Here is the code I’m running, thank you so much for your time and help
from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
import librosa as lb
import torch
def extract(fpath):
# Initialize the tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("KBLab/wav2vec2-large-xlsr-53-swedish")
# Initialize the model
model = Wav2Vec2ForCTC.from_pretrained("KBLab/wav2vec2-large-xlsr-53-swedish")
# Read the sound file
waveform, rate = lb.load(fpath, sr = 16000)
# Tokenize the waveform
input_values = tokenizer(waveform, return_tensors='pt', padding=True).input_values
# Retrieve logits from the model
with torch.no_grad():
logits = model(input_values).logits
# Take argmax value and decode into transcription
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
# Print the output
return transcription