I get an error
RuntimeError: Expected 3-dimensional input for 3-dimensional weight [512, 1, 10], but got 5-dimensional input of size [1, 1, 1, 240000, 2] instead
while feeding the Wav2Vec2Processor
and HubertForCTC
with a wav
audio file:
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-xlarge-ls960-ft"
, cache_dir=os.getenv("cache_dir", "../../models"))
model = HubertForCTC.from_pretrained("facebook/hubert-xlarge-ls960-ft"
, cache_dir=os.getenv("cache_dir", "../../models"))
for idx, audio in enumerate(train_loader):
input_values = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_values # Batch size 1
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
print(transcription)
where the audio input comes from this function
def read_audio(self, audio_path):
try:
import soundfile as sf
y, _ = sf.read(audio_path)
return y # [1, 960000]
except Exception as err:
try:
import librosa
y, _ = librosa.load(audio_path, sr=self.sr)
return y
except Exception as err:
pass
return None
The shape of the audio is like (with soundfile
):
0 torch.Size([1, 960000])
1 torch.Size([1, 240000, 2])
and librosa
:
0 torch.Size([1, 240000])
1 torch.Size([1, 960000])