I got excited seeing a tweet Automatic Speech Recognition is in transformers 4.3.0, so I had to try it. Unfortunately, I got an error.
I started by recording a 14 second test file on Quicktime, and then used VLC to convert it from .m4a to .wav
The first part ran fine:
import librosa
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
speech, rate = sf.read("test1.wav")
The next line caused an error – which I fixed (as below):
speech = librosa.resample(speech, rate, 16000)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/Documents/projects/misc-aiml/wav2vec.py in
----> 16 speech = librosa.resample(speech, rate, 16000)
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/librosa/core/audio.py in resample(y, orig_sr, target_sr, res_type, fix, scale, **kwargs)
582 y_hat = samplerate.resample(y.T, ratio, converter_type=res_type).T
583 else:
--> 584 y_hat = resampy.resample(y, orig_sr, target_sr, filter=res_type, axis=-1)
585
586 if fix:
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/resampy/core.py in resample(x, sr_orig, sr_new, axis, filter, **kwargs)
95
96 if shape[axis] < 1:
---> 97 raise ValueError('Input signal length={} is too small to '
98 'resample from {}->{}'.format(x.shape[axis], sr_orig, sr_new))
99
ValueError: Input signal length=2 is too small to resample from 44100->16000
Based on the discussion here, I changed it to speech.T
, which now seem to work so far
speech = librosa.resample(speech.T, rate, 16000)
input_values = tokenizer(speech, return_tensors = 'pt').input_values
However, then I get a different error:
logits = model(input_values).logits
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~/Documents/projects/misc-aiml/wav2vec.py in <module>
----> 1 model(input_values)
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in forward(self, input_values, output_attentions, output_hidden_states, return_dict, labels)
793 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
794
--> 795 outputs = self.wav2vec2(
796 input_values,
797 output_attentions=output_attentions,
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in forward(self, input_values, output_attentions, output_hidden_states, return_dict)
641 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
642
--> 643 hidden_states = self.feature_extractor(input_values)
644 hidden_states = self.feature_projection(hidden_states)
645
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in forward(self, input_values)
179 hidden_states = input_values[:, None]
180 for conv_layer in self.conv_layers:
--> 181 hidden_states = conv_layer(hidden_states)
182
183 return hidden_states
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in forward(self, hidden_states)
113
114 def forward(self, hidden_states):
--> 115 hidden_states = self.conv(hidden_states)
116 hidden_states = self.dropout(hidden_states)
117 hidden_states = self.layer_norm(hidden_states)
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/conv.py in forward(self, input)
256 self.weight, self.bias, self.stride,
257 _single(0), self.dilation, self.groups)
--> 258 return F.conv1d(input, self.weight, self.bias, self.stride,
259 self.padding, self.dilation, self.groups)
260
RuntimeError: Expected 3-dimensional input for 3-dimensional weight [512, 1, 10], but got 4-dimensional input of size [1, 1, 2, 221173] instead
Does anyone happen to know anything about the new model, and what I might be doing wrong?
Thanks!