Failed attempt to use new Automatic Speech Recognition

I got excited seeing a tweet Automatic Speech Recognition is in transformers 4.3.0, so I had to try it. Unfortunately, I got an error.

I started by recording a 14 second test file on Quicktime, and then used VLC to convert it from .m4a to .wav

The first part ran fine:

import librosa
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

speech, rate = sf.read("test1.wav")

The next line caused an error – which I fixed (as below):

speech = librosa.resample(speech, rate, 16000)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/Documents/projects/misc-aiml/wav2vec.py in 
----> 16 speech = librosa.resample(speech, rate, 16000)

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/librosa/core/audio.py in resample(y, orig_sr, target_sr, res_type, fix, scale, **kwargs)
    582         y_hat = samplerate.resample(y.T, ratio, converter_type=res_type).T
    583     else:
--> 584         y_hat = resampy.resample(y, orig_sr, target_sr, filter=res_type, axis=-1)
    585 
    586     if fix:

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/resampy/core.py in resample(x, sr_orig, sr_new, axis, filter, **kwargs)
     95 
     96     if shape[axis] < 1:
---> 97         raise ValueError('Input signal length={} is too small to '
     98                          'resample from {}->{}'.format(x.shape[axis], sr_orig, sr_new))
     99 

ValueError: Input signal length=2 is too small to resample from 44100->16000

Based on the discussion here, I changed it to speech.T, which now seem to work so far


speech = librosa.resample(speech.T, rate, 16000)

input_values = tokenizer(speech, return_tensors = 'pt').input_values

However, then I get a different error:

logits = model(input_values).logits
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
~/Documents/projects/misc-aiml/wav2vec.py in <module>
----> 1 model(input_values)

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in forward(self, input_values, output_attentions, output_hidden_states, return_dict, labels)
    793         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    794 
--> 795         outputs = self.wav2vec2(
    796             input_values,
    797             output_attentions=output_attentions,

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in forward(self, input_values, output_attentions, output_hidden_states, return_dict)
    641         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    642 
--> 643         hidden_states = self.feature_extractor(input_values)
    644         hidden_states = self.feature_projection(hidden_states)
    645 

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in forward(self, input_values)
    179         hidden_states = input_values[:, None]
    180         for conv_layer in self.conv_layers:
--> 181             hidden_states = conv_layer(hidden_states)
    182 
    183         return hidden_states

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in forward(self, hidden_states)
    113 
    114     def forward(self, hidden_states):
--> 115         hidden_states = self.conv(hidden_states)
    116         hidden_states = self.dropout(hidden_states)
    117         hidden_states = self.layer_norm(hidden_states)

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/miniconda3/envs/wav2vec/lib/python3.8/site-packages/torch/nn/modules/conv.py in forward(self, input)
    256                             self.weight, self.bias, self.stride,
    257                             _single(0), self.dilation, self.groups)
--> 258         return F.conv1d(input, self.weight, self.bias, self.stride,
    259                         self.padding, self.dilation, self.groups)
    260 

RuntimeError: Expected 3-dimensional input for 3-dimensional weight [512, 1, 10], but got 4-dimensional input of size [1, 1, 2, 221173] instead

Does anyone happen to know anything about the new model, and what I might be doing wrong?

Thanks!

I solved it - I made one change:

logits = model(input_values[0]).logits

I ran the model on the first element of the tensor (adding in the [0]) – now I succeded!

Thanks!

4 Likes

I’m trying to use this model to transcribe Youtube videos but my Google Colab instance keeps crashing at that line. It says I’ve used up all of my memory (even though I’m using the High-RAM setting in Colab).

Here’s my code:

!pip install git+https://github.com/huggingface/transformers
!pip install youtube-dl path.py soundfile librosa sentencepiece torchaudio
import youtube_dl

from path import Path as Path

import tempfile

import textwrap

import librosa

import soundfile as sf

import torch

from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

wrapper = textwrap.TextWrapper(width=70)

mydir = tempfile.TemporaryDirectory()

dirname = mydir.name + "/tmp.wav"

print(dirname)

!youtube-dl -o $dirname -ci -f 'bestvideo[ext=mp4]+bestaudio' -x --audio-format wav https://www.youtube.com/watch?v=d5yfUuHYWho

filename = dirname + ".wav"

speech, rate = sf.read(filename)

speech = librosa.resample(speech.T, rate, 16000)

input_values = tokenizer(speech, return_tensors = 'pt').input_values

logits = model(input_values[0]).logits