Facing difficulty while fine tuning speech recognition model in local pc

I have successfully fine tuned a voice recognition model on Google Colab. But when I try to use the same code to fine tune my model on my local PC it gives me the following error message.

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Input In [85], in <cell line: 1>()
----> 1 trainer.train()

File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1316, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1314         tr_loss_step = self.training_step(model, inputs)
   1315 else:
-> 1316     tr_loss_step = self.training_step(model, inputs)
   1318 if (
   1319     args.logging_nan_inf_filter
   1320     and not is_torch_tpu_available()
   1321     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
   1322 ):
   1323     # if loss is nan or inf simply add the average of previous logged losses
   1324     tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1849, in Trainer.training_step(self, model, inputs)
   1847         loss = self.compute_loss(model, inputs)
   1848 else:
-> 1849     loss = self.compute_loss(model, inputs)
   1851 if self.args.n_gpu > 1:
   1852     loss = loss.mean()  # mean() to average on multi-gpu parallel training

File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1881, in Trainer.compute_loss(self, model, inputs, return_outputs)
   1879 else:
   1880     labels = None
-> 1881 outputs = model(**inputs)
   1882 # Save past state if it exists
   1883 # TODO: this needs to be fixed and made cleaner later.
   1884 if self.args.past_index >= 0:

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1110, in Module._call_impl(self, *input, **kwargs)
   1106 # If we don't have any hooks, we want to skip the rest of the logic in
   1107 # this function, and just call forward.
   1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110     return forward_call(*input, **kwargs)
   1111 # Do not call functions when jit is used
   1112 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1494, in Wav2Vec2ForCTC.forward(self, input_values, attention_mask, output_attentions, output_hidden_states, return_dict, labels)
   1449 r"""
   1450 labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_length)`, `optional`):
   1451     Labels for connectionist temporal classification. Note that ``target_length`` has to be smaller or equal to
   (...)
   1489     >>> loss = model(input_values, labels=labels).loss
   1490 """
   1492 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1494 outputs = self.wav2vec2(
   1495     input_values,
   1496     attention_mask=attention_mask,
   1497     output_attentions=output_attentions,
   1498     output_hidden_states=output_hidden_states,
   1499     return_dict=return_dict,
   1500 )
   1502 hidden_states = outputs[0]
   1503 hidden_states = self.dropout(hidden_states)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1110, in Module._call_impl(self, *input, **kwargs)
   1106 # If we don't have any hooks, we want to skip the rest of the logic in
   1107 # this function, and just call forward.
   1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110     return forward_call(*input, **kwargs)
   1111 # Do not call functions when jit is used
   1112 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:1064, in Wav2Vec2Model.forward(self, input_values, attention_mask, mask_time_indices, output_attentions, output_hidden_states, return_dict)
   1059 output_hidden_states = (
   1060     output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
   1061 )
   1062 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1064 extract_features = self.feature_extractor(input_values)
   1065 extract_features = extract_features.transpose(1, 2)
   1067 if attention_mask is not None:
   1068     # compute reduced attention_mask corresponding to feature vectors

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1110, in Module._call_impl(self, *input, **kwargs)
   1106 # If we don't have any hooks, we want to skip the rest of the logic in
   1107 # this function, and just call forward.
   1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110     return forward_call(*input, **kwargs)
   1111 # Do not call functions when jit is used
   1112 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:337, in Wav2Vec2FeatureExtractor.forward(self, input_values)
    335 hidden_states = input_values[:, None]
    336 for conv_layer in self.conv_layers:
--> 337     hidden_states = conv_layer(hidden_states)
    339 return hidden_states

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1110, in Module._call_impl(self, *input, **kwargs)
   1106 # If we don't have any hooks, we want to skip the rest of the logic in
   1107 # this function, and just call forward.
   1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110     return forward_call(*input, **kwargs)
   1111 # Do not call functions when jit is used
   1112 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:258, in Wav2Vec2GroupNormConvLayer.forward(self, hidden_states)
    257 def forward(self, hidden_states):
--> 258     hidden_states = self.conv(hidden_states)
    259     hidden_states = self.layer_norm(hidden_states)
    260     hidden_states = self.activation(hidden_states)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1110, in Module._call_impl(self, *input, **kwargs)
   1106 # If we don't have any hooks, we want to skip the rest of the logic in
   1107 # this function, and just call forward.
   1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110     return forward_call(*input, **kwargs)
   1111 # Do not call functions when jit is used
   1112 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:302, in Conv1d.forward(self, input)
    301 def forward(self, input: Tensor) -> Tensor:
--> 302     return self._conv_forward(input, self.weight, self.bias)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:298, in Conv1d._conv_forward(self, input, weight, bias)
    294 if self.padding_mode != 'zeros':
    295     return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
    296                     weight, bias, self.stride,
    297                     _single(0), self.dilation, self.groups)
--> 298 return F.conv1d(input, weight, bias, self.stride,
    299                 self.padding, self.dilation, self.groups)

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

Can you tell me where is the problem?

Looks to me like your model’s weights are on GPU but the input is not. What is your training code?

1 Like

Here is my training notebook. can you please check and suggest me some solution?

https://github.com/iftekherhossain/Bangla-Voice-Recognition/blob/master/Bangla_voice_train.ipynb

I’m not experienced with audio so I am afraid I cannot help you here.