Thank you for your reply @patrickvonplaten, but after some steps I get the following error when I set mask_time_length=10:
ValueError: Cannot take a larger sample than population when 'replace=False'
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-0a4dde6faf17> in <module>
1 # test
----> 2 trainer.train()
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1330 tr_loss_step = self.training_step(model, inputs)
1331 else:
-> 1332 tr_loss_step = self.training_step(model, inputs)
1333
1334 if (
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in training_step(self, model, inputs)
1889
1890 with self.autocast_smart_context_manager():
-> 1891 loss = self.compute_loss(model, inputs)
1892
1893 if self.args.n_gpu > 1:
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
1921 else:
1922 labels = None
-> 1923 outputs = model(**inputs)
1924 # Save past state if it exists
1925 # TODO: this needs to be fixed and made cleaner later.
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.6/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in forward(self, input_values, attention_mask, output_attentions, output_hidden_states, return_dict, labels)
1659 output_attentions=output_attentions,
1660 output_hidden_states=output_hidden_states,
-> 1661 return_dict=return_dict,
1662 )
1663
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.6/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in forward(self, input_values, attention_mask, mask_time_indices, output_attentions, output_hidden_states, return_dict)
1286 hidden_states, extract_features = self.feature_projection(extract_features)
1287 hidden_states = self._mask_hidden_states(
-> 1288 hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
1289 )
1290
/usr/local/lib/python3.6/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in _mask_hidden_states(self, hidden_states, mask_time_indices, attention_mask)
1233 mask_length=self.config.mask_time_length,
1234 attention_mask=attention_mask,
-> 1235 min_masks=self.config.mask_time_min_masks,
1236 )
1237 mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
/usr/local/lib/python3.6/dist-packages/transformers/models/wav2vec2/modeling_wav2vec2.py in _compute_mask_indices(shape, mask_prob, mask_length, attention_mask, min_masks)
241 # get random indices to mask
242 spec_aug_mask_idx = np.random.choice(
--> 243 np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
244 )
245
mtrand.pyx in numpy.random.mtrand.RandomState.choice()
ValueError: Cannot take a larger sample than population when 'replace=False'
In this case, input_length=9 and mask_length=10 so np.arange(input_length - (mask_length - 1)) = []