TypeError: '<' not supported between instances of 'NoneType' and 'int' while training wav2vec2

Hello,

Here I read about set the eval_steps to 10, but hasn´t work for me. When I do it, I get: ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided [‘input_values’].

Here is my code

import os
import torch
print(torch.cuda.is_available())
import transformers
print(transformers.__version__)
import datasets
print(datasets.__version__)


path_audio = "audios/"
path_transcripts = "transcripts/"

i = 0
j = 0
for (root,dirs,file) in os.walk(path_audio):
    if i > 6:
        break
    i = i+1
    print(i)
    files = file
        
for (root,dirs,file) in os.walk(path_transcripts):
    
    if j > 6:
        break
    else:
        text = file
        j =j+1


texts = []
for t in text:
    with open(path_transcripts + t) as f:
        line = f.readlines()
        if len(line)>0:
            texts.append(line[0])

import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\`\”\�\$\&]'

def remove_special_characters(batch):
    from unidecode import unidecode
    #batch = re.sub(chars_to_ignore_regex, '', batch).lower() + " "
    batch = re.sub(chars_to_ignore_regex, '', batch).lower()
    batch = unidecode(batch)
    if batch[-1] == " ":
        batch = batch[:len(batch)-1]
    return batch

texts = list(map(remove_special_characters,texts))
text_combined = " ".join(texts)

tokens = {}
for token in text_combined:
   if token in tokens.keys():
       tokens[token] = tokens[token] + 1
   else:
       tokens[token] = 0
tokens["|"] = tokens[" "]
del tokens[" "]

import json
with open('tokens.json', 'w') as f:
     json.dump(tokens, f)

data_lst = []
for a,t in zip(files,texts):
    data_lst.append({"path":path_audio + a, "transcription": t})

tokens["[UNK]"] = len(tokens)
tokens["[PAD]"] = len(tokens)

from transformers import Wav2Vec2CTCTokenizer
tokenizer = Wav2Vec2CTCTokenizer("tokens.json",unk_token="[UNK]",pad_token="[PAD]",word_delimiter_token="|")

from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

data_train = data_lst[0:round(len(data_lst)*0.8)]
data_test = data_lst[round(len(data_lst)*0.8):]

import librosa
for d in data_lst:
    d["audio"] = librosa.load(d["path"],sr=16000)


def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio[0], sampling_rate=audio[-1]).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
        #batch["labels"] = replace_n(batch["labels"],tokens)
    return batch

audio_train = list(map(prepare_dataset, data_train))
audio_test = list(map(prepare_dataset, data_test))

import pandas as pd
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset

df_train = pd.DataFrame(audio_train).drop(['path', 'audio','transcription'], axis=1)
df_test = pd.DataFrame(audio_test).drop(['path', 'audio','transcription'], axis=1)

train = Dataset(pa.Table.from_pandas(df_train)) 
test = Dataset(pa.Table.from_pandas(df_test)) 

import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
wer_metric = datasets.load_metric("wer")


def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

model.freeze_feature_encoder()

from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir='wav2vec2-spanish-demo',
  group_by_length=True,
  per_device_train_batch_size=8,
  evaluation_strategy="steps",
  num_train_epochs=30,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=processor.feature_extractor,
)

Here is the complete error:

/anaconda/envs/transformer/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
***** Running training *****
  Num examples = 26
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 120
/anaconda/envs/transformer/lib/python3.10/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:154: UserWarning: `as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your labels by using the argument `text` of the regular `__call__` method (either in the same call as your audio inputs, or in a separate call.
  warnings.warn(
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[58], line 1
----> 1 trainer.train()

File /anaconda/envs/transformer/lib/python3.10/site-packages/transformers/trainer.py:1500, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1495     self.model_wrapped = self.model
   1497 inner_training_loop = find_executable_batch_size(
   1498     self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
   1499 )
-> 1500 return inner_training_loop(
   1501     args=args,
   1502     resume_from_checkpoint=resume_from_checkpoint,
   1503     trial=trial,
   1504     ignore_keys_for_eval=ignore_keys_for_eval,
   1505 )

File /anaconda/envs/transformer/lib/python3.10/site-packages/transformers/trainer.py:1716, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1713     self._load_rng_state(resume_from_checkpoint)
   1715 step = -1
-> 1716 for step, inputs in enumerate(epoch_iterator):
   1717 
   1718     # Skip past any already trained steps if resuming training
   1719     if steps_trained_in_current_epoch > 0:
   1720         steps_trained_in_current_epoch -= 1

File /anaconda/envs/transformer/lib/python3.10/site-packages/torch/utils/data/dataloader.py:530, in _BaseDataLoaderIter.__next__(self)
    528 if self._sampler_iter is None:
    529     self._reset()
--> 530 data = self._next_data()
    531 self._num_yielded += 1
    532 if self._dataset_kind == _DatasetKind.Iterable and \
    533         self._IterableDataset_len_called is not None and \
    534         self._num_yielded > self._IterableDataset_len_called:

File /anaconda/envs/transformer/lib/python3.10/site-packages/torch/utils/data/dataloader.py:570, in _SingleProcessDataLoaderIter._next_data(self)
    568 def _next_data(self):
    569     index = self._next_index()  # may raise StopIteration
--> 570     data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    571     if self._pin_memory:
    572         data = _utils.pin_memory.pin_memory(data)

File /anaconda/envs/transformer/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:52, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
     50 else:
     51     data = self.dataset[possibly_batched_index]
---> 52 return self.collate_fn(data)

Cell In[50], line 39, in DataCollatorCTCWithPadding.__call__(self, features)
     33 batch = self.processor.pad(
     34     input_features,
     35     padding=self.padding,
     36     return_tensors="pt",
     37 )
     38 with self.processor.as_target_processor():
---> 39     labels_batch = self.processor.pad(
     40         label_features,
     41         padding=self.padding,
     42         return_tensors="pt",
     43     )
     45 # replace padding with -100 to ignore loss correctly
     46 labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

File /anaconda/envs/transformer/lib/python3.10/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:113, in Wav2Vec2Processor.pad(self, *args, **kwargs)
    111 # For backward compatibility
    112 if self._in_target_context_manager:
--> 113     return self.current_processor.pad(*args, **kwargs)
    115 input_features = kwargs.pop("input_features", None)
    116 labels = kwargs.pop("labels", None)

File /anaconda/envs/transformer/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2941, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
   2938         encoded_inputs[key] = to_py_obj(value)
   2940 # Convert padding_strategy in PaddingStrategy
-> 2941 padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
   2942     padding=padding, max_length=max_length, verbose=verbose
   2943 )
   2945 required_input = encoded_inputs[self.model_input_names[0]]
   2946 if required_input and not isinstance(required_input[0], (list, tuple)):

File /anaconda/envs/transformer/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2388, in PreTrainedTokenizerBase._get_padding_truncation_strategies(self, padding, truncation, max_length, pad_to_multiple_of, verbose, **kwargs)
   2385             max_length = self.model_max_length
   2387 # Test if we have a padding token
-> 2388 if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
   2389     raise ValueError(
   2390         "Asking to pad but the tokenizer does not have a padding token. "
   2391         "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
   2392         "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
   2393     )
   2395 # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided

TypeError: '<' not supported between instances of 'NoneType' and 'int'

Have you solved this problem? I have met the same :sob:

1 Like