Column lengths mismatch in IterableDataset

mirfan899 · May 11, 2023, 12:28pm

Hi, I’m trying to train Wav2Vec2ForCTC for phonetic recognition. Here is my code.

from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2CTCTokenizer

dataset = load_dataset("mirfan899/kids_phoneme_sm", streaming=True, split="train")
seed, bf = 42, 500
dataset = dataset.shuffle(seed, buffer_size=bf)


tokenizer = Wav2Vec2CTCTokenizer("/kaggle/input/phoneme-vocab/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

import soundfile as sf

def speech_file_to_array_fn(batch):
    batch["speech"] = batch["audio"][0]["array"]
    batch["sampling_rate"] = batch["audio"][0]["sampling_rate"]
    batch["target_text"] = batch["phonetic"]
    return batch

dataset = dataset.map(speech_file_to_array_fn, batch_size=8, batched=True)


def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
            len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch


dataset_prepared = dataset.map(prepare_dataset, batch_size=8, batched=True)

DataClass

import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

Now remaingin code using the DataClass

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./wav2vec2-base-timit-demo",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=20,
  save_steps=1000,
  eval_steps=1000,
  max_steps=100,
  save_total_limit=2,
)

from jiwer import wer

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    cer = wer(label_str, pred_str)

    return {"cer": cer}

from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=timit_prepared.with_format("torch"),
    tokenizer=processor.feature_extractor,
)


trainer.train()

And I’m getting this error.

/opt/conda/lib/python3.10/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[29], line 1
----> 1 trainer.train()

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1662, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1657     self.model_wrapped = self.model
   1659 inner_training_loop = find_executable_batch_size(
   1660     self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
   1661 )
-> 1662 return inner_training_loop(
   1663     args=args,
   1664     resume_from_checkpoint=resume_from_checkpoint,
   1665     trial=trial,
   1666     ignore_keys_for_eval=ignore_keys_for_eval,
   1667 )

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1899, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1896     rng_to_sync = True
   1898 step = -1
-> 1899 for step, inputs in enumerate(epoch_iterator):
   1900     total_batched_samples += 1
   1901     if rng_to_sync:

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:634, in _BaseDataLoaderIter.__next__(self)
    631 if self._sampler_iter is None:
    632     # TODO(https://github.com/pytorch/pytorch/issues/76750)
    633     self._reset()  # type: ignore[call-arg]
--> 634 data = self._next_data()
    635 self._num_yielded += 1
    636 if self._dataset_kind == _DatasetKind.Iterable and \
    637         self._IterableDataset_len_called is not None and \
    638         self._num_yielded > self._IterableDataset_len_called:

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:678, in _SingleProcessDataLoaderIter._next_data(self)
    676 def _next_data(self):
    677     index = self._next_index()  # may raise StopIteration
--> 678     data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    679     if self._pin_memory:
    680         data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:32, in _IterableDatasetFetcher.fetch(self, possibly_batched_index)
     30 for _ in possibly_batched_index:
     31     try:
---> 32         data.append(next(self.dataset_iter))
     33     except StopIteration:
     34         self.ended = True

File /opt/conda/lib/python3.10/site-packages/datasets/iterable_dataset.py:987, in IterableDataset.__iter__(self)
    984         yield from self._iter_pytorch(ex_iterable)
    985         return
--> 987 for key, example in ex_iterable:
    988     if self.features:
    989         # `IterableDataset` automatically fills missing columns with None.
    990         # This is done with `_apply_feature_types_on_example`.
    991         yield _apply_feature_types_on_example(
    992             example, self.features, token_per_repo_id=self._token_per_repo_id
    993         )

File /opt/conda/lib/python3.10/site-packages/datasets/iterable_dataset.py:460, in MappedExamplesIterable.__iter__(self)
    458 current_idx = 0
    459 if self.batched:
--> 460     for key, example in iterator:
    461         # If `batched`, first build the batch, if `batch_size` is None or <=0, then the batch is the whole dataset
    462         iterator_batch = (
    463             iterator
    464             if self.batch_size is None or self.batch_size <= 0
    465             else islice(iterator, self.batch_size - 1)
    466         )
    467         key_examples_list = [(key, example)] + [(key, example) for key, example in iterator_batch]

File /opt/conda/lib/python3.10/site-packages/datasets/iterable_dataset.py:490, in MappedExamplesIterable.__iter__(self)
    488 if transformed_batch:
    489     first_col = next(iter(transformed_batch))
--> 490     bad_cols = [
    491         col
    492         for col in transformed_batch
    493         if len(transformed_batch[col]) != len(transformed_batch[first_col])
    494     ]
    495     if bad_cols:
    496         raise ValueError(
    497             f"Column lengths mismatch: columns {bad_cols} have length {[len(transformed_batch[col]) for col in bad_cols]} while {first_col} has length {len(transformed_batch[first_col])}."
    498         )

File /opt/conda/lib/python3.10/site-packages/datasets/iterable_dataset.py:493, in <listcomp>(.0)
    488 if transformed_batch:
    489     first_col = next(iter(transformed_batch))
    490     bad_cols = [
    491         col
    492         for col in transformed_batch
--> 493         if len(transformed_batch[col]) != len(transformed_batch[first_col])
    494     ]
    495     if bad_cols:
    496         raise ValueError(
    497             f"Column lengths mismatch: columns {bad_cols} have length {[len(transformed_batch[col]) for col in bad_cols]} while {first_col} has length {len(transformed_batch[first_col])}."
    498         )

TypeError: object of type 'int' has no len()

Can you help me out?

lhoestq · May 11, 2023, 12:39pm

Hi ! Your map function should return a batch where each column is a list of the same size:

def speech_file_to_array_fn(batch):
    batch_size = len(batch["phonetic"])
    batch["speech"] = [batch["audio"][i]["array"] for i in range(batch_size)]
    batch["sampling_rate"] = [batch["audio"][i]["sampling_rate"] for i in range(batch_size)]
    batch["target_text"] = batch["phonetic"]
    return batch

mirfan899 · May 12, 2023, 6:12am

Okay, after refreshing training starts but only for epoch 1. It’s not proceeding further. What I’m missing here???

TrainOutput(global_step=5, training_loss=46.29713439941406, metrics={'train_runtime': 12.3198, 'train_samples_per_second': 1.623, 'train_steps_per_second': 0.406, 'total_flos': 1929139833312000.0, 'train_loss': 46.29713439941406, 'epoch': 1.0})

Topic		Replies	Views
Error loading and preprocessing librispeech 🤗Datasets	1	803	August 29, 2022
Using IterableDataset with Trainer - `IterableDataset' has no len() 🤗Transformers	7	14589	December 17, 2024
Length error using `map` with datasets 🤗Datasets	2	753	May 29, 2023
RuntimeError: The size of tensor a (553) must match the size of tensor b (448) at non-singleton dimension 1 Beginners	3	1092	July 17, 2024
TypeError: '<' not supported between instances of 'NoneType' and 'int' while training wav2vec2 🤗Transformers	1	2528	October 27, 2024

Column lengths mismatch in IterableDataset

Related topics