IndexError: Invalid key: 16 is out of bounds for size 0

I am trying to generate a large dataset for fine-tuning a Wav2Vec2 model.

It’s important that the data is not getting cached in-memory and I am still not sure if this is going to be the case the way I am doing it.

However, I have managed to generate a small dataset myself (timit in this case) but as the training starts I am getting the following exception:

  File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/transformers/trainer.py", line 1290, in train
    for step, inputs in enumerate(epoch_iterator):
  File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 521, in __next__
    data = self._next_data()
  File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 561, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 1857, in __getitem__
    return self._getitem(
  File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 1849, in _getitem
    pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
  File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/datasets/formatting/formatting.py", line 462, in query_table
    _check_valid_index_key(key, size)
  File "/home/sfalk/miniconda3/envs/speech/lib/python3.9/site-packages/datasets/formatting/formatting.py", line 405, in _check_valid_index_key
    raise IndexError(f"Invalid key: {key} is out of bounds for size {size}")
IndexError: Invalid key: 16 is out of bounds for size 0
  0%|                                                   | 0/700 [00:00<?, ?it/s]

This is the implementation of my GeneratorBasedBuilder:

class NewDataset(datasets.GeneratorBasedBuilder):

    VERSION: datasets.Version = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            version=VERSION, description="This part of my dataset covers a first domain"
        ),
    ]

    def _info(self):

        features = datasets.Features(
            {
                "inputs": datasets.features.Sequence(datasets.Value("int16")),
                "targets": datasets.Value("string"),
                "length": datasets.Value("int64"),
            }
        )

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": "/mariana/asr/corpora/converted/en/timit_train",
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": "/mariana/asr/corpora/converted/en/timit_test",
                    "split": "test"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath": "/mariana/asr/corpora/converted/en/timit_dev",
                    "split": "dev",
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        corpus = ConvertedCorpus(filepath)
        for i, record in enumerate(corpus.sample_generator()):
            key = "/".join((str(record.speaker_id), str(record.sample_id)))
            yield key, dict(inputs=record.wav, targets=record.transcript, length=len(record.wav))
            if i >= 100:
                break

This is the content of the cache directory:

$ ls -lah
total 30M
drwxrwxr-x 2 sfalk sfalk 4.0K Feb  1 13:28 .
drwxrwxr-x 3 sfalk sfalk 4.0K Feb  1 13:28 ..
-rw-rw-r-- 1 sfalk sfalk 1.2K Feb  1 13:28 dataset_info.json
-rw-rw-r-- 1 sfalk sfalk    0 Feb  1 13:28 LICENSE
-rw-rw-r-- 1 sfalk sfalk  11M Feb  1 13:28 new_dataset-test.arrow
-rw-rw-r-- 1 sfalk sfalk  10M Feb  1 13:28 new_dataset-train.arrow
-rw-rw-r-- 1 sfalk sfalk 9.3M Feb  1 13:28 new_dataset-validation.arrow

And this here is the dataset_info.json:

{
  "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.\n",
  "citation": "@InProceedings{huggingface:dataset,\ntitle = {A great new dataset},\nauthor={huggingface, Inc.\n},\nyear={2020}\n}\n",
  "homepage": "",
  "license": "",
  "features": {
    "inputs": {
      "feature": {
        "dtype": "int16",
        "id": null,
        "_type": "Value"
      },
      "length": -1,
      "id": null,
      "_type": "Sequence"
    },
    "targets": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "length": {
      "dtype": "int64",
      "id": null,
      "_type": "Value"
    }
  },
  "post_processed": null,
  "supervised_keys": null,
  "task_templates": null,
  "builder_name": "new_dataset",
  "config_name": "default",
  "version": {
    "version_str": "0.0.1",
    "description": null,
    "major": 0,
    "minor": 0,
    "patch": 1
  },
  "splits": {
    "train": {
      "name": "train",
      "num_bytes": 10383006,
      "num_examples": 101,
      "dataset_name": "new_dataset"
    },
    "test": {
      "name": "test",
      "num_bytes": 10771888,
      "num_examples": 101,
      "dataset_name": "new_dataset"
    },
    "validation": {
      "name": "validation",
      "num_bytes": 9742303,
      "num_examples": 101,
      "dataset_name": "new_dataset"
    }
  },
  "download_checksums": {},
  "download_size": 0,
  "post_processing_size": null,
  "dataset_size": 30897197,
  "size_in_bytes": 30897197
}

On additional interesting observation here: There is a replays field on the MemoryMappedTable (+sigh+) object. It looks like all feature columns have been dropped?

Hi ! It looks like your code dropped the columns at one point. Which script are you using ?

I know that the Trainer class from transformers does drop the columns that are not named after actual inputs of the model you want to use, could it be because of that ?

1 Like

Hello, I am facing the same issues. Here is a minimal code that is failing.

import librosa

import torch
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2Processor, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
from transformers import DataCollatorForTokenClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric



import pdb 



# load tokenizer and model # see the doc here: https://huggingface.co/docs/transformers/model_doc/wav2vec2#wav2vec2
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") 
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") 
  

# load dataset and more
dataset =load_dataset('hf-internal-testing/librispeech_asr_dummy','clean')
data_collator = DataCollatorForTokenClassification(tokenizer=processor.feature_extractor, padding=True) 
wer_metric = load_metric("wer")

# training
training_args = TrainingArguments(output_dir=f"wav2vec2")



trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=dataset['validation'],
    tokenizer=processor.feature_extractor
)

trainer.train()

And here is the error message:

Any idea of what is wrong?

1 Like

Hi ! You are using the transformers Trainer. Please note that the trainer does drop all the dataset columns that are not actual input to the models for training. If the dataset ends up with no columns, its size becomes zero.

In particular in your case, the Trainer must have logged

***** Running training *****
  Num examples = 0
  Num Epochs = 3
  ...

Can you try setting remove_unused_columns=False in the training arguments ?

2 Likes

Thank you, it solved the invalid key issue.

I am also facing the same issue, check the below code.

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import os
from datasets import load_dataset
os.environ[“WANDB_DISABLED”] = “true”

tokenizer = AutoTokenizer.from_pretrained(“EleutherAI/gpt-j-6B”, revision=“float16”, low_cpu_mem_usage=True)
model = AutoModelForCausalLM.from_pretrained(“EleutherAI/gpt-j-6B”, revision=“float16”, low_cpu_mem_usage=True)
train_dataset = load_dataset(‘D:\Vinoth\Finetune_GPTNEO_GPTJ6B\cp’)

training_args = TrainingArguments(
output_dir=‘results’,
num_train_epochs=1,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir=‘logs’,

)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset[“train”]

)

trainer.train()

Same issue here. Very annoying. If trainer decides to change/drop anything, it should log what it does, and WHY it does it. What file:line is the code that does that removal?

it’s _remove_unused_columns() in trainer.py in transformers

Im facing a similar issue currently. I created a dataset out of a folder in my drive as follows:

import os
import librosa
import pandas as pd
from datasets import Dataset

Step 2: Read file paths

def get_file_paths(audio_folder, transcript_folder):
audio_files = sorted([os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith(‘.wav’)])
transcript_files = sorted([os.path.join(transcript_folder, f) for f in os.listdir(transcript_folder) if f.endswith(‘.txt’)])
return audio_files, transcript_files

Step 3: Load and align audio and transcripts

def load_and_align_data(audio_files, transcript_files):
data =
for audio_file, transcript_file in zip(audio_files, transcript_files):
with open(transcript_file, ‘r’) as f:
transcript = f.read().strip().upper()
data.append({‘audio’: audio_file, ‘text’: transcript})
return data

Step 4: Apply alignment function

audio_folder = “/content/drive/MyDrive/TrainingDataset/Audio”
transcript_folder = “/content/drive/MyDrive/TrainingDataset/Transcripts”
audio_files, transcript_files = get_file_paths(audio_folder, transcript_folder)
aligned_data = load_and_align_data(audio_files, transcript_files)

Step 5: Create custom dataset

custom_dataset = Dataset.from_dict({“audio”: [d[“audio”] for d in aligned_data], “text”: [d[“text”] for d in aligned_data]})

loaded the models and defined the training arguments:
from transformers import TrainingArguments, Trainer, Wav2Vec2ForCTC, Wav2Vec2Tokenizer, Wav2Vec2Processor

#load pre-trained model, tokenizer, processor
tokenizer = Wav2Vec2Tokenizer.from_pretrained(“facebook/wav2vec2-base-960h”)
model = Wav2Vec2ForCTC.from_pretrained(“facebook/wav2vec2-base-960h”)
processor = Wav2Vec2Processor.from_pretrained(“facebook/wav2vec2-base-960h”)

split_ratio = 0.1
num_samples = len(custom_dataset)
train_dataset = custom_dataset.select(range(int(num_samples * (1 - split_ratio))))
val_dataset = custom_dataset.select(range(int(num_samples * (1 - split_ratio)), num_samples))

Step 4: Define the training arguments

training_args = TrainingArguments(
remove_unused_columns=False,
output_dir=“/content/drive/MyDrive/ASR_Results”, # output directory for the checkpoints and evaluation results
evaluation_strategy=“steps”, # evaluation strategy to adopt during training
eval_steps=500, # number of steps between evaluations on the validation set
save_total_limit=2, # limit the total amount of checkpoints
learning_rate=3e-4, # learning rate for the optimizer
per_device_train_batch_size=4, # batch size for training
per_device_eval_batch_size=4, # batch size for evaluation
num_train_epochs=5, # total number of training epochs
weight_decay=0.01, # weight decay for regularization
push_to_hub=False,
logging_dir=“./logs”, # directory for storing logs
logging_steps=500, # number of steps between logging messages
)

from datasets import load_metric
from transformers import DataCollatorForTokenClassification

print(len(train_dataset))
print(len(val_dataset))

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

training

training_args = TrainingArguments(output_dir=f"wav2vec2")

Step 5: Instantiate the Trainer class

trainer = Trainer(
model=model,
data_collator=data_collator,
args=training_args,
tokenizer=tokenizer,
train_dataset=train_dataset,

)

Step 6: Train the model

trainer.train()

please help, it would mean a lot :disappointed_relieved: