Hi, I’m having the following error when training GPT2 from scratch.
IndexError: Invalid key: 1104 is out of bounds for size 0
Overview of Task
I am training GPT2 from scratch on a musical dataset in ABC form with the aim of generating music based on a ABC prompt.
Example of ABC form: (|~B2 AB dBAG|FDAD BDAD|FDAD dAFD|)
I’d very much appreciate any help. Below I have left my code.
Output with Error and Full Traceback
loading file /content/gdrive/MyDrive/3YP/tokenizer/vocab.json
loading file /content/gdrive/MyDrive/3YP/tokenizer/merges.txt
loading file None
loading file None
loading file None
loading configuration file /content/gdrive/MyDrive/3YP/tokenizer/config.json
Model config GPT2Config {
"_name_or_path": "/content/gdrive/MyDrive/3YP/tokenizer",
"activation_function": "gelu_new",
"attn_pdrop": 0.1,
"bos_token_id": 50256,
"embd_pdrop": 0.1,
"eos_token_id": 50256,
"initializer_range": 0.02,
"layer_norm_epsilon": 1e-05,
"model_type": "gpt2",
"n_embd": 768,
"n_head": 12,
"n_inner": null,
"n_layer": 12,
"n_positions": 1024,
"reorder_and_upcast_attn": false,
"resid_pdrop": 0.1,
"scale_attn_by_inverse_layer_idx": false,
"scale_attn_weights": true,
"summary_activation": null,
"summary_first_dropout": 0.1,
"summary_proj_to_labels": true,
"summary_type": "cls_index",
"summary_use_proj": true,
"transformers_version": "4.17.0",
"use_cache": true,
"vocab_size": 50257
}
Assigning </s> to the eos_token key of the tokenizer
Assigning <s> to the bos_token key of the tokenizer
Assigning <unk> to the unk_token key of the tokenizer
Assigning <pad> to the pad_token key of the tokenizer
Assigning <mask> to the mask_token key of the tokenizer
VOCAB SIZE: 1200
Using custom data configuration default-821d577aa689a915
Reusing dataset text (/root/.cache/huggingface/datasets/text/default-821d577aa689a915/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)
100%
1/1 [00:00<00:00, 37.49it/s]
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`, you can safely ignore this message.
/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
FutureWarning,
***** Running training *****
Num examples = 0
Num Epochs = 1
Instantaneous batch size per device = 1
Total train batch size (w. parallel, distributed & accumulation) = 1
Gradient Accumulation steps = 1
Total optimization steps = 3640
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-74-23b0ca32fb98> in <module>()
78 )
79
---> 80 trainer.train()
81 trainer.save_model("/content/gdrive/MyDrive/3YP/ABCModel")
8 frames
/usr/local/lib/python3.7/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1372
1373 step = -1
-> 1374 for step, inputs in enumerate(epoch_iterator):
1375
1376 # Skip past any already trained steps if resuming training
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in __next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
559 def _next_data(self):
560 index = self._next_index() # may raise StopIteration
--> 561 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
562 if self._pin_memory:
563 data = _utils.pin_memory.pin_memory(data)
/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in __getitem__(self, key)
1764 """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools)."""
1765 return self._getitem(
-> 1766 key,
1767 )
1768
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in _getitem(self, key, decoded, **kwargs)
1747 format_kwargs = format_kwargs if format_kwargs is not None else {}
1748 formatter = get_formatter(format_type, features=self.features, decoded=decoded, **format_kwargs)
-> 1749 pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
1750 formatted_output = format_table(
1751 pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns
/usr/local/lib/python3.7/dist-packages/datasets/formatting/formatting.py in query_table(table, key, indices)
484 else:
485 size = indices.num_rows if indices is not None else table.num_rows
--> 486 _check_valid_index_key(key, size)
487 # Query the main table
488 if indices is None:
/usr/local/lib/python3.7/dist-packages/datasets/formatting/formatting.py in _check_valid_index_key(key, size)
427 if isinstance(key, int):
428 if (key < 0 and key + size < 0) or (key >= size):
--> 429 raise IndexError(f"Invalid key: {key} is out of bounds for size {size}")
430 return
431 elif isinstance(key, slice):
IndexError: Invalid key: 1104 is out of bounds for size 0
Dataset Preparation
import os
MAX_CHAR_LENGTH = 400
MIN_CHAR_LENGTH = 300
NEWLINECHAR = "<N>"
paths, dirs, files = next(os.walk("/content/gdrive/MyDrive/3YP/trainset/abc_txt"))
with open("/content/gdrive/MyDrive/3YP/trainset/abc_text_data.txt", "a") as f:
for fpath in files:
d = open("/content/gdrive/MyDrive/3YP/trainset/abc_txt/"+fpath, "r").read()
fd = d.replace("\n", NEWLINECHAR)
if MIN_CHAR_LENGTH < len(d) <= MAX_CHAR_LENGTH:
f.write(fd+"\n")
else:
sd = fd.split('|')
substring = ""
for split in sd:
substring += "|" + split
if MIN_CHAR_LENGTH <= len(substring) <=MAX_CHAR_LENGTH:
f.write(substring+'\n')
substring = ""
Tokenization, Initializing Model with Config & Training
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
TRAIN_BASE = True
paths = ["/content/gdrive/MyDrive/3YP/trainset/abc_text_data.txt"]
if TRAIN_BASE:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=paths, vocab_size=1500, min_frequency=0, special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
])
tokenizer.save_model("/content/gdrive/MyDrive/3YP/tokenizer")
# input = "|x2[=E=B,G,"
# t = tokenizer.encode(input)
# print(t.ids)
# print(t.tokens)
tokenizer = GPT2Tokenizer.from_pretrained("/content/gdrive/MyDrive/3YP/tokenizer")
tokenizer.add_special_tokens({
"eos_token": "</s>",
"bos_token": "<s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"mask_token": "<mask>"
})
config = GPT2Config(
vocab_size = tokenizer.vocab_size,
bos_token = tokenizer.bos_token_id,
eos_token = tokenizer.eos_token_id,
)
print("VOCAB SIZE: ", tokenizer.vocab_size)
model = GPT2LMHeadModel(config)
dataset = load_dataset("text", data_files=paths)
def encode(lines):
return tokenizer(lines['text'], add_special_tokens=True, truncation=True, max_length=400)
dataset.set_transform(encode)
dataset = dataset['train']
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=True,
mlm_probability=0.15,
)
training_args = TrainingArguments(
output_dir="/content/gdrive/MyDrive/3YP/ABCModel",
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=1,
save_steps=100,
save_total_limit=2,
prediction_loss_only=True,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
trainer.train()
trainer.save_model("/content/gdrive/MyDrive/3YP/ABCModel")