IndexError: index out of range in self - Text Generation with GPT2

Hi!
I have recently started experimenting with the transformers library.
In this small project I would like to fine-tune a GPT2 model to generate text using the Trainer API. In the example below I use a dummy dataset, but nonetheless I was able to replicate the error.

import torch
from torch.utils.data import random_split, Dataset
from transformers import GPT2Tokenizer, GPT2Model, Trainer, \
    TrainingArguments


class MyDataset(Dataset):

    def __init__(self, txt_list, tokenizer, max_length):
        self.tokenizer = tokenizer 
        self.input_ids = []
        self.attn_masks = []

        for txt in txt_list:
            """
            This loop will iterate through each entry in the text corpus.
            For each bit of text it will prepend it with the start of text token,
            then append the end of text token and pad to the maximum length with the 
            pad token.
            """
            encodings_dict = tokenizer(
                '<|startoftext|>' + txt + '<|endoftext|>',
                truncation=True,
                max_length=max_length,
                padding="max_length")

            """
            Each iteration then appends either the encoded tensor to a list,
            or the attention mask for that encoding to a list. The attention mask is
            a binary list of 1's or 0's which determine whether the langauge model
            should take that token into consideration or not. 
            """
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(
                torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


def dummy_data_collator(features):
    batch = {}
    batch['input_ids'] = torch.stack([f[0] for f in features])
    batch['attention_mask'] = torch.stack([f[1] for f in features])

    return batch


if __name__ == '__main__':
    txt_list = [
        'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
        'Vestibulum tempus lorem arcu, eget consectetur augue pretium a.',
        'Suspendisse id pellentesque erat.',
        'Pellentesque quis ante ut risus sollicitudin maximus scelerisque ut urna.',
        'Nam tempus quis magna ac convallis. Praesent convallis egestas libero, ac sollicitudin libero dignissim at.',
        'Etiam efficitur eget dolor nec iaculis.'
    ]

    # Instantiate italian GPT2 tokenizer.
    tokenizer = GPT2Tokenizer.from_pretrained('LorenzoDeMattei/GePpeTto',
                                              bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>',
                                              pad_token='<|pad|>')

    # Identify the longest text to know how long to pad our sentences out to.
    max_length = max(
        [len(tokenizer.encode(txt)) for txt in txt_list])

    # Create the PyTorch dataset.
    dataset = MyDataset(txt_list, tokenizer, max_length)

    # Split into training and validation sets.
    val_size = int(0.1 * len(dataset))
    train_size = len(dataset) - val_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    # Fine-tune the model using the 🤗 Trainer API
    model = GPT2Model.from_pretrained('LorenzoDeMattei/GePpeTto')

    training_args = TrainingArguments(
        output_dir='./results/',
        num_train_epochs=4,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./log/',
        evaluation_strategy='epoch'
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=dummy_data_collator,
    )

    trainer.train()

When running it I get the following error message

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-23-94c97283687f> in <module>
     18     )
     19 
---> 20 trainer.train()

~\...\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
   1267                         tr_loss += self.training_step(model, inputs)
   1268                 else:
-> 1269                     tr_loss += self.training_step(model, inputs)
   1270                 self.current_flos += float(self.floating_point_ops(inputs))
   1271 

~\...\site-packages\transformers\trainer.py in training_step(self, model, inputs)
   1760                 loss = self.compute_loss(model, inputs)
   1761         else:
-> 1762             loss = self.compute_loss(model, inputs)
   1763 
   1764         if self.args.n_gpu > 1:

~\...\site-packages\transformers\trainer.py in compute_loss(self, model, inputs, return_outputs)
   1792         else:
   1793             labels = None
-> 1794         outputs = model(**inputs)
   1795         # Save past state if it exists
   1796         # TODO: this needs to be fixed and made cleaner later.

~\...\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

~\...\site-packages\transformers\models\gpt2\modeling_gpt2.py in forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)
    731 
    732         if inputs_embeds is None:
--> 733             inputs_embeds = self.wte(input_ids)
    734         position_embeds = self.wpe(position_ids)
    735         hidden_states = inputs_embeds + position_embeds

~\...\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

~\...\site-packages\torch\nn\modules\sparse.py in forward(self, input)
    156 
    157     def forward(self, input: Tensor) -> Tensor:
--> 158         return F.embedding(
    159             input, self.weight, self.padding_idx, self.max_norm,
    160             self.norm_type, self.scale_grad_by_freq, self.sparse)

~\...\site-packages\torch\nn\functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2041         # remove once script supports set_grad_enabled
   2042         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2043     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   2044 
   2045 

IndexError: index out of range in self

By debugging at return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) I find the following:

image

I suppose the problem is that the embedding size is not large enough, since, as far as I understood, it should be that weight.shape[0] - 1 >= torch.max(input) .

I cannot figure out what’s the root cause of this error, anyone can help me?
Thank you!

3 Likes

Hi rpisu, I had the same problem today, and I worked sooo hard to solve it. I first thought my token embedding is wrong, but I found out later that it is my position encoding is wrong. In concrete, my code is like output = model.generate(encoded_input, max_length=4000, pad_token_id=tokenizer.eos_token_id) and clearly 4000 is larger than the max sequence length of GPT2 which is 1024. So I change 4000 to 1000 and it works fine now. Hope it helps.

2 Likes

Thanks very much! It works for me.