Model.generate() -- IndexError: too many indices for tensor of dimension 2

I’ve tried merging most of the code blocks below; but to sum up:

  • DistilGPT2 with extra tokens.
  • Google Colab
from transformers import AutoModelForSequenceClassification

from transformers import AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2,)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')



tokenizer.pad_token = '<|pad|>'
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
tokenizer.add_tokens(["<SEP>"])

mappings = {"YES": 1, "NO": 0}

# newcolumn is labels..
data["newcolumn"] = data['newcolumn'].map(mappings)

from sklearn.model_selection import train_test_split
max_length =  1024
padding = True # "max_length"  # True

X = list(data["document_plaintext"])
y = list(data["newcolumn"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=padding, truncation=True, max_length=max_length)
X_val_tokenized = tokenizer(X_val, padding=padding, truncation=True, max_length=max_length)

import torch
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
eval_dataset = Dataset(X_val_tokenized, y_val)

from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer", 
                                  per_device_train_batch_size=1, 
                                  gradient_accumulation_steps=2,  # 2, with small batches
                                  per_device_eval_batch_size=1,
                                  )

model.resize_token_embeddings(len(tokenizer))

from transformers import Trainer

trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=train_dataset, 
                  eval_dataset=eval_dataset,
  )

trainer.train()

model.generate()  # This gives an error
model.to("cpu")
model.generate(tokenizer.encode('i enjoy walking with my cute dog', return_tensors='pt')) # This gives an error

Another problem is I still receive a missing padding_token error when training in batches despite trying many times to define it for the tokenizer.

The full error:

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.

---------------------------------------------------------------------------

IndexError                                Traceback (most recent call last)

<ipython-input-27-2f650aa8ce2f> in <module>()
      1 model.to("cpu")
----> 2 model.generate(tokenizer.encode('i enjoy walking with my cute dog', return_tensors='pt'))

2 frames

/usr/local/lib/python3.7/dist-packages/torch/autograd/grad_mode.py in decorate_context(*args, **kwargs)
     26         def decorate_context(*args, **kwargs):
     27             with self.__class__():
---> 28                 return func(*args, **kwargs)
     29         return cast(F, decorate_context)
     30 

/usr/local/lib/python3.7/dist-packages/transformers/generation_utils.py in generate(self, input_ids, max_length, min_length, do_sample, early_stopping, num_beams, temperature, top_k, top_p, repetition_penalty, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, length_penalty, no_repeat_ngram_size, encoder_no_repeat_ngram_size, num_return_sequences, max_time, max_new_tokens, decoder_start_token_id, use_cache, num_beam_groups, diversity_penalty, prefix_allowed_tokens_fn, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, forced_bos_token_id, forced_eos_token_id, remove_invalid_values, synced_gpus, **model_kwargs)
    997                 return_dict_in_generate=return_dict_in_generate,
    998                 synced_gpus=synced_gpus,
--> 999                 **model_kwargs,
   1000             )
   1001 

/usr/local/lib/python3.7/dist-packages/transformers/generation_utils.py in greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, **model_kwargs)
   1301                 continue  # don't waste resources running the code we don't need
   1302 
-> 1303             next_token_logits = outputs.logits[:, -1, :]
   1304 
   1305             # Store scores, attentions and hidden_states when required

IndexError: too many indices for tensor of dimension 2

Could it be the vocabulary embedding dimensions not carrying over to text generation?

Hi,

The generate() method is meant to be used by encoder-decoder models (like T5, BART, MarianMT, the EncoderDecoderModel classes etc.), to autoregressively generate text. The model you are loading is an encoder-only model, which is not a generative, but a discriminative model, meaning that you can use it to classify text, do token classification, extractive question-answering, etc.

model_checkpoint = "distilgpt2" (removed when I edited the original question)

Thought gpt2 was autoregressive and suitable for text-generation. Will try another model :slight_smile:

Thank you Niels!