Hi!
I have recently started experimenting with the transformers
library.
In this small project I would like to fine-tune a GPT2 model to generate text using the Trainer
API. In the example below I use a dummy dataset, but nonetheless I was able to replicate the error.
import torch
from torch.utils.data import random_split, Dataset
from transformers import GPT2Tokenizer, GPT2Model, Trainer, \
TrainingArguments
class MyDataset(Dataset):
def __init__(self, txt_list, tokenizer, max_length):
self.tokenizer = tokenizer
self.input_ids = []
self.attn_masks = []
for txt in txt_list:
"""
This loop will iterate through each entry in the text corpus.
For each bit of text it will prepend it with the start of text token,
then append the end of text token and pad to the maximum length with the
pad token.
"""
encodings_dict = tokenizer(
'<|startoftext|>' + txt + '<|endoftext|>',
truncation=True,
max_length=max_length,
padding="max_length")
"""
Each iteration then appends either the encoded tensor to a list,
or the attention mask for that encoding to a list. The attention mask is
a binary list of 1's or 0's which determine whether the langauge model
should take that token into consideration or not.
"""
self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
self.attn_masks.append(
torch.tensor(encodings_dict['attention_mask']))
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.attn_masks[idx]
def dummy_data_collator(features):
batch = {}
batch['input_ids'] = torch.stack([f[0] for f in features])
batch['attention_mask'] = torch.stack([f[1] for f in features])
return batch
if __name__ == '__main__':
txt_list = [
'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
'Vestibulum tempus lorem arcu, eget consectetur augue pretium a.',
'Suspendisse id pellentesque erat.',
'Pellentesque quis ante ut risus sollicitudin maximus scelerisque ut urna.',
'Nam tempus quis magna ac convallis. Praesent convallis egestas libero, ac sollicitudin libero dignissim at.',
'Etiam efficitur eget dolor nec iaculis.'
]
# Instantiate italian GPT2 tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained('LorenzoDeMattei/GePpeTto',
bos_token='<|startoftext|>',
eos_token='<|endoftext|>',
pad_token='<|pad|>')
# Identify the longest text to know how long to pad our sentences out to.
max_length = max(
[len(tokenizer.encode(txt)) for txt in txt_list])
# Create the PyTorch dataset.
dataset = MyDataset(txt_list, tokenizer, max_length)
# Split into training and validation sets.
val_size = int(0.1 * len(dataset))
train_size = len(dataset) - val_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
# Fine-tune the model using the 🤗 Trainer API
model = GPT2Model.from_pretrained('LorenzoDeMattei/GePpeTto')
training_args = TrainingArguments(
output_dir='./results/',
num_train_epochs=4,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
warmup_steps=100,
weight_decay=0.01,
logging_dir='./log/',
evaluation_strategy='epoch'
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=dummy_data_collator,
)
trainer.train()
When running it I get the following error message
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-23-94c97283687f> in <module>
18 )
19
---> 20 trainer.train()
~\...\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
1267 tr_loss += self.training_step(model, inputs)
1268 else:
-> 1269 tr_loss += self.training_step(model, inputs)
1270 self.current_flos += float(self.floating_point_ops(inputs))
1271
~\...\site-packages\transformers\trainer.py in training_step(self, model, inputs)
1760 loss = self.compute_loss(model, inputs)
1761 else:
-> 1762 loss = self.compute_loss(model, inputs)
1763
1764 if self.args.n_gpu > 1:
~\...\site-packages\transformers\trainer.py in compute_loss(self, model, inputs, return_outputs)
1792 else:
1793 labels = None
-> 1794 outputs = model(**inputs)
1795 # Save past state if it exists
1796 # TODO: this needs to be fixed and made cleaner later.
~\...\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
~\...\site-packages\transformers\models\gpt2\modeling_gpt2.py in forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)
731
732 if inputs_embeds is None:
--> 733 inputs_embeds = self.wte(input_ids)
734 position_embeds = self.wpe(position_ids)
735 hidden_states = inputs_embeds + position_embeds
~\...\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
~\...\site-packages\torch\nn\modules\sparse.py in forward(self, input)
156
157 def forward(self, input: Tensor) -> Tensor:
--> 158 return F.embedding(
159 input, self.weight, self.padding_idx, self.max_norm,
160 self.norm_type, self.scale_grad_by_freq, self.sparse)
~\...\site-packages\torch\nn\functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2041 # remove once script supports set_grad_enabled
2042 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2043 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
2044
2045
IndexError: index out of range in self
By debugging at return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
I find the following:
I suppose the problem is that the embedding size is not large enough, since, as far as I understood, it should be that weight.shape[0] - 1
>= torch.max(input)
.
I cannot figure out what’s the root cause of this error, anyone can help me?
Thank you!