Hello huggers! I am trying to pretrain a GPT2 model from scratch for casual language modeling (given a sequence of words, whats the next most likely word) and I am new to both Pytorch Lightning and the amazing Hugging Face libraries!
When I try to train a GPT2LMHeadModel
from scratch using the code provided below, I get an error
TypeError: new_full(): argument ‘fill_value’ (position 2) must be Number, not NoneType
I have 2 questions here:
- Whats causing this error and how can we fix it?
- When training a GPT2 model for casual LM (i.e. predicting the next word in a sentence), should a label be provided by the
Dataset
in addition to the input ids? TheEsperantoDataset
in Huggingface’s tutorial How to train a new language model from scratch using Transformers and Tokenizers does not, so myMyDataset
dataset for my GPT2 model does not return the label as well. Is this correct? Can aDataCollatorForLanguageModeling
withmlm=False
be useful here for GPT2?
Thank you so much!
import os
from pathlib import Path
import torch
import pytorch_lightning as pl
from transformers import (
GPT2Config,
GPT2LMHeadModel,
GPT2Tokenizer,
DataCollatorForLanguageModeling,
)
from transformers.optimization import AdamW
from tokenizers import ByteLevelBPETokenizer
from torch.utils.data import (
DataLoader,
Dataset,
)
TOKENIZER_DIRPATH = os.path.join("..", "data")
def tokenize_data():
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(
files=os.path.join(TOKENIZER_DIRPATH, "words.txt"),
vocab_size=50000,
min_frequency=2,
special_tokens=["<s>", "</s>", "<unk>", "<mask>", "<pad>",],
)
tokenizer.save_model("../data")
class MyDataset(Dataset):
def __init__(self):
tokenizer = GPT2Tokenizer(
os.path.join(TOKENIZER_DIRPATH, "vocab.json"),
os.path.join(TOKENIZER_DIRPATH, "merges.txt"),
)
tokenizer.pad_token = tokenizer.eos_token
src_file = Path(os.path.join(TOKENIZER_DIRPATH, "words.txt"))
lines = src_file.read_text(encoding="utf-8").splitlines()
self.examples = [tokenizer.encode(line) for line in lines]
def __len__(self):
return len(self.examples)
def __getitem__(self, i):
return torch.tensor(self.examples[i])
class MyDataModule(pl.LightningDataModule):
def __init__(self):
super().__init__()
tokenizer = GPT2Tokenizer(
os.path.join(TOKENIZER_DIRPATH, "vocab.json"),
os.path.join(TOKENIZER_DIRPATH, "merges.txt"),
)
tokenizer.pad_token = tokenizer.eos_token
self.tokenizer = tokenizer
def setup(self, stage):
self.train_dataset = MyDataset()
def train_dataloader(self):
data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer, mlm=False
)
train_dataloader = DataLoader(
self.train_dataset, collate_fn=data_collator, batch_size=32, shuffle=True,
)
return train_dataloader
class MyModel(pl.LightningModule):
def __init__(self, learning_rate, adam_beta1, adam_beta2, adam_epsilon):
super().__init__()
self.save_hyperparameters()
config = GPT2Config()
self.model = GPT2LMHeadModel(config)
def forward(self, x):
return self.model(x).logits
def training_step(self, batch, batch_idx):
input_ids, labels = batch
loss = self.model(input_ids, labels=labels).loss
self.log("train_loss", loss, on_epoch=True)
return loss
def configure_optimizers(self):
optimizer = AdamW(
self.parameters(),
self.hparams.learning_rate,
betas=(self.hparams.adam_beta1, self.hparams.adam_beta2),
eps=self.hparams.adam_epsilon,
)
return optimizer
tokenize_data()
dm = MyDataModule()
model = MyModel(
learning_rate=5e-5, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-8,
)
trainer = pl.Trainer()
trainer.fit(model, dm)
Error Traceback
Traceback (most recent call last):
File "test_gpt.py", line 122, in <module>
trainer.fit(model, dm)
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 499, in fit
self.dispatch()
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 546, in dispatch
self.accelerator.start_training(self)
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 73, in start_training
self.training_type_plugin.start_training(trainer)
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 114, in start_training
self._results = trainer.run_train()
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 637, in run_train
self.train_loop.run_training_epoch()
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 485, in run_training_epoch
for batch_idx, (batch, is_last_batch) in train_dataloader:
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/profiler/profilers.py", line 82, in profile_iterable
value = next(iterator)
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 47, in _with_is_last
last = next(it)
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/supporters.py", line 470, in __next__
return self.request_next_batch(self.loader_iters)
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/supporters.py", line 484, in request_next_batch
return apply_to_collection(loader_iters, Iterator, next)
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/utilities/apply_func.py", line 84, in apply_to_collection
return function(data, *args, **kwargs)
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 435, in __next__
data = self._next_data()
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1085, in _next_data
return self._process_data(data)
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1111, in _process_data
data.reraise()
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/_utils.py", line 428, in reraise
raise self.exc_type(msg)
TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
data = fetcher.fetch(index)
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
return self.collate_fn(data)
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/transformers/data/data_collator.py", line 341, in __call__
batch = {"input_ids": _collate_batch(examples, self.tokenizer)}
File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/transformers/data/data_collator.py", line 216, in _collate_batch
result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
TypeError: new_full(): argument 'fill_value' (position 2) must be Number, not NoneType