TypeError: new_full(): argument ‘fill_value’ (position 2) must be Number, not NoneType

Hello huggers! I am trying to pretrain a GPT2 model from scratch for casual language modeling (given a sequence of words, whats the next most likely word) and I am new to both Pytorch Lightning and the amazing Hugging Face libraries!

When I try to train a GPT2LMHeadModel from scratch using the code provided below, I get an error

TypeError: new_full(): argument ‘fill_value’ (position 2) must be Number, not NoneType

I have 2 questions here:

  1. Whats causing this error and how can we fix it?
  2. When training a GPT2 model for casual LM (i.e. predicting the next word in a sentence), should a label be provided by the Dataset in addition to the input ids? The EsperantoDataset in Huggingface’s tutorial How to train a new language model from scratch using Transformers and Tokenizers does not, so my MyDataset dataset for my GPT2 model does not return the label as well. Is this correct? Can a DataCollatorForLanguageModeling with mlm=False be useful here for GPT2?

Thank you so much!

import os
from pathlib import Path
import torch
import pytorch_lightning as pl
from transformers import (
    GPT2Config,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
)
from transformers.optimization import AdamW
from tokenizers import ByteLevelBPETokenizer
from torch.utils.data import (
    DataLoader,
    Dataset,
)

TOKENIZER_DIRPATH = os.path.join("..", "data")


def tokenize_data():
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(
        files=os.path.join(TOKENIZER_DIRPATH, "words.txt"),
        vocab_size=50000,
        min_frequency=2,
        special_tokens=["<s>", "</s>", "<unk>", "<mask>", "<pad>",],
    )
    tokenizer.save_model("../data")


class MyDataset(Dataset):
    def __init__(self):
        tokenizer = GPT2Tokenizer(
            os.path.join(TOKENIZER_DIRPATH, "vocab.json"),
            os.path.join(TOKENIZER_DIRPATH, "merges.txt"),
        )
        tokenizer.pad_token = tokenizer.eos_token

        src_file = Path(os.path.join(TOKENIZER_DIRPATH, "words.txt"))
        lines = src_file.read_text(encoding="utf-8").splitlines()
        self.examples = [tokenizer.encode(line) for line in lines]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])


class MyDataModule(pl.LightningDataModule):
    def __init__(self):
        super().__init__()
        tokenizer = GPT2Tokenizer(
            os.path.join(TOKENIZER_DIRPATH, "vocab.json"),
            os.path.join(TOKENIZER_DIRPATH, "merges.txt"),
        )
        tokenizer.pad_token = tokenizer.eos_token

        self.tokenizer = tokenizer

    def setup(self, stage):
        self.train_dataset = MyDataset()

    def train_dataloader(self):
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer, mlm=False
        )
        train_dataloader = DataLoader(
            self.train_dataset, collate_fn=data_collator, batch_size=32, shuffle=True,
        )
        return train_dataloader


class MyModel(pl.LightningModule):
    def __init__(self, learning_rate, adam_beta1, adam_beta2, adam_epsilon):
        super().__init__()
        self.save_hyperparameters()
        config = GPT2Config()
        self.model = GPT2LMHeadModel(config)

    def forward(self, x):
        return self.model(x).logits

    def training_step(self, batch, batch_idx):
        input_ids, labels = batch
        loss = self.model(input_ids, labels=labels).loss
        self.log("train_loss", loss, on_epoch=True)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(
            self.parameters(),
            self.hparams.learning_rate,
            betas=(self.hparams.adam_beta1, self.hparams.adam_beta2),
            eps=self.hparams.adam_epsilon,
        )
        return optimizer


tokenize_data()
dm = MyDataModule()
model = MyModel(
    learning_rate=5e-5, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-8,
)

trainer = pl.Trainer()
trainer.fit(model, dm)

Error Traceback

Traceback (most recent call last):
  File "test_gpt.py", line 122, in <module>
    trainer.fit(model, dm)
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 499, in fit
    self.dispatch()
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 546, in dispatch
    self.accelerator.start_training(self)
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 73, in start_training
    self.training_type_plugin.start_training(trainer)
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 114, in start_training
    self._results = trainer.run_train()
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 637, in run_train
    self.train_loop.run_training_epoch()
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 485, in run_training_epoch
    for batch_idx, (batch, is_last_batch) in train_dataloader:
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/profiler/profilers.py", line 82, in profile_iterable
    value = next(iterator)
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 47, in _with_is_last
    last = next(it)
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/supporters.py", line 470, in __next__
    return self.request_next_batch(self.loader_iters)
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/trainer/supporters.py", line 484, in request_next_batch
    return apply_to_collection(loader_iters, Iterator, next)
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/pytorch_lightning/utilities/apply_func.py", line 84, in apply_to_collection
    return function(data, *args, **kwargs)
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 435, in __next__
    data = self._next_data()
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1085, in _next_data
    return self._process_data(data)
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1111, in _process_data
    data.reraise()
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/_utils.py", line 428, in reraise
    raise self.exc_type(msg)
TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/transformers/data/data_collator.py", line 341, in __call__
    batch = {"input_ids": _collate_batch(examples, self.tokenizer)}
  File "/opt/anaconda3/envs/myhuggingface/lib/python3.8/site-packages/transformers/data/data_collator.py", line 216, in _collate_batch
    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
TypeError: new_full(): argument 'fill_value' (position 2) must be Number, not NoneType