Training GPT2 From Scratch in TensorFlow (TFGPT2) with generators

Hi, I’m trying to train with a special kind of text augmentation, so I need to use TF dataset generators. I’ve tried to get a minimal working example, but I keep getting an input reshape error. Here’s my code to set-up the dataset generator

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer
)
import tensorflow as tf
import numpy as np
import random
from transformers import PreTrainedTokenizerFast, TFGPT2LMHeadModel, GPT2Config, pipeline

EOT = '<|endoftext|>'

# mock generator
def dummy_gen():
    for _ in range(100):
        k = random.randint(2, 8)
        yield ''.join(random.choices(['foo ', 'bar '], k=k) + [EOT])
    
# make a tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(vocab_size=5000, special_tokens=[EOT])#[EOT, EOM] + MASK_TOKENS)
tokenizer.train_from_iterator(dummy_gen(), trainer=trainer)
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token=EOT,
    eos_token=EOT,
    pad_token=EOT
)

# set-up generators for TF dataset
keys = ['input_ids', 'attention_mask', 'token_type_ids']

def tokenizer_map_np(s):
    r = wrapped_tokenizer(list(s.astype('str')), padding=True, return_tensors='tf')
    return [r[k] for k in keys]

# work around - https://github.com/tensorflow/tensorflow/issues/36276
def tokenizer_map(s):
    values = tf.numpy_function(tokenizer_map_np, [s], [tf.int32, tf.int32, tf.int32])
    # not sure if identity is needed below (?)
    return dict(input_ids=values[0], attention_mask=values[1], token_type_ids=values[2], labels=tf.identity(values[0]))

# check single example
dataset = tf.data.Dataset.from_generator(dummy_gen, output_types=tf.string).batch(2)
dataset = dataset.map(tokenizer_map)
for d in dataset:
    print(d)
    break

which outputs something that looks correct

{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
array([[18, 19, 20, 19, 19, 19, 20, 14,  0],
       [18, 20, 19, 20, 19, 14,  0,  0,  0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'labels': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
array([[18, 19, 20, 19, 19, 19, 20, 14,  0],
       [18, 20, 19, 20, 19, 14,  0,  0,  0]], dtype=int32)>}

Now build small GPT2 model

configuration = GPT2Config(vocab_size=wrapped_tokenizer.vocab_size, 
                           bos_token_id=wrapped_tokenizer.bos_token_id, 
                           eos_token_id=wrapped_tokenizer.eos_token_id,                        
                          n_head=4,
                          n_embd=32,
                          n_layer=4)
model = TFGPT2LMHeadModel(configuration)
model.compile()
model.fit(dataset)

Error message

    File "/home/whitead/miniconda3/envs/mmm/lib/python3.8/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 494, in call
      hidden_states = tf.reshape(hidden_states, output_shape)
Node: 'tfgpt2lm_head_model_1/transformer/Reshape_4'
Input to reshape is a tensor with 384 values, but the requested shape has 1292
	 [[{{node tfgpt2lm_head_model_1/transformer/Reshape_4}}]] [Op:__inference_train_function_17353]

“Solved” the problem. It’s some kind of graph mode vs eager mode execution problem in the tokenizer_map function, which I do not understand. The work around:

model.fit(dataset.as_numpy_iterator())