Hi, I’m trying to train with a special kind of text augmentation, so I need to use TF dataset generators. I’ve tried to get a minimal working example, but I keep getting an input reshape error. Here’s my code to set-up the dataset generator
from tokenizers import (
decoders,
models,
normalizers,
pre_tokenizers,
processors,
trainers,
Tokenizer
)
import tensorflow as tf
import numpy as np
import random
from transformers import PreTrainedTokenizerFast, TFGPT2LMHeadModel, GPT2Config, pipeline
EOT = '<|endoftext|>'
# mock generator
def dummy_gen():
for _ in range(100):
k = random.randint(2, 8)
yield ''.join(random.choices(['foo ', 'bar '], k=k) + [EOT])
# make a tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(vocab_size=5000, special_tokens=[EOT])#[EOT, EOM] + MASK_TOKENS)
tokenizer.train_from_iterator(dummy_gen(), trainer=trainer)
wrapped_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,
bos_token=EOT,
eos_token=EOT,
pad_token=EOT
)
# set-up generators for TF dataset
keys = ['input_ids', 'attention_mask', 'token_type_ids']
def tokenizer_map_np(s):
r = wrapped_tokenizer(list(s.astype('str')), padding=True, return_tensors='tf')
return [r[k] for k in keys]
# work around - https://github.com/tensorflow/tensorflow/issues/36276
def tokenizer_map(s):
values = tf.numpy_function(tokenizer_map_np, [s], [tf.int32, tf.int32, tf.int32])
# not sure if identity is needed below (?)
return dict(input_ids=values[0], attention_mask=values[1], token_type_ids=values[2], labels=tf.identity(values[0]))
# check single example
dataset = tf.data.Dataset.from_generator(dummy_gen, output_types=tf.string).batch(2)
dataset = dataset.map(tokenizer_map)
for d in dataset:
print(d)
break
which outputs something that looks correct
{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
array([[18, 19, 20, 19, 19, 19, 20, 14, 0],
[18, 20, 19, 20, 19, 14, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'labels': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
array([[18, 19, 20, 19, 19, 19, 20, 14, 0],
[18, 20, 19, 20, 19, 14, 0, 0, 0]], dtype=int32)>}
Now build small GPT2 model
configuration = GPT2Config(vocab_size=wrapped_tokenizer.vocab_size,
bos_token_id=wrapped_tokenizer.bos_token_id,
eos_token_id=wrapped_tokenizer.eos_token_id,
n_head=4,
n_embd=32,
n_layer=4)
model = TFGPT2LMHeadModel(configuration)
model.compile()
model.fit(dataset)
Error message
File "/home/whitead/miniconda3/envs/mmm/lib/python3.8/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 494, in call
hidden_states = tf.reshape(hidden_states, output_shape)
Node: 'tfgpt2lm_head_model_1/transformer/Reshape_4'
Input to reshape is a tensor with 384 values, but the requested shape has 1292
[[{{node tfgpt2lm_head_model_1/transformer/Reshape_4}}]] [Op:__inference_train_function_17353]