I’m trying to fine-tune gpt2 with TensorFlow on my apple m1:
Here’s my code, following the guide on the course:
import os
import psutil
import kaggle
import tensorflow as tf
from itertools import chain
from datasets import load_dataset
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from transformers import AutoTokenizer, TFAutoModelForCausalLM, DataCollatorWithPadding
BATCH_SIZE = 8
NUM_EPOCHS = 3
BLOCK_SIZE = 512
CPU_COUNT = psutil.cpu_count()
MODEL_CHECKPOINT = 'distilgpt2'
KAGGLE_DS_DIR = 'kaggle_dataset'
print('Number of CPUs:', CPU_COUNT)
print('Available GPUs:', tf.config.experimental.list_physical_devices('GPU'))
# download dataset if it doesn't exists
if not os.path.exists(KAGGLE_DS_DIR):
kaggle.api.dataset_download_files(
'simiotic/github-code-snippets-development-sample', path=KAGGLE_DS_DIR, unzip=True)
# load raw dataset from sqlite3
raw_dataset = load_dataset('./sql_loading_script.py')
if "validation" not in raw_dataset.keys():
raw_dataset["validation"] = load_dataset(
'./sql_loading_script.py',
split=f"train[:5%]",
)
raw_dataset["train"] = load_dataset(
'./sql_loading_script.py',
split=f"train[5%:]",
)
# initiate tokenizer and model on cuda
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
tokenizer.pad_token = tokenizer.eos_token
max_seq_length = tokenizer.model_max_length
model = TFAutoModelForCausalLM.from_pretrained(MODEL_CHECKPOINT)
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
model.resize_token_embeddings(len(tokenizer))
def tokenize_funcion(examples):
return tokenizer(examples['text'], truncation=True)
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {
k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
if total_length >= max_seq_length:
total_length = (total_length // max_seq_length) * max_seq_length
# Split by chunks of max_len.
result = {
k: [t[i: i + max_seq_length]
for i in range(0, total_length, max_seq_length)]
for k, t in concatenated_examples.items()
}
return result
# tokenize the raw dataset
tokenized_ds = raw_dataset.map(tokenize_funcion, batched=True, num_proc=CPU_COUNT, remove_columns=["text"])
tokenized_ds = tokenized_ds.map(group_texts, batched=True, num_proc=CPU_COUNT)
# convert training dataset to tf dataset
tf_ds = tokenized_ds['train'].to_tf_dataset(
columns=['input_ids', 'attention_mask', 'token_type_ids'],
label_cols=["labels"],
# columns=[col for col in tokenized_ds['train'].features if col != "special_tokens_mask"],
shuffle=True,
collate_fn=data_collator,
batch_size=8,
drop_remainder=True,
)
eval_ds = tokenized_ds['validation'].to_tf_dataset(
columns=['input_ids', 'attention_mask', 'token_type_ids'],
label_cols=["labels"],
# columns=[col for col in tokenized_ds['validation'].features if col != "special_tokens_mask"],
shuffle=True,
collate_fn=data_collator,
batch_size=8,
drop_remainder=True,
)
num_train_steps = len(tf_ds) * NUM_EPOCHS
lr_scheduler = PolynomialDecay(
initial_learning_rate=5e-5,
end_learning_rate=0.0,
decay_steps=num_train_steps,
)
opt = Adam(learning_rate=lr_scheduler)
model.compile(
optimizer=opt,
loss=SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE),
metrics=['accuracy'],
)
model.fit(tf_ds, validation_data=eval_ds, epochs=NUM_EPOCHS, steps_per_epoch=len(tf_ds) // BATCH_SIZE)
EDIT - Here’s the local loading script:
import os
import sqlite3
import datasets
class SqlDsLoader(datasets.GeneratorBasedBuilder):
"""Code Corpus Dataset."""
VERSION = datasets.Version("1.0.0")
def _info(self):
return datasets.DatasetInfo(
description="",
features=datasets.Features({"text": datasets.Value("string")}),
supervised_keys=None,
homepage="",
citation="",
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
input_path = "kaggle_dataset/snippets-dev/snippets-dev.db"
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": input_path},
),
]
def _generate_examples(self, filepath):
"""Yields examples."""
con = sqlite3.connect(filepath)
cur = con.cursor()
for id, snippet in cur.execute('SELECT id, snippet FROM snippets'):
yield str(id), {
"text": snippet,
}
However, once I add the loss function SparseCategoricalCrossentropy
to the model.compile()
, I get the following error:
Traceback (most recent call last):
File "/Users/elonsalfati/devel/metissio/research/trainer.py", line 115, in <module>
model.fit(tf_ds, validation_data=eval_ds, epochs=NUM_EPOCHS, steps_per_epoch=len(tf_ds) // BATCH_SIZE)
File "/opt/homebrew/Caskroom/miniforge/base/envs/nlp/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/opt/homebrew/Caskroom/miniforge/base/envs/nlp/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py", line 1147, in autograph_handler
raise e.ag_error_metadata.to_exception(e)
TypeError: in user code:
File "/opt/homebrew/Caskroom/miniforge/base/envs/nlp/lib/python3.9/site-packages/keras/engine/training.py", line 1021, in train_function *
return step_function(self, iterator)
File "/opt/homebrew/Caskroom/miniforge/base/envs/nlp/lib/python3.9/site-packages/keras/engine/training.py", line 1010, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/opt/homebrew/Caskroom/miniforge/base/envs/nlp/lib/python3.9/site-packages/keras/engine/training.py", line 1000, in run_step **
outputs = model.train_step(data)
File "/opt/homebrew/Caskroom/miniforge/base/envs/nlp/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 916, in train_step
self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
File "/opt/homebrew/Caskroom/miniforge/base/envs/nlp/lib/python3.9/site-packages/keras/optimizer_v2/optimizer_v2.py", line 530, in minimize
grads_and_vars = self._compute_gradients(
File "/opt/homebrew/Caskroom/miniforge/base/envs/nlp/lib/python3.9/site-packages/keras/optimizer_v2/optimizer_v2.py", line 583, in _compute_gradients
grads_and_vars = self._get_gradients(tape, loss, var_list, grad_loss)
File "/opt/homebrew/Caskroom/miniforge/base/envs/nlp/lib/python3.9/site-packages/keras/optimizer_v2/optimizer_v2.py", line 464, in _get_gradients
grads = tape.gradient(loss, var_list, grad_loss)
TypeError: Target should be a list or nested structure of Tensors or Variables to be differentiated, but received None
I’ve tried to look for some examples on how to fine-tune gpt2 with TensorFlow for text generation, but I couldn’t find much. Any suggestions on how to solve this TypeError and what does it mean?