Hi! I’m trying to adapt “Training T5 using Native Tensorflow 2” to use the codeparrot/apps database.
However, the native squad dataset from tensorflow have some type of decode call, making possible this code:
### Defining the data pipeline
def encode(question,answer,
encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len):
question_plus = f"answer_me: {str(question.numpy().decode('utf-8'))}"
answer_plus = ', '.join([i.decode('utf-8') for i in list(answer.numpy())])
answer_plus = f"{answer_plus} </s>"
encoder_inputs = tokenizer(question_plus, truncation=True,
return_tensors='tf', max_length=encoder_max_len,
pad_to_max_length=True)
decoder_inputs = tokenizer(answer_plus, truncation=True,
return_tensors='tf', max_length=decoder_max_len,
pad_to_max_length=True)
input_ids = encoder_inputs['input_ids'][0]
input_attention = encoder_inputs['attention_mask'][0]
target_ids = decoder_inputs['input_ids'][0]
target_attention = decoder_inputs['attention_mask'][0]
return input_ids,input_attention, target_ids, target_attention```
How can I adapt the codeparrots/apps database?
I’ve tried a lot of things but none of them worked. So, here’s my code for the dataset:
from datasets import load_dataset, Dataset
train_dataset = load_dataset("codeparrot/apps", "all", split="train")
valid_dataset = load_dataset("codeparrot/apps", "all", split="test")
train_dataset = train_dataset.remove_columns(['problem_id', 'starter_code', 'url', 'input_output', 'difficulty'])
valid_dataset = valid_dataset.remove_columns(['problem_id', 'starter_code', 'url', 'input_output', 'difficulty'])
train_dataset = train_dataset.rename_column('solutions', 'answers')
valid_dataset = valid_dataset.rename_column('solutions', 'answers')
pandas_train = train_dataset.to_pandas()
pandas_valid = valid_dataset.to_pandas()
def df_to_dataset(dataframe, shuffle=True, batch_size=1):
dataframe = dataframe.copy()
ds = tf.data.Dataset.from_tensor_slices((dict(dataframe)))
return ds
Here’s the full code:
<a href= "https://www.snapthat.ai/"> <img src="https://www.snapthat.ai/img/landing/images/logo_h1.png" /></a>
## Training T5 using Native Tensorflow 2
The purpose of this notebook is to demonstrate training using tensorflow 2 and keras. This notebook includes tf Data pipelines for build any other NLP task in a text to text fashion. Anyone can adapt the data pipeline to thier own datasets for Text-2-Text fashion training.
#### Features
- Train TF T5 on SQUAD questioning and answering
- Train T5 using keras trainer fucntion
- tf.Data pipeline
- TF datasets as source
- Log metrics using tensorboard
- Profile your experiment with the brand new tensorflow profiler !!
### Installation
!pip install -U transformers
import tensorflow_datasets as tfds
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
from transformers import (TFAutoModelWithLMHead, AutoTokenizer,
TFTrainer, TFTrainingArguments, TFT5ForConditionalGeneration, T5Config)
import datetime
import os
print(tf.__version__)
os.environ["TF_GPU_THREAD_MODE"]="gpu_private"
!mkdir data
!ls -la
data_dir = "./data"
log_dir = f"{data_dir}/experiments/t5/logs"
save_path = f"{data_dir}/experiments/t5/models"
cache_path_train = f"{data_dir}/cache/t5.train"
cache_path_test = f"{data_dir}/cache/t5.test"
### Defining the Model
class SnapthatT5(TFT5ForConditionalGeneration):
def __init__(self, *args, log_dir=None, cache_dir= None, **kwargs):
super().__init__(*args, **kwargs)
self.loss_tracker= tf.keras.metrics.Mean(name='loss')
@tf.function
def train_step(self, data):
x, _= data
y = x["labels"]
y = tf.reshape(y, [-1, 1])
with tf.GradientTape() as tape:
outputs = self(x, training=True)
loss = outputs[0]
logits = outputs[1]
loss = tf.reduce_mean(loss)
grads = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
lr = self.optimizer._decayed_lr(tf.float32)
self.loss_tracker.update_state(loss)
self.compiled_metrics.update_state(y, logits)
metrics = {m.name: m.result() for m in self.metrics}
metrics.update({'lr': lr})
return metrics
def test_step(self, data):
x, _ = data
y = x["labels"]
y = tf.reshape(y, [-1, 1])
output = self(x, training=False)
loss = output[0]
loss = tf.reduce_mean(loss)
logits = output[1]
self.loss_tracker.update_state(loss)
self.compiled_metrics.update_state(y, logits)
return {m.name: m.result() for m in self.metrics}
### The Tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-large")
## Preparing the Data
train_dataset, info = tfds.load('squad', split='train', with_info=True)
valid_dataset = tfds.load('squad', split='validation', with_info=False)
print(info)
data = next(iter(train_dataset))
print("Example data from the dataset: \n", data)
#### Training Parameters
warmup_steps = 1e4
batch_size = 50
encoder_max_len = 250
decoder_max_len = 54
buffer_size = 1000
ntrain = info.splits["train"].num_examples
nvalid = info.splits["validation"].num_examples
steps = int(np.ceil(ntrain/batch_size))
valid_steps = int(np.ceil(nvalid/batch_size))
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)
### Defining the data pipeline
def encode(question,answer,
encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len):
question_plus = f"answer_me: {str(question.numpy().decode('utf-8'))}"
answer_plus = ', '.join([i.decode('utf-8') for i in list(answer.numpy())])
answer_plus = f"{answer_plus} </s>"
encoder_inputs = tokenizer(question_plus, truncation=True,
return_tensors='tf', max_length=encoder_max_len,
pad_to_max_length=True)
decoder_inputs = tokenizer(answer_plus, truncation=True,
return_tensors='tf', max_length=decoder_max_len,
pad_to_max_length=True)
input_ids = encoder_inputs['input_ids'][0]
input_attention = encoder_inputs['attention_mask'][0]
target_ids = decoder_inputs['input_ids'][0]
target_attention = decoder_inputs['attention_mask'][0]
return input_ids,input_attention, target_ids, target_attention
def encode_tf(inputs):
question = inputs['question']
answer = inputs['answers']['text']
encoded = tf.py_function(encode, [question, answer],
[tf.int32, tf.int32, tf.int32, tf.int32])
input_ids,input_attention, target_ids,target_attention = encoded
input_ids.set_shape([None])
target_ids.set_shape([None])
input_attention.set_shape([None])
target_attention.set_shape([None])
# labels = tf.reshape(target_ids, [-1, 1])
data= {'input_ids': input_ids, #'decoder_input_ids': target_ids,
'labels': target_ids,
'attention_mask': input_attention,
'decoder_attention_mask': target_attention}
return (data, None)
def create_dataset(source_dataset, cache_path=None, batch_size=1,
buffer_size= 1000, shuffling=True):
dataset = source_dataset.map(encode_tf)
if cache_path is not None:
dataset = dataset.cache(cache_path)
if shuffling:
dataset = dataset.shuffle(buffer_size)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
return dataset
train_ds= create_dataset(train_dataset, batch_size=batch_size,
shuffling=True, cache_path = None)
valid_ds = create_dataset(valid_dataset, batch_size=batch_size,
shuffling=False, cache_path = None)
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, warmup_steps=1e4):
super().__init__()
self.warmup_steps = tf.cast(warmup_steps, tf.float32)
def __call__(self, step):
step = tf.cast(step, tf.float32)
m = tf.maximum(self.warmup_steps, step)
m = tf.cast(m, tf.float32)
lr = tf.math.rsqrt(m)
return lr
plt.style.use('ggplot')
schedule = CustomSchedule()
plt.plot(schedule(tf.range(25000, dtype=tf.float32)))
plt.xlabel("Steps")
plt.ylabel("Learning rate")
### Callbacks and Metrics
start_profile_batch = steps+10
stop_profile_batch = start_profile_batch + 100
profile_range = f"{start_profile_batch},{stop_profile_batch}"
log_path = log_dir + "/" + datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=1,
update_freq=20,profile_batch=profile_range)
checkpoint_filepath = save_path + "/" + "T5-{epoch:04d}-{val_loss:.4f}.ckpt"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_filepath,
save_weights_only=False,
monitor='val_loss',
mode='min',
save_best_only=True)
callbacks = [tensorboard_callback, model_checkpoint_callback]
metrics = [tf.keras.metrics.SparseTopKCategoricalAccuracy(name='accuracy') ]
learning_rate = CustomSchedule()
# learning_rate = 0.001 # Instead set a static learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate)
model = SnapthatT5.from_pretrained("t5-base")
model.compile(optimizer=optimizer, metrics=metrics)
epochs_done = 0
model.fit(train_ds, epochs=1, steps_per_epoch=steps, callbacks=callbacks,
validation_data=valid_ds, validation_steps=valid_steps, initial_epoch=epochs_done)
model.save_pretrained(save_path)
### Lets test our model!!
#context = """We went on a trip to Europe. We had our breakfast at 7 am in the morning at \
#the nearby coffee shop. Wore a dark blue over coat for our first visit to Louvre Museum \
#to experience history and art."""
#question = "At what time did we had breakfast?"
#print(context)
#print(question)
#input_text = f"answer_me: {question} context: {context} </s>"
#encoded_query = tokenizer(input_text,
# return_tensors='tf', padding=True, truncation=True)
#input_ids = encoded_query["input_ids"]
#attention_mask = encoded_query["attention_mask"]
#generated_answer = model.generate(input_ids, attention_mask=attention_mask,
# max_length=decoder_max_len, top_p=0.98, top_k=50)
#decoded_answer = tokenizer.decode(generated_question.numpy()[0])
#print("Answer: ", decoded_answer)