How to use codeparrots/apps dataset with T5

Hi! I’m trying to adapt “Training T5 using Native Tensorflow 2” to use the codeparrot/apps database.

However, the native squad dataset from tensorflow have some type of decode call, making possible this code:

### Defining the data pipeline

def encode(question,answer, 
           encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len):
    question_plus = f"answer_me: {str(question.numpy().decode('utf-8'))}"
    
    answer_plus = ', '.join([i.decode('utf-8') for i in list(answer.numpy())])
    answer_plus = f"{answer_plus} </s>"
    
    encoder_inputs = tokenizer(question_plus, truncation=True, 
                               return_tensors='tf', max_length=encoder_max_len,
                              pad_to_max_length=True)
    
    decoder_inputs = tokenizer(answer_plus, truncation=True, 
                               return_tensors='tf', max_length=decoder_max_len,
                              pad_to_max_length=True)
    
    input_ids = encoder_inputs['input_ids'][0]
    input_attention = encoder_inputs['attention_mask'][0]
    target_ids = decoder_inputs['input_ids'][0]
    target_attention = decoder_inputs['attention_mask'][0]
    
    return input_ids,input_attention, target_ids, target_attention```

How can I adapt the codeparrots/apps database?

I’ve tried a lot of things but none of them worked. So, here’s my code for the dataset:

from datasets import load_dataset, Dataset

train_dataset = load_dataset("codeparrot/apps", "all", split="train")
valid_dataset = load_dataset("codeparrot/apps", "all", split="test")

train_dataset = train_dataset.remove_columns(['problem_id', 'starter_code', 'url', 'input_output', 'difficulty'])
valid_dataset = valid_dataset.remove_columns(['problem_id', 'starter_code', 'url', 'input_output', 'difficulty'])

train_dataset = train_dataset.rename_column('solutions', 'answers')
valid_dataset = valid_dataset.rename_column('solutions', 'answers')

pandas_train = train_dataset.to_pandas()
pandas_valid = valid_dataset.to_pandas()

def df_to_dataset(dataframe, shuffle=True, batch_size=1):
  dataframe = dataframe.copy()
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe)))
  return ds

Here’s the full code:

<a href= "https://www.snapthat.ai/"> <img src="https://www.snapthat.ai/img/landing/images/logo_h1.png" /></a>

## Training T5 using Native Tensorflow 2

The purpose of this notebook is to demonstrate training using tensorflow 2 and keras. This notebook includes tf Data pipelines for build any other NLP task in a text to text fashion. Anyone can adapt the data pipeline to thier own datasets for Text-2-Text fashion training.
#### Features
- Train TF T5 on SQUAD questioning and answering
- Train T5 using keras trainer fucntion
- tf.Data pipeline
- TF datasets as source
- Log metrics using tensorboard
- Profile your experiment with the brand new tensorflow profiler !!

### Installation

!pip install -U transformers


import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
from transformers import (TFAutoModelWithLMHead, AutoTokenizer, 
    TFTrainer, TFTrainingArguments, TFT5ForConditionalGeneration, T5Config)
import datetime
import os


print(tf.__version__)

os.environ["TF_GPU_THREAD_MODE"]="gpu_private"

!mkdir data
!ls -la

data_dir = "./data"
log_dir = f"{data_dir}/experiments/t5/logs"
save_path = f"{data_dir}/experiments/t5/models"
cache_path_train = f"{data_dir}/cache/t5.train"
cache_path_test = f"{data_dir}/cache/t5.test"



### Defining the Model

class SnapthatT5(TFT5ForConditionalGeneration):
    def __init__(self, *args, log_dir=None, cache_dir= None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_tracker= tf.keras.metrics.Mean(name='loss') 
    
    @tf.function
    def train_step(self, data):
        x, _= data
        y = x["labels"]
        y = tf.reshape(y, [-1, 1])
        with tf.GradientTape() as tape:
            outputs = self(x, training=True)
            loss = outputs[0]
            logits = outputs[1]
            loss = tf.reduce_mean(loss)
            
            grads = tape.gradient(loss, self.trainable_variables)
            
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        lr = self.optimizer._decayed_lr(tf.float32)
        
        self.loss_tracker.update_state(loss)        
        self.compiled_metrics.update_state(y, logits)
        metrics = {m.name: m.result() for m in self.metrics}
        metrics.update({'lr': lr})
        
        return metrics

    def test_step(self, data):
        x, _ = data
        y = x["labels"]
        y = tf.reshape(y, [-1, 1])
        output = self(x, training=False)
        loss = output[0]
        loss = tf.reduce_mean(loss)
        logits = output[1]
        
        self.loss_tracker.update_state(loss)
        self.compiled_metrics.update_state(y, logits)
        return {m.name: m.result() for m in self.metrics}
        



### The Tokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-large")          



## Preparing the Data

train_dataset, info = tfds.load('squad', split='train', with_info=True)
valid_dataset = tfds.load('squad', split='validation', with_info=False)
print(info)

data = next(iter(train_dataset))
print("Example data from the dataset: \n", data)

#### Training Parameters

warmup_steps = 1e4
batch_size = 50
encoder_max_len = 250
decoder_max_len = 54
buffer_size = 1000
ntrain = info.splits["train"].num_examples
nvalid = info.splits["validation"].num_examples
steps = int(np.ceil(ntrain/batch_size))
valid_steps = int(np.ceil(nvalid/batch_size))
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)


### Defining the data pipeline

def encode(question,answer, 
           encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len):
    question_plus = f"answer_me: {str(question.numpy().decode('utf-8'))}"
    
    answer_plus = ', '.join([i.decode('utf-8') for i in list(answer.numpy())])
    answer_plus = f"{answer_plus} </s>"
    
    encoder_inputs = tokenizer(question_plus, truncation=True, 
                               return_tensors='tf', max_length=encoder_max_len,
                              pad_to_max_length=True)
    
    decoder_inputs = tokenizer(answer_plus, truncation=True, 
                               return_tensors='tf', max_length=decoder_max_len,
                              pad_to_max_length=True)
    
    input_ids = encoder_inputs['input_ids'][0]
    input_attention = encoder_inputs['attention_mask'][0]
    target_ids = decoder_inputs['input_ids'][0]
    target_attention = decoder_inputs['attention_mask'][0]
    
    return input_ids,input_attention, target_ids, target_attention
    
    

def encode_tf(inputs):
    question = inputs['question']
    answer = inputs['answers']['text']
    encoded = tf.py_function(encode, [question, answer], 
                                           [tf.int32, tf.int32, tf.int32, tf.int32])
    input_ids,input_attention, target_ids,target_attention = encoded
    input_ids.set_shape([None])
    target_ids.set_shape([None])
    input_attention.set_shape([None])
    target_attention.set_shape([None])
    
#     labels = tf.reshape(target_ids, [-1, 1])
    data=  {'input_ids': input_ids, #'decoder_input_ids': target_ids, 
            'labels': target_ids, 
            'attention_mask': input_attention,
           'decoder_attention_mask': target_attention}
    return (data, None)


def create_dataset(source_dataset, cache_path=None, batch_size=1, 
                   buffer_size= 1000, shuffling=True):
    dataset = source_dataset.map(encode_tf)
    
    if cache_path is not None:
        dataset = dataset.cache(cache_path)        
    if shuffling:
        dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset
    

train_ds= create_dataset(train_dataset, batch_size=batch_size, 
                         shuffling=True, cache_path = None)
valid_ds = create_dataset(valid_dataset, batch_size=batch_size, 
                         shuffling=False, cache_path = None)


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, warmup_steps=1e4):
    super().__init__()

    self.warmup_steps = tf.cast(warmup_steps, tf.float32)
    
  def __call__(self, step):
    step = tf.cast(step, tf.float32)
    m = tf.maximum(self.warmup_steps, step)
    m = tf.cast(m, tf.float32)
    lr = tf.math.rsqrt(m)
    
    return lr 
        

plt.style.use('ggplot')
schedule = CustomSchedule()
plt.plot(schedule(tf.range(25000, dtype=tf.float32)))
plt.xlabel("Steps")
plt.ylabel("Learning rate")

### Callbacks and Metrics

start_profile_batch = steps+10
stop_profile_batch = start_profile_batch + 100
profile_range = f"{start_profile_batch},{stop_profile_batch}"

log_path = log_dir + "/" + datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=1,
                                                     update_freq=20,profile_batch=profile_range)

checkpoint_filepath = save_path + "/" + "T5-{epoch:04d}-{val_loss:.4f}.ckpt"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

callbacks = [tensorboard_callback, model_checkpoint_callback] 
metrics = [tf.keras.metrics.SparseTopKCategoricalAccuracy(name='accuracy') ]

learning_rate = CustomSchedule()
# learning_rate = 0.001  # Instead set a static learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate)



model = SnapthatT5.from_pretrained("t5-base")

model.compile(optimizer=optimizer, metrics=metrics)



epochs_done = 0
model.fit(train_ds, epochs=1, steps_per_epoch=steps, callbacks=callbacks, 
          validation_data=valid_ds, validation_steps=valid_steps, initial_epoch=epochs_done)

model.save_pretrained(save_path)



### Lets test our model!!

#context = """We went on a trip to Europe. We had our breakfast at 7 am in the morning at \
#the nearby coffee shop. Wore a dark blue over coat for our first visit to Louvre Museum \
#to experience history and art."""

#question = "At what time did we had breakfast?"
#print(context)
#print(question)

#input_text =  f"answer_me: {question} context: {context} </s>"
#encoded_query = tokenizer(input_text, 
#                         return_tensors='tf', padding=True, truncation=True)
#input_ids = encoded_query["input_ids"]
#attention_mask = encoded_query["attention_mask"]
#generated_answer = model.generate(input_ids, attention_mask=attention_mask, 
#                                 max_length=decoder_max_len, top_p=0.98, top_k=50)
#decoded_answer = tokenizer.decode(generated_question.numpy()[0])
#print("Answer: ", decoded_answer)