How to use codeparrots/apps dataset with T5

Hi! I’m trying to adapt “Training T5 using Native Tensorflow 2” to use the codeparrot/apps database.

However, the native squad dataset from tensorflow have some type of decode call, making possible this code:

### Defining the data pipeline

def encode(question,answer, 
           encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len):
    question_plus = f"answer_me: {str(question.numpy().decode('utf-8'))}"
    
    answer_plus = ', '.join([i.decode('utf-8') for i in list(answer.numpy())])
    answer_plus = f"{answer_plus} </s>"
    
    encoder_inputs = tokenizer(question_plus, truncation=True, 
                               return_tensors='tf', max_length=encoder_max_len,
                              pad_to_max_length=True)
    
    decoder_inputs = tokenizer(answer_plus, truncation=True, 
                               return_tensors='tf', max_length=decoder_max_len,
                              pad_to_max_length=True)
    
    input_ids = encoder_inputs['input_ids'][0]
    input_attention = encoder_inputs['attention_mask'][0]
    target_ids = decoder_inputs['input_ids'][0]
    target_attention = decoder_inputs['attention_mask'][0]
    
    return input_ids,input_attention, target_ids, target_attention```

How can I adapt the codeparrots/apps database?

I’ve tried a lot of things but none of them worked. So, here’s my code for the dataset:

from datasets import load_dataset, Dataset

train_dataset = load_dataset("codeparrot/apps", "all", split="train")
valid_dataset = load_dataset("codeparrot/apps", "all", split="test")

train_dataset = train_dataset.remove_columns(['problem_id', 'starter_code', 'url', 'input_output', 'difficulty'])
valid_dataset = valid_dataset.remove_columns(['problem_id', 'starter_code', 'url', 'input_output', 'difficulty'])

train_dataset = train_dataset.rename_column('solutions', 'answers')
valid_dataset = valid_dataset.rename_column('solutions', 'answers')

pandas_train = train_dataset.to_pandas()
pandas_valid = valid_dataset.to_pandas()

def df_to_dataset(dataframe, shuffle=True, batch_size=1):
  dataframe = dataframe.copy()
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe)))
  return ds

Here’s the full code:

<a href= "https://www.snapthat.ai/"> <img src="https://www.snapthat.ai/img/landing/images/logo_h1.png" /></a>

## Training T5 using Native Tensorflow 2

The purpose of this notebook is to demonstrate training using tensorflow 2 and keras. This notebook includes tf Data pipelines for build any other NLP task in a text to text fashion. Anyone can adapt the data pipeline to thier own datasets for Text-2-Text fashion training.
#### Features
- Train TF T5 on SQUAD questioning and answering
- Train T5 using keras trainer fucntion
- tf.Data pipeline
- TF datasets as source
- Log metrics using tensorboard
- Profile your experiment with the brand new tensorflow profiler !!

### Installation

!pip install -U transformers


import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
from transformers import (TFAutoModelWithLMHead, AutoTokenizer, 
    TFTrainer, TFTrainingArguments, TFT5ForConditionalGeneration, T5Config)
import datetime
import os


print(tf.__version__)

os.environ["TF_GPU_THREAD_MODE"]="gpu_private"

!mkdir data
!ls -la

data_dir = "./data"
log_dir = f"{data_dir}/experiments/t5/logs"
save_path = f"{data_dir}/experiments/t5/models"
cache_path_train = f"{data_dir}/cache/t5.train"
cache_path_test = f"{data_dir}/cache/t5.test"



### Defining the Model

class SnapthatT5(TFT5ForConditionalGeneration):
    def __init__(self, *args, log_dir=None, cache_dir= None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_tracker= tf.keras.metrics.Mean(name='loss') 
    
    @tf.function
    def train_step(self, data):
        x, _= data
        y = x["labels"]
        y = tf.reshape(y, [-1, 1])
        with tf.GradientTape() as tape:
            outputs = self(x, training=True)
            loss = outputs[0]
            logits = outputs[1]
            loss = tf.reduce_mean(loss)
            
            grads = tape.gradient(loss, self.trainable_variables)
            
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        lr = self.optimizer._decayed_lr(tf.float32)
        
        self.loss_tracker.update_state(loss)        
        self.compiled_metrics.update_state(y, logits)
        metrics = {m.name: m.result() for m in self.metrics}
        metrics.update({'lr': lr})
        
        return metrics

    def test_step(self, data):
        x, _ = data
        y = x["labels"]
        y = tf.reshape(y, [-1, 1])
        output = self(x, training=False)
        loss = output[0]
        loss = tf.reduce_mean(loss)
        logits = output[1]
        
        self.loss_tracker.update_state(loss)
        self.compiled_metrics.update_state(y, logits)
        return {m.name: m.result() for m in self.metrics}
        



### The Tokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-large")          



## Preparing the Data

train_dataset, info = tfds.load('squad', split='train', with_info=True)
valid_dataset = tfds.load('squad', split='validation', with_info=False)
print(info)

data = next(iter(train_dataset))
print("Example data from the dataset: \n", data)

#### Training Parameters

warmup_steps = 1e4
batch_size = 50
encoder_max_len = 250
decoder_max_len = 54
buffer_size = 1000
ntrain = info.splits["train"].num_examples
nvalid = info.splits["validation"].num_examples
steps = int(np.ceil(ntrain/batch_size))
valid_steps = int(np.ceil(nvalid/batch_size))
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)


### Defining the data pipeline

def encode(question,answer, 
           encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len):
    question_plus = f"answer_me: {str(question.numpy().decode('utf-8'))}"
    
    answer_plus = ', '.join([i.decode('utf-8') for i in list(answer.numpy())])
    answer_plus = f"{answer_plus} </s>"
    
    encoder_inputs = tokenizer(question_plus, truncation=True, 
                               return_tensors='tf', max_length=encoder_max_len,
                              pad_to_max_length=True)
    
    decoder_inputs = tokenizer(answer_plus, truncation=True, 
                               return_tensors='tf', max_length=decoder_max_len,
                              pad_to_max_length=True)
    
    input_ids = encoder_inputs['input_ids'][0]
    input_attention = encoder_inputs['attention_mask'][0]
    target_ids = decoder_inputs['input_ids'][0]
    target_attention = decoder_inputs['attention_mask'][0]
    
    return input_ids,input_attention, target_ids, target_attention
    
    

def encode_tf(inputs):
    question = inputs['question']
    answer = inputs['answers']['text']
    encoded = tf.py_function(encode, [question, answer], 
                                           [tf.int32, tf.int32, tf.int32, tf.int32])
    input_ids,input_attention, target_ids,target_attention = encoded
    input_ids.set_shape([None])
    target_ids.set_shape([None])
    input_attention.set_shape([None])
    target_attention.set_shape([None])
    
#     labels = tf.reshape(target_ids, [-1, 1])
    data=  {'input_ids': input_ids, #'decoder_input_ids': target_ids, 
            'labels': target_ids, 
            'attention_mask': input_attention,
           'decoder_attention_mask': target_attention}
    return (data, None)


def create_dataset(source_dataset, cache_path=None, batch_size=1, 
                   buffer_size= 1000, shuffling=True):
    dataset = source_dataset.map(encode_tf)
    
    if cache_path is not None:
        dataset = dataset.cache(cache_path)        
    if shuffling:
        dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset
    

train_ds= create_dataset(train_dataset, batch_size=batch_size, 
                         shuffling=True, cache_path = None)
valid_ds = create_dataset(valid_dataset, batch_size=batch_size, 
                         shuffling=False, cache_path = None)


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, warmup_steps=1e4):
    super().__init__()

    self.warmup_steps = tf.cast(warmup_steps, tf.float32)
    
  def __call__(self, step):
    step = tf.cast(step, tf.float32)
    m = tf.maximum(self.warmup_steps, step)
    m = tf.cast(m, tf.float32)
    lr = tf.math.rsqrt(m)
    
    return lr 
        

plt.style.use('ggplot')
schedule = CustomSchedule()
plt.plot(schedule(tf.range(25000, dtype=tf.float32)))
plt.xlabel("Steps")
plt.ylabel("Learning rate")

### Callbacks and Metrics

start_profile_batch = steps+10
stop_profile_batch = start_profile_batch + 100
profile_range = f"{start_profile_batch},{stop_profile_batch}"

log_path = log_dir + "/" + datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=1,
                                                     update_freq=20,profile_batch=profile_range)

checkpoint_filepath = save_path + "/" + "T5-{epoch:04d}-{val_loss:.4f}.ckpt"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

callbacks = [tensorboard_callback, model_checkpoint_callback] 
metrics = [tf.keras.metrics.SparseTopKCategoricalAccuracy(name='accuracy') ]

learning_rate = CustomSchedule()
# learning_rate = 0.001  # Instead set a static learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate)



model = SnapthatT5.from_pretrained("t5-base")

model.compile(optimizer=optimizer, metrics=metrics)



epochs_done = 0
model.fit(train_ds, epochs=1, steps_per_epoch=steps, callbacks=callbacks, 
          validation_data=valid_ds, validation_steps=valid_steps, initial_epoch=epochs_done)

model.save_pretrained(save_path)



### Lets test our model!!

#context = """We went on a trip to Europe. We had our breakfast at 7 am in the morning at \
#the nearby coffee shop. Wore a dark blue over coat for our first visit to Louvre Museum \
#to experience history and art."""

#question = "At what time did we had breakfast?"
#print(context)
#print(question)

#input_text =  f"answer_me: {question} context: {context} </s>"
#encoded_query = tokenizer(input_text, 
#                         return_tensors='tf', padding=True, truncation=True)
#input_ids = encoded_query["input_ids"]
#attention_mask = encoded_query["attention_mask"]
#generated_answer = model.generate(input_ids, attention_mask=attention_mask, 
#                                 max_length=decoder_max_len, top_p=0.98, top_k=50)
#decoded_answer = tokenizer.decode(generated_question.numpy()[0])
#print("Answer: ", decoded_answer)


APPS has a special case where the solutions(answers in your case) should be loaded this way (as specified in the dataset’s README):


inputs = next(iter(train_dataset))

inputs["answers"] = json.loads(inputs["answers"])

# now inputs["answers"] is a list of multiple answers

It is a list of multiple solutions and the way you load it makes the list a string.

You also need to adapt the encoding functions. You could define a single function for the encoding like this:

from datasets import load_dataset

train_dataset = load_dataset("codeparrot/apps", "all", split="train")
train_dataset = train_dataset.remove_columns(['problem_id', 'starter_code', 'url', 'input_output', 'difficulty'])
train_dataset = train_dataset.rename_column('solutions', 'answers')

def encode_inputs(inputs):
    question = inputs['question']
    #load answers correctly
    answer = json.loads(inputs['answers'])
    # question here is already an str
    question_plus = f"answer_me: {question}"
    # answer is already a list of strings
    answer_plus = ' '.join(answer)
    answer_plus = f"{answer_plus} </s>"

    encoder_inputs = tokenizer(question_plus, truncation=True, 
                               return_tensors='tf', max_length=encoder_max_len,
                              pad_to_max_length=True)
    
    decoder_inputs = tokenizer(answer_plus, truncation=True, 
                               return_tensors='tf', max_length=decoder_max_len,
                              pad_to_max_length=True)
    
    input_ids = encoder_inputs['input_ids'][0]
    input_attention = encoder_inputs['attention_mask'][0]
    target_ids = decoder_inputs['input_ids'][0]
    target_attention = decoder_inputs['attention_mask'][0]

    
    #labels = tf.reshape(target_ids, [-1, 1])
    data=  {'input_ids': input_ids, #'decoder_input_ids': target_ids, 
            'labels': target_ids, 
            'attention_mask': input_attention,
           'decoder_attention_mask': target_attention}
    return {"data": data}

I changed the last line from return (data, None) to return {"data": data} so that you can do encoded_train_dataset = train_dataset.map(encode_inputs) which uses mapping to encode all samples of the dataset, otherwise you will have to manually apply the function to each sample.
But beware that this dataset is rather intended for code generation than question answering, also the solutions might be long/numerous and some samples have empty solutions.

Thank you SO MUCH! When I try to do the same for validation dataset (encoded_valid_dataset = valid_dataset.map(encode_inputs)), I receive the following error:

/usr/lib/python3.7/json/decoder.py in raw_decode(self, s, idx)
353 obj, end = self.scan_once(s, idx)
354 except StopIteration as err:
→ 355 raise JSONDecodeError(“Expecting value”, s, err.value) from None
356 return obj, end

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Do you know what’s happening/what I can do?

As I said some samples have empty solutions, if you just want to fix the error you can add a try except block

def encode_inputs(inputs):
    question = inputs['question']
    #load answers correctly
    try:
      answer = json.loads(inputs['answers'])
    except ValueError:
      answer = ""
    ....

but you might need to be more careful with the dataset preprocessing for this use case

Thanks! I have removed all the empty solutions/questions.

This will be my last question, I promise!

I’ve tried to train the model, but now I get the following error:

ValueError                                Traceback (most recent call last)
<ipython-input-336-3cb604a081c5> in <module>()
      1 epochs_done = 0
      2 model.fit(encoded_train_dataset, epochs=1, steps_per_epoch=steps, callbacks=callbacks, 
----> 3           validation_data=encoded_valid_dataset, validation_steps=valid_steps, initial_epoch=epochs_done)

1 frames
/usr/local/lib/python3.7/dist-packages/keras/engine/data_adapter.py in select_data_adapter(x, y)
    985         "Failed to find data adapter that can handle "
    986         "input: {}, {}".format(
--> 987             _type_name(x), _type_name(y)))
    988   elif len(adapter_cls) > 1:
    989     raise RuntimeError(

ValueError: Failed to find data adapter that can handle input: <class 'datasets.arrow_dataset.Dataset'>, <class 'NoneType'>

Do you know how I could fix this? So many thanks! Seriously, you are amazing.