T5 trainin - ValueError: Failed to find data adapter that can handle input" - help!

Hi! I’m using a T5 transformer and the codeparrot/apps dataset. When I try to train my model, I receive the following error:

/usr/local/lib/python3.7/dist-packages/keras/engine/data_adapter.py in select_data_adapter(x, y) 985 "Failed to find data adapter that can handle " 986 "input: {}, {}".format( --> 987 _type_name(x), _type_name(y))) 988 elif len(adapter_cls) > 1: 989 raise RuntimeError( ValueError: Failed to find data adapter that can handle input: <class 'datasets.arrow_dataset.Dataset'>, <class 'NoneType'>

Could anyone help me?

This is the complete code:

The objective of this notebook is to build a NLP text-to-code model including tf Data pipelines for general NLP's tasks, so I can get familiar with TensorFlow more complex models. 

#### Features
- Transfer learning pretrained T5 model (https://huggingface.co/docs/transformers/model_doc/t5) to add a siamese LSTM trained on codeparrots/apps questioning and answering (https://huggingface.co/datasets/codeparrot/apps)
- Train T5 using keras trainer fucntion
- tf.Data pipeline
- TF datasets as source
- Log metrics using tensorboard



- This notebook is my attempt to adapt an existing notebook for my educational purposes, but I just can't adapt it to my needs.



### Installation

!pip install -U transformers
!pip install datasets


import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
from transformers import (TFAutoModelWithLMHead, AutoTokenizer, 
    TFTrainer, TFTrainingArguments, TFT5ForConditionalGeneration, T5Config)
import datetime
import os
import json

!mkdir data
!ls -la

data_dir = "./data"
log_dir = f"{data_dir}/experiments/t5/logs"
save_path = f"{data_dir}/experiments/t5/models"
cache_path_train = f"{data_dir}/cache/t5.train"
cache_path_test = f"{data_dir}/cache/t5.test"



### Defining the Model

class SnapthatT5(TFT5ForConditionalGeneration):
    def __init__(self, *args, log_dir=None, cache_dir= None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_tracker= tf.keras.metrics.Mean(name='loss') 
    
    @tf.function
    def train_step(self, data):
        x, _= data
        y = x["labels"]
        with tf.GradientTape() as tape:
            outputs = self(x, training=True)
            loss = outputs[0]
            logits = outputs[1]
            loss = tf.reduce_mean(loss)
            
            grads = tape.gradient(loss, self.trainable_variables)
            
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        lr = self.optimizer._decayed_lr(tf.float32)
        
        self.loss_tracker.update_state(loss)        
        self.compiled_metrics.update_state(y, logits)
        metrics = {m.name: m.result() for m in self.metrics}
        metrics.update({'lr': lr})
        
        return metrics  

    def test_step(self, data):
        x, _ = data
        y = x["labels"]
        y = tf.reshape(y, [-1, 1])
        output = self(x, training=False)
        loss = output[0]
        loss = tf.reduce_mean(loss)
        logits = output[1]
        
        self.loss_tracker.update_state(loss)
        self.compiled_metrics.update_state(y, logits)
        return {m.name: m.result() for m in self.metrics}
        

### The Tokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-large")          



## Preparing the Data


from datasets import load_dataset

train_dataset = load_dataset("codeparrot/apps", "all", split="train")

train_dataset = train_dataset.remove_columns(['problem_id', 'starter_code', 'url', 'input_output', 'difficulty'])

train_dataset = train_dataset.rename_column('solutions', 'answers')

valid_dataset = load_dataset("codeparrot/apps", "all", split="test")

valid_dataset = valid_dataset.remove_columns(['problem_id', 'starter_code', 'url', 'input_output', 'difficulty'])

valid_dataset = valid_dataset.rename_column('solutions', 'answers')


def encode_inputs(inputs):
    try:
      question = inputs['question']
    except ValueError:
       print('no question')
    try:
      answer = json.loads(inputs['answers'])
    except ValueError:
       print('no answer')

    question_plus = f"answer_me: {question}"

    answer_plus = ' '.join(answer)

    answer_plus = f"{answer_plus} </s>"

    encoder_inputs = tokenizer(question_plus, truncation=True, 

                               return_tensors='tf', max_length=encoder_max_len,

                              pad_to_max_length=True)
    
    decoder_inputs = tokenizer(answer_plus, truncation=True, 

                               return_tensors='tf', max_length=decoder_max_len,

                              pad_to_max_length=True)

    
    input_ids = encoder_inputs['input_ids'][0]

    input_attention = encoder_inputs['attention_mask'][0]

    target_ids = decoder_inputs['input_ids'][0]

    target_attention = decoder_inputs['attention_mask'][0]

    data =  {'input_ids': input_ids, 
            'decoder_input_ids': target_ids, 
            'attention_mask': input_attention,
            'decoder_attention_mask': target_attention}

    return {"data": data}

a = []
for index, x in enumerate(valid_dataset['answers'][0:5000]):
      if x == "":
        a.append(index)

print(len(a))


from datasets import Dataset 

valid_dataset_pandas = valid_dataset.to_pandas()

valid_dataset_pandas_no_null = valid_dataset_pandas.drop(a)

valid_dataset_ = Dataset.from_pandas(valid_dataset_pandas_no_null)



#### Training Parameters

warmup_steps = 1e4
batch_size = 1
encoder_max_len = 250
decoder_max_len = 54
buffer_size = 1000
ntrain = len(encoded_train_dataset)
nvalid = len(encoded_valid_dataset)
steps = int(np.ceil(ntrain/batch_size))
valid_steps = int(np.ceil(nvalid/batch_size))
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)

### Defining the data pipeline



encoded_train_dataset = train_dataset.map(encode_inputs)

encoded_valid_dataset = valid_dataset_.map(encode_inputs)






    




class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, warmup_steps=1e4):
    super().__init__()

    self.warmup_steps = tf.cast(warmup_steps, tf.float32)
    
  def __call__(self, step):
    step = tf.cast(step, tf.float32)
    m = tf.maximum(self.warmup_steps, step)
    m = tf.cast(m, tf.float32)
    lr = tf.math.rsqrt(m)
    
    return lr 
        

plt.style.use('ggplot')
schedule = CustomSchedule()
plt.plot(schedule(tf.range(25000, dtype=tf.float32)))
plt.xlabel("Steps")
plt.ylabel("Learning rate")

### Callbacks and Metrics

start_profile_batch = steps+1
stop_profile_batch = start_profile_batch + 1
profile_range = f"{start_profile_batch},{stop_profile_batch}"

log_path = log_dir + "/" + datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=1,
                                                     update_freq=20,profile_batch=profile_range)

checkpoint_filepath = save_path + "/" + "T5-{epoch:04d}-{val_loss:.4f}.ckpt"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

callbacks = [tensorboard_callback, model_checkpoint_callback] 
metrics = [tf.keras.metrics.SparseTopKCategoricalAccuracy(name='accuracy') ]

learning_rate = CustomSchedule()
# learning_rate = 0.001  # Instead set a static learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate)



model = SnapthatT5.from_pretrained("t5-large")

model.compile(optimizer=optimizer, metrics=metrics)



epochs_done = 0
model.fit(encoded_train_dataset, epochs=1, steps_per_epoch=steps, callbacks=callbacks, 
          validation_data=encoded_valid_dataset, validation_steps=valid_steps, initial_epoch=epochs_done)

model.save_pretrained(save_path)