Hi! I’m using a T5 transformer and the codeparrot/apps dataset. When I try to train my model, I receive the following error:
/usr/local/lib/python3.7/dist-packages/keras/engine/data_adapter.py in select_data_adapter(x, y) 985 "Failed to find data adapter that can handle " 986 "input: {}, {}".format( --> 987 _type_name(x), _type_name(y))) 988 elif len(adapter_cls) > 1: 989 raise RuntimeError( ValueError: Failed to find data adapter that can handle input: <class 'datasets.arrow_dataset.Dataset'>, <class 'NoneType'>
Could anyone help me?
This is the complete code:
The objective of this notebook is to build a NLP text-to-code model including tf Data pipelines for general NLP's tasks, so I can get familiar with TensorFlow more complex models.
#### Features
- Transfer learning pretrained T5 model (https://huggingface.co/docs/transformers/model_doc/t5) to add a siamese LSTM trained on codeparrots/apps questioning and answering (https://huggingface.co/datasets/codeparrot/apps)
- Train T5 using keras trainer fucntion
- tf.Data pipeline
- TF datasets as source
- Log metrics using tensorboard
- This notebook is my attempt to adapt an existing notebook for my educational purposes, but I just can't adapt it to my needs.
### Installation
!pip install -U transformers
!pip install datasets
import tensorflow_datasets as tfds
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
from transformers import (TFAutoModelWithLMHead, AutoTokenizer,
TFTrainer, TFTrainingArguments, TFT5ForConditionalGeneration, T5Config)
import datetime
import os
import json
!mkdir data
!ls -la
data_dir = "./data"
log_dir = f"{data_dir}/experiments/t5/logs"
save_path = f"{data_dir}/experiments/t5/models"
cache_path_train = f"{data_dir}/cache/t5.train"
cache_path_test = f"{data_dir}/cache/t5.test"
### Defining the Model
class SnapthatT5(TFT5ForConditionalGeneration):
def __init__(self, *args, log_dir=None, cache_dir= None, **kwargs):
super().__init__(*args, **kwargs)
self.loss_tracker= tf.keras.metrics.Mean(name='loss')
@tf.function
def train_step(self, data):
x, _= data
y = x["labels"]
with tf.GradientTape() as tape:
outputs = self(x, training=True)
loss = outputs[0]
logits = outputs[1]
loss = tf.reduce_mean(loss)
grads = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
lr = self.optimizer._decayed_lr(tf.float32)
self.loss_tracker.update_state(loss)
self.compiled_metrics.update_state(y, logits)
metrics = {m.name: m.result() for m in self.metrics}
metrics.update({'lr': lr})
return metrics
def test_step(self, data):
x, _ = data
y = x["labels"]
y = tf.reshape(y, [-1, 1])
output = self(x, training=False)
loss = output[0]
loss = tf.reduce_mean(loss)
logits = output[1]
self.loss_tracker.update_state(loss)
self.compiled_metrics.update_state(y, logits)
return {m.name: m.result() for m in self.metrics}
### The Tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-large")
## Preparing the Data
from datasets import load_dataset
train_dataset = load_dataset("codeparrot/apps", "all", split="train")
train_dataset = train_dataset.remove_columns(['problem_id', 'starter_code', 'url', 'input_output', 'difficulty'])
train_dataset = train_dataset.rename_column('solutions', 'answers')
valid_dataset = load_dataset("codeparrot/apps", "all", split="test")
valid_dataset = valid_dataset.remove_columns(['problem_id', 'starter_code', 'url', 'input_output', 'difficulty'])
valid_dataset = valid_dataset.rename_column('solutions', 'answers')
def encode_inputs(inputs):
try:
question = inputs['question']
except ValueError:
print('no question')
try:
answer = json.loads(inputs['answers'])
except ValueError:
print('no answer')
question_plus = f"answer_me: {question}"
answer_plus = ' '.join(answer)
answer_plus = f"{answer_plus} </s>"
encoder_inputs = tokenizer(question_plus, truncation=True,
return_tensors='tf', max_length=encoder_max_len,
pad_to_max_length=True)
decoder_inputs = tokenizer(answer_plus, truncation=True,
return_tensors='tf', max_length=decoder_max_len,
pad_to_max_length=True)
input_ids = encoder_inputs['input_ids'][0]
input_attention = encoder_inputs['attention_mask'][0]
target_ids = decoder_inputs['input_ids'][0]
target_attention = decoder_inputs['attention_mask'][0]
data = {'input_ids': input_ids,
'decoder_input_ids': target_ids,
'attention_mask': input_attention,
'decoder_attention_mask': target_attention}
return {"data": data}
a = []
for index, x in enumerate(valid_dataset['answers'][0:5000]):
if x == "":
a.append(index)
print(len(a))
from datasets import Dataset
valid_dataset_pandas = valid_dataset.to_pandas()
valid_dataset_pandas_no_null = valid_dataset_pandas.drop(a)
valid_dataset_ = Dataset.from_pandas(valid_dataset_pandas_no_null)
#### Training Parameters
warmup_steps = 1e4
batch_size = 1
encoder_max_len = 250
decoder_max_len = 54
buffer_size = 1000
ntrain = len(encoded_train_dataset)
nvalid = len(encoded_valid_dataset)
steps = int(np.ceil(ntrain/batch_size))
valid_steps = int(np.ceil(nvalid/batch_size))
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)
### Defining the data pipeline
encoded_train_dataset = train_dataset.map(encode_inputs)
encoded_valid_dataset = valid_dataset_.map(encode_inputs)
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, warmup_steps=1e4):
super().__init__()
self.warmup_steps = tf.cast(warmup_steps, tf.float32)
def __call__(self, step):
step = tf.cast(step, tf.float32)
m = tf.maximum(self.warmup_steps, step)
m = tf.cast(m, tf.float32)
lr = tf.math.rsqrt(m)
return lr
plt.style.use('ggplot')
schedule = CustomSchedule()
plt.plot(schedule(tf.range(25000, dtype=tf.float32)))
plt.xlabel("Steps")
plt.ylabel("Learning rate")
### Callbacks and Metrics
start_profile_batch = steps+1
stop_profile_batch = start_profile_batch + 1
profile_range = f"{start_profile_batch},{stop_profile_batch}"
log_path = log_dir + "/" + datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=1,
update_freq=20,profile_batch=profile_range)
checkpoint_filepath = save_path + "/" + "T5-{epoch:04d}-{val_loss:.4f}.ckpt"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_filepath,
save_weights_only=False,
monitor='val_loss',
mode='min',
save_best_only=True)
callbacks = [tensorboard_callback, model_checkpoint_callback]
metrics = [tf.keras.metrics.SparseTopKCategoricalAccuracy(name='accuracy') ]
learning_rate = CustomSchedule()
# learning_rate = 0.001 # Instead set a static learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate)
model = SnapthatT5.from_pretrained("t5-large")
model.compile(optimizer=optimizer, metrics=metrics)
epochs_done = 0
model.fit(encoded_train_dataset, epochs=1, steps_per_epoch=steps, callbacks=callbacks,
validation_data=encoded_valid_dataset, validation_steps=valid_steps, initial_epoch=epochs_done)
model.save_pretrained(save_path)