To summarize my concerns, I need answers to the following questions:
- Is the code included in the notebook suitable for ranking text in the intended way?
- If so, how to improve the results given that at any case (I already tried hundreds of things: hyperparameter search, different architectures, different pretrained models, … you name it) the training plateaus at 0.3 kendall tau max.
- Why the model doesn’t work if the padded examples’s are shuffled?
Here’s a notebook with a reproducible example
The following code is intended to accept sequences of unordered jupyter notebook cell texts and order them. I’m using microsoft/codebert-base for fine tuning using the keras model below. The model reaches up to 0.3-0.35 kendall tau max and doesn’t improve any further. Another thing I observed is that training doesn’t work at all if the padded tokens and their respective targets are shuffled ex:
[[<token1>, <token2>, <pad1>, <pad2>, <pad3>],
[<token1>, <token2>, <pad1>, <pad2>, <pad3>]]
and targets:
[[0.5, 1, 0, 0, 0], [0.5, 1, 0, 0, 0]]
If they were shuffled while maintaining their relationships: <token1
> with 0.5
, <token2>
with 1
, and so on … the model doesn’t even converge and training fails.
from keras import Input, Model
from keras.layers import Dense
from transformers import TFAutoModel
def create_model(input_shape, model_name, cache_dir):
bert_head = TFAutoModel.from_pretrained(model_name, cache_dir=cache_dir)
ids = Input(input_shape, dtype='int32', name='input_ids')
masks = Input(input_shape, dtype='int32', name='attention_mask')
x0 = dict(input_ids=ids, attention_mask=masks)
x = bert_head(x0)[0]
output = Dense(1, 'sigmoid')(x)
return Model(x0, output)
I’m using a custom tokenizer which is a modified version (included in the notebook) of the one found in text_layers.py because transformers auto-tokenizer is slow and doesn’t play nice with tensorflow graph.
Here are the training functions:
def load_data(src):
df = pd.read_parquet(src)
replacements = {
r'http\S+': '',
r'[^\x00-\x7F]+': '',
r'don\'t': 'do not',
r'won\'t': 'will not',
r'can\'t': 'cannot',
r'i\'m': 'i am',
r'n\'t': ' not',
r'\'re': ' are',
r'\'s': ' is',
r'\'d': ' would',
r'\'ll': ' will',
r'\'ve': ' have',
r'\'m': ' am',
r' +': ' ',
}
df['source'] = df['source'].replace(replacements, regex=True)
df['order'] = df.groupby('notebook_id')['order'].rank(pct=True)
groups = [*df.groupby('notebook_id')]
max_length = len(max(groups, key=lambda x: x[1].shape[0])[1])
return df, groups, max_length
def process_xy(x, y, tokenizer, packer):
tokens = tokenizer(x)
x = packer(tokens)
x['input_ids'] = x['input_word_ids']
x['attention_mask'] = x['input_mask']
del x['input_word_ids']
del x['input_type_ids']
del x['input_mask']
return x, tf.expand_dims(y, -1)
def serialize_example(notebook_id, notebook_df, writer):
features = dict()
features['notebook_id'] = tf.train.Feature(
bytes_list=tf.train.BytesList(value=[bytes(notebook_id, 'utf-8')])
)
features['cell_id'] = tf.train.Feature(
bytes_list=tf.train.BytesList(value=notebook_df['cell_id'].values.astype("|S"))
)
features['order'] = tf.train.Feature(
float_list=tf.train.FloatList(value=notebook_df['order'].values)
)
features['source'] = tf.train.Feature(
bytes_list=tf.train.BytesList(
value=[tf.io.serialize_tensor(notebook_df['source'].values).numpy()]
)
)
example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(example.SerializeToString())
def write_tfrecord(fp, data, **kwargs):
total_examples = len(data)
with tf.io.TFRecordWriter(fp, **kwargs) as writer:
for i, (notebook_id, notebook_df) in enumerate(data):
print(f'\rWriting example: {i + 1}/{total_examples}', end='')
serialize_example(notebook_id, notebook_df, writer)
print()
def read_example(example, feature_map, max_notebook_cells, num_pad=0, str_pad=''):
features = tf.io.parse_single_example(example, feature_map)
source = tf.sparse.to_dense(features['source'])
source = tf.io.parse_tensor(source[0], tf.string)
order = tf.sparse.to_dense(features['order'])
pad_size = max_notebook_cells - tf.shape(order)[0]
order = tf.pad(order, [[0, pad_size]], constant_values=num_pad)
source = tf.pad(source, [[0, pad_size]], constant_values=str_pad)
return source, order
def read_tfrecord(
fp, max_notebook_cells, batch_size, shuffle_buffer_size, tokenizer, packer, **kwargs
):
files = tf.data.Dataset.list_files(fp)
dataset = tf.data.TFRecordDataset(files, **kwargs)
feature_map = {
'notebook_id': tf.io.VarLenFeature(tf.string),
'cell_id': tf.io.VarLenFeature(tf.string),
'source': tf.io.VarLenFeature(tf.string),
'order': tf.io.VarLenFeature(tf.float32),
}
return (
dataset.map(
lambda x: read_example(x, feature_map, max_notebook_cells),
tf.data.experimental.AUTOTUNE,
)
.batch(batch_size)
.shuffle(shuffle_buffer_size)
.map(lambda x, y: (process_xy(x, y, tokenizer, packer)))
.prefetch(tf.data.experimental.AUTOTUNE)
)
def kendall_tau(y_true, y_pred):
def _kendall_tau(*args):
return kendalltau(*args).correlation
return tf.numpy_function(_kendall_tau, [y_true, y_pred], 'float64')
Here’s how the model is trained:
df, groups, max_length = load_data(
'sample.parquet' # provided in the notebook
)
which loads the data in pandas dataframe. Here’s how a row looks like:
cell_id 1862f0a6
cell_type code
source # This Python 3 environment comes with many he...
order 1
notebook_id 00001756c60be8
Name: 0, dtype: object
Each row contains a cell_id
, cell_type
(markdown or code), source
which is the model input and contains cell contents as str, order
which is the cell order we are trying to predict. Multiple rows can have the same notebook id.
special_tokens = {
'start_of_sequence_id': '<s>',
'end_of_segment_id': '</s>',
'padding_id': '<pad>',
'mask_id': '<mask>',
}
unknown_token = '<unk>'
tokenizer = BertTokenizer(
vocab_file='microsoft-codebert-base.txt',
lower_case=True,
special_tokens=special_tokens,
tokenizer_kwargs={'unknown_token': unknown_token},
)
packer = BertPacker(
seq_length=max_length, special_tokens=tokenizer.get_special_tokens()
)
Split the data
train_idx = int(0.8 * len(groups))
valid_idx = int(0.95 * len(groups))
random.shuffle(groups)
train_data = groups[:train_idx]
valid_data = groups[train_idx:valid_idx]
test_data = groups[valid_idx:]
train_path = 'example-train.tfrecord'
valid_path = 'example-valid.tfrecord'
Then the train and valid tfrecords are created. serialize_example
creates the following feature map:
feature_map = {
'notebook_id': tf.io.VarLenFeature(tf.string),
'cell_id': tf.io.VarLenFeature(tf.string),
'source': tf.io.VarLenFeature(tf.string),
'order': tf.io.VarLenFeature(tf.float32),
}
Each example represents one notebook which means a single example contains a tensor of cell texts (markdown or code) that appeared in the notebook, serialized using tf.io.serialize_tensor and are tokenized at read time using read_example
which pads x
and y
inputs (source
and order
) in this case and finally applies tokenization using process_xy
.
write_tfrecord(train_path, train_data)
write_tfrecord(valid_path, valid_data)
train_dataset = read_tfrecord(train_path, max_length, 4, 512, tokenizer, packer)
valid_dataset = read_tfrecord(valid_path, max_length, 4, 512, tokenizer, packer)
ckpt = ModelCheckpoint(
f'trained-model.tf',
verbose=True,
save_best_only=True,
save_weights_only=True,
)
model = create_model(
(max_length,),
'microsoft/codebert-base',
'transformers-cache',
)
I’m using scipy.stats.kendalltau as evaluation metric which ranges from -1 to 1 (worst to best).
model.compile(Adam(3e-5), loss='huber', metrics=kendall_tau)
model.fit(
train_dataset,
epochs=2,
callbacks=[ckpt],
validation_data=valid_dataset,
)
I tried many things including: different bert heads, adding dense and lstms, increasing the training epochs, adding more data, different hyperparameters, more loss functions, … at any case, kendall tau improves up to 0.3-0.4 max and never improves further. Is there something I’m doing wrong? Anything that should be changed / improved and why?