My code is as follows:
import tensorflow as tf
from transformers import RobertaConfig, TFRobertaMainLayer
# 1. Create a class to be able to use fit()
class Transformer(tf.keras.Model):
def __init__(self):
super(Transformer, self).__init__()
config = RobertaConfig(
vocab_size=100,
hidden_size=128,
intermediate_size=128,
max_position_embeddings=514,
num_attention_heads=8,
num_hidden_layers=6,
type_vocab_size=1,
)
self.encoder = TFRobertaMainLayer(config)
def call(self, inp, training=False):
return self.encoder(inp)[0]
model = Transformer()
# 2. Calculating loss manually for dummy input
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
x = tf.constant([[1, 0]])
y_true = tf.constant([[1, 0]])
y_pred = model((x, x))
loss = loss_fn(y_true, y_pred)
print(loss) # printing 4.8093767
# 3. Run fit()
model.compile(loss=loss_fn)
model.fit((x, x), y_true) # printing 4.7854
The losses are different:
tf.Tensor(4.8093767, shape=(), dtype=float32) 1/1
[==============================] - 0s 0s/step - loss: 4.7854