Is a Transformer-based image caption model trained to predict the last token only in training phase?

For the following code (which is snippet from Image Captioning), I do not see the steps for entering the input sequence to model token by token in training phase. But instead of that, the input sequence is entered at once except the last toke by: batch_seq_inp = batch_seq[:, :-1] in the function def _compute_caption_loss_and_acc as shown below. Based on my knowladge, if we have an image that is captioned with a sentence like (image_1 : a man is running), the input output pair in training should be like:

image_1 SOS ==> a
image_1 SOS a ==> man
image_1 SOS a man ==> is
image_1 SOS a man is ==> running
image_1 SOS a man is running ==> END
So I am little confused.

class ImageCaptioningModel(keras.Model):
def init(
self, cnn_model, encoder, decoder, num_captions_per_image=5, image_aug=None,
self.cnn_model = cnn_model
self.encoder = encoder
self.decoder = decoder
self.loss_tracker = keras.metrics.Mean(name=“loss”)
self.acc_tracker = keras.metrics.Mean(name=“accuracy”)
self.num_captions_per_image = num_captions_per_image
self.image_aug = image_aug

def calculate_loss(self, y_true, y_pred, mask):
    loss = self.loss(y_true, y_pred)
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def calculate_accuracy(self, y_true, y_pred, mask):
    accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))
    accuracy = tf.math.logical_and(mask, accuracy)
    accuracy = tf.cast(accuracy, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)

def _compute_caption_loss_and_acc(self, img_embed, batch_seq, training=True):
    encoder_out = self.encoder(img_embed, training=training)
    batch_seq_inp = batch_seq[:, :-1]
    batch_seq_true = batch_seq[:, 1:]
    mask = tf.math.not_equal(batch_seq_true, 0)
    batch_seq_pred = self.decoder(
        batch_seq_inp, encoder_out, training=training, mask=mask
    loss = self.calculate_loss(batch_seq_true, batch_seq_pred, mask)
    acc = self.calculate_accuracy(batch_seq_true, batch_seq_pred, mask)
    return loss, acc

def train_step(self, batch_data):
    batch_img, batch_seq = batch_data
    batch_loss = 0
    batch_acc = 0

    if self.image_aug:
        batch_img = self.image_aug(batch_img)

    # 1. Get image embeddings
    img_embed = self.cnn_model(batch_img)

    # 2. Pass each of the five captions one by one to the decoder
    # along with the encoder outputs and compute the loss as well as accuracy
    # for each caption.
    for i in range(self.num_captions_per_image):
        with tf.GradientTape() as tape:
            loss, acc = self._compute_caption_loss_and_acc(
                img_embed, batch_seq[:, i, :], training=True

            # 3. Update loss and accuracy
            batch_loss += loss
            batch_acc += acc

        # 4. Get the list of all the trainable weights
        train_vars = (
            self.encoder.trainable_variables + self.decoder.trainable_variables

        # 5. Get the gradients
        grads = tape.gradient(loss, train_vars)

        # 6. Update the trainable weights
        self.optimizer.apply_gradients(zip(grads, train_vars))

    # 7. Update the trackers
    batch_acc /= float(self.num_captions_per_image)

    # 8. Return the loss and accuracy values
    return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}

Hi! Did you manage to find an answer to this question, please?