See my comments below. Only where you calculate loss manually you replace 0 with -100. This replacement does not happen in the built-in T5ForConditionalGeneration method so you have to do the replacement beforehand.
# Here you get loss based on "target_text_input_ids" as-is (no ignored index)
loss, outputs = self(
source_text_input_ids, source_text_attention_mask, target_text_input_ids
)
loss_mine = None
output = self.model(
input_ids=source_text_input_ids,
attention_mask=source_text_attention_mask,
labels=target_text_input_ids,
)
# Here you first set the padding IDs to -100 so that CE will ignore them...
labels = batch["target_text_input_ids"].clone()
labels[labels == 0] = -100
if target_text_input_ids is not None:
loss_fct = CrossEntropyLoss(ignore_index=-100)
# ... and THEN you calculate loss
loss_mine = loss_fct(output.logits.view(-1, outputs.size(-1)), labels.view(-1))
print(f"loss_huggingface: {loss.item()}, loss_mine : {loss_mine.item()}")