Hi, I am using Seq2SeqLM and found that when decoder_input_ids
is padded (by providing decoder_attention_mask
) gives different results when compared decoder_input_ids
without any padding.
It is expected both should result in the same logits right?
A minimal working example is below.
from transformers import AutoModelForSeq2SeqLM
import torch
# model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
# encoder inputs
encoder_input_ids = torch.tensor([
[5, 6],
[3, 4]
])
encoder_attention_mask = torch.tensor([
[1, 1],
[1, 1]
])
# 1. get encoder outputs
model_kwargs = {
"attention_mask": encoder_attention_mask
}
inputs_tensor, model_input_name, model_kwargs = model._prepare_model_inputs(
encoder_input_ids, None, model_kwargs)
# 2. prepare encoder outputs
model_kwargs = model._prepare_encoder_decoder_kwargs_for_generation(
inputs_tensor, model_kwargs, model_input_name
)
########################################################################################
# scenario 1 - decoder inputs without any padding, simply starting off from start_token
decoder_input_ids_default = torch.tensor([
[0],
[0]
])
model_inputs = model.prepare_inputs_for_generation(decoder_input_ids_default,
**model_kwargs)
outputs = model(
**model_inputs,
return_dict=True)
next_token_logits_default = outputs.logits[:, -1, :].clone()
########################################################################################
# scenario 2 - decoder inputs left padded but decoder attention mask specified
decoder_input_ids_padded = torch.tensor([
[0, 0],
[0, 0]
])
model_kwargs["decoder_attention_mask"] = torch.tensor([
[0, 1],
[0, 1]
])
model_inputs = model.prepare_inputs_for_generation(decoder_input_ids_padded,
**model_kwargs)
outputs = model(
**model_inputs,
return_dict=True)
next_token_logits_padded = outputs.logits[:, -1, :].clone()
# padded vs non-padded must give same results, but it does not
assert torch.equal(next_token_logits_default, next_token_logits_padded)