Questions about outputs.logits,

I applied the softmax function to outputs.logits to calculate the probabilities for all tokens that were not output. I then checked the probabilities for both the tokens that were output and those that were not. However, the probabilities obtained in this way differ from the values obtained by applying model.compute_transition_scores to outputs.scores. Why is there such a difference? Since model.compute_transition_scores only provides scores for the final output tokens, is it okay to use the softmax-applied logits values as the output token probabilities?

model_id = "meta-llama/Meta-Llama-3-70B-Instruct"
llama3 = AutoModelForCausalLM.from_pretrained(model_id,  quantization_config=bnb_config, device_map ='auto',cache_dir = cache)
tokenizer = AutoTokenizer.from_pretrained(model_id)


messages = [
    {"role": "system", "content": """ Hello 1+1=?"""},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(llama3.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = llama3.generate(
    input_ids,
    max_new_tokens=50,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    output_scores=True,
    output_logits=True,
    return_dict_in_generate=True, 
    pad_token_id=tokenizer.eos_token_id
)
## Example
outputs.sequences[0][len(input_ids[0]):-1]
=tensor([ 791, 4320,  311,  220,   16,   10,   16,  374,  220,   17,   13],
       device='cuda:0')

print(tokenizer.decode(outputs.sequences[0][len(input_ids[0]):-1]))
='The answer to 1+1 is 2.'

# I have confirmed that the two variables show the same pattern.
for i in range(len(outputs.logits[:-1])):
    print(torch.argmax(outputs.logits[i]))
  = tensor(791, device='cuda:0')
  tensor(4320, device='cuda:0')
...
  tensor(17, device='cuda:0')
  tensor(13, device='cuda:0')


for i in range(len(outputs.scores[:-1])):
    print(torch.argmax(outputs.scores[i]))
tensor(791, device='cuda:0')
tensor(4320, device='cuda:0')
....
tensor(17, device='cuda:0')
tensor(13, device='cuda:0')

#So I calculate probability

for i in range(len(outputs.logits[:-1])):
    probabilities_scores = torch.softmax(outputs.logits[i], dim = -1)
    print(max(probabilities_scores[0]))

tensor(0.6257, device='cuda:0')
tensor(0.9997, device='cuda:0')
tensor(0.6549, device='cuda:0')
tensor(1.0000, device='cuda:0')
tensor(1., device='cuda:0')
tensor(0.9868, device='cuda:0')
tensor(1., device='cuda:0')
tensor(1.0000, device='cuda:0')
tensor(0.9984, device='cuda:0')
tensor(1., device='cuda:0')
tensor(0.7974, device='cuda:0')


#but it is very different from compute_transtion_scores()

transition_scores = llama3.compute_transition_scores(
    outputs.sequences, outputs.scores, normalize_logits=True
).cpu()
np.exp(transition_scores)
tensor([[0.8918, 1.0000, 0.7442, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000]]
...