i am trying to pack the tokens and see if the output remain same with and without packing
when i used f16 precision results for with packing and without packing remain same, but when i tried with bf16 i found slight difference in the output
Below is the code
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
model = AutoModel.from_pretrained("Qwen/Qwen3-Embedding-0.6B" , attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16).cuda()
# ignore the example text :)
texts = ["hi how are you this is what so that is that what is know is there any way to do this" , " hi how can i help you today" ]
input_ids = tokenizer(
texts,
padding=True,
truncation=True,
max_length=512,
return_tensors="pt",
)
input_ids = { k: v.cuda() for k, v in input_ids.items() }
# packing the tokens
input_ids_with_packing = tokenizer(
texts,
padding=False,
truncation=False,
max_length=512,
)
input_with_packing = {"input_ids" : [], "position_ids" : []}
index =1
for i in range(len(input_ids_with_packing["input_ids"])):
input__ = input_ids_with_packing["input_ids"][i]
input_with_packing['input_ids'] += input__
index = index+1
input_with_packing['position_ids'] += list(range(len(input_ids_with_packing["input_ids"][i])))
input_with_packing = {k: torch.tensor([v]).cuda() for k, v in input_with_packing.items()}
so the final inputs will be like
# without packing
{'input_ids': tensor([[ 6023, 1246, 525, 498, 419, 374, 1128, 773, 429,
374, 429, 1128, 374, 1414, 374, 1052, 894, 1616,
311, 653, 419, 151643],
[151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
151643, 151643, 151643, 151643, 151643, 15588, 1246, 646, 600,
1492, 498, 3351, 151643]], device='cuda:0'),
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]],
device='cuda:0')}
# with packing
{'input_ids': tensor([[ 6023, 1246, 525, 498, 419, 374, 1128, 773, 429,
374, 429, 1128, 374, 1414, 374, 1052, 894, 1616,
311, 653, 419, 151643, 15588, 1246, 646, 600, 1492,
498, 3351, 151643]], device='cuda:0'),
'position_ids': tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7]], device='cuda:0')}
when i run with these inputs
with torch.no_grad():
direct_outputs = model(**input_ids)
packing_outputs = model(**input_with_packing)
the last_hidden_state outputs are
# without packing
tensor([[[ 2.4062e+00, -1.4609e+00, -1.8848e-01, ..., -8.4375e+00,
-1.1625e+01, 1.3504e-03],
[-3.9375e+00, -3.2656e+00, -1.2500e+00, ..., 1.5078e+00,
-1.7090e-01, 1.0000e+00],
[-2.4219e+00, -5.9688e+00, -1.4766e+00, ..., 8.4375e-01,
-1.0156e+00, -2.3281e+00],
...,
[-1.6406e+00, -8.6875e+00, -1.3281e+00, ..., -5.7812e-01,
-1.4844e+00, -1.5625e-01],
[-1.7344e+00, -6.6875e+00, -1.3594e+00, ..., -3.1982e-02,
-5.2344e-01, -1.7480e-01],
[-9.8047e-01, -1.7188e+00, -9.6484e-01, ..., -9.9121e-02,
2.2344e+00, 4.6250e+00]],
[[-4.3438e+00, 1.6797e-01, 2.5586e-01, ..., 3.7969e+00,
6.6250e+00, -3.9688e+00],
[-4.3438e+00, 1.6797e-01, 2.5586e-01, ..., 3.7969e+00,
6.6250e+00, -3.9688e+00],
[-4.3438e+00, 1.6797e-01, 2.5586e-01, ..., 3.7969e+00,
6.6250e+00, -3.9688e+00],
...,
[ 5.7031e-01, -5.2188e+00, -1.1094e+00, ..., 1.0000e+00,
-4.0938e+00, -1.3984e+00],
[-2.5000e+00, -3.6719e+00, -1.2812e+00, ..., 1.1328e+00,
-4.5508e-01, -2.4375e+00],
[-1.6016e+00, 5.1953e-01, -1.1016e+00, ..., 9.2188e-01,
5.0000e-01, 1.7266e+00]]], device='cuda:0', dtype=torch.bfloat16))
# with packing
(tensor([[[ 2.5312, -1.4531, -0.1982, ..., -8.4375, -11.7500, -0.0121],
[ -3.9688, -3.1562, -1.2656, ..., 1.5703, -0.1865, 0.8359],
[ -2.3906, -5.9062, -1.4766, ..., 0.9258, -1.0234, -2.2188],
...,
[ 0.5078, -5.1250, -1.1250, ..., 1.0938, -4.0625, -1.3594],
[ -2.5625, -3.5938, -1.2656, ..., 1.1484, -0.4316, -2.4219],
[ -1.7188, 0.5547, -1.0938, ..., 0.9609, 0.5977, 1.7031]]],
device='cuda:0', dtype=torch.bfloat16),
is there any reason for this ? or am i doing something wrong here