I found a cause.
#model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16) # , cache_dir="/workspace/fmrai/")
model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.float32) # , cache_dir="/workspace/fmrai/")
Cache 0: tensor([[ 3.6974, 22.4915, 14.4676, ..., -5.7953, -3.8699, -7.3202]],
device='cuda:0')
No Cache 0: tensor([[ 3.6974, 22.4915, 14.4676, ..., -5.7953, -3.8699, -7.3202]],
device='cuda:0')
Cache Att 0: tensor([[ 3.6974, 22.4915, 14.4676, ..., -5.7953, -3.8699, -7.3202]],
device='cuda:0')
No Cache Att 0: tensor([[ 3.6974, 22.4915, 14.4676, ..., -5.7953, -3.8699, -7.3202]],
device='cuda:0')
Cache 1: tensor([[ 7.8056, 6.0996, -1.6019, ..., -3.3531, 4.3285, 1.0036]],
device='cuda:0')
No Cache 1: tensor([[ 7.8056, 6.0996, -1.6019, ..., -3.3530, 4.3285, 1.0037]],
device='cuda:0')
Cache Att 1: tensor([[ 7.8056, 6.0996, -1.6019, ..., -3.3530, 4.3285, 1.0037]],
device='cuda:0')
No Cache Att 1: tensor([[ 7.8056, 6.0996, -1.6019, ..., -3.3530, 4.3285, 1.0037]],
device='cuda:0')