Difference in Number of Parameters for load_in_4bit

I am seeing a different number of Total parameters when I load a model normally vs. load_in_4bit.

def print_param_precision(model):
  dtypes = {}
  for _, p in model.named_parameters():
      dtype = p.dtype
      if dtype not in dtypes:
          dtypes[dtype] = 0
      dtypes[dtype] += p.numel()
  total = 0
  for k, v in dtypes.items():
      total += v
  for k, v in dtypes.items():
      print(f"{k}, {v / 10**6:.4f} M, {v / total*100:.2f} %")

def print_trainable_parameters(model):
  # Count the total parameters
  total_params = sum(p.numel() for p in model.parameters())
  print(f"Total parameters: {total_params/10**6:.4f} M")

  # Count the trainable parameters
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print(f"Trainable parameters: {trainable_params/10**6:.4f} M")
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
print(f"Memory: {model.get_memory_footprint() / 1024**2:.4f} MB")
print("\nParameters:")
print_trainable_parameters(model)
print("\nData types:")
print_param_precision(model)

Memory: 486.7002 MB

Parameters:
Total parameters: 124.4398 M
Trainable parameters: 124.4398 M

Data types:
torch.float32, 124.4398 M, 100.00 %

model = AutoModelForCausalLM.from_pretrained("gpt2", load_in_4bit="True", device_map="auto")
print(f"Memory: {model.get_memory_footprint() / 1024**2:.4f} MB")
print("\nParameters:")
print_trainable_parameters(model)
print("\nData types:")
print_param_precision(model)

Memory: 127.8501 MB

Parameters:
Total parameters: 81.9725 M
Trainable parameters: 39.4222 M

Data types:
torch.float16, 39.5052 M, 48.19 %
torch.uint8, 42.4673 M, 51.81 %