I am seeing a different number of Total parameters
when I load a model normally vs. load_in_4bit
.
def print_param_precision(model):
dtypes = {}
for _, p in model.named_parameters():
dtype = p.dtype
if dtype not in dtypes:
dtypes[dtype] = 0
dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items():
total += v
for k, v in dtypes.items():
print(f"{k}, {v / 10**6:.4f} M, {v / total*100:.2f} %")
def print_trainable_parameters(model):
# Count the total parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params/10**6:.4f} M")
# Count the trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params/10**6:.4f} M")
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
print(f"Memory: {model.get_memory_footprint() / 1024**2:.4f} MB")
print("\nParameters:")
print_trainable_parameters(model)
print("\nData types:")
print_param_precision(model)
Memory: 486.7002 MB
Parameters:
Total parameters: 124.4398 M
Trainable parameters: 124.4398 MData types:
torch.float32, 124.4398 M, 100.00 %
model = AutoModelForCausalLM.from_pretrained("gpt2", load_in_4bit="True", device_map="auto")
print(f"Memory: {model.get_memory_footprint() / 1024**2:.4f} MB")
print("\nParameters:")
print_trainable_parameters(model)
print("\nData types:")
print_param_precision(model)
Memory: 127.8501 MB
Parameters:
Total parameters: 81.9725 M
Trainable parameters: 39.4222 MData types:
torch.float16, 39.5052 M, 48.19 %
torch.uint8, 42.4673 M, 51.81 %