i have been encountering the following error message for a while:
.to is not supported for 4-bit or 8-bit bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct dtype.
and the code that throws the error is as follows:
loading an LLM locally
import os
import torch
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
)
from transformers.utils import is_flash_attn_2_available
00. Loading the environment variable
load_dotenv()
01. Grab token
hf_token = os.getenv(“HF_TOKEN”)
if not hf_token:
raise RuntimeError(“Hugging Face token not found! Please set HF_TOKEN in your .env file.”)
02. (Optional) Cache it locally so you can also use CLI-backed commands
login(token=hf_token)
1.Create a quantization config
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16)
if use_quantization_config:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
)
else:
quantization_config = None
flash attention 2 - faster attention mechanism
Flash attention requires a GPU capability 8.0+
if (is_flash_attn_2_available()) and (torch_cuda.get_device_capability(0)[0] >=8):
attn_implementation = “flash_attention_2”
else:
attn_implementation = “sdpa” # scaled dot product attention
2. Pick a model
model_id = “google/gemma-2-2b-it”
model_id = model_id
3. Instantiate tokenizer (tokenizer turns text into tokens)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id,
token=hf_token,)
4 Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path=model_id,
device_map=“auto”,
token=hf_token,
torch_dtype=torch.float16,
quantization_config=quantization_config if use_quantization_config else None,
# quantization_config=quantization_config,
low_cpu_mem_usage=True,
attn_implementation=attn_implementation,
# device_map=“auto”,
)
if not use_quantization_config:
llm_model.to(“cuda”)