i have been encountering the following error message for a while:
.to
is not supported for 4-bit
or 8-bit
bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct dtype
.
and the code that throws the error is as follows:
loading an LLM locally
import os
import torch
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
)
from transformers.utils import is_flash_attn_2_available
00. Loading the environment variable
load_dotenv()
01. Grab token
hf_token = os.getenv(“HF_TOKEN”)
if not hf_token:
raise RuntimeError(“Hugging Face token not found! Please set HF_TOKEN in your .env file.”)
02. (Optional) Cache it locally so you can also use CLI-backed commands
login(token=hf_token)
1.Create a quantization config
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16)
if use_quantization_config:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
)
else:
quantization_config = None
flash attention 2 - faster attention mechanism
Flash attention requires a GPU capability 8.0+
if (is_flash_attn_2_available()) and (torch_cuda.get_device_capability(0)[0] >=8):
attn_implementation = “flash_attention_2”
else:
attn_implementation = “sdpa” # scaled dot product attention
2. Pick a model
model_id = “google/gemma-2-2b-it”
model_id = model_id
3. Instantiate tokenizer (tokenizer turns text into tokens)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id,
token=hf_token,)
4 Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path=model_id,
device_map=“auto”,
token=hf_token,
torch_dtype=torch.float16,
quantization_config=quantization_config if use_quantization_config else None,
# quantization_config=quantization_config,
low_cpu_mem_usage=True,
attn_implementation=attn_implementation,
# device_map=“auto”,
)
if not use_quantization_config:
llm_model.to(“cuda”)