Added offload_folder and offload_dict_state after reading the Hugging Face guide to load huge models.
def load_llm():
"""
Load the LLM
"""
# Model ID
repo_id = 'meta-llama/Llama-2-7b-chat-hf'
login(token="hf_xxxxxxxx")
# Load the model
model = AutoModelForCausalLM.from_pretrained(
repo_id,
device_map='auto',
load_in_4bit=False,
token = True,
offload_folder = r"C:\Users\DHRUV\Desktop\New folder\Law-GPT",
offload_state_dict = True
)
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
repo_id,
use_fast=True
)
# Create pipeline
pipe = pipeline(
'text-generation',
model=model,
tokenizer=tokenizer,
max_length=512
)
# Load the LLM
llm = HuggingFacePipeline(pipeline=pipe)
return llm
The Error I am facing, Please help:
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\DHRUV\.cache\huggingface\token
Login successful
Loading checkpoint shards: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 2/2 [00:00<?, ?it/s]
Traceback (most recent call last):
File "C:\Users\DHRUV\Desktop\New folder\Law-GPT\app.py", line 5, in <module>
chain = qa_pipeline()
File "C:\Users\DHRUV\Desktop\New folder\Law-GPT\utils.py", line 100, in qa_pipeline
llm = load_llm()
File "C:\Users\DHRUV\Desktop\New folder\Law-GPT\utils.py", line 44, in load_llm
model = AutoModelForCausalLM.from_pretrained(
File "C:\Users\DHRUV\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\transformers\models\auto\auto_factory.py", line 566, in from_pretrained
return model_class.from_pretrained(
File "C:\Users\DHRUV\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\transformers\modeling_utils.py", line 3773, in from_pretrained
dispatch_model(model, **device_map_kwargs)
File "C:\Users\DHRUV\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\accelerate\big_modeling.py", line 438, in dispatch_model
raise ValueError(
ValueError: You are trying to offload the whole model to the disk. Please use the `disk_offload` function instead.
I have the same issue trying to load. GPT-NeoXT-Chat-Base-20B
Hopefully some one chimes in here. For now my best guess is to follow the instruction in the error message.
Find this file File βC:\Users\DHRUV\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\accelerate\big_modeling.pyβ
on line 436 change : model.to(device)" to βmodel.disk_offload(device)β
Let me know if that works. Iβm using a colab notebook and struggling to get the file updated.
I fixed it this morning. To resolve this, I made some modifications to the initialization method of my chat model. The modifications involve adding checks to determine if offloading to disk is necessary and using the disk_offload function accordingly, thereby resolving the ValueError .Hereβs a comparison of the original and modified code with comments:
Original Code - ChatModel init function :
# Set device to GPU with specified id
device = torch.device('cuda', gpu_id)
if max_memory is None:
# Load model onto one device
self._model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.float16, device_map="auto")
self._model.to(device)
else:
# Load model configuration
config = AutoConfig.from_pretrained(model_name)
with init_empty_weights():
# Initialize model with empty weights
model_from_conf = AutoModelForCausalLM.from_config(config)
model_from_conf.tie_weights()
# Create a device_map from max_memory
device_map = infer_auto_device_map(
model_from_conf, max_memory=max_memory,
no_split_module_classes=["GPTNeoXLayer"], dtype="float16"
)
# Load the model with the above device_map
self._model = AutoModelForCausalLM.from_pretrained(
model_name, device_map=device_map, offload_folder="offload",
offload_state_dict=True, torch_dtype=torch.float16
)
self._tokenizer = AutoTokenizer.from_pretrained(model_name)
Modified Code - ChatModel init function :
# Selects GPU if available, else CPU
device = torch.device('cuda', gpu_id) if torch.cuda.is_available() else torch.device('cpu')
# Load model configuration
config = AutoConfig.from_pretrained(model_name)
with init_empty_weights():
# Initialize model with empty weights
self._model = AutoModelForCausalLM.from_config(config)
# Create device map based on memory constraints
device_map = infer_auto_device_map(
self._model, max_memory=max_memory, no_split_module_classes=["GPTNeoXLayer"], dtype="float16"
)
# Determine if offloading is needed
needs_offloading = any(device == 'disk' for device in device_map.values())
if needs_offloading:
# Load model for offloading
self._model = AutoModelForCausalLM.from_pretrained(
model_name, device_map=device_map, offload_folder="offload",
offload_state_dict=True, torch_dtype=torch.float16
)
offload_directory = "../offload/"
# Offload model to disk
disk_offload(model=self._model, offload_dir=offload_directory)
else:
# Load model normally to specified device
self._model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.float16
)
self._model.to(device)
# Initialize tokenizer
self._tokenizer = AutoTokenizer.from_pretrained(model_name)
Has this been resolved? Trying to download Llama-3.1-70B-Instruct (total mem: ~150GiB) locally and was facing the same issue. I was able to fix this by pushing the entire model into disk memory:
device_map = infer_auto_device_map(
model,
max_memory={"disk": "150GiB"}, # Adjust based on available RAM
no_split_module_classes=["LlamaDecoderLayer"] # Ensures correct model partitioning
)
However, when I run the script below -
def download_model(model_name: str, output_dir: str, use_auth: bool = False, force: bool = False):
if model_name not in MODEL_CONFIGS:
raise ValueError(f"Model {model_name} not found in configurations")
config = MODEL_CONFIGS[model_name]
model_id = config["huggingface_id"]
local_path = config["path"]
# Offload directory for large models
offload_path = os.path.join(local_path, "offload")
# Check if model already exists
if os.path.exists(local_path):
if not force:
print(f"\nβ {config['name']} already exists at {local_path}")
return
else:
print(f"\nForce flag set - removing existing model at {local_path}")
shutil.rmtree(local_path)
os.makedirs(offload_path, exist_ok=True)
# Create output directory
model_output_dir = os.path.join(output_dir, model_name)
os.makedirs(model_output_dir, exist_ok=True)
try:
# Download tokenizer
print("Downloading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
model_id,
use_fast=True,
trust_remote_code=True
)
tokenizer.save_pretrained(local_path)
print("β Tokenizer downloaded successfully")
# Download model with disk offloading
print("\nDownloading model...")
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16, # Use float16 to reduce memory usage
trust_remote_code=True
)
# Automatically offload large model parts to disk
device_map = infer_auto_device_map(
model,
max_memory={"disk": "150GiB"},
no_split_module_classes=["LlamaDecoderLayer"] # Ensures correct model partitioning
)
# Dispatch model to offload components to disk
model = dispatch_model(
model,
device_map=device_map,
offload_dir=offload_path, # Offload excess parts to disk
offload_state_dict=True # Ensures early offloading
)
# Save model after successful loading
model.save_pretrained(
local_path,
safe_serialization=True,
max_shard_size="2GB"
)
print("β Model downloaded successfully")
print(f"\nβ {config['name']} has been downloaded and saved to {local_path}")
except Exception as e:
print(f"\nβ Error downloading {config['name']}: {str(e)}")
if "401" in str(e) and not use_auth:
print("\nTip: This model might require authentication. Try running with --use-auth")
raise
I get this error msg -
Downloading shards: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββ| 30/30 [26:55<00:00, 53.85s/it]
Loading checkpoint shards: 33%|ββββββββββββββ | 10/30 [02:00<04:31, 13.60s/it]zsh: killed python Codes/download_models.py --models llama-70b --output-dir Models
/opt/miniconda3/lib/python3.12/multiprocessing/resource_tracker.py:254: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '
Apparently, Iβm still running into a bunch of OOM issues. No idea whatβs going on.
Some info -
Before running the script, I had >300GiB free disk memory
After the model is downloaded into cache, this reduced to ~160GiB
def download_model(model_name: str, output_dir: str, use_auth: bool = False, force: bool = False):
if model_name not in MODEL_CONFIGS:
raise ValueError(f"Model {model_name} not found in configurations")
config = MODEL_CONFIGS[model_name]
model_id = config["huggingface_id"]
local_path = config["path"]
# Offload directory for large models
offload_path = os.path.join(local_path, "offload")
# Check if model already exists
if os.path.exists(local_path):
if not force:
print(f"\nβ {config['name']} already exists at {local_path}")
return
else:
print(f"\nForce flag set - removing existing model at {local_path}")
shutil.rmtree(local_path)
os.makedirs(offload_path, exist_ok=True)
# Create output directory
model_output_dir = os.path.join(output_dir, model_name)
os.makedirs(model_output_dir, exist_ok=True)
try:
# Download tokenizer
print("Downloading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
model_id,
use_fast=True,
trust_remote_code=True
)
tokenizer.save_pretrained(local_path)
print("β Tokenizer downloaded successfully")
# Load model configuration
config = AutoConfig.from_pretrained(model_id)
with init_empty_weights():
# Initialize model with empty weights
self._model = AutoModelForCausalLM.from_config(config)
# Automatically offload large model parts to disk
device_map = infer_auto_device_map(
model,
max_memory={"disk": "150GiB"},
no_split_module_classes=["LlamaDecoderLayer"] # Ensures correct model partitioning
)
# Download model with disk offloading
print("\nDownloading model...")
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16, # Use float16 to reduce memory usage
device_map=device_map, # <= Without this, 150 GiB of RAM will be consumed during loading
offload_folder=offload_path,
offload_state_dict=True,
trust_remote_code=True
)
# Dispatch model to offload components to disk
model = dispatch_model(
model,
device_map=device_map,
offload_dir=offload_path, # Offload excess parts to disk
offload_state_dict=True # Ensures early offloading
)
# Save model after successful loading
model.save_pretrained(
local_path,
safe_serialization=True,
max_shard_size="2GB"
)
print("β Model downloaded successfully")
print(f"\nβ {config['name']} has been downloaded and saved to {local_path}")
except Exception as e:
print(f"\nβ Error downloading {config['name']}: {str(e)}")
if "401" in str(e) and not use_auth:
print("\nTip: This model might require authentication. Try running with --use-auth")
raise
Turns out a lot of model files were getting pushed into my RAM from .cache, which caused the crash. I fixed it by downloading the model to cache and just storing it there. You can then rsync to wherever you want to run inference.
def download_model_to_cache(model_name: str):
config = MODEL_CONFIGS[model_name]
model_id = config["huggingface_id"]
try:
# Download full model snapshot to cache
snapshot_download(repo_id=model_id, local_dir=None)
print("\nβ Model successfully downloaded to cache!")
except Exception as e:
print(f"\nβ Error downloading {model_id}: {str(e)}")
raise