Seems working well for me. (Windows (not WSL2), Python 3.9)
pip install -U huggingface_hub[hf_xet]
import os
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForCausalLM
# --- CONFIGURATION ---
REPO_ID = "Efficient-Large-Model/NVILA-8B-Video"
LOCAL_DIR = os.path.abspath(r'model_nvila_8B')
if not os.path.exists(LOCAL_DIR):
snapshot_download(
repo_id=REPO_ID,
repo_type="model",
local_dir=LOCAL_DIR,
)
else:
print("Model already downloaded")
README.md: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4.29k/4.29k [00:00<00:00, 2.14MB/s]
added_tokens.json: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 194/194 [00:00<00:00, 64.6kB/s]
generation_config.json: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 243/243 [00:00<00:00, 122kB/s]
config.json: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 226k/226k [00:00<00:00, 8.70MB/s]
merges.txt: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1.67M/1.67M [00:00<00:00, 21.1MB/s]
config.json: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 186k/186k [00:00<00:00, 1.19MB/s]
.gitattributes: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1.52k/1.52k [00:00<00:00, 759kB/s]
model.safetensors.index.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 27.8k/27.8k [00:00<00:00, 9.25MB/s]
special_tokens_map.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 555/555 [00:00<00:00, 277kB/s]
tokenizer_config.json: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 2.30k/2.30k [00:00<00:00, 1.15MB/s]
config.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 313/313 [00:00<00:00, 156kB/s]
trainer_state.json: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 128k/128k [00:00<00:00, 860kB/s]
config.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 650/650 [00:00<00:00, 217kB/s]
vocab.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3.38M/3.38M [00:00<00:00, 4.32MB/s]
preprocessor_config.json: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 394/394 [00:00<00:00, 131kB/s]
mm_projector/model.safetensors: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 58.8M/58.8M [01:14<00:00, 789kB/s]
llm/model-00001-of-00004.safetensors: 0%| | 3.39M/4.87G [01:11<29:12:40, 46.3kB/s]
llm/model-00003-of-00004.safetensors: 0%| | 2.27M/4.33G [00:38<9:42:44, 124kB/s]
llm/model-00004-of-00004.safetensors: 7%|ββββββββββ | 81.2M/1.09G [01:10<13:31, 1.24MB/s]
llm/model-00004-of-00004.safetensors: 14%|ββββββββββββββββββ | 148M/1.09G [01:14<06:11, 2.53MB/s]
vision_tower/model.safetensors: 4%|βββββ | 30.4M/827M [01:08<1:09:24, 191kB/s]
mm_projector/model.safetensors: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 58.8M/58.8M [01:14<00:00, 789kB/s]