I am trying to download few models which are very heavy and have LFS files. for this i was using dask for parallelization. But the code is not able to download the model and suggest the model name is not availableo on git, here is the code used . Can someone suggest how to download multiple large LFS files and not use git clone as it could be very slow
import dask
from dask.diagnostics import ProgressBar
import huggingface_hub
import time
import os
import subprocess
from huggingface_hub import hf_hub_download
Your Hugging Face token (replace with your actual token)
token = “your_actual_token”
Model IDs
model_ids = [“lysandre/arxiv-nlp”, “tiiuae/falcon-180B”]
def download_model(model_id):
start_time = time.time()
try:
# Download the model metadata (including LFS pointers)
hf_hub_download(repo_id=model_id, token=token, cache_dir=‘/path/to/your/directory’)
# Change to the directory where the model is downloaded
model_dir = os.path.join('/path/to/your/directory', model_id)
os.chdir(model_dir)
# Run git-lfs fetch to download the LFS files
subprocess.run(["git", "lfs", "fetch", "--all"], check=True)
subprocess.run(["git", "lfs", "checkout"], check=True)
except Exception as e:
print(f"Error downloading {model_id}: {e}")
elapsed_time = time.time() - start_time
print(f"Downloaded {model_id} in {elapsed_time:.2f} seconds")
Create a Dask list of delayed objects for each model information
model_infos = [dask.delayed(download_model)(model_id) for model_id in model_ids]
Download models in parallel and track progress
with ProgressBar():
dask.compute(model_infos)