How to use trust_remote_code=True with load_checkpoint_and_dispatch?

Hey, so, I have been trying to run inference using mosaicml’s mpt-7b model using accelerate to split the model across multiple gpus. I know its probably not the most common model to do this with, but I want to do it for my particular use case.

Anyways, the model requires setting trust_remote_code=True or else it will throw an error. The trouble is, even when I set that, it still throws the error when I use load_checkpoint_and_dispatch. I wrote my code based almost entirely on this huggingface article..

This is my code:

import accelerate
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import transformers
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

checkpoint = "C:/Users/Bilbo/Documents/mosaic/model"
config = transformers.AutoConfig.from_pretrained(
        checkpoint,
        trust_remote_code=True
    )

with init_empty_weights():
    model = transformers.AutoModelForCausalLM.from_config(
        config,
        trust_remote_code=True
    )
    
model.tie_weights()

model = load_checkpoint_and_dispatch(
    model, checkpoint, device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer("Hello, my name is", return_tensors="pt")
inputs = inputs.to(0)
output = model.generate(inputs["input_ids"], max_length=50, temperature=0.7)
output_text = tokenizer.decode(output[0].tolist())
print(output_text)

And this is the error I receive when I run it:

ValueError: Loading C:/Users/Bilbo/Documents/mosaic/model requires you to execute the configuration file in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error.

How can I make trust_remote_code=True actually stick when using load_checkpoint_and_dispatch? Or is there a way to split it across multiple gpu’s with accelerate that doesn’t require load_checkpoint_and_dispatch?

Hi @SDryluth,

I am able to load the model this way:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, file_utils
from accelerate import init_empty_weights, load_checkpoint_and_dispatch


pretrained_model_dir = 'mosaicml/mpt-7b'
pretrained_model_cache_dir = "/home/user/.cache/huggingface/hub/models--mosaicml--mpt-7b/snapshots/d8304854d4877849c3c0a78f3469512a84419e84/"

config = AutoConfig.from_pretrained(pretrained_model_dir, trust_remote_code=True, torch_dtype=torch.float16)
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True, torch_dtype=torch.float16)

max_memory = {0: "10GiB", "cpu": "80GiB"}
model = load_checkpoint_and_dispatch(
    model, pretrained_model_cache_dir, device_map="auto", max_memory=max_memory, dtype=torch.float16
)

I only have one 12GB VRAM GPU, so I am loading the rest of the model in CPU, but perhaps you can modify your max_memory dict to include 2nd GPU and see if this works.

Hi @abhinavkulkarni , I see NoneType after loading the model from disk


from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
from accelerate import init_empty_weights
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, file_utils
from accelerate import init_empty_weights, load_checkpoint_and_dispatch


model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b-instruct", trust_remote_code=True)
model.save_pretrained("./model_path/")

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
# print("downlaoded tokenizer")
tokenizer.save_pretrained("./model_path/")
# print("saved tokenizer")

model = AutoModelForCausalLM.from_pretrained('./model_path',trust_remote_code=True)
checkpoint ="./model_path"
config = AutoConfig.from_pretrained('./model_path',trust_remote_code=True)

with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config,trust_remote_code=True)

max_memory = {0: "8GiB", "cpu": "4GiB"}
model = load_checkpoint_and_dispatch(    model, checkpoint, device_map="auto",max_memory=max_memory,  offload_folder="offload")
model = model.tie_weights()
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print(type(model))

Running above code gives Nonetype

> Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:26<00:00, 68.79s/it]
> The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
> The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
> <class 'NoneType'>

Any idea on how to resolve this? Thanks

Try this and let me know:

from transformers import AutoTokenizer, AutoModelForCausalLM

import transformers

import torch

from accelerate import init_empty_weights

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, file_utils

from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map

config = AutoConfig.from_pretrained(‘./model_path’,trust_remote_code=True)

config.attn_config[“attn_impl”] = “triton”

config.init_device = “cuda:0”

model = AutoModelForCausalLM.from_pretrained(“tiiuae/falcon-7b-instruct”, config=config, torch_dtype=torch.bfloat16, trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(“EleutherAI/gpt-neox-20b”, padding_side=“left”)

tokenizer.pad_token_id = tokenizer.eos_token_id

print(“downlaoded tokenizer”)

model.save_pretrained(“./model_path/”)

tokenizer.save_pretrained(“./model_path/”)

print(“saved tokenizer”)

checkpoint =“./model_path”

with init_empty_weights():

config = AutoConfig.from_pretrained(‘./model_path’,trust_remote_code=True)

model = AutoModelForCausalLM.from_config(config,trust_remote_code=True)

max_memory = {0: “8GiB”, “cpu”: “4GiB”}

model = load_checkpoint_and_dispatch(model, checkpoint, device_map=“auto”, dtype=torch.bfloat16, offload_folder=“offload”)

model = model.tie_weights()

tokenizer = AutoTokenizer.from_pretrained(checkpoint, padding_side=“left”)

print(type(model))