Trying the inference with model Llama-2-70b-hf on 2 A100 (80g) GPUs but getting errors

I found code here: GitHub - notrichardren/llama-2-70b-hf-inference: How to do llama-70b HuggingFace inference, parallelized across multiple GPUs

I am running code on node with 2 A100 (80g) GPUs.

This is slightly modified version of it:

import os
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer, LlamaForCausalLM
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from huggingface_hub import hf_hub_download, snapshot_download
import torch

MODEL_NAME = f"meta-llama/Llama-2-70b-hf"
WEIGHTS_DIR = f"{os.getcwd()}/llama-weights-70b"


# Download model

if not os.path.exists(WEIGHTS_DIR):
    os.system(f"mkdir {WEIGHTS_DIR}")

#checkpoint_location = snapshot_download(MODEL_NAME, local_dir=WEIGHTS_DIR, ignore_patterns=["*.safetensors", "model.safetensors.index.json"], token="my_token_goes_here") # run this if you haven't downloaded the 70b model
checkpoint_location = WEIGHTS_DIR # run this if you haven't


# Load model

with init_empty_weights():
    model = LlamaForCausalLM.from_pretrained(checkpoint_location)

model = load_checkpoint_and_dispatch(
    model,
    checkpoint_location,
    device_map="auto",
    offload_folder=WEIGHTS_DIR,
    dtype=torch.float16,
    no_split_module_classes=["LlamaDecoderLayer"],
)
tokenizer = LlamaTokenizer.from_pretrained(checkpoint_location)



# Use model

print(tokenizer.decode(model.generate(**({ k: torch.unsqueeze(torch.tensor(v), 0) for k,v in tokenizer("Hi there, how are you doing?").items()}), max_new_tokens = 20).squeeze()))

When I run it gives me this error (only last part from console):

/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [58,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [59,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [60,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [61,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [62,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [63,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
Traceback (most recent call last):
  File "/data/user/home/vbachi/llama_2/inference.py", line 39, in <module>
    print(tokenizer.decode(model.generate(**({ k: torch.unsqueeze(torch.tensor(v), 0) for k,v in tokenizer("Hi there, how are you doing?").items()}), max_new_tokens = 20).squeeze()))
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/generation/utils.py", line 1652, in generate
    return self.sample(
           ^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/generation/utils.py", line 2734, in sample
    outputs = self(
              ^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 1038, in forward
    outputs = self.model(
              ^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 925, in forward
    layer_outputs = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 635, in forward
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
                                                          ^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 351, in forward
    value_states = self.v_proj(hidden_states)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasGemmEx( handle, opa, opb, m, n, k, &falpha, a, CUDA_R_16F, lda, b, CUDA_R_16F, ldb, &fbeta, c, CUDA_R_16F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`

If I remove line:
print(tokenizer.decode(model.generate(**({ k: torch.unsqueeze(torch.tensor(v), 0) for k,v in tokenizer("Hi there, how are you doing?").items()}), max_new_tokens = 20).squeeze()))

I am getting output (only last part that I console output has):


  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.79.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.79.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.79.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.79.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.norm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
Loading checkpoint shards:  93%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2589 | 14/15 [00:56<00:03,  3.80s/it]/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for lm_head.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
  warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
Loading checkpoint shards: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 15/15 [00:57<00:00,  3.80s/it]

Question: How to fix the code so it works well?

Hi there, also encountering this issue. Adding this comment to hopefully encourage attention and response.

I also had this problem. Did you solve the problem?

@magh66
Unfortunately have not solved it.

It wokrs for me.

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)

with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)

model = load_checkpoint_and_dispatch(model, model_path,
                                     device_map='auto',
                                     offload_folder="offload",
                                     offload_state_dict=True,
                                     dtype = "float16",
                                     no_split_module_classes=["LlamaDecoderLayer"])
1 Like

@magh66

Thank you.

Could you please also post part of the code that makes inference? My inference code:

print(tokenizer.decode(model.generate(**({ k: torch.unsqueeze(torch.tensor(v), 0) for k,v in tokenizer("Hi there, how are you doing?").items()}), max_new_tokens = 20).squeeze()))

gives me the same error.

I used Yi (a fine-tuned model based on LLaMa, not the original LLaMa.) But they should work the same way, so you can try it. Here’s the repository address, with sample code.
https://github.com/01-ai/Yi