I found code here: GitHub - notrichardren/llama-2-70b-hf-inference: How to do llama-70b HuggingFace inference, parallelized across multiple GPUs
I am running code on node with 2 A100 (80g) GPUs.
This is slightly modified version of it:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer, LlamaForCausalLM
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from huggingface_hub import hf_hub_download, snapshot_download
import torch
MODEL_NAME = f"meta-llama/Llama-2-70b-hf"
WEIGHTS_DIR = f"{os.getcwd()}/llama-weights-70b"
# Download model
if not os.path.exists(WEIGHTS_DIR):
os.system(f"mkdir {WEIGHTS_DIR}")
#checkpoint_location = snapshot_download(MODEL_NAME, local_dir=WEIGHTS_DIR, ignore_patterns=["*.safetensors", "model.safetensors.index.json"], token="my_token_goes_here") # run this if you haven't downloaded the 70b model
checkpoint_location = WEIGHTS_DIR # run this if you haven't
# Load model
with init_empty_weights():
model = LlamaForCausalLM.from_pretrained(checkpoint_location)
model = load_checkpoint_and_dispatch(
model,
checkpoint_location,
device_map="auto",
offload_folder=WEIGHTS_DIR,
dtype=torch.float16,
no_split_module_classes=["LlamaDecoderLayer"],
)
tokenizer = LlamaTokenizer.from_pretrained(checkpoint_location)
# Use model
print(tokenizer.decode(model.generate(**({ k: torch.unsqueeze(torch.tensor(v), 0) for k,v in tokenizer("Hi there, how are you doing?").items()}), max_new_tokens = 20).squeeze()))
When I run it gives me this error (only last part from console):
/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [58,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [59,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [60,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [61,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [62,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1695392026823/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [1,0,0], thread: [63,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
Traceback (most recent call last):
File "/data/user/home/vbachi/llama_2/inference.py", line 39, in <module>
print(tokenizer.decode(model.generate(**({ k: torch.unsqueeze(torch.tensor(v), 0) for k,v in tokenizer("Hi there, how are you doing?").items()}), max_new_tokens = 20).squeeze()))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/generation/utils.py", line 1652, in generate
return self.sample(
^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/generation/utils.py", line 2734, in sample
outputs = self(
^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 1038, in forward
outputs = self.model(
^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 925, in forward
layer_outputs = decoder_layer(
^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 635, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 351, in forward
value_states = self.v_proj(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasGemmEx( handle, opa, opb, m, n, k, &falpha, a, CUDA_R_16F, lda, b, CUDA_R_16F, ldb, &fbeta, c, CUDA_R_16F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`
If I remove line:
print(tokenizer.decode(model.generate(**({ k: torch.unsqueeze(torch.tensor(v), 0) for k,v in tokenizer("Hi there, how are you doing?").items()}), max_new_tokens = 20).squeeze()))
I am getting output (only last part that I console output has):
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.79.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.79.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.79.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.79.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.norm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
Loading checkpoint shards: 93%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2589 | 14/15 [00:56<00:03, 3.80s/it]/home/vbachi/.conda/envs/hf_env/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for lm_head.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
Loading checkpoint shards: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 15/15 [00:57<00:00, 3.80s/it]
Question: How to fix the code so it works well?