I want to deploy TheBloke/Llama-2-7b-chat-GPTQ model on Sagemaker and it is giving me this error:
This the code I’m running in sagemaker notebook instance:
import sagemaker
import boto3
sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
sagemaker_session_bucket = sess.default_bucket()
try:
role = sagemaker.get_execution_role()
except ValueError:
iam = boto3.client('iam')
role = iam.get_role(RoleName = 'sagemaker_execution_role')['Role']['Arn']
sess = sagemaker.Session(default_bucket = sagemaker_session_bucket)
print(f"sagemaker role arn:{role}")
print(f"sagemaker session region {sess.boto_region_name}")
import json
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
# sagemaker config
instance = "ml.g4dn.xlarge"
number_of_gpus = 1
health_check_timeout = 1000
# Define Model and Endpoint Configuration
hub = {
'HF_MODEL_ID':'TheBloke/Llama-2-7b-Chat-GPTQ',
'SM_NUM_GPUS': json.dumps(1),
'MAX_TOTAL_TOKEN' : json.dumps(5000),
'HUGGING_FACE_HUB_TOKEN':json.dumps("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")
}
assert hub['HUGGING_FACE_HUB_TOKEN'] != "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" , "Please set your hugging face Hub token"
huggingface_model = HuggingFaceModel(
role=role,
image_uri=get_huggingface_llm_image_uri("huggingface",version="0.9.3"),
env = hub,
)
llm = huggingface_model.deploy(
initial_instance_count = 1,
instance_type= instance,
container_startup_health_check_timeout = health_check_timeout,
)
Error:
UnexpectedStatusException: Error hosting endpoint huggingface-pytorch-tgi-inference-2023-08-24-06-51-13-816: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint..
my CloudWatch logs shows me this:
RuntimeError: weight model.layers.0.self_attn.q_proj.weight does not exist
2023-08-24T12:42:01.865+05:00 #033[2m2023-08-24T07:42:01.699855Z#033[0m #033[31mERROR#033[0m #033[1mshard-manager#033[0m: #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Shard complete standard error output:
You are using a model of type llama to instantiate a model of type . This is not supported for all configurations of models and can yield errors.