I was following the article here to train the llama2 13b model.
But when I try to deploy with TGI container, I’m running into this error:
#033[2m2023-07-25T19:44:12.555090Z#033[0m #033[31mERROR#033[0m #033[1mshard-manager#033[0m: #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Error when initializing model
"Traceback (most recent call last):
File ""/opt/conda/bin/text-generation-server"", line 8, in <module>
sys.exit(app())
File ""/opt/conda/lib/python3.9/site-packages/typer/main.py"", line 311, in __call__
return get_command(self)(*args, **kwargs)
File ""/opt/conda/lib/python3.9/site-packages/click/core.py"", line 1130, in __call__
return self.main(*args, **kwargs)
File ""/opt/conda/lib/python3.9/site-packages/typer/core.py"", line 778, in main
return _main(
File ""/opt/conda/lib/python3.9/site-packages/typer/core.py"", line 216, in _main
rv = self.invoke(ctx)
File ""/opt/conda/lib/python3.9/site-packages/click/core.py"", line 1657, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File ""/opt/conda/lib/python3.9/site-packages/click/core.py"", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File ""/opt/conda/lib/python3.9/site-packages/click/core.py"", line 760, in invoke
return __callback(*args, **kwargs)
File ""/opt/conda/lib/python3.9/site-packages/typer/main.py"", line 683, in wrapper
return callback(**use_params) # type: ignore
File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/cli.py"", line 67, in serve
server.serve(model_id, revision, sharded, quantize, trust_remote_code, uds_path)
File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py"", line 155, in serve
asyncio.run(serve_inner(model_id, revision, sharded, quantize, trust_remote_code))
File ""/opt/conda/lib/python3.9/asyncio/runners.py"", line 44, in run
return loop.run_until_complete(main)
File ""/opt/conda/lib/python3.9/asyncio/base_events.py"", line 634, in run_until_complete
self.run_forever()
File ""/opt/conda/lib/python3.9/asyncio/base_events.py"", line 601, in run_forever
self._run_once()
File ""/opt/conda/lib/python3.9/asyncio/base_events.py"", line 1905, in _run_once
handle._run()
File ""/opt/conda/lib/python3.9/asyncio/events.py"", line 80, in _run
self._context.run(self._callback, *self._args)"
"> File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py"", line 124, in serve_inner
model = get_model(model_id, revision, sharded, quantize, trust_remote_code)
File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/models/__init__.py"", line 246, in get_model
return llama_cls(
File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/models/flash_llama.py"", line 58, in __init__
filenames = weight_files(model_id, revision, "".bin"")
File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/utils/hub.py"", line 86, in weight_files
raise FileNotFoundError("
"FileNotFoundError: No local weights found in /opt/ml/model with extension .bin
#033[2m#033[3mrank#033[0m#033[2m=#033[0m0#033[0m"
#033[2m2023-07-25T19:44:13.212448Z#033[0m #033[31mERROR#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Shard 0 failed to start:
"Traceback (most recent call last):
File ""/opt/conda/bin/text-generation-server"", line 8, in <module>
sys.exit(app())
File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/cli.py"", line 67, in serve"
"Error: ShardCannotStart
server.serve(model_id, revision, sharded, quantize, trust_remote_code, uds_path)
File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py"", line 155, in serve
asyncio.run(serve_inner(model_id, revision, sharded, quantize, trust_remote_code))
File ""/opt/conda/lib/python3.9/asyncio/runners.py"", line 44, in run
return loop.run_until_complete(main)
File ""/opt/conda/lib/python3.9/asyncio/base_events.py"", line 647, in run_until_complete
return future.result()
File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py"", line 124, in serve_inner
model = get_model(model_id, revision, sharded, quantize, trust_remote_code)
File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/models/__init__.py"", line 246, in get_model
return llama_cls(
File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/models/flash_llama.py"", line 58, in __init__
filenames = weight_files(model_id, revision, "".bin"")
File ""/opt/conda/lib/python3.9/site-packages/text_generation_server/utils/hub.py"", line 86, in weight_files
raise FileNotFoundError("
FileNotFoundError: No local weights found in /opt/ml/model with extension .bin
I use this code to deploy
from sagemaker.huggingface import get_huggingface_llm_image_uri
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
"huggingface",
version="0.8.2"
)
# print ecr image uri
print(f"llm image uri: {llm_image}")
import json
from sagemaker.huggingface import HuggingFaceModel
# sagemaker config
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300
# Define Model and Endpoint configuration parameter
config = {
'HF_MODEL_ID': "/opt/ml/model",
'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
'MAX_INPUT_LENGTH': json.dumps(2048), # Max length of input text
'MAX_TOTAL_TOKENS': json.dumps(4096), # Max length of the generation (including input text)
# 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}
# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
model_data=f"s3://sagemaker-us-east-1-535772764458/huggingface-qlora-2023-07-24-21-27-30-2023-07-24-21-30-34-443/output/model.tar.gz", # Change to your model path
role=role,
image_uri=llm_image,
env=config
)
llm = llm_model.deploy(
initial_instance_count=1,
instance_type=instance_type,
# volume_size=400, # If using an instance with local SSD storage, volume_size must be None, e.g. p4 but not p3
container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)
The trained model is made public in the s3 uri above in case that helps with debugging.