I am looking to deploy “meta-llama/Llama-3.2-11B-Vision-Instruct” to AWS SageMaker but getting below error while deploying.
Traceback (most recent call last):
File "/opt/conda/bin/text-generation-server", line 8, in <module>
sys.exit(app())
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/cli.py", line 118, in serve
server.serve(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py", line 297, in serve
asyncio.run(
File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
return future.result()
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py", line 231, in serve_inner
model = get_model(
File "/opt/conda/lib/python3.10/site-packages/text_generation_server/models/__init__.py", line 1117, in get_model
raise ValueError(f"Unsupported model type {model_type}")
sagemaker config
instance_type = “ml.g5.4xlarge”
number_of_gpu = 1
health_check_timeout = 1000
Define Model and Endpoint configuration parameter
config = {
‘HF_MODEL_ID’: “meta-llama/Llama-3.2-11B-Vision-Instruct”,
‘SM_NUM_GPUS’: json.dumps(number_of_gpu), # Number of GPU used per replica
‘MAX_INPUT_LENGTH’: json.dumps(6000), # Max length of input text
‘MAX_TOTAL_TOKENS’: json.dumps(8192), # Max length of the generation (including input text)
}