TGI version 0.9.3 llama2 13B deployment sagemaker RuntimeError

Hello everyone,

I am currently facing an issue when trying to deploy a fine-tuned Llama-2-13b-hf model using the Text Generation Interface (TGI) version 0.9.3. I was able to deploy a model a few weeks ago without any problems, but now I am encountering a RuntimeError during the model initialization process.

The error message I am receiving is: “RuntimeError: weight model.layers.0.self_attn.rotary_emb.inv_freq does not exist”. This error occurs after the model has successfully downloaded the weights and started the sharding process.

Here are the logs for reference:

#033[2m2023-09-05T14:50:28.214647Z#033[0m #033[32m INFO#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Args { model_id: “/opt/ml/model”, revision: None, validation_workers: 2, sharded: None, num_shard: Some(4), quantize: None, dtype: None, trust_remote_code: false, max_concurrent_requests: 128, max_best_of: 2, max_stop_sequences: 4, max_input_length: 3072, max_total_tokens: 4096, waiting_served_ratio: 1.2, max_batch_prefill_tokens: 4096, max_batch_total_tokens: 8192, max_waiting_tokens: 20, hostname: “container-0.local”, port: 8080, shard_uds_path: “/tmp/text-generation-server”, master_addr: “localhost”, master_port: 29500, huggingface_hub_cache: Some(“/tmp”), weights_cache_override: None, disable_custom_kernels: false, json_output: false, otlp_endpoint: None, cors_allow_origin: , watermark_gamma: None, watermark_delta: None, ngrok: false, ngrok_authtoken: None, ngrok_domain: None, ngrok_username: None, ngrok_password: None, env: false }
#033[2m2023-09-05T14:50:28.214680Z#033[0m #033[32m INFO#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Sharding model on 4 processes
#033[2m2023-09-05T14:50:28.214769Z#033[0m #033[32m INFO#033[0m #033[1mdownload#033[0m: #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Starting download process.
#033[2m2023-09-05T14:50:33.080912Z#033[0m #033[32m INFO#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Files are already present on the host. Skipping download.
#033[2m2023-09-05T14:50:36.622398Z#033[0m #033[32m INFO#033[0m #033[1mdownload#033[0m: #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Successfully downloaded weights.
#033[2m2023-09-05T14:50:36.622764Z#033[0m #033[32m INFO#033[0m #033[1mshard-manager#033[0m: #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Starting shard 0 #033[2m#033[3mrank#033[0m#033[2m=#033[0m0#033[0m
#033[2m2023-09-05T14:50:36.622853Z#033[0m #033[32m INFO#033[0m #033[1mshard-manager#033[0m: #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Starting shard 2 #033[2m#033[3mrank#033[0m#033[2m=#033[0m2#033[0m
#033[2m2023-09-05T14:50:36.622927Z#033[0m #033[32m INFO#033[0m #033[1mshard-manager#033[0m: #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Starting shard 1 #033[2m#033[3mrank#033[0m#033[2m=#033[0m1#033[0m
#033[2m2023-09-05T14:50:36.623672Z#033[0m #033[32m INFO#033[0m #033[1mshard-manager#033[0m: #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Starting shard 3 #033[2m#033[3mrank#033[0m#033[2m=#033[0m3#033[0m
#033[2m2023-09-05T14:50:41.184062Z#033[0m #033[31mERROR#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Error when initializing model
Traceback (most recent call last):
File “/opt/conda/bin/text-generation-server”, line 8, in
sys.exit(app())
File “/opt/conda/lib/python3.9/site-packages/typer/main.py”, line 311, in call
return get_command(self)(*args, **kwargs)
File “/opt/conda/lib/python3.9/site-packages/click/core.py”, line 1130, in call
return self.main(*args, **kwargs)
File “/opt/conda/lib/python3.9/site-packages/typer/core.py”, line 778, in main
return _main(
File “/opt/conda/lib/python3.9/site-packages/typer/core.py”, line 216, in _main
rv = self.invoke(ctx)
File “/opt/conda/lib/python3.9/site-packages/click/core.py”, line 1657, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File “/opt/conda/lib/python3.9/site-packages/click/core.py”, line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File “/opt/conda/lib/python3.9/site-packages/click/core.py”, line 760, in invoke
return __callback(*args, **kwargs)
File “/opt/conda/lib/python3.9/site-packages/typer/main.py”, line 683, in wrapper
return callback(**use_params) # type: ignore
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/cli.py”, line 78, in serve
server.serve(
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py”, line 175, in serve
asyncio.run(
File “/opt/conda/lib/python3.9/asyncio/runners.py”, line 44, in run
return loop.run_until_complete(main)
File “/opt/conda/lib/python3.9/asyncio/base_events.py”, line 634, in run_until_complete
self.run_forever()
File “/opt/conda/lib/python3.9/asyncio/base_events.py”, line 601, in run_forever
self._run_once()
File “/opt/conda/lib/python3.9/asyncio/base_events.py”, line 1905, in _run_once
handle._run()
File “/opt/conda/lib/python3.9/asyncio/events.py”, line 80, in _run
self._context.run(self._callback, *self._args)

File “/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py”, line 142, in serve_inner
model = get_model(
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/models/init.py”, line 185, in get_model
return FlashLlama(
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/models/flash_llama.py”, line 65, in init
model = FlashLlamaForCausalLM(config, weights)
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py”, line 452, in init
self.model = FlashLlamaModel(config, weights)
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py”, line 390, in init
[
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py”, line 391, in
FlashLlamaLayer(
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py”, line 326, in init
self.self_attn = FlashLlamaAttention(
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_llama_modeling.py”, line 183, in init
self.rotary_emb = PositionRotaryEmbedding.load(
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/utils/layers.py”, line 395, in load
inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/utils/weights.py”, line 62, in get_tensor
filename, tensor_name = self.get_filename(tensor_name)
File “/opt/conda/lib/python3.9/site-packages/text_generation_server/utils/weights.py”, line 49, in get_filename
raise RuntimeError(f"weight {tensor_name} does not exist")
RuntimeError: weight model.layers.0.self_attn.rotary_emb.inv_freq does not exist
#033[2m2023-09-05T14:50:41.184184Z#033[0m #033[31mERROR#033[0m #033[2mtext_generation_launcher#033[0m#033[2m:#033[0m Error when initializing model

I would appreciate any guidance or suggestions on how to resolve this issue. Thank you in advance for your help!

Best,
Jorge

1 Like

same problem here, where you able to deploy it with another method?

The problem resides in the compatibility of (TGI) version 0.9.3 with transformers 4.32 or later.

I executed the training using:

transformers==4.31.0

and it deployed successfully with TGI 0.9.3.