I’m also facing the same issue.
Log
- 2024-07-31T09:16:57.356+00:00 {"timestamp":"2024-07-31T09:16:57.355971Z","level":"INFO","fields":{"message":"Args {\n model_id: \"/repository\",\n revision: None,\n validation_workers: 2,\n sharded: None,\n num_shard: None,\n quantize: None,\n speculate: None,\n dtype: None,\n trust_remote_code: false,\n max_concurrent_requests: 128,\n max_best_of: 2,\n max_stop_sequences: 4,\n max_top_n_tokens: 5,\n max_input_tokens: None,\n max_input_length: Some(\n 1024,\n ),\n max_total_tokens: Some(\n 1512,\n ),\n waiting_served_ratio: 0.3,\n max_batch_prefill_tokens: Some(\n 2048,\n ),\n max_batch_total_tokens: None,\n max_waiting_tokens: 20,\n max_batch_size: None,\n cuda_graphs: None,\n hostname: \"r-antony-pk-meta-llama-3-1-8b-instruct-j-gsh-o7668i4x-62b4f-mup\",\n port: 80,\n shard_uds_path: \"/tmp/text-generation-server\",\n master_addr: \"localhost\",\n master_port: 29500,\n huggingface_hub_cache: Some(\n \"/repository/cache\",\n ),\n weights_cache_override: None,\n disable_custom_kernels: false,\n cuda_memory_fraction: 1.0,\n rope_scaling: None,\n rope_factor: None,\n json_output: true,\n otlp_endpoint: None,\n cors_allow_origin: [],\n watermark_gamma: None,\n watermark_delta: None,\n ngrok: false,\n ngrok_authtoken: None,\n ngrok_edge: None,\n tokenizer_config_path: None,\n disable_grammar_support: false,\n env: false,\n max_client_batch_size: 4,\n}"},"target":"text_generation_launcher"}
- 2024-07-31T09:16:57.356+00:00 {"timestamp":"2024-07-31T09:16:57.356183Z","level":"INFO","fields":{"message":"Using default cuda graphs [1, 2, 4, 8, 16, 32]"},"target":"text_generation_launcher"}
- 2024-07-31T09:16:57.356+00:00 {"timestamp":"2024-07-31T09:16:57.356256Z","level":"INFO","fields":{"message":"Starting download process."},"target":"text_generation_launcher","span":{"name":"download"},"spans":[{"name":"download"}]}
- 2024-07-31T09:16:59.691+00:00 {"timestamp":"2024-07-31T09:16:59.691502Z","level":"INFO","fields":{"message":"Files are already present on the host. Skipping download.\n"},"target":"text_generation_launcher"}
- 2024-07-31T09:17:00.159+00:00 {"timestamp":"2024-07-31T09:17:00.159052Z","level":"INFO","fields":{"message":"Successfully downloaded weights."},"target":"text_generation_launcher","span":{"name":"download"},"spans":[{"name":"download"}]}
- 2024-07-31T09:17:00.159+00:00 {"timestamp":"2024-07-31T09:17:00.159194Z","level":"INFO","fields":{"message":"Starting shard"},"target":"text_generation_launcher","span":{"rank":0,"name":"shard-manager"},"spans":[{"rank":0,"name":"shard-manager"}]}
- 2024-07-31T09:17:03.265+00:00 {"timestamp":"2024-07-31T09:17:03.265587Z","level":"ERROR","fields":{"message":"Error when initializing model\nTraceback (most recent call last):\n File \"/opt/conda/bin/text-generation-server\", line 8, in <module>\n sys.exit(app())\n File \"/opt/conda/lib/python3.10/site-packages/typer/main.py\", line 311, in __call__\n return get_command(self)(*args, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/click/core.py\", line 1157, in __call__\n return self.main(*args, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/typer/core.py\", line 778, in main\n return _main(\n File \"/opt/conda/lib/python3.10/site-packages/typer/core.py\", line 216, in _main\n rv = self.invoke(ctx)\n File \"/opt/conda/lib/python3.10/site-packages/click/core.py\", line 1688, in invoke\n return _process_result(sub_ctx.command.invoke(sub_ctx))\n File \"/opt/conda/lib/python3.10/site-packages/click/core.py\", line 1434, in invoke\n return ctx.invoke(self.callback, **ctx.params)\n File \"/opt/conda/lib/python3.10/site-packages/click/core.py\", line 783, in invoke\n return __callback(*args, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/typer/main.py\", line 683, in wrapper\n return callback(**use_params) # type: ignore\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/cli.py\", line 90, in serve\n server.serve(\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py\", line 253, in serve\n asyncio.run(\n File \"/opt/conda/lib/python3.10/asyncio/runners.py\", line 44, in run\n return loop.run_until_complete(main)\n File \"/opt/conda/lib/python3.10/asyncio/base_events.py\", line 636, in run_until_complete\n self.run_forever()\n File \"/opt/conda/lib/python3.10/asyncio/base_events.py\", line 603, in run_forever\n self._run_once()\n File \"/opt/conda/lib/python3.10/asyncio/base_events.py\", line 1909, in _run_once\n handle._run()\n File \"/opt/conda/lib/python3.10/asyncio/events.py\", line 80, in _run\n self._context.run(self._callback, *self._args)\n> File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py\", line 217, in serve_inner\n model = get_model(\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/models/__init__.py\", line 333, in get_model\n return FlashLlama(\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_llama.py\", line 70, in __init__\n config = AutoConfig.from_pretrained(\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py\", line 952, in from_pretrained\n return config_class.from_dict(config_dict, **unused_kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/configuration_utils.py\", line 761, in from_dict\n config = cls(**config_dict)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py\", line 161, in __init__\n self._rope_scaling_validation()\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py\", line 181, in _rope_scaling_validation\n raise ValueError(\nValueError: `rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}\n"},"target":"text_generation_launcher"}
- 2024-07-31T09:17:03.963+00:00 {"timestamp":"2024-07-31T09:17:03.962934Z","level":"ERROR","fields":{"message":"Shard complete standard error output:\n\nThe tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \nThe tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. \nThe class this function is called from is 'LlamaTokenizer'.\nYou are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\nSpecial tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\nTraceback (most recent call last):\n\n File \"/opt/conda/bin/text-generation-server\", line 8, in <module>\n sys.exit(app())\n\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/cli.py\", line 90, in serve\n server.serve(\n\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py\", line 253, in serve\n asyncio.run(\n\n File \"/opt/conda/lib/python3.10/asyncio/runners.py\", line 44, in run\n return loop.run_until_complete(main)\n\n File \"/opt/conda/lib/python3.10/asyncio/base_events.py\", line 649, in run_until_complete\n return future.result()\n\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py\", line 217, in serve_inner\n model = get_model(\n\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/models/__init__.py\", line 333, in get_model\n return FlashLlama(\n\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_llama.py\", line 70, in __init__\n config = AutoConfig.from_pretrained(\n\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py\", line 952, in from_pretrained\n return config_class.from_dict(config_dict, **unused_kwargs)\n\n File \"/opt/conda/lib/python3.10/site-packages/transformers/configuration_utils.py\", line 761, in from_dict\n config = cls(**config_dict)\n\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py\", line 161, in __init__\n self._rope_scaling_validation()\n\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py\", line 181, in _rope_scaling_validation\n raise ValueError(\n\nValueError: `rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}\n"},"target":"text_generation_launcher","span":{"rank":0,"name":"shard-manager"},"spans":[{"rank":0,"name":"shard-manager"}]}
- 2024-07-31T09:17:04.062+00:00 {"timestamp":"2024-07-31T09:17:04.062178Z","level":"ERROR","fields":{"message":"Shard 0 failed to start"},"target":"text_generation_launcher"}
- 2024-07-31T09:17:04.062+00:00 {"timestamp":"2024-07-31T09:17:04.062193Z","level":"INFO","fields":{"message":"Shutting down shards"},"target":"text_generation_launcher"}
- 2024-07-31T09:17:04.062+00:00 Error: ShardCannotStart
- 2024-07-31T09:17:11.470+00:00 {"timestamp":"2024-07-31T09:17:11.470185Z","level":"INFO","fields":{"message":"Args {\n model_id: \"/repository\",\n revision: None,\n validation_workers: 2,\n sharded: None,\n num_shard: None,\n quantize: None,\n speculate: None,\n dtype: None,\n trust_remote_code: false,\n max_concurrent_requests: 128,\n max_best_of: 2,\n max_stop_sequences: 4,\n max_top_n_tokens: 5,\n max_input_tokens: None,\n max_input_length: Some(\n 1024,\n ),\n max_total_tokens: Some(\n 1512,\n ),\n waiting_served_ratio: 0.3,\n max_batch_prefill_tokens: Some(\n 2048,\n ),\n max_batch_total_tokens: None,\n max_waiting_tokens: 20,\n max_batch_size: None,\n cuda_graphs: None,\n hostname: \"r-antony-pk-meta-llama-3-1-8b-instruct-j-gsh-o7668i4x-62b4f-mup\",\n port: 80,\n shard_uds_path: \"/tmp/text-generation-server\",\n master_addr: \"localhost\",\n master_port: 29500,\n huggingface_hub_cache: Some(\n \"/repository/cache\",\n ),\n weights_cache_override: None,\n disable_custom_kernels: false,\n cuda_memory_fraction: 1.0,\n rope_scaling: None,\n rope_factor: None,\n json_output: true,\n otlp_endpoint: None,\n cors_allow_origin: [],\n watermark_gamma: None,\n watermark_delta: None,\n ngrok: false,\n ngrok_authtoken: None,\n ngrok_edge: None,\n tokenizer_config_path: None,\n disable_grammar_support: false,\n env: false,\n max_client_batch_size: 4,\n}"},"target":"text_generation_launcher"}
- 2024-07-31T09:17:11.470+00:00 {"timestamp":"2024-07-31T09:17:11.470246Z","level":"INFO","fields":{"message":"Using default cuda graphs [1, 2, 4, 8, 16, 32]"},"target":"text_generation_launcher"}
- 2024-07-31T09:17:11.470+00:00 {"timestamp":"2024-07-31T09:17:11.470305Z","level":"INFO","fields":{"message":"Starting download process."},"target":"text_generation_launcher","span":{"name":"download"},"spans":[{"name":"download"}]}
- 2024-07-31T09:17:13.788+00:00 {"timestamp":"2024-07-31T09:17:13.788666Z","level":"INFO","fields":{"message":"Files are already present on the host. Skipping download.\n"},"target":"text_generation_launcher"}
- 2024-07-31T09:17:14.273+00:00 {"timestamp":"2024-07-31T09:17:14.273297Z","level":"INFO","fields":{"message":"Successfully downloaded weights."},"target":"text_generation_launcher","span":{"name":"download"},"spans":[{"name":"download"}]}
- 2024-07-31T09:17:14.273+00:00 {"timestamp":"2024-07-31T09:17:14.273500Z","level":"INFO","fields":{"message":"Starting shard"},"target":"text_generation_launcher","span":{"rank":0,"name":"shard-manager"},"spans":[{"rank":0,"name":"shard-manager"}]}
- 2024-07-31T09:17:17.359+00:00 {"timestamp":"2024-07-31T09:17:17.358917Z","level":"ERROR","fields":{"message":"Error when initializing model\nTraceback (most recent call last):\n File \"/opt/conda/bin/text-generation-server\", line 8, in <module>\n sys.exit(app())\n File \"/opt/conda/lib/python3.10/site-packages/typer/main.py\", line 311, in __call__\n return get_command(self)(*args, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/click/core.py\", line 1157, in __call__\n return self.main(*args, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/typer/core.py\", line 778, in main\n return _main(\n File \"/opt/conda/lib/python3.10/site-packages/typer/core.py\", line 216, in _main\n rv = self.invoke(ctx)\n File \"/opt/conda/lib/python3.10/site-packages/click/core.py\", line 1688, in invoke\n return _process_result(sub_ctx.command.invoke(sub_ctx))\n File \"/opt/conda/lib/python3.10/site-packages/click/core.py\", line 1434, in invoke\n return ctx.invoke(self.callback, **ctx.params)\n File \"/opt/conda/lib/python3.10/site-packages/click/core.py\", line 783, in invoke\n return __callback(*args, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/typer/main.py\", line 683, in wrapper\n return callback(**use_params) # type: ignore\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/cli.py\", line 90, in serve\n server.serve(\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py\", line 253, in serve\n asyncio.run(\n File \"/opt/conda/lib/python3.10/asyncio/runners.py\", line 44, in run\n return loop.run_until_complete(main)\n File \"/opt/conda/lib/python3.10/asyncio/base_events.py\", line 636, in run_until_complete\n self.run_forever()\n File \"/opt/conda/lib/python3.10/asyncio/base_events.py\", line 603, in run_forever\n self._run_once()\n File \"/opt/conda/lib/python3.10/asyncio/base_events.py\", line 1909, in _run_once\n handle._run()\n File \"/opt/conda/lib/python3.10/asyncio/events.py\", line 80, in _run\n self._context.run(self._callback, *self._args)\n> File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py\", line 217, in serve_inner\n model = get_model(\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/models/__init__.py\", line 333, in get_model\n return FlashLlama(\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_llama.py\", line 70, in __init__\n config = AutoConfig.from_pretrained(\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py\", line 952, in from_pretrained\n return config_class.from_dict(config_dict, **unused_kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/configuration_utils.py\", line 761, in from_dict\n config = cls(**config_dict)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py\", line 161, in __init__\n self._rope_scaling_validation()\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py\", line 181, in _rope_scaling_validation\n raise ValueError(\nValueError: `rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}\n"},"target":"text_generation_launcher"}
- 2024-07-31T09:17:17.977+00:00 {"timestamp":"2024-07-31T09:17:17.977339Z","level":"ERROR","fields":{"message":"Shard complete standard error output:\n\nThe tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \nThe tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. \nThe class this function is called from is 'LlamaTokenizer'.\nYou are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\nSpecial tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\nTraceback (most recent call last):\n\n File \"/opt/conda/bin/text-generation-server\", line 8, in <module>\n sys.exit(app())\n\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/cli.py\", line 90, in serve\n server.serve(\n\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py\", line 253, in serve\n asyncio.run(\n\n File \"/opt/conda/lib/python3.10/asyncio/runners.py\", line 44, in run\n return loop.run_until_complete(main)\n\n File \"/opt/conda/lib/python3.10/asyncio/base_events.py\", line 649, in run_until_complete\n return future.result()\n\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/server.py\", line 217, in serve_inner\n model = get_model(\n\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/models/__init__.py\", line 333, in get_model\n return FlashLlama(\n\n File \"/opt/conda/lib/python3.10/site-packages/text_generation_server/models/flash_llama.py\", line 70, in __init__\n config = AutoConfig.from_pretrained(\n\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py\", line 952, in from_pretrained\n return config_class.from_dict(config_dict, **unused_kwargs)\n\n File \"/opt/conda/lib/python3.10/site-packages/transformers/configuration_utils.py\", line 761, in from_dict\n config = cls(**config_dict)\n\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py\", line 161, in __init__\n self._rope_scaling_validation()\n\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/llama/configuration_llama.py\", line 181, in _rope_scaling_validation\n raise ValueError(\n\nValueError: `rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}\n"},"target":"text_generation_launcher","span":{"rank":0,"name":"shard-manager"},"spans":[{"rank":0,"name":"shard-manager"}]}
- 2024-07-31T09:17:18.076+00:00 {"timestamp":"2024-07-31T09:17:18.076364Z","level":"ERROR","fields":{"message":"Shard 0 failed to start"},"target":"text_generation_launcher"}
- 2024-07-31T09:17:18.076+00:00 {"timestamp":"2024-07-31T09:17:18.076379Z","level":"INFO","fields":{"message":"Shutting down shards"},"target":"text_generation_launcher"}
- 2024-07-31T09:17:18.076+00:00 Error: ShardCannotStart