Cannot get tools to work: InferenceClient + hf-inference + Qwen/Qwen3-235B-A22B -- Internal Server Error

I’m trying to get an existing app (OpenAI or Gemini both work well ) to run on open-weight models and keep failing. I have now distilled a minimal example that works on gpt-4.1-mini but doesn’t on Qwen3.

client = openai.Client()
MODEL = "gpt-4.1-mini"

messages = [
    {"role": "user", "content": "You are a shopping assistant for a store. You can help pick the right products for the user."},
    {"role": "user", "content": "I'm looking for a T-shirt"}
]

dummy_tools = [{
        "type": "function",
        "function": {
            "name": "get_products",
            "description": (
                "Search for products. Useful if someone needs clothing."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The query to look up products for."
                    }
                },
                "required": [
                    "query"
                ],
                "additionalProperties": False
            },
            "strict": True
        }
    }]
r = client.chat.completions.create(model=MODEL, tools=dummy_tools, messages=messages)
tcs = []
for tc in r.choices[0].message.tool_calls:
    tcs.append({
        "id": tc.id,
        "type": tc.type,
        "function": {
            "name": tc.function.name,
            "arguments": tc.function.arguments,
        }
    })
messages.append({"role": "assistant", "tool_calls": tcs})
# fake it for brevity
messages.append({"role": "tool", "tool_call_id": tcs[0]["id"], "content": "Product 1: Blue T-Shirt\nProduct 2: Red Hoody."})
for m in messages:
    print(m)
print("-----------")
r = client.chat.completions.create(model=MODEL, messages=messages)
print(r.choices[0])

works and prints:

{'role': 'user', 'content': 'You are a shopping assistant for a store. You can help pick the right products for the user.'}
{'role': 'user', 'content': "I'm looking for a T-shirt"}
{'role': 'assistant', 'tool_calls': [{'id': 'call_b7Gp98ZGcdv6TSbAlgrZC8Sq', 'type': 'function', 'function': {'name': 'get_products', 'arguments': '{"query":"T-shirt"}'}}]}
{'role': 'tool', 'tool_call_id': 'call_b7Gp98ZGcdv6TSbAlgrZC8Sq', 'content': 'Product 1: Blue T-Shirt\nProduct 2: Red Hoody.'}
 -----------
Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='I found a Blue T-Shirt for you. Would you like more options or details about this one?', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))

Meanwhile:

client = InferenceClient(
        provider="hf-inference",
        api_key=os.environ["HF_TOKEN"],
    )
MODEL = "Qwen/Qwen3-235B-A22B"

messages = [
    {"role": "user", "content": "You are a shopping assistant for a store. You can help pick the right products for the user."},
    {"role": "user", "content": "I'm looking for a T-shirt"}
]

dummy_tools = [{
        "type": "function",
        "function": {
            "name": "get_products",
            "description": (
                "Search for products. Useful if someone needs clothing."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The query to look up products for."
                    }
                },
                "required": [
                    "query"
                ],
                "additionalProperties": False
            },
            "strict": True
        }
    }]
r = client.chat.completions.create(model=MODEL, tools=dummy_tools, messages=messages)
tcs = []
for tc in r.choices[0].message.tool_calls:
    tcs.append({
        "id": tc.id,
        "type": tc.type,
        "function": {
            "name": tc.function.name,
            "arguments": tc.function.arguments,
        }
    })
messages.append({"role": "assistant", "tool_calls": tcs})
# fake it for brevity
messages.append({"role": "tool", "tool_call_id": tcs[0]["id"], "content": "Product 1: Blue T-Shirt\nProduct 2: Red Hoody."})
for m in messages:
    print(m)
print("-----------")
r = client.chat.completions.create(model=MODEL, messages=messages)
print(r.choices[0])

fails with

---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
File ~/micromamba/envs/strauss_rag_202505/lib/python3.13/site-packages/huggingface_hub/utils/_http.py:409, in hf_raise_for_status(response, endpoint_name)
    408 try:
--> 409     response.raise_for_status()
    410 except HTTPError as e:

File ~/micromamba/envs/strauss_rag_202505/lib/python3.13/site-packages/requests/models.py:1024, in Response.raise_for_status(self)
   1023 if http_error_msg:
-> 1024     raise HTTPError(http_error_msg, response=self)

HTTPError: 500 Server Error: Internal Server Error for url: https://router.huggingface.co/hf-inference/models/Qwen/Qwen3-235B-A22B/v1/chat/completions

The above exception was the direct cause of the following exception:

HfHubHTTPError                            Traceback (most recent call last)
Cell In[107], line 52
     50     print(m)
     51 print("-----------")
---> 52 r = client.chat.completions.create(model=MODEL, messages=messages)
     53 print(r.choices[0])

File ~/micromamba/envs/strauss_rag_202505/lib/python3.13/site-packages/huggingface_hub/inference/_client.py:924, in InferenceClient.chat_completion(self, messages, model, stream, frequency_penalty, logit_bias, logprobs, max_tokens, n, presence_penalty, response_format, seed, stop, stream_options, temperature, tool_choice, tool_prompt, tools, top_logprobs, top_p, extra_body)
    896 parameters = {
    897     "model": payload_model,
    898     "frequency_penalty": frequency_penalty,
   (...)    915     **(extra_body or {}),
    916 }
    917 request_parameters = provider_helper.prepare_request(
    918     inputs=messages,
    919     parameters=parameters,
   (...)    922     api_key=self.token,
    923 )
--> 924 data = self._inner_post(request_parameters, stream=stream)
    926 if stream:
    927     return _stream_chat_completion_response(data)  # type: ignore[arg-type]

File ~/micromamba/envs/strauss_rag_202505/lib/python3.13/site-packages/huggingface_hub/inference/_client.py:280, in InferenceClient._inner_post(self, request_parameters, stream)
    277         raise InferenceTimeoutError(f"Inference call timed out: {request_parameters.url}") from error  # type: ignore
    279 try:
--> 280     hf_raise_for_status(response)
    281     return response.iter_lines() if stream else response.content
    282 except HTTPError as error:

File ~/micromamba/envs/strauss_rag_202505/lib/python3.13/site-packages/huggingface_hub/utils/_http.py:482, in hf_raise_for_status(response, endpoint_name)
    478     raise _format(HfHubHTTPError, message, response) from e
    480 # Convert `HTTPError` into a `HfHubHTTPError` to display request information
    481 # as well (request id and/or server error message)
--> 482 raise _format(HfHubHTTPError, str(e), response) from e

HfHubHTTPError: 500 Server Error: Internal Server Error for url: https://router.huggingface.co/hf-inference/models/Qwen/Qwen3-235B-A22B/v1/chat/completions (Request ID: Root=1-684c0e94-1b2fcc1112ce97d968f42b89;4a0857fe-92d3-4b59-977c-2c58fee78502)

Unfortunately, I fail to get a better reason than the 500 return code, and I’m not sure if I am misusing the API somehow

1 Like

3 days later, this works. I assume the “internal server error” actually was an internal error after all :slight_smile:

1 Like

Great. Links that may be useful in case of trouble. However, ongoing problems may not always be apparent.
Server status: https://status.huggingface.co/
ChangeLog: Changelog - Hugging Face

This topic was automatically closed 12 hours after the last reply. New replies are no longer allowed.