I’m trying to get an existing app (OpenAI or Gemini both work well ) to run on open-weight models and keep failing. I have now distilled a minimal example that works on gpt-4.1-mini but doesn’t on Qwen3.
client = openai.Client()
MODEL = "gpt-4.1-mini"
messages = [
{"role": "user", "content": "You are a shopping assistant for a store. You can help pick the right products for the user."},
{"role": "user", "content": "I'm looking for a T-shirt"}
]
dummy_tools = [{
"type": "function",
"function": {
"name": "get_products",
"description": (
"Search for products. Useful if someone needs clothing."
),
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The query to look up products for."
}
},
"required": [
"query"
],
"additionalProperties": False
},
"strict": True
}
}]
r = client.chat.completions.create(model=MODEL, tools=dummy_tools, messages=messages)
tcs = []
for tc in r.choices[0].message.tool_calls:
tcs.append({
"id": tc.id,
"type": tc.type,
"function": {
"name": tc.function.name,
"arguments": tc.function.arguments,
}
})
messages.append({"role": "assistant", "tool_calls": tcs})
# fake it for brevity
messages.append({"role": "tool", "tool_call_id": tcs[0]["id"], "content": "Product 1: Blue T-Shirt\nProduct 2: Red Hoody."})
for m in messages:
print(m)
print("-----------")
r = client.chat.completions.create(model=MODEL, messages=messages)
print(r.choices[0])
works and prints:
{'role': 'user', 'content': 'You are a shopping assistant for a store. You can help pick the right products for the user.'}
{'role': 'user', 'content': "I'm looking for a T-shirt"}
{'role': 'assistant', 'tool_calls': [{'id': 'call_b7Gp98ZGcdv6TSbAlgrZC8Sq', 'type': 'function', 'function': {'name': 'get_products', 'arguments': '{"query":"T-shirt"}'}}]}
{'role': 'tool', 'tool_call_id': 'call_b7Gp98ZGcdv6TSbAlgrZC8Sq', 'content': 'Product 1: Blue T-Shirt\nProduct 2: Red Hoody.'}
-----------
Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='I found a Blue T-Shirt for you. Would you like more options or details about this one?', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))
Meanwhile:
client = InferenceClient(
provider="hf-inference",
api_key=os.environ["HF_TOKEN"],
)
MODEL = "Qwen/Qwen3-235B-A22B"
messages = [
{"role": "user", "content": "You are a shopping assistant for a store. You can help pick the right products for the user."},
{"role": "user", "content": "I'm looking for a T-shirt"}
]
dummy_tools = [{
"type": "function",
"function": {
"name": "get_products",
"description": (
"Search for products. Useful if someone needs clothing."
),
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The query to look up products for."
}
},
"required": [
"query"
],
"additionalProperties": False
},
"strict": True
}
}]
r = client.chat.completions.create(model=MODEL, tools=dummy_tools, messages=messages)
tcs = []
for tc in r.choices[0].message.tool_calls:
tcs.append({
"id": tc.id,
"type": tc.type,
"function": {
"name": tc.function.name,
"arguments": tc.function.arguments,
}
})
messages.append({"role": "assistant", "tool_calls": tcs})
# fake it for brevity
messages.append({"role": "tool", "tool_call_id": tcs[0]["id"], "content": "Product 1: Blue T-Shirt\nProduct 2: Red Hoody."})
for m in messages:
print(m)
print("-----------")
r = client.chat.completions.create(model=MODEL, messages=messages)
print(r.choices[0])
fails with
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
File ~/micromamba/envs/strauss_rag_202505/lib/python3.13/site-packages/huggingface_hub/utils/_http.py:409, in hf_raise_for_status(response, endpoint_name)
408 try:
--> 409 response.raise_for_status()
410 except HTTPError as e:
File ~/micromamba/envs/strauss_rag_202505/lib/python3.13/site-packages/requests/models.py:1024, in Response.raise_for_status(self)
1023 if http_error_msg:
-> 1024 raise HTTPError(http_error_msg, response=self)
HTTPError: 500 Server Error: Internal Server Error for url: https://router.huggingface.co/hf-inference/models/Qwen/Qwen3-235B-A22B/v1/chat/completions
The above exception was the direct cause of the following exception:
HfHubHTTPError Traceback (most recent call last)
Cell In[107], line 52
50 print(m)
51 print("-----------")
---> 52 r = client.chat.completions.create(model=MODEL, messages=messages)
53 print(r.choices[0])
File ~/micromamba/envs/strauss_rag_202505/lib/python3.13/site-packages/huggingface_hub/inference/_client.py:924, in InferenceClient.chat_completion(self, messages, model, stream, frequency_penalty, logit_bias, logprobs, max_tokens, n, presence_penalty, response_format, seed, stop, stream_options, temperature, tool_choice, tool_prompt, tools, top_logprobs, top_p, extra_body)
896 parameters = {
897 "model": payload_model,
898 "frequency_penalty": frequency_penalty,
(...) 915 **(extra_body or {}),
916 }
917 request_parameters = provider_helper.prepare_request(
918 inputs=messages,
919 parameters=parameters,
(...) 922 api_key=self.token,
923 )
--> 924 data = self._inner_post(request_parameters, stream=stream)
926 if stream:
927 return _stream_chat_completion_response(data) # type: ignore[arg-type]
File ~/micromamba/envs/strauss_rag_202505/lib/python3.13/site-packages/huggingface_hub/inference/_client.py:280, in InferenceClient._inner_post(self, request_parameters, stream)
277 raise InferenceTimeoutError(f"Inference call timed out: {request_parameters.url}") from error # type: ignore
279 try:
--> 280 hf_raise_for_status(response)
281 return response.iter_lines() if stream else response.content
282 except HTTPError as error:
File ~/micromamba/envs/strauss_rag_202505/lib/python3.13/site-packages/huggingface_hub/utils/_http.py:482, in hf_raise_for_status(response, endpoint_name)
478 raise _format(HfHubHTTPError, message, response) from e
480 # Convert `HTTPError` into a `HfHubHTTPError` to display request information
481 # as well (request id and/or server error message)
--> 482 raise _format(HfHubHTTPError, str(e), response) from e
HfHubHTTPError: 500 Server Error: Internal Server Error for url: https://router.huggingface.co/hf-inference/models/Qwen/Qwen3-235B-A22B/v1/chat/completions (Request ID: Root=1-684c0e94-1b2fcc1112ce97d968f42b89;4a0857fe-92d3-4b59-977c-2c58fee78502)
Unfortunately, I fail to get a better reason than the 500 return code, and I’m not sure if I am misusing the API somehow