HuggingFace Infernece Endpoint don't work

import os
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from typing import TypedDict
from PIL import Image
from dotenv import load_dotenv

load_dotenv()

HuggingFace API TOKEN

HUGGINGFACEHUB_API_TOKEN = os.getenv(“HUGGINGFACEHUB_API_TOKEN”)
os.environ[“HUGGINGFACEHUB_API_TOKEN”] = HUGGINGFACEHUB_API_TOKEN

endpoint_url = ‘’

class HarmonyState(TypedDict):
messages: list
image: Image.Image
people: dict

sys_msg = “”"

    """

template = “”"

"""

prompt = PromptTemplate.from_template(template)

model = HuggingFaceEndpoint(
endpoint_url=endpoint_url,
max_new_tokens=512,
top_k=15,
top_p=0.95,
typical_p=0.95,
temperature=0.1,
repetition_penalty=1.03,
)

def call_model(state: HarmonyState) → dict:
image = state[“image”]
conversation = “\n”.join([msg.content for msg in state[“messages”] if isinstance(msg, HumanMessage)])
prompt_text = prompt.format(
conversation=conversation,
people=str(state[“people”]),
)

# Fix: Correct the model invocation format
response = model.invoke(prompt_text)

return {
    "messages": state["messages"] + [AIMessage(content=response)],
    "image": image,
    "people": state["people"],
}

workflow = StateGraph(state_schema=HarmonyState)
workflow.add_node(“model”, call_model)
workflow.set_entry_point(“model”)
workflow.set_finish_point(“model”)

memory = MemorySaver()
app = workflow.compile(checkpointer=memory)

def output_of_model(conversation, people):
image = Image.open(“output_image.jpg”).convert(“RGB”)
initial_state = {
“messages”: [
SystemMessage(content=sys_msg),
HumanMessage(content=conversation)
],
“image”: image,
“people”: people
}

try:
    # Fix: Use the correct method and configuration format
    config = {"configurable": {"thread_id": "session-1"}}
    result = app.invoke(initial_state, config=config)
    
    print(result["messages"][-1].content)
    return result["messages"][-1].content
except Exception as e:
    print(f"Oops, something went wrong: {str(e)}")
    return f"Oops, something went wrong: {str(e)}"

I try to create a bot using Hugging Face endpoint. Model I am using is qwen2-5-vl-7b-instruct-gguf-qzv with Llama.cpp Container. But every time I run the code 404 Error. I tried every possible way to solve it. I code this looking at LangChain document.

1 Like

Model I am using is qwen2-5-vl-7b-instruct-gguf-qzv with Llama.cpp Container

This caught my attention. While it is easy to run LLM in Llama.cpp, running VLM is likely to be quite challenging.
In the case of TGI (Hugging Face Endpoint), using a backend such as vLLM would be a more reliable option with VLM.

Thank you for your reply, I will give a try

1 Like