hi, i am an absolute beginner, i took an example of LLAMA 3.1 8B and ran it from python using transformers pipeline, and it works perfectly
but i have to wait for the response to be generated and only then see the response (instead of printing token by token as soon as they are ready)
even a print to the console would help me understand how to proceed, i have tried many examples from the web / from chat-gpt and they all stream the tokens but destroy the way the LLM is working, in the streaming code it spit out rubbish and doesn’t stop until max_length is reached,
here is my working code (without streaming),
import streamlit as st
import transformers
import torch
# Function to initialize the model and pipeline
def model_init():
# Initialize model and pipeline only once
if 'pipeline' not in st.session_state:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
st.session_state.pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
device_map="cuda",
)
def init_chat_history():
# Initialize the session state (chat history)
if 'messages' not in st.session_state:
st.session_state.messages = [
{"role": "system", "content": "You are an AI developer and expert in NLP."},
{"role": "assistant", "content": "Welcome to the Personal Assistant! How can I help you today ?"},
]
def display_chat_messages():
# Display chat messages from history on app rerun
for message in st.session_state.messages:
if message['role'] != 'system' and message['role'] != 'ipython':
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Function to get the model response
def get_response():
outputs = st.session_state.pipeline(
st.session_state.messages,
max_new_tokens=2048,
eos_token_id=[128001, 128009], # Define the end-of-sequence tokens for the model
)
response = outputs[0]["generated_text"]
#print(f"Model Output: {response}") # Debugging line
return response
# Function to get the last assistant reply
def get_last_assistant_reply(history):
for entry in reversed(history): # Iterate in reverse to find the last assistant message
if entry['role'] == 'assistant':
return entry['content']
return None # Return None if no assistant reply is found
st.title("Chat with AI")
init_chat_history()
display_chat_messages()
model_init()
# React to user input
if prompt := st.chat_input("Write your message..."):
# Display user message in chat message container
st.chat_message("user").markdown(prompt)
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": prompt})
# Get assistant response
response = get_last_assistant_reply(get_response())
# Display assistant response in chat message container
with st.chat_message("assistant"):
st.markdown(response)
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": response})
is there anybody out there that knows how to:
- make it stream the tokens as soon as they are ready
- keep the model setup like the “text-generation” pipeline does
- allow me to “play” with the setup (top_k, temperature, max length … )