hi, i am an absolute beginner, i took an example of LLAMA 3.1 8B and ran it from python using transformers pipeline, and it works perfectly
but i have to wait for the response to be generated and only then see the response (instead of printing token by token as soon as they are ready)
even a print to the console would help me understand how to proceed, i have tried many examples from the web / from chat-gpt and they all stream the tokens but destroy the way the LLM is working, in the streaming code it spit out rubbish and doesn’t stop until max_length is reached,
here is my working code (without streaming),
import streamlit as st
import transformers
import torch
# Function to initialize the model and pipeline
def model_init():
# Initialize model and pipeline only once
if 'pipeline' not in st.session_state:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
st.session_state.pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
device_map="cuda",
)
def init_chat_history():
# Initialize the session state (chat history)
if 'messages' not in st.session_state:
st.session_state.messages = [
{"role": "system", "content": "You are an AI developer and expert in NLP."},
{"role": "assistant", "content": "Welcome to the Personal Assistant! How can I help you today ?"},
]
def display_chat_messages():
# Display chat messages from history on app rerun
for message in st.session_state.messages:
if message['role'] != 'system' and message['role'] != 'ipython':
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Function to get the model response
def get_response():
outputs = st.session_state.pipeline(
st.session_state.messages,
max_new_tokens=2048,
eos_token_id=[128001, 128009], # Define the end-of-sequence tokens for the model
)
response = outputs[0]["generated_text"]
#print(f"Model Output: {response}") # Debugging line
return response
# Function to get the last assistant reply
def get_last_assistant_reply(history):
for entry in reversed(history): # Iterate in reverse to find the last assistant message
if entry['role'] == 'assistant':
return entry['content']
return None # Return None if no assistant reply is found
st.title("Chat with AI")
init_chat_history()
display_chat_messages()
model_init()
# React to user input
if prompt := st.chat_input("Write your message..."):
# Display user message in chat message container
st.chat_message("user").markdown(prompt)
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": prompt})
# Get assistant response
response = get_last_assistant_reply(get_response())
# Display assistant response in chat message container
with st.chat_message("assistant"):
st.markdown(response)
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": response})
is there anybody out there that knows how to:
make it stream the tokens as soon as they are ready
keep the model setup like the “text-generation” pipeline does
allow me to “play” with the setup (top_k, temperature, max length … )
Hi @dan30l
here’s a working code that is using gradio
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import os
from threading import Thread
token = os.environ["HF_TOKEN"]
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct",
# torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
torch_dtype=torch.float16,
token=token)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct",token=token)
# using CUDA for an optimal experience
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda')
model = model.to(device)
def chat(message, history):
chat = []
for item in history:
chat.append({"role": "user", "content": item[0]})
if item[1] is not None:
chat.append({"role": "assistant", "content": item[1]})
chat.append({"role": "user", "content": message})
messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
# Tokenize the messages string
model_inputs = tokenizer([messages], return_tensors="pt").to(device)
streamer = TextIteratorStreamer(
tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
model_inputs,
streamer=streamer,
max_new_tokens=1024,
do_sample=True,
top_p=0.95,
top_k=1000,
temperature=0.75,
num_beams=1,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
# Initialize an empty string to store the generated text
partial_text = ""
for new_text in streamer:
# print(new_text)
partial_text += new_text
# Yield an empty string to cleanup the message textbox and the updated conversation history
yield partial_text
demo = gr.ChatInterface(fn=chat,
chatbot=gr.Chatbot(show_label=True, show_share_button=True, show_copy_button=True, likeable=True, layout="bubble", bubble_full_width=False),
theme="soft",
examples=[["Write me a poem about Machine Learning."]],
title="Text Streaming")
demo.launch()
based on your suggestion i rewrote the code sample and used TextIteratorStreamer instead of TextStreamer (so i can update the UI during inference) and the code now looks like this:
import threading
import streamlit as st
import transformers
import torch
# Function to initialize the model and tokenizer
def model_init():
# Initialize model and tokenizer only once
if 'tokenizer' not in st.session_state:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
st.session_state.tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
st.session_state.model = transformers.AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to('cuda')
def init_chat_history():
# Initialize the session state (chat history)
if 'messages' not in st.session_state:
st.session_state.messages = [
{"role": "system", "content": "You are an AI developer and expert in NLP."},
{"role": "assistant", "content": "Welcome to the Personal Assistant! How can I help you today ?"},
]
def display_chat_messages():
# Display chat messages from history on app rerun
for message in st.session_state.messages:
if message['role'] != 'system' and message['role'] != 'ipython':
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Function to get the model response
def stream_response():
inputs = st.session_state.tokenizer(st.session_state.messages[-1]['content'], return_tensors="pt").to('cuda')
input_ids = inputs.input_ids.to('cuda')
streamer = transformers.TextIteratorStreamer(st.session_state.tokenizer, skip_prompt=True, skip_special_tokens=True)
# outputs = st.session_state.model.generate(**inputs, streamer=streamer, max_new_tokens=2048)
# start a generation
generation_thread = threading.Thread(
target=st.session_state.model.generate,
args=(input_ids,),
kwargs={'streamer': streamer, 'max_new_tokens': 2048},
)
# start the thread so that the generation can run in parallel to updating the UI
generation_thread.start()
# reset the response string
response = ""
# create an empty streamlit ui container for the chat response
current_chat_container = st.empty()
# print streamed output as it's being generated
for new_text in streamer:
# stop the generation if EOS (end of sequence) token is detected
if '[EOS]' in new_text:
response += new_text.split('[EOS]')[0]
break
else:
# append the generated token to the response
response += new_text
# update the current chat message container with the latest response
with current_chat_container.chat_message("assistant"):
current_chat_container.markdown(response)
return response
# Function to get the last assistant reply
def get_last_assistant_reply(history):
for entry in reversed(history): # Iterate in reverse to find the last assistant message
if entry['role'] == 'assistant':
return entry['content']
return None # Return None if no assistant reply is found
st.title("Chat with AI")
init_chat_history()
display_chat_messages()
model_init()
# React to user input
if prompt := st.chat_input("Write your message..."):
# Display user message in chat message container
st.chat_message("user").markdown(prompt)
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": prompt})
# Get assistant response
response = stream_response()
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": response})
but unfortunately the LLM is spitting rubbish, for me that means the default configuration that the pipeline “text-generation” has used is not the default setup that AutoModelForCausalLM is using,
@not-lain thanks for the response, your code works so much better and the answers of the LLM makes sense now and closer to the ones produced by the “text-generation” pipeline, i will use your setup and feed it into my Streamlit app and play with it until i will understand each argument,