Can't stream response token by token

hi, i am an absolute beginner, i took an example of LLAMA 3.1 8B and ran it from python using transformers pipeline, and it works perfectly

but i have to wait for the response to be generated and only then see the response (instead of printing token by token as soon as they are ready)

even a print to the console would help me understand how to proceed, i have tried many examples from the web / from chat-gpt and they all stream the tokens but destroy the way the LLM is working, in the streaming code it spit out rubbish and doesn’t stop until max_length is reached,

here is my working code (without streaming),

import streamlit as st
import transformers
import torch


# Function to initialize the model and pipeline
def model_init():
    # Initialize model and pipeline only once
    if 'pipeline' not in st.session_state:
        model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
        st.session_state.pipeline = transformers.pipeline(
            "text-generation",
            model=model_id,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="cuda",
        )


def init_chat_history():
    # Initialize the session state (chat history)
    if 'messages' not in st.session_state:
        st.session_state.messages = [
            {"role": "system", "content": "You are an AI developer and expert in NLP."},
            {"role": "assistant", "content": "Welcome to the Personal Assistant! How can I help you today ?"},
        ]


def display_chat_messages():
    # Display chat messages from history on app rerun
    for message in st.session_state.messages:
        if message['role'] != 'system' and message['role'] != 'ipython':
            with st.chat_message(message["role"]):
                st.markdown(message["content"])


# Function to get the model response
def get_response():
    outputs = st.session_state.pipeline(
        st.session_state.messages,
        max_new_tokens=2048,
        eos_token_id=[128001, 128009],  # Define the end-of-sequence tokens for the model
    )
    response = outputs[0]["generated_text"]
    #print(f"Model Output: {response}")  # Debugging line
    return response


# Function to get the last assistant reply
def get_last_assistant_reply(history):
    for entry in reversed(history):  # Iterate in reverse to find the last assistant message
        if entry['role'] == 'assistant':
            return entry['content']
    return None  # Return None if no assistant reply is found


st.title("Chat with AI")
init_chat_history()
display_chat_messages()
model_init()


# React to user input
if prompt := st.chat_input("Write your message..."):
    # Display user message in chat message container
    st.chat_message("user").markdown(prompt)

    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": prompt})

    # Get assistant response
    response = get_last_assistant_reply(get_response())

    # Display assistant response in chat message container
    with st.chat_message("assistant"):
        st.markdown(response)

    # Add assistant response to chat history
    st.session_state.messages.append({"role": "assistant", "content": response})

is there anybody out there that knows how to:

  1. make it stream the tokens as soon as they are ready
  2. keep the model setup like the “text-generation” pipeline does
  3. allow me to “play” with the setup (top_k, temperature, max length … )

hi @dan30l
You can check this one but it doesn’t work with pipeline as far as I see

Hi @dan30l
here’s a working code that is using gradio

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import os
from threading import Thread

token = os.environ["HF_TOKEN"]
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", 
                                             # torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                                             torch_dtype=torch.float16,
                                             token=token)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct",token=token)
# using CUDA for an optimal experience
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda')
model = model.to(device)


def chat(message, history):
    chat = []
    for item in history:
        chat.append({"role": "user", "content": item[0]})
        if item[1] is not None:
            chat.append({"role": "assistant", "content": item[1]})
    chat.append({"role": "user", "content": message})
    messages = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    # Tokenize the messages string
    model_inputs = tokenizer([messages], return_tensors="pt").to(device)
    streamer = TextIteratorStreamer(
        tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=1024,
        do_sample=True,
        top_p=0.95,
        top_k=1000,
        temperature=0.75,
        num_beams=1,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    # Initialize an empty string to store the generated text
    partial_text = ""
    for new_text in streamer:
        # print(new_text)
        partial_text += new_text
        # Yield an empty string to cleanup the message textbox and the updated conversation history
        yield partial_text



demo = gr.ChatInterface(fn=chat, 
                        chatbot=gr.Chatbot(show_label=True, show_share_button=True, show_copy_button=True, likeable=True, layout="bubble", bubble_full_width=False),
                        theme="soft",
                        examples=[["Write me a poem about Machine Learning."]], 
                        title="Text Streaming")
demo.launch()
1 Like

@mahmutc thank you for the response,

based on your suggestion i rewrote the code sample and used TextIteratorStreamer instead of TextStreamer (so i can update the UI during inference) and the code now looks like this:

import threading
import streamlit as st
import transformers
import torch


# Function to initialize the model and tokenizer
def model_init():
    # Initialize model and tokenizer only once
    if 'tokenizer' not in st.session_state:
        model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
        st.session_state.tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
        st.session_state.model = transformers.AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to('cuda')


def init_chat_history():
    # Initialize the session state (chat history)
    if 'messages' not in st.session_state:
        st.session_state.messages = [
            {"role": "system", "content": "You are an AI developer and expert in NLP."},
            {"role": "assistant", "content": "Welcome to the Personal Assistant! How can I help you today ?"},
        ]


def display_chat_messages():
    # Display chat messages from history on app rerun
    for message in st.session_state.messages:
        if message['role'] != 'system' and message['role'] != 'ipython':
            with st.chat_message(message["role"]):
                st.markdown(message["content"])


# Function to get the model response
def stream_response():
    inputs = st.session_state.tokenizer(st.session_state.messages[-1]['content'], return_tensors="pt").to('cuda')
    input_ids = inputs.input_ids.to('cuda')
    streamer = transformers.TextIteratorStreamer(st.session_state.tokenizer, skip_prompt=True, skip_special_tokens=True)

    # outputs = st.session_state.model.generate(**inputs, streamer=streamer, max_new_tokens=2048)

    # start a generation
    generation_thread = threading.Thread(
        target=st.session_state.model.generate, 
        args=(input_ids,), 
        kwargs={'streamer': streamer, 'max_new_tokens': 2048},
    )

    # start the thread so that the generation can run in parallel to updating the UI
    generation_thread.start()

    # reset the response string
    response = ""

    # create an empty streamlit ui container for the chat response
    current_chat_container = st.empty()

    # print streamed output as it's being generated
    for new_text in streamer:
        # stop the generation if EOS (end of sequence) token is detected
        if '[EOS]' in new_text:
            response += new_text.split('[EOS]')[0]
            break
        else:
            # append the generated token to the response
            response += new_text

        # update the current chat message container with the latest response
        with current_chat_container.chat_message("assistant"):
            current_chat_container.markdown(response)
   
    return response


# Function to get the last assistant reply
def get_last_assistant_reply(history):
    for entry in reversed(history):  # Iterate in reverse to find the last assistant message
        if entry['role'] == 'assistant':
            return entry['content']
    return None  # Return None if no assistant reply is found


st.title("Chat with AI")
init_chat_history()
display_chat_messages()
model_init()


# React to user input
if prompt := st.chat_input("Write your message..."):
    # Display user message in chat message container
    st.chat_message("user").markdown(prompt)

    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": prompt})

    # Get assistant response
    response = stream_response()

    # Add assistant response to chat history
    st.session_state.messages.append({"role": "assistant", "content": response})

but unfortunately the LLM is spitting rubbish, for me that means the default configuration that the pipeline “text-generation” has used is not the default setup that AutoModelForCausalLM is using,

any ideas for improvements ?

@not-lain thanks for the response, your code works so much better and the answers of the LLM makes sense now and closer to the ones produced by the “text-generation” pipeline, i will use your setup and feed it into my Streamlit app and play with it until i will understand each argument,

thank you

1 Like

This topic was automatically closed 12 hours after the last reply. New replies are no longer allowed.