I am using Gradio blocks and Llama2 chatbot in order to create a streaming chatbot. Currently, it works as planned but I have one issue, the streaming chatbot output is only printed out once the full result is executed. I would like it to print the output word by word AS its generating.
if __name__ == "__main__":
#loads model
llama = Llama2(model_name=config.get('model', 'llama2-7b'))
#builds interface for gradio
with gr.Blocks(theme=gr.themes.Soft()) as demo:
chatbot = gr.Chatbot()
with gr.Row():
msg = gr.Textbox(label="Your Message", interactive=True)
mic_transcribe = gr.Interface(
allow_flagging="never",
fn=transcribe,
#state variable is used to keep track of all audio history
inputs=["state", gr.Audio(sources=["microphone"], streaming=True)],
outputs=["state", msg],
live=True
)
def bot(history):
bot_message = history[-1][1]
history[-1][1] = ""
for character in bot_message:
history[-1][1] += character
time.sleep(0.05)
yield history
clear_button = gr.ClearButton([msg, chatbot])
#hides voice response interface
with gr.Column(visible = False):
msg.submit(llama.get_llama_response, [msg, chatbot], [msg, chatbot, gr.HTML()], queue=False).then(bot, chatbot, chatbot)
demo.queue()
demo.launch()
Here is my main function, which uses the llama get response in order to generate the chatbots response,
#generates response based of history in conversational style
def get_llama_response(self, message, history):
"""
Generates a conversational response from the Llama model.
Parameters:
message (str): User's input message.
history (list): Past conversation history.
Returns:
str: input message as empty string to clear textbox
list: history of conversation
html: autoplay audio output of chatbot response
"""
#input message formatted
query = self.format_message(message, history)
response = ""
sequences = self.pipeline(
query,
do_sample=True,
top_k = 10,
num_return_sequences=1,
eos_token_id=self.tokenizer.eos_token_id,
max_length=1024,
)
generated_text = sequences[0]['generated_text'] # type: ignore
#response of chatbot
response = generated_text[len(query):] #removes initial prompt from output
#adds input message and chatbot response to history
history.append((message, response.strip()))
#calls tts on bot response and converts it to html audio
bot_voice = tts(response.strip(), "en")
bot_voice_bytes = tts_to_bytes(bot_voice)
html = html_autoplay(bot_voice_bytes)
#returns user message, chat response, HTML Audio for browser
return "", history, html
So basically, it does print it out word by word, but for a split second it shows the full result before it prints it out, meaning that it waits for the full generated text, instead of printing the output while its getting generated. Any ideas on how I could fix this functionality? Im sorry if my format is bad, it is my first time here. Thank you for the help.