Hello how can I use the ElevenLabs API for realtime text-to-speech streaming output with Gradio please?
Many thanks
Hello how can I use the ElevenLabs API for realtime text-to-speech streaming output with Gradio please?
Many thanks
Hi @ethantan !
This gradio demo uses a different streaming api but I imagine you can adept it to your use case.
Cc @ysharma
Thanks Freddy - itβs text streaming rather than audio but will give it a go!
Hi @freddyaboulton I canβt figure out how to play a stream of audio - would appreciate any help!
Iβm off today - will check back tomorrow!
Was anyone able to figure this out? The elevelabs API play() works in local environment but produce no audio and no error on huggingface.
import gradio as gr
import librosa
import pyaudio
import numpy as np
import requests
import tempfile
import os
import io
from io import BytesIO
import base64
from faster_whisper import WhisperModel
model_size = "large-v2"
# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")
def main():
with gr.Blocks(theme=gr.themes.Soft(primary_hue='orange',
secondary_hue='orange', neutral_hue='stone')) as app:
def text_to_speech(text):
url = f"https://api.elevenlabs.io/v1/text-to-speech/YOUR_VOICE"
headers = {
"accept": "audio/mpeg",
"xi-api-key": "API_KEY",
"Content-Type": "application/json",
}
data = {"text": text}
response = requests.post(url, headers = headers, json=data)
if response.status_code == 200:
print(response.status_code)
return response.content
else:
print(f"Error: {response.status_code}")
return None
def on_click_play_audio_button(text):
#text = response2
print(text)
audio_content = text_to_speech(text)
if audio_content is not None:
# with open("output_audio.mp3", "wb") as audio_file:
# audio_file.write(audio_content)
audio_bytes = BytesIO(audio_content)
audio_bytes.seek(0)
audio_base_64 = base64.b64encode(audio_bytes.read()).decode("utf-8")
audio_player = f'<audio src="data:audio/mpeg;base64,{audio_base_64}" controls autoplay></audio>'
return audio_player
with gr.Row():
with gr.Column(scale=3):
query = gr.Textbox(label='Query', lines=1, placeholder="Ask a question to the dataset...")
gr.Textbox.style(query, show_copy_button=True)
with gr.Column(scale=.1, min_width=200):
audio = gr.Audio(source="microphone", type="filepath", label="Audio")
def get_audio(audio):
if audio != None:
segments, _ = model.transcribe(audio)
text = []
for segment in segments:
text.append(segment.text)
text = ' '.join(text)
return text
else:
return None
html = gr.HTML()
audio.change(get_audio, audio, query).then(on_click_play_audio_button, inputs=[query], outputs=[html])
app.launch(share=True, width=1600, height=800)
if __name__ == "__main__":
main()
This code generates text from your speech and convert it back to an ElevenLab audio.
You can modify it slightly so that it works with chatbot conversations!
Change the url link and API and it should work once you install all dependencies!