Using the ElevenLabs API for text to speech streaming?

Hello how can I use the ElevenLabs API for realtime text-to-speech streaming output with Gradio please?

https://api.elevenlabs.io/docs#/text-to-speech/Text_to_speech_v1_text_to_speech__voice_id__stream_post

Many thanks

Hi @ethantan !

This gradio demo uses a different streaming api but I imagine you can adept it to your use case.

Cc @ysharma

Thanks Freddy - it’s text streaming rather than audio but will give it a go!

Hi @freddyaboulton I can’t figure out how to play a stream of audio - would appreciate any help!

I’m off today - will check back tomorrow!

Was anyone able to figure this out? The elevelabs API play() works in local environment but produce no audio and no error on huggingface.

import gradio as gr
import librosa
import pyaudio
import numpy as np
import requests
import tempfile
import os
import io
from io import BytesIO
import base64
from faster_whisper import WhisperModel
model_size = "large-v2"
# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")

def main():

    with gr.Blocks(theme=gr.themes.Soft(primary_hue='orange',
                             secondary_hue='orange', neutral_hue='stone')) as app:


        def text_to_speech(text):
            url = f"https://api.elevenlabs.io/v1/text-to-speech/YOUR_VOICE"

            headers = {
            "accept": "audio/mpeg",
            "xi-api-key": "API_KEY",
            "Content-Type": "application/json",
            }

            data = {"text": text}

            response = requests.post(url, headers = headers, json=data)


            if response.status_code == 200:
                print(response.status_code)
                return response.content
            else:
                print(f"Error: {response.status_code}")
                return None
            

        def on_click_play_audio_button(text):
            #text = response2
            print(text)
            audio_content = text_to_speech(text)

            if audio_content is not None:
                # with open("output_audio.mp3", "wb") as audio_file:
                #     audio_file.write(audio_content)
                audio_bytes = BytesIO(audio_content)
                audio_bytes.seek(0)
                audio_base_64 = base64.b64encode(audio_bytes.read()).decode("utf-8")
                audio_player = f'<audio src="data:audio/mpeg;base64,{audio_base_64}" controls autoplay></audio>'
                return audio_player

        with gr.Row():
            with gr.Column(scale=3):
                query = gr.Textbox(label='Query', lines=1, placeholder="Ask a question to the dataset...")
                gr.Textbox.style(query, show_copy_button=True)
            with gr.Column(scale=.1, min_width=200):


                audio = gr.Audio(source="microphone", type="filepath", label="Audio")
                    
                def get_audio(audio):
                    if audio != None:
                        segments, _ = model.transcribe(audio)
                        text = []
                        for segment in segments:
                            text.append(segment.text)
                        text = ' '.join(text)
                        return text
                    else:  
                        return None


        html = gr.HTML()
        audio.change(get_audio, audio, query).then(on_click_play_audio_button, inputs=[query], outputs=[html])

    app.launch(share=True, width=1600, height=800)

if __name__ == "__main__":
    main()

This code generates text from your speech and convert it back to an ElevenLab audio.
You can modify it slightly so that it works with chatbot conversations!

Change the url link and API and it should work once you install all dependencies!