How to get the microphone streaming input file when using blocks?

Iโ€™m trying to convert a simple example from interface to block, and I have missing part

The app displays every 2 seconds the temporary recording wav file name:

import gradio as gr
import time

def transcribe(audio, state=""):
    time.sleep(2)
    return audio


inf = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True),
    ],
    outputs=[
        "textbox",
    ],
    live=True)

if __name__ == "__main__":
    inf.launch()

Iโ€™m trying to write this simple example with blocks, but I donโ€™t know how to connect the transcribe function to the new file event ?

import gradio as gr
import time

def transcribe(audio, state=""):
    time.sleep(2)
    return audio

with gr.Blocks() as demo:
    input_mic = gr.Audio(source="microphone", type="filepath", streaming=True)
    out_text  = gr.Textbox()

# how to connect transcribe function to the input_mic event ?

if __name__ == "__main__":
    demo.launch()

this responds to your question


import gradio as gr
from transformers import pipeline
import time

pipe = pipeline("automatic-speech-recognition")

def transcribe(audio, state=""):
    print(audio)
    time.sleep(2)
    text = pipe(audio)["text"]
    state += text + " "
    return state, state


with gr.Blocks() as demo:
  state = gr.State(value="")
  with gr.Row():
      with gr.Column():
        audio = gr.Audio(source="microphone", type="filepath") 
      with gr.Column():
        textbox = gr.Textbox()
  audio.stream(fn=transcribe, inputs=[audio, state], outputs=[textbox, state])

demo.launch(debug=True)
1 Like

I tried this solution and I get this error: TypeError: Transcription.transcribe() takes 2 positional arguments but 3 were given

@radames thank you for your solution. There is one typo! โ€˜sourceโ€™ instead of โ€˜sourcesโ€™

audio = gr.Audio(source="microphone", type="filepath")

Corrected;

audio = gr.Audio(sources="microphone", type="filepath")

I got annoyed with the text box flickering while streaming. So, I came up with this solution to transcribe only when the โ€œRefreshโ€ button is pressed:

import gradio as gr
from transformers import pipeline
import numpy as np

# Initialize the automatic speech recognition pipeline using a pre-trained model
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

# Global variables to store the accumulated audio data and its streaming rate
audio_data = None
streaming_rate = None

def capture_audio(stream, new_chunk):
    """
    Function to capture streaming audio and accumulate it in a global variable.

    Args:
        stream (numpy.ndarray): The accumulated audio data up to this point.
        new_chunk (tuple): A tuple containing the sampling rate and the new audio data chunk.

    Returns:
        numpy.ndarray: The updated stream with the new chunk appended.
    """
    global audio_data
    global streaming_rate

    # Extract sampling rate and audio chunk, normalize the audio
    sr, y = new_chunk
    streaming_rate = sr
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    # Concatenate new audio chunk to the existing stream or start a new one
    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    # Update the global variable with the new audio data
    audio_data = stream
    return stream

def get_transcript():
    """
    Function to transcribe the accumulated audio data.

    Returns:
        str: The transcription of the accumulated audio data.
    """
    global audio_data
    global streaming_rate

    # Transcribe the audio data if available
    if audio_data is not None and streaming_rate is not None:
        transcript = transcriber({"sampling_rate": streaming_rate, "raw": audio_data})["text"]
        return transcript
    return ""

# Building the Gradio interface using Blocks
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            # State variable to manage the streaming data
            state = gr.State()
            # Audio component for real-time audio capture from the microphone
            audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy")
            # Textbox for displaying the transcription
            transcript_box = gr.Textbox(label="Transcript")
            # Button to initiate transcription of the captured audio
            rfrsh_btn = gr.Button("Refresh")

            # Streaming setup to handle real-time audio capture
            audio.stream(fn=capture_audio, inputs=[state, audio], outputs=[state])
            # Button click setup to trigger transcription
            rfrsh_btn.click(fn=get_transcript, outputs=[transcript_box])

# Launch the Gradio interface
demo.launch()