I’m trying to convert a simple example from interface to block, and I have missing part
The app displays every 2 seconds the temporary recording wav file name:
import gradio as gr
import time
def transcribe(audio, state=""):
time.sleep(2)
return audio
inf = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
],
outputs=[
"textbox",
],
live=True)
if __name__ == "__main__":
inf.launch()
I’m trying to write this simple example with blocks, but I don’t know how to connect the transcribe function to the new file event ?
import gradio as gr
import time
def transcribe(audio, state=""):
time.sleep(2)
return audio
with gr.Blocks() as demo:
input_mic = gr.Audio(source="microphone", type="filepath", streaming=True)
out_text = gr.Textbox()
# how to connect transcribe function to the input_mic event ?
if __name__ == "__main__":
demo.launch()
import gradio as gr
from transformers import pipeline
import time
pipe = pipeline("automatic-speech-recognition")
def transcribe(audio, state=""):
print(audio)
time.sleep(2)
text = pipe(audio)["text"]
state += text + " "
return state, state
with gr.Blocks() as demo:
state = gr.State(value="")
with gr.Row():
with gr.Column():
audio = gr.Audio(source="microphone", type="filepath")
with gr.Column():
textbox = gr.Textbox()
audio.stream(fn=transcribe, inputs=[audio, state], outputs=[textbox, state])
demo.launch(debug=True)
I got annoyed with the text box flickering while streaming. So, I came up with this solution to transcribe only when the “Refresh” button is pressed:
import gradio as gr
from transformers import pipeline
import numpy as np
# Initialize the automatic speech recognition pipeline using a pre-trained model
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
# Global variables to store the accumulated audio data and its streaming rate
audio_data = None
streaming_rate = None
def capture_audio(stream, new_chunk):
"""
Function to capture streaming audio and accumulate it in a global variable.
Args:
stream (numpy.ndarray): The accumulated audio data up to this point.
new_chunk (tuple): A tuple containing the sampling rate and the new audio data chunk.
Returns:
numpy.ndarray: The updated stream with the new chunk appended.
"""
global audio_data
global streaming_rate
# Extract sampling rate and audio chunk, normalize the audio
sr, y = new_chunk
streaming_rate = sr
y = y.astype(np.float32)
y /= np.max(np.abs(y))
# Concatenate new audio chunk to the existing stream or start a new one
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
# Update the global variable with the new audio data
audio_data = stream
return stream
def get_transcript():
"""
Function to transcribe the accumulated audio data.
Returns:
str: The transcription of the accumulated audio data.
"""
global audio_data
global streaming_rate
# Transcribe the audio data if available
if audio_data is not None and streaming_rate is not None:
transcript = transcriber({"sampling_rate": streaming_rate, "raw": audio_data})["text"]
return transcript
return ""
# Building the Gradio interface using Blocks
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
# State variable to manage the streaming data
state = gr.State()
# Audio component for real-time audio capture from the microphone
audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy")
# Textbox for displaying the transcription
transcript_box = gr.Textbox(label="Transcript")
# Button to initiate transcription of the captured audio
rfrsh_btn = gr.Button("Refresh")
# Streaming setup to handle real-time audio capture
audio.stream(fn=capture_audio, inputs=[state, audio], outputs=[state])
# Button click setup to trigger transcription
rfrsh_btn.click(fn=get_transcript, outputs=[transcript_box])
# Launch the Gradio interface
demo.launch()