VoicePresets-TexttoSpeech- Pro/Inference - Suno/Bark

Augmented123 · March 2, 2025, 2:11pm

Hi,
I recently got the pro subscription to use the text to speech (Suno/Bark) and im not able to set the voice presets in using huggingface.js

The docs only have these parameter, model and inputs:
await hf.textToSpeech({
model: ‘espnet/kan-bayashi_ljspeech_vits’,
inputs: ‘Hello world!’
})

but if I add presets for voice (eg v2/en_speaker_9) found in the Bark docs, it doesn’t work.

All of bark’s docs are in python as well, and the hf.js doesn’t seem to account for them.

Any help?

my ttscontroller.js:

// File: /uncensored-backend/controllers/ttsController.js
import { HfInference } from “@huggingface/inference”;
import { HF_API_TOKEN } from “…/utils/constants.js”;

/**

Text-to-Speech (TTS) handler using the Hugging Face Inference API with the “suno/bark” model.
This handler enforces a female voice by prepending a directive to the input text and by setting a voice preset.
The preset used in this example is “v2/en_speaker_9”.
The API response may be returned as an ArrayBuffer, Uint8Array, or a Blob‑like object.
This code converts the response into audio data and sends it back with the appropriate Content-Type.
*/
export const barkTTSHandler = async (req, res) => {
try {
// Validate the input text from the request body
const { text } = req.body;
if (!text || typeof text !== “string”) {
return res.status(400).json({ error: “Missing ‘text’ field.” });
}

// Prepend directive to enforce a female voice
const modifiedText = "Please speak in a female voice: " + text;
console.log(
Processing TTS request with text: "${modifiedText.substring(0, 50)}${ modifiedText.length > 50 ? "..." : "" }"
);

// Initialize the Hugging Face Inference client
const hf = new HfInference(HF_API_TOKEN);
console.log(“Sending request to Hugging Face API with voice preset v2/en_speaker_9”);

try {
// Call the textToSpeech API with the modified text and preset
const response = await hf.textToSpeech({
model: “suno/bark”,
inputs: modifiedText,
parameters: { voice_preset: “v2/en_speaker_9” },
options: { wait_for_model: true },
});

console.log(“Received response from Hugging Face API”);

// Check and handle different response types

// Handle ArrayBuffer responses
if (response instanceof ArrayBuffer) {
console.log(Success: Received audio data as ArrayBuffer (${response.byteLength} bytes));
res.setHeader(“Content-Type”, “audio/wav”);
return res.send(Buffer.from(response));
}
// Handle Uint8Array responses
else if (response instanceof Uint8Array) {
console.log(Success: Received audio data as Uint8Array (${response.byteLength} bytes));
res.setHeader(“Content-Type”, “audio/wav”);
return res.send(Buffer.from(response));
}
// Handle Blob-like responses (which have an arrayBuffer method)
else if (response && typeof response.arrayBuffer === “function”) {
console.log(“Success: Received Blob-like audio response”);
const arrayBuffer = await response.arrayBuffer();
const contentType = response.type || “audio/wav”;
res.setHeader(“Content-Type”, contentType);
return res.send(Buffer.from(arrayBuffer));
} else {
console.error(“Unexpected response format:”, response);
return res.status(500).json({
error: “Bark TTS returned an unexpected response format.”,
details: response,
});
}
} catch (apiError) {
// Log error details from the Hugging Face API call
console.error(“Hugging Face API Error:”, apiError);
const errorMsg = apiError.message || “Unknown error occurred”;
const errorDetails = apiError.response?.data || {};
console.error(“Error details:”, {
message: errorMsg,
response: errorDetails,
status: apiError.response?.status,
});

// Detect subscription-related errors
if (
errorMsg.includes(“subscription”) ||
errorMsg.includes(“pro”) ||
errorMsg.includes(“upgrade”) ||
errorMsg.includes(“quota”) ||
errorMsg.includes(“limit”) ||
(errorDetails &&
typeof errorDetails === “object” &&
(errorDetails.error || “”).toString().includes(“subscription”))
) {
console.error(“Subscription error detected”);
return res.status(402).json({
error: “Hugging Face Pro subscription required for this model”,
details: errorMsg,
});
}
return res.status(500).json({
error: “Error from Hugging Face API”,
message: errorMsg,
details: errorDetails,
});
}
} catch (error) {
// Handle any unexpected errors
console.error(“Fatal error in barkTTSHandler:”, error);
return res.status(500).json({
error: “TTS processing error”,
message: error.message || “Unknown error occurred”,
});
}
};

export default barkTTSHandler;

John6666 · March 2, 2025, 8:20pm

Hmmm…

Topic		Replies	Views
Fine tuning a TTS model Models	0	1792	March 7, 2023
Speech synthesis model with Styles Like Emoticons or emphasis Intermediate	3	212	December 25, 2024
How to Get Dutch Output for Dutch Audio Using Whisper Model via Hugging Face Inference Endpoint? Inference Endpoints on the Hub	0	283	January 26, 2024
Adding prompt / context to Whisper with Huggingface Transformers Models	7	6809	January 20, 2025
Inference provider for captioning (image2text model) Beginners	3	22	June 16, 2025

VoicePresets-TexttoSpeech- Pro/Inference - Suno/Bark

Related topics