Add TTS backends (ElevenLabs, Kokoro) and /relay/tts
This commit is contained in:
@@ -0,0 +1,109 @@
|
||||
// ElevenLabs TTS backend — the cloud alternative to operator-hardware
|
||||
// Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini
|
||||
// as a swappable cloud fallback to the Parakeet/vLLM hardware path.
|
||||
//
|
||||
// Selected when relay_tts_backend_preference routes here (or when
|
||||
// Kokoro is unavailable and a fallback is allowed) AND
|
||||
// relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs
|
||||
// handles full multi-sentence paragraphs in one call — one request per
|
||||
// topic summary, no chunking.
|
||||
//
|
||||
// NOTE: This path is implemented to ElevenLabs' documented API but is
|
||||
// UNTESTED against a live key (the operator hadn't supplied one at
|
||||
// build time). The Kokoro path is the tested default. Before relying on
|
||||
// ElevenLabs in production, set relay_elevenlabs_api_key +
|
||||
// relay_elevenlabs_voice_id and smoke-test one /relay/tts call.
|
||||
//
|
||||
// Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds,
|
||||
// voice, model } — durationSeconds is null (we don't decode
|
||||
// the MP3 frame count here; the Recap server measures it when
|
||||
// transcoding/caching).
|
||||
|
||||
const API_BASE = "https://api.elevenlabs.io/v1";
|
||||
const DEFAULT_MODEL = "eleven_turbo_v2_5";
|
||||
const DEFAULT_TIMEOUT_MS = 120_000;
|
||||
|
||||
export function createElevenLabsBackend({
|
||||
apiKey = "",
|
||||
voiceId = "",
|
||||
model = DEFAULT_MODEL,
|
||||
timeoutMs = DEFAULT_TIMEOUT_MS,
|
||||
} = {}) {
|
||||
const configured = !!(apiKey && voiceId);
|
||||
|
||||
return {
|
||||
hasTts: configured,
|
||||
kind: "elevenlabs",
|
||||
|
||||
async synthesize({ text, voice }) {
|
||||
if (!apiKey) {
|
||||
const e = new Error(
|
||||
"ElevenLabs TTS is not configured — set relay_elevenlabs_api_key"
|
||||
);
|
||||
e.status = 503;
|
||||
throw e;
|
||||
}
|
||||
// `voice` from the client overrides the operator default voice id
|
||||
// when present (the Recap client may let a user pick a voice).
|
||||
const chosenVoice = (voice || voiceId || "").trim();
|
||||
if (!chosenVoice) {
|
||||
const e = new Error(
|
||||
"ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id"
|
||||
);
|
||||
e.status = 503;
|
||||
throw e;
|
||||
}
|
||||
const cleaned = (text || "").replace(/\s+/g, " ").trim();
|
||||
if (!cleaned) {
|
||||
const e = new Error("TTS input text is empty");
|
||||
e.status = 400;
|
||||
throw e;
|
||||
}
|
||||
|
||||
const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`;
|
||||
let res;
|
||||
try {
|
||||
// Public-internet call — use the global fetch with full cert
|
||||
// validation (NOT lanFetch, which is scoped to LAN/Spark Control).
|
||||
res = await fetch(url, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"xi-api-key": apiKey,
|
||||
"Content-Type": "application/json",
|
||||
Accept: "audio/mpeg",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text: cleaned,
|
||||
model_id: model || DEFAULT_MODEL,
|
||||
}),
|
||||
signal: AbortSignal.timeout(timeoutMs),
|
||||
});
|
||||
} catch (err) {
|
||||
const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`);
|
||||
e.status = 502;
|
||||
throw e;
|
||||
}
|
||||
if (!res.ok) {
|
||||
let body = "";
|
||||
try {
|
||||
body = await res.text();
|
||||
} catch {}
|
||||
const e = new Error(
|
||||
`ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}`
|
||||
);
|
||||
e.status = res.status;
|
||||
throw e;
|
||||
}
|
||||
const audio = Buffer.from(await res.arrayBuffer());
|
||||
return {
|
||||
audio,
|
||||
contentType: "audio/mpeg",
|
||||
durationSeconds: null,
|
||||
sentenceCount: null,
|
||||
attempts: 1,
|
||||
voice: chosenVoice,
|
||||
model: model || DEFAULT_MODEL,
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user