From 04dcf86fa46de3282780772f5943b5dac84d65ba Mon Sep 17 00:00:00 2001 From: Keysat Date: Sat, 13 Jun 2026 13:36:05 -0500 Subject: [PATCH] Add TTS backends (ElevenLabs, Kokoro) and /relay/tts --- server/backends/elevenlabs.js | 109 +++++++++++++ server/backends/kokoro.js | 140 ++++++++++++++++ server/routes/tts.js | 289 ++++++++++++++++++++++++++++++++++ 3 files changed, 538 insertions(+) create mode 100644 server/backends/elevenlabs.js create mode 100644 server/backends/kokoro.js create mode 100644 server/routes/tts.js diff --git a/server/backends/elevenlabs.js b/server/backends/elevenlabs.js new file mode 100644 index 0000000..7dc04d9 --- /dev/null +++ b/server/backends/elevenlabs.js @@ -0,0 +1,109 @@ +// ElevenLabs TTS backend — the cloud alternative to operator-hardware +// Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini +// as a swappable cloud fallback to the Parakeet/vLLM hardware path. +// +// Selected when relay_tts_backend_preference routes here (or when +// Kokoro is unavailable and a fallback is allowed) AND +// relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs +// handles full multi-sentence paragraphs in one call — one request per +// topic summary, no chunking. +// +// NOTE: This path is implemented to ElevenLabs' documented API but is +// UNTESTED against a live key (the operator hadn't supplied one at +// build time). The Kokoro path is the tested default. Before relying on +// ElevenLabs in production, set relay_elevenlabs_api_key + +// relay_elevenlabs_voice_id and smoke-test one /relay/tts call. +// +// Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds, +// voice, model } — durationSeconds is null (we don't decode +// the MP3 frame count here; the Recap server measures it when +// transcoding/caching). + +const API_BASE = "https://api.elevenlabs.io/v1"; +const DEFAULT_MODEL = "eleven_turbo_v2_5"; +const DEFAULT_TIMEOUT_MS = 120_000; + +export function createElevenLabsBackend({ + apiKey = "", + voiceId = "", + model = DEFAULT_MODEL, + timeoutMs = DEFAULT_TIMEOUT_MS, +} = {}) { + const configured = !!(apiKey && voiceId); + + return { + hasTts: configured, + kind: "elevenlabs", + + async synthesize({ text, voice }) { + if (!apiKey) { + const e = new Error( + "ElevenLabs TTS is not configured — set relay_elevenlabs_api_key" + ); + e.status = 503; + throw e; + } + // `voice` from the client overrides the operator default voice id + // when present (the Recap client may let a user pick a voice). + const chosenVoice = (voice || voiceId || "").trim(); + if (!chosenVoice) { + const e = new Error( + "ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id" + ); + e.status = 503; + throw e; + } + const cleaned = (text || "").replace(/\s+/g, " ").trim(); + if (!cleaned) { + const e = new Error("TTS input text is empty"); + e.status = 400; + throw e; + } + + const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`; + let res; + try { + // Public-internet call — use the global fetch with full cert + // validation (NOT lanFetch, which is scoped to LAN/Spark Control). + res = await fetch(url, { + method: "POST", + headers: { + "xi-api-key": apiKey, + "Content-Type": "application/json", + Accept: "audio/mpeg", + }, + body: JSON.stringify({ + text: cleaned, + model_id: model || DEFAULT_MODEL, + }), + signal: AbortSignal.timeout(timeoutMs), + }); + } catch (err) { + const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`); + e.status = 502; + throw e; + } + if (!res.ok) { + let body = ""; + try { + body = await res.text(); + } catch {} + const e = new Error( + `ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}` + ); + e.status = res.status; + throw e; + } + const audio = Buffer.from(await res.arrayBuffer()); + return { + audio, + contentType: "audio/mpeg", + durationSeconds: null, + sentenceCount: null, + attempts: 1, + voice: chosenVoice, + model: model || DEFAULT_MODEL, + }; + }, + }; +} diff --git a/server/backends/kokoro.js b/server/backends/kokoro.js new file mode 100644 index 0000000..147abfb --- /dev/null +++ b/server/backends/kokoro.js @@ -0,0 +1,140 @@ +// Kokoro TTS backend — synthesizes a topic summary into speech via Spark +// Control's OpenAI-compatible /v1/audio/speech endpoint. +// +// Kokoro-82M (Apache-2.0, hexgrad/Kokoro-82M) replaced Magpie in Spark +// Control v0.14.0. Magpie's NVIDIA Riva decoder had a structural +// truncation defect that capped end-to-end reliability at ~85% even with +// server-side retries + chunking; Kokoro renders cleanly at any length +// (100% in our testing, ~1s for a ~100-word summary, no truncation). So +// this backend is a single pass-through call — NONE of the Magpie-era +// fragmenting, pacing/recovery-gap, duration-check, retry, or WAV +// stitching is needed or present. +// +// Output: 24kHz mono 16-bit PCM. Kokoro can emit wav/mp3/opus/flac +// directly via response_format, so we request the caller's format (mp3 +// by default — small + universally playable for the mobile/offline +// player) and never transcode client-side. durationSeconds is left null: +// Kokoro's WAV header carries a placeholder size field (bogus computed +// duration), and for mp3 we'd have to decode — the Recap side measures +// duration off the cached file /