Add TTS backends (ElevenLabs, Kokoro) and /relay/tts

This commit is contained in:
Keysat
2026-06-13 13:36:05 -05:00
parent 0aa648706e
commit 04dcf86fa4
3 changed files with 538 additions and 0 deletions
+109
View File
@@ -0,0 +1,109 @@
// ElevenLabs TTS backend — the cloud alternative to operator-hardware
// Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini
// as a swappable cloud fallback to the Parakeet/vLLM hardware path.
//
// Selected when relay_tts_backend_preference routes here (or when
// Kokoro is unavailable and a fallback is allowed) AND
// relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs
// handles full multi-sentence paragraphs in one call — one request per
// topic summary, no chunking.
//
// NOTE: This path is implemented to ElevenLabs' documented API but is
// UNTESTED against a live key (the operator hadn't supplied one at
// build time). The Kokoro path is the tested default. Before relying on
// ElevenLabs in production, set relay_elevenlabs_api_key +
// relay_elevenlabs_voice_id and smoke-test one /relay/tts call.
//
// Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds,
// voice, model } — durationSeconds is null (we don't decode
// the MP3 frame count here; the Recap server measures it when
// transcoding/caching).
const API_BASE = "https://api.elevenlabs.io/v1";
const DEFAULT_MODEL = "eleven_turbo_v2_5";
const DEFAULT_TIMEOUT_MS = 120_000;
export function createElevenLabsBackend({
apiKey = "",
voiceId = "",
model = DEFAULT_MODEL,
timeoutMs = DEFAULT_TIMEOUT_MS,
} = {}) {
const configured = !!(apiKey && voiceId);
return {
hasTts: configured,
kind: "elevenlabs",
async synthesize({ text, voice }) {
if (!apiKey) {
const e = new Error(
"ElevenLabs TTS is not configured — set relay_elevenlabs_api_key"
);
e.status = 503;
throw e;
}
// `voice` from the client overrides the operator default voice id
// when present (the Recap client may let a user pick a voice).
const chosenVoice = (voice || voiceId || "").trim();
if (!chosenVoice) {
const e = new Error(
"ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id"
);
e.status = 503;
throw e;
}
const cleaned = (text || "").replace(/\s+/g, " ").trim();
if (!cleaned) {
const e = new Error("TTS input text is empty");
e.status = 400;
throw e;
}
const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`;
let res;
try {
// Public-internet call — use the global fetch with full cert
// validation (NOT lanFetch, which is scoped to LAN/Spark Control).
res = await fetch(url, {
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify({
text: cleaned,
model_id: model || DEFAULT_MODEL,
}),
signal: AbortSignal.timeout(timeoutMs),
});
} catch (err) {
const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`);
e.status = 502;
throw e;
}
if (!res.ok) {
let body = "";
try {
body = await res.text();
} catch {}
const e = new Error(
`ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}`
);
e.status = res.status;
throw e;
}
const audio = Buffer.from(await res.arrayBuffer());
return {
audio,
contentType: "audio/mpeg",
durationSeconds: null,
sentenceCount: null,
attempts: 1,
voice: chosenVoice,
model: model || DEFAULT_MODEL,
};
},
};
}
+140
View File
@@ -0,0 +1,140 @@
// Kokoro TTS backend — synthesizes a topic summary into speech via Spark
// Control's OpenAI-compatible /v1/audio/speech endpoint.
//
// Kokoro-82M (Apache-2.0, hexgrad/Kokoro-82M) replaced Magpie in Spark
// Control v0.14.0. Magpie's NVIDIA Riva decoder had a structural
// truncation defect that capped end-to-end reliability at ~85% even with
// server-side retries + chunking; Kokoro renders cleanly at any length
// (100% in our testing, ~1s for a ~100-word summary, no truncation). So
// this backend is a single pass-through call — NONE of the Magpie-era
// fragmenting, pacing/recovery-gap, duration-check, retry, or WAV
// stitching is needed or present.
//
// Output: 24kHz mono 16-bit PCM. Kokoro can emit wav/mp3/opus/flac
// directly via response_format, so we request the caller's format (mp3
// by default — small + universally playable for the mobile/offline
// player) and never transcode client-side. durationSeconds is left null:
// Kokoro's WAV header carries a placeholder size field (bogus computed
// duration), and for mp3 we'd have to decode — the Recap side measures
// duration off the cached file / <audio> element instead.
import { lanFetch } from "../lan-fetch.js";
const DEFAULT_TIMEOUT_MS = 60_000;
const DEFAULT_VOICE = "bm_george";
const DEFAULT_FORMAT = "mp3";
// One retry on a 5xx / network blip (per the Spark Control dev's
// error-handling guidance: 4xx = real client error, 5xx = retry once).
// Kokoro doesn't truncate, so there's no duration-based retry.
const RETRY_ON_5XX = 1;
const FORMAT_CONTENT_TYPE = {
wav: "audio/wav",
mp3: "audio/mpeg",
opus: "audio/ogg",
flac: "audio/flac",
};
function sleepMs(ms) {
return new Promise((r) => setTimeout(r, ms));
}
export function createKokoroBackend({
// Spark Control base URL (no path) — derived by the caller from
// relay_spark_control_url with the /api/endpoints suffix stripped.
sparkControlBaseURL = "",
defaultVoice = DEFAULT_VOICE,
defaultFormat = DEFAULT_FORMAT,
timeoutMs = DEFAULT_TIMEOUT_MS,
} = {}) {
const sparkBase = (sparkControlBaseURL || "")
.trim()
.replace(/\/$/, "")
.replace(/\/api\/endpoints$/, "");
async function callKokoro({ text, voice, format }) {
const url = `${sparkBase}/v1/audio/speech`;
let res;
try {
res = await lanFetch(url, {
method: "POST",
headers: { "Content-Type": "application/json" },
redirect: "follow",
body: JSON.stringify({
model: "kokoro",
input: text,
voice,
response_format: format,
}),
signal: AbortSignal.timeout(timeoutMs),
});
} catch (err) {
const cause = err?.cause?.message || err?.cause?.code || err?.cause || "";
const detail = cause ? `${err.message} (cause: ${cause})` : err?.message || String(err);
const e = new Error(`Kokoro TTS network error at ${url}: ${detail}`);
e.status = 502;
throw e;
}
if (!res.ok) {
let body = "";
try {
body = await res.text();
} catch {}
const e = new Error(`Kokoro TTS ${res.status} at ${url}: ${body.slice(0, 300)}`);
e.status = res.status;
throw e;
}
return Buffer.from(await res.arrayBuffer());
}
return {
hasTts: !!sparkBase,
kind: "kokoro",
async synthesize({ text, voice, format }) {
if (!sparkBase) {
const e = new Error(
"Kokoro TTS is not configured — Spark Control discovery isn't reporting a ready kokoro endpoint"
);
e.status = 503;
throw e;
}
const cleaned = (text || "").replace(/\s+/g, " ").trim();
if (!cleaned) {
const e = new Error("TTS input text is empty");
e.status = 400;
throw e;
}
const chosenVoice = (voice || defaultVoice || "").trim() || DEFAULT_VOICE;
const fmt = (format || defaultFormat || DEFAULT_FORMAT).toLowerCase();
const contentType = FORMAT_CONTENT_TYPE[fmt] || "application/octet-stream";
let attempt = 0;
// Retry only on transient 5xx; a 4xx (bad voice/format) is
// deterministic and surfaces immediately.
while (true) {
try {
const audio = await callKokoro({ text: cleaned, voice: chosenVoice, format: fmt });
return {
audio,
contentType,
durationSeconds: null,
voice: chosenVoice,
model: "kokoro",
format: fmt,
attempts: attempt + 1,
};
} catch (err) {
const status = err?.status || 0;
if (status >= 400 && status < 500) throw err; // client error → no retry
if (attempt >= RETRY_ON_5XX) throw err;
attempt += 1;
console.warn(
`[kokoro] TTS call failed (${status || "network"}) — retry ${attempt}/${RETRY_ON_5XX}`
);
await sleepMs(500);
}
}
},
};
}