Add TTS backends (ElevenLabs, Kokoro) and /relay/tts
This commit is contained in:
@@ -0,0 +1,140 @@
|
||||
// Kokoro TTS backend — synthesizes a topic summary into speech via Spark
|
||||
// Control's OpenAI-compatible /v1/audio/speech endpoint.
|
||||
//
|
||||
// Kokoro-82M (Apache-2.0, hexgrad/Kokoro-82M) replaced Magpie in Spark
|
||||
// Control v0.14.0. Magpie's NVIDIA Riva decoder had a structural
|
||||
// truncation defect that capped end-to-end reliability at ~85% even with
|
||||
// server-side retries + chunking; Kokoro renders cleanly at any length
|
||||
// (100% in our testing, ~1s for a ~100-word summary, no truncation). So
|
||||
// this backend is a single pass-through call — NONE of the Magpie-era
|
||||
// fragmenting, pacing/recovery-gap, duration-check, retry, or WAV
|
||||
// stitching is needed or present.
|
||||
//
|
||||
// Output: 24kHz mono 16-bit PCM. Kokoro can emit wav/mp3/opus/flac
|
||||
// directly via response_format, so we request the caller's format (mp3
|
||||
// by default — small + universally playable for the mobile/offline
|
||||
// player) and never transcode client-side. durationSeconds is left null:
|
||||
// Kokoro's WAV header carries a placeholder size field (bogus computed
|
||||
// duration), and for mp3 we'd have to decode — the Recap side measures
|
||||
// duration off the cached file / <audio> element instead.
|
||||
|
||||
import { lanFetch } from "../lan-fetch.js";
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = 60_000;
|
||||
const DEFAULT_VOICE = "bm_george";
|
||||
const DEFAULT_FORMAT = "mp3";
|
||||
// One retry on a 5xx / network blip (per the Spark Control dev's
|
||||
// error-handling guidance: 4xx = real client error, 5xx = retry once).
|
||||
// Kokoro doesn't truncate, so there's no duration-based retry.
|
||||
const RETRY_ON_5XX = 1;
|
||||
|
||||
const FORMAT_CONTENT_TYPE = {
|
||||
wav: "audio/wav",
|
||||
mp3: "audio/mpeg",
|
||||
opus: "audio/ogg",
|
||||
flac: "audio/flac",
|
||||
};
|
||||
|
||||
function sleepMs(ms) {
|
||||
return new Promise((r) => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
export function createKokoroBackend({
|
||||
// Spark Control base URL (no path) — derived by the caller from
|
||||
// relay_spark_control_url with the /api/endpoints suffix stripped.
|
||||
sparkControlBaseURL = "",
|
||||
defaultVoice = DEFAULT_VOICE,
|
||||
defaultFormat = DEFAULT_FORMAT,
|
||||
timeoutMs = DEFAULT_TIMEOUT_MS,
|
||||
} = {}) {
|
||||
const sparkBase = (sparkControlBaseURL || "")
|
||||
.trim()
|
||||
.replace(/\/$/, "")
|
||||
.replace(/\/api\/endpoints$/, "");
|
||||
|
||||
async function callKokoro({ text, voice, format }) {
|
||||
const url = `${sparkBase}/v1/audio/speech`;
|
||||
let res;
|
||||
try {
|
||||
res = await lanFetch(url, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
redirect: "follow",
|
||||
body: JSON.stringify({
|
||||
model: "kokoro",
|
||||
input: text,
|
||||
voice,
|
||||
response_format: format,
|
||||
}),
|
||||
signal: AbortSignal.timeout(timeoutMs),
|
||||
});
|
||||
} catch (err) {
|
||||
const cause = err?.cause?.message || err?.cause?.code || err?.cause || "";
|
||||
const detail = cause ? `${err.message} (cause: ${cause})` : err?.message || String(err);
|
||||
const e = new Error(`Kokoro TTS network error at ${url}: ${detail}`);
|
||||
e.status = 502;
|
||||
throw e;
|
||||
}
|
||||
if (!res.ok) {
|
||||
let body = "";
|
||||
try {
|
||||
body = await res.text();
|
||||
} catch {}
|
||||
const e = new Error(`Kokoro TTS ${res.status} at ${url}: ${body.slice(0, 300)}`);
|
||||
e.status = res.status;
|
||||
throw e;
|
||||
}
|
||||
return Buffer.from(await res.arrayBuffer());
|
||||
}
|
||||
|
||||
return {
|
||||
hasTts: !!sparkBase,
|
||||
kind: "kokoro",
|
||||
|
||||
async synthesize({ text, voice, format }) {
|
||||
if (!sparkBase) {
|
||||
const e = new Error(
|
||||
"Kokoro TTS is not configured — Spark Control discovery isn't reporting a ready kokoro endpoint"
|
||||
);
|
||||
e.status = 503;
|
||||
throw e;
|
||||
}
|
||||
const cleaned = (text || "").replace(/\s+/g, " ").trim();
|
||||
if (!cleaned) {
|
||||
const e = new Error("TTS input text is empty");
|
||||
e.status = 400;
|
||||
throw e;
|
||||
}
|
||||
const chosenVoice = (voice || defaultVoice || "").trim() || DEFAULT_VOICE;
|
||||
const fmt = (format || defaultFormat || DEFAULT_FORMAT).toLowerCase();
|
||||
const contentType = FORMAT_CONTENT_TYPE[fmt] || "application/octet-stream";
|
||||
|
||||
let attempt = 0;
|
||||
// Retry only on transient 5xx; a 4xx (bad voice/format) is
|
||||
// deterministic and surfaces immediately.
|
||||
while (true) {
|
||||
try {
|
||||
const audio = await callKokoro({ text: cleaned, voice: chosenVoice, format: fmt });
|
||||
return {
|
||||
audio,
|
||||
contentType,
|
||||
durationSeconds: null,
|
||||
voice: chosenVoice,
|
||||
model: "kokoro",
|
||||
format: fmt,
|
||||
attempts: attempt + 1,
|
||||
};
|
||||
} catch (err) {
|
||||
const status = err?.status || 0;
|
||||
if (status >= 400 && status < 500) throw err; // client error → no retry
|
||||
if (attempt >= RETRY_ON_5XX) throw err;
|
||||
attempt += 1;
|
||||
console.warn(
|
||||
`[kokoro] TTS call failed (${status || "network"}) — retry ${attempt}/${RETRY_ON_5XX}`
|
||||
);
|
||||
await sleepMs(500);
|
||||
}
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user