Add TTS backends (ElevenLabs, Kokoro) and /relay/tts

2026-06-13 13:36:05 -05:00
parent 0aa648706e
commit 04dcf86fa4
3 changed files with 538 additions and 0 deletions
@@ -0,0 +1,109 @@
+// ElevenLabs TTS backend — the cloud alternative to operator-hardware
+// Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini
+// as a swappable cloud fallback to the Parakeet/vLLM hardware path.
+//
+// Selected when relay_tts_backend_preference routes here (or when
+// Kokoro is unavailable and a fallback is allowed) AND
+// relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs
+// handles full multi-sentence paragraphs in one call — one request per
+// topic summary, no chunking.
+//
+// NOTE: This path is implemented to ElevenLabs' documented API but is
+// UNTESTED against a live key (the operator hadn't supplied one at
+// build time). The Kokoro path is the tested default. Before relying on
+// ElevenLabs in production, set relay_elevenlabs_api_key +
+// relay_elevenlabs_voice_id and smoke-test one /relay/tts call.
+//
+// Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds,
+//           voice, model } — durationSeconds is null (we don't decode
+//           the MP3 frame count here; the Recap server measures it when
+//           transcoding/caching).
+
+const API_BASE = "https://api.elevenlabs.io/v1";
+const DEFAULT_MODEL = "eleven_turbo_v2_5";
+const DEFAULT_TIMEOUT_MS = 120_000;
+
+export function createElevenLabsBackend({
+  apiKey = "",
+  voiceId = "",
+  model = DEFAULT_MODEL,
+  timeoutMs = DEFAULT_TIMEOUT_MS,
+} = {}) {
+  const configured = !!(apiKey && voiceId);
+
+  return {
+    hasTts: configured,
+    kind: "elevenlabs",
+
+    async synthesize({ text, voice }) {
+      if (!apiKey) {
+        const e = new Error(
+          "ElevenLabs TTS is not configured — set relay_elevenlabs_api_key"
+        );
+        e.status = 503;
+        throw e;
+      }
+      // `voice` from the client overrides the operator default voice id
+      // when present (the Recap client may let a user pick a voice).
+      const chosenVoice = (voice || voiceId || "").trim();
+      if (!chosenVoice) {
+        const e = new Error(
+          "ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id"
+        );
+        e.status = 503;
+        throw e;
+      }
+      const cleaned = (text || "").replace(/\s+/g, " ").trim();
+      if (!cleaned) {
+        const e = new Error("TTS input text is empty");
+        e.status = 400;
+        throw e;
+      }
+
+      const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`;
+      let res;
+      try {
+        // Public-internet call — use the global fetch with full cert
+        // validation (NOT lanFetch, which is scoped to LAN/Spark Control).
+        res = await fetch(url, {
+          method: "POST",
+          headers: {
+            "xi-api-key": apiKey,
+            "Content-Type": "application/json",
+            Accept: "audio/mpeg",
+          },
+          body: JSON.stringify({
+            text: cleaned,
+            model_id: model || DEFAULT_MODEL,
+          }),
+          signal: AbortSignal.timeout(timeoutMs),
+        });
+      } catch (err) {
+        const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`);
+        e.status = 502;
+        throw e;
+      }
+      if (!res.ok) {
+        let body = "";
+        try {
+          body = await res.text();
+        } catch {}
+        const e = new Error(
+          `ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}`
+        );
+        e.status = res.status;
+        throw e;
+      }
+      const audio = Buffer.from(await res.arrayBuffer());
+      return {
+        audio,
+        contentType: "audio/mpeg",
+        durationSeconds: null,
+        sentenceCount: null,
+        attempts: 1,
+        voice: chosenVoice,
+        model: model || DEFAULT_MODEL,
+      };
+    },
+  };
+}
@@ -0,0 +1,140 @@
+// Kokoro TTS backend — synthesizes a topic summary into speech via Spark
+// Control's OpenAI-compatible /v1/audio/speech endpoint.
+//
+// Kokoro-82M (Apache-2.0, hexgrad/Kokoro-82M) replaced Magpie in Spark
+// Control v0.14.0. Magpie's NVIDIA Riva decoder had a structural
+// truncation defect that capped end-to-end reliability at ~85% even with
+// server-side retries + chunking; Kokoro renders cleanly at any length
+// (100% in our testing, ~1s for a ~100-word summary, no truncation). So
+// this backend is a single pass-through call — NONE of the Magpie-era
+// fragmenting, pacing/recovery-gap, duration-check, retry, or WAV
+// stitching is needed or present.
+//
+// Output: 24kHz mono 16-bit PCM. Kokoro can emit wav/mp3/opus/flac
+// directly via response_format, so we request the caller's format (mp3
+// by default — small + universally playable for the mobile/offline
+// player) and never transcode client-side. durationSeconds is left null:
+// Kokoro's WAV header carries a placeholder size field (bogus computed
+// duration), and for mp3 we'd have to decode — the Recap side measures
+// duration off the cached file / <audio> element instead.
+
+import { lanFetch } from "../lan-fetch.js";
+
+const DEFAULT_TIMEOUT_MS = 60_000;
+const DEFAULT_VOICE = "bm_george";
+const DEFAULT_FORMAT = "mp3";
+// One retry on a 5xx / network blip (per the Spark Control dev's
+// error-handling guidance: 4xx = real client error, 5xx = retry once).
+// Kokoro doesn't truncate, so there's no duration-based retry.
+const RETRY_ON_5XX = 1;
+
+const FORMAT_CONTENT_TYPE = {
+  wav: "audio/wav",
+  mp3: "audio/mpeg",
+  opus: "audio/ogg",
+  flac: "audio/flac",
+};
+
+function sleepMs(ms) {
+  return new Promise((r) => setTimeout(r, ms));
+}
+
+export function createKokoroBackend({
+  // Spark Control base URL (no path) — derived by the caller from
+  // relay_spark_control_url with the /api/endpoints suffix stripped.
+  sparkControlBaseURL = "",
+  defaultVoice = DEFAULT_VOICE,
+  defaultFormat = DEFAULT_FORMAT,
+  timeoutMs = DEFAULT_TIMEOUT_MS,
+} = {}) {
+  const sparkBase = (sparkControlBaseURL || "")
+    .trim()
+    .replace(/\/$/, "")
+    .replace(/\/api\/endpoints$/, "");
+
+  async function callKokoro({ text, voice, format }) {
+    const url = `${sparkBase}/v1/audio/speech`;
+    let res;
+    try {
+      res = await lanFetch(url, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        redirect: "follow",
+        body: JSON.stringify({
+          model: "kokoro",
+          input: text,
+          voice,
+          response_format: format,
+        }),
+        signal: AbortSignal.timeout(timeoutMs),
+      });
+    } catch (err) {
+      const cause = err?.cause?.message || err?.cause?.code || err?.cause || "";
+      const detail = cause ? `${err.message} (cause: ${cause})` : err?.message || String(err);
+      const e = new Error(`Kokoro TTS network error at ${url}: ${detail}`);
+      e.status = 502;
+      throw e;
+    }
+    if (!res.ok) {
+      let body = "";
+      try {
+        body = await res.text();
+      } catch {}
+      const e = new Error(`Kokoro TTS ${res.status} at ${url}: ${body.slice(0, 300)}`);
+      e.status = res.status;
+      throw e;
+    }
+    return Buffer.from(await res.arrayBuffer());
+  }
+
+  return {
+    hasTts: !!sparkBase,
+    kind: "kokoro",
+
+    async synthesize({ text, voice, format }) {
+      if (!sparkBase) {
+        const e = new Error(
+          "Kokoro TTS is not configured — Spark Control discovery isn't reporting a ready kokoro endpoint"
+        );
+        e.status = 503;
+        throw e;
+      }
+      const cleaned = (text || "").replace(/\s+/g, " ").trim();
+      if (!cleaned) {
+        const e = new Error("TTS input text is empty");
+        e.status = 400;
+        throw e;
+      }
+      const chosenVoice = (voice || defaultVoice || "").trim() || DEFAULT_VOICE;
+      const fmt = (format || defaultFormat || DEFAULT_FORMAT).toLowerCase();
+      const contentType = FORMAT_CONTENT_TYPE[fmt] || "application/octet-stream";
+
+      let attempt = 0;
+      // Retry only on transient 5xx; a 4xx (bad voice/format) is
+      // deterministic and surfaces immediately.
+      while (true) {
+        try {
+          const audio = await callKokoro({ text: cleaned, voice: chosenVoice, format: fmt });
+          return {
+            audio,
+            contentType,
+            durationSeconds: null,
+            voice: chosenVoice,
+            model: "kokoro",
+            format: fmt,
+            attempts: attempt + 1,
+          };
+        } catch (err) {
+          const status = err?.status || 0;
+          if (status >= 400 && status < 500) throw err; // client error → no retry
+          if (attempt >= RETRY_ON_5XX) throw err;
+          attempt += 1;
+          console.warn(
+            `[kokoro] TTS call failed (${status || "network"}) — retry ${attempt}/${RETRY_ON_5XX}`
+          );
+          await sleepMs(500);
+        }
+      }
+    },
+  };
+}