Add TTS backends (ElevenLabs, Kokoro) and /relay/tts

2026-06-13 13:36:05 -05:00
parent 0aa648706e
commit 04dcf86fa4
3 changed files with 538 additions and 0 deletions
@@ -0,0 +1,109 @@
+// ElevenLabs TTS backend — the cloud alternative to operator-hardware
+// Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini
+// as a swappable cloud fallback to the Parakeet/vLLM hardware path.
+//
+// Selected when relay_tts_backend_preference routes here (or when
+// Kokoro is unavailable and a fallback is allowed) AND
+// relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs
+// handles full multi-sentence paragraphs in one call — one request per
+// topic summary, no chunking.
+//
+// NOTE: This path is implemented to ElevenLabs' documented API but is
+// UNTESTED against a live key (the operator hadn't supplied one at
+// build time). The Kokoro path is the tested default. Before relying on
+// ElevenLabs in production, set relay_elevenlabs_api_key +
+// relay_elevenlabs_voice_id and smoke-test one /relay/tts call.
+//
+// Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds,
+//           voice, model } — durationSeconds is null (we don't decode
+//           the MP3 frame count here; the Recap server measures it when
+//           transcoding/caching).
+
+const API_BASE = "https://api.elevenlabs.io/v1";
+const DEFAULT_MODEL = "eleven_turbo_v2_5";
+const DEFAULT_TIMEOUT_MS = 120_000;
+
+export function createElevenLabsBackend({
+  apiKey = "",
+  voiceId = "",
+  model = DEFAULT_MODEL,
+  timeoutMs = DEFAULT_TIMEOUT_MS,
+} = {}) {
+  const configured = !!(apiKey && voiceId);
+
+  return {
+    hasTts: configured,
+    kind: "elevenlabs",
+
+    async synthesize({ text, voice }) {
+      if (!apiKey) {
+        const e = new Error(
+          "ElevenLabs TTS is not configured — set relay_elevenlabs_api_key"
+        );
+        e.status = 503;
+        throw e;
+      }
+      // `voice` from the client overrides the operator default voice id
+      // when present (the Recap client may let a user pick a voice).
+      const chosenVoice = (voice || voiceId || "").trim();
+      if (!chosenVoice) {
+        const e = new Error(
+          "ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id"
+        );
+        e.status = 503;
+        throw e;
+      }
+      const cleaned = (text || "").replace(/\s+/g, " ").trim();
+      if (!cleaned) {
+        const e = new Error("TTS input text is empty");
+        e.status = 400;
+        throw e;
+      }
+
+      const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`;
+      let res;
+      try {
+        // Public-internet call — use the global fetch with full cert
+        // validation (NOT lanFetch, which is scoped to LAN/Spark Control).
+        res = await fetch(url, {
+          method: "POST",
+          headers: {
+            "xi-api-key": apiKey,
+            "Content-Type": "application/json",
+            Accept: "audio/mpeg",
+          },
+          body: JSON.stringify({
+            text: cleaned,
+            model_id: model || DEFAULT_MODEL,
+          }),
+          signal: AbortSignal.timeout(timeoutMs),
+        });
+      } catch (err) {
+        const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`);
+        e.status = 502;
+        throw e;
+      }
+      if (!res.ok) {
+        let body = "";
+        try {
+          body = await res.text();
+        } catch {}
+        const e = new Error(
+          `ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}`
+        );
+        e.status = res.status;
+        throw e;
+      }
+      const audio = Buffer.from(await res.arrayBuffer());
+      return {
+        audio,
+        contentType: "audio/mpeg",
+        durationSeconds: null,
+        sentenceCount: null,
+        attempts: 1,
+        voice: chosenVoice,
+        model: model || DEFAULT_MODEL,
+      };
+    },
+  };
+}
@@ -0,0 +1,140 @@
+// Kokoro TTS backend — synthesizes a topic summary into speech via Spark
+// Control's OpenAI-compatible /v1/audio/speech endpoint.
+//
+// Kokoro-82M (Apache-2.0, hexgrad/Kokoro-82M) replaced Magpie in Spark
+// Control v0.14.0. Magpie's NVIDIA Riva decoder had a structural
+// truncation defect that capped end-to-end reliability at ~85% even with
+// server-side retries + chunking; Kokoro renders cleanly at any length
+// (100% in our testing, ~1s for a ~100-word summary, no truncation). So
+// this backend is a single pass-through call — NONE of the Magpie-era
+// fragmenting, pacing/recovery-gap, duration-check, retry, or WAV
+// stitching is needed or present.
+//
+// Output: 24kHz mono 16-bit PCM. Kokoro can emit wav/mp3/opus/flac
+// directly via response_format, so we request the caller's format (mp3
+// by default — small + universally playable for the mobile/offline
+// player) and never transcode client-side. durationSeconds is left null:
+// Kokoro's WAV header carries a placeholder size field (bogus computed
+// duration), and for mp3 we'd have to decode — the Recap side measures
+// duration off the cached file / <audio> element instead.
+
+import { lanFetch } from "../lan-fetch.js";
+
+const DEFAULT_TIMEOUT_MS = 60_000;
+const DEFAULT_VOICE = "bm_george";
+const DEFAULT_FORMAT = "mp3";
+// One retry on a 5xx / network blip (per the Spark Control dev's
+// error-handling guidance: 4xx = real client error, 5xx = retry once).
+// Kokoro doesn't truncate, so there's no duration-based retry.
+const RETRY_ON_5XX = 1;
+
+const FORMAT_CONTENT_TYPE = {
+  wav: "audio/wav",
+  mp3: "audio/mpeg",
+  opus: "audio/ogg",
+  flac: "audio/flac",
+};
+
+function sleepMs(ms) {
+  return new Promise((r) => setTimeout(r, ms));
+}
+
+export function createKokoroBackend({
+  // Spark Control base URL (no path) — derived by the caller from
+  // relay_spark_control_url with the /api/endpoints suffix stripped.
+  sparkControlBaseURL = "",
+  defaultVoice = DEFAULT_VOICE,
+  defaultFormat = DEFAULT_FORMAT,
+  timeoutMs = DEFAULT_TIMEOUT_MS,
+} = {}) {
+  const sparkBase = (sparkControlBaseURL || "")
+    .trim()
+    .replace(/\/$/, "")
+    .replace(/\/api\/endpoints$/, "");
+
+  async function callKokoro({ text, voice, format }) {
+    const url = `${sparkBase}/v1/audio/speech`;
+    let res;
+    try {
+      res = await lanFetch(url, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        redirect: "follow",
+        body: JSON.stringify({
+          model: "kokoro",
+          input: text,
+          voice,
+          response_format: format,
+        }),
+        signal: AbortSignal.timeout(timeoutMs),
+      });
+    } catch (err) {
+      const cause = err?.cause?.message || err?.cause?.code || err?.cause || "";
+      const detail = cause ? `${err.message} (cause: ${cause})` : err?.message || String(err);
+      const e = new Error(`Kokoro TTS network error at ${url}: ${detail}`);
+      e.status = 502;
+      throw e;
+    }
+    if (!res.ok) {
+      let body = "";
+      try {
+        body = await res.text();
+      } catch {}
+      const e = new Error(`Kokoro TTS ${res.status} at ${url}: ${body.slice(0, 300)}`);
+      e.status = res.status;
+      throw e;
+    }
+    return Buffer.from(await res.arrayBuffer());
+  }
+
+  return {
+    hasTts: !!sparkBase,
+    kind: "kokoro",
+
+    async synthesize({ text, voice, format }) {
+      if (!sparkBase) {
+        const e = new Error(
+          "Kokoro TTS is not configured — Spark Control discovery isn't reporting a ready kokoro endpoint"
+        );
+        e.status = 503;
+        throw e;
+      }
+      const cleaned = (text || "").replace(/\s+/g, " ").trim();
+      if (!cleaned) {
+        const e = new Error("TTS input text is empty");
+        e.status = 400;
+        throw e;
+      }
+      const chosenVoice = (voice || defaultVoice || "").trim() || DEFAULT_VOICE;
+      const fmt = (format || defaultFormat || DEFAULT_FORMAT).toLowerCase();
+      const contentType = FORMAT_CONTENT_TYPE[fmt] || "application/octet-stream";
+
+      let attempt = 0;
+      // Retry only on transient 5xx; a 4xx (bad voice/format) is
+      // deterministic and surfaces immediately.
+      while (true) {
+        try {
+          const audio = await callKokoro({ text: cleaned, voice: chosenVoice, format: fmt });
+          return {
+            audio,
+            contentType,
+            durationSeconds: null,
+            voice: chosenVoice,
+            model: "kokoro",
+            format: fmt,
+            attempts: attempt + 1,
+          };
+        } catch (err) {
+          const status = err?.status || 0;
+          if (status >= 400 && status < 500) throw err; // client error → no retry
+          if (attempt >= RETRY_ON_5XX) throw err;
+          attempt += 1;
+          console.warn(
+            `[kokoro] TTS call failed (${status || "network"}) — retry ${attempt}/${RETRY_ON_5XX}`
+          );
+          await sleepMs(500);
+        }
+      }
+    },
+  };
+}
@@ -0,0 +1,289 @@
+// POST /relay/tts — synthesize a topic summary into speech for the
+// Recap app's audio-first ("walking mode") player. Returns the raw
+// audio bytes (MP3 by default from Kokoro or ElevenLabs) with credit
+// metadata in response headers.
+//
+// Request:
+//   headers:
+//     X-Recap-Install-Id   (required)
+//     X-Recap-Job-Id       (optional but expected — credit dedup key;
+//                           the Recap client sends ONE id per recap so
+//                           synthesizing all N topics of a recap costs
+//                           at most 1 credit, like transcribe+analyze)
+//     Authorization        (optional Bearer LIC1-… for licensed tiers)
+//   body (application/json):
+//     { "text": "the topic summary to speak", "voice": "optional voice id" }
+//
+// Response (200):
+//   body:    raw audio bytes
+//   headers:
+//     Content-Type                audio/wav | audio/mpeg
+//     X-Recap-Tts-Backend         kokoro | elevenlabs
+//     X-Recap-Tts-Voice           voice id used
+//     X-Recap-Audio-Duration      seconds (may be absent for ElevenLabs)
+//     X-Recap-Credits-Remaining   number, or "unlimited"
+//     X-Recap-Tier                core | pro | max
+//     X-Recap-Credit-Charged      0 | 1
+//
+// Errors return the standard JSON errorEnvelope (so the client can keep
+// its credit pill accurate) with an appropriate status.
+//
+// Billing: 1 credit per unique job id, deduped exactly like transcribe.
+// Gated to Max users on the Recap side; the relay still enforces a
+// balance floor so a non-Max install can't drain TTS for free.
+
+import express from "express";
+import { resolveIdentity, identityTier } from "../identity.js";
+import {
+  getOrCreateRow,
+  commitCredit,
+  computeRemaining,
+  licenseFingerprint,
+} from "../credits.js";
+import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
+import { getConfigSnapshot, getTierQuotas } from "../config.js";
+import { resolveHardwareConfig } from "../hardware-config.js";
+import { createKokoroBackend } from "../backends/kokoro.js";
+import { createElevenLabsBackend } from "../backends/elevenlabs.js";
+import { errorEnvelope } from "./envelope.js";
+import { recordCall } from "../audit-log.js";
+
+// Pick which TTS backend serves this call given the operator preference
+// and what's actually available. Returns "kokoro" | "elevenlabs" | null
+// (null = nothing available → caller surfaces a 503).
+function pickTtsBackend({ preference, kokoroReady, elevenConfigured }) {
+  const pref = preference || "hardware_first";
+  if (pref === "hardware_only") return kokoroReady ? "kokoro" : null;
+  if (pref === "cloud_only") return elevenConfigured ? "elevenlabs" : null;
+  if (pref === "cloud_first") {
+    if (elevenConfigured) return "elevenlabs";
+    if (kokoroReady) return "kokoro";
+    return null;
+  }
+  // hardware_first (default)
+  if (kokoroReady) return "kokoro";
+  if (elevenConfigured) return "elevenlabs";
+  return null;
+}
+
+export function ttsRouter() {
+  const router = express.Router();
+
+  router.post("/tts", express.json({ limit: "256kb" }), async (req, res) => {
+    const t0 = Date.now();
+    const jobId = req.header("X-Recap-Job-Id") || null;
+
+    let identity;
+    try {
+      identity = await resolveIdentity(req);
+    } catch (err) {
+      const e = await errorEnvelope({
+        error: err?.message || "auth_error",
+        statusHint: err?.status || 401,
+      });
+      return res.status(e.statusHint || 401).json(e.body);
+    }
+    if (identity.kind === "license" && !identity.installId) {
+      const e = await errorEnvelope({
+        error: "missing X-Recap-Install-Id header",
+        statusHint: 400,
+      });
+      return res.status(400).json(e.body);
+    }
+    const { creditKey, installId, license } = identity;
+
+    const text = typeof req.body?.text === "string" ? req.body.text.trim() : "";
+    if (!text) {
+      const e = await errorEnvelope({
+        error: "missing 'text' in request body",
+        creditKey,
+        installId,
+        statusHint: 400,
+      });
+      return res.status(400).json(e.body);
+    }
+    const clientVoice =
+      typeof req.body?.voice === "string" ? req.body.voice.trim() : "";
+    // Optional output format override (wav|mp3|opus|flac). Kokoro emits
+    // any of these directly; default comes from config (mp3). ElevenLabs
+    // ignores it (always mp3).
+    const clientFormat =
+      typeof req.body?.format === "string" ? req.body.format.trim().toLowerCase() : "";
+
+    const row = await getOrCreateRow({ creditKey, installId, license });
+    const tier = identityTier(identity, row);
+    row.tier_snapshot = tier;
+    const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license);
+    const auditInstall = installId || identity.userId || null;
+
+    const cfg = await getConfigSnapshot();
+    const quota = await getTierQuotas();
+
+    // Balance floor — refuse only when the install has a finite balance
+    // that's exhausted (null total = unlimited, e.g. Max). Max users
+    // (the intended audience) always pass; this just stops a credit-less
+    // Core install from synthesizing for free if it bypasses the Recap-
+    // side Max gate.
+    const preBalance = computeRemaining(row, quota);
+    if (preBalance.total !== null && preBalance.total <= 0) {
+      await recordCall({
+        install_id: auditInstall,
+        license_fingerprint: licenseFp,
+        tier,
+        pipeline: "tts",
+        backend: null,
+        model: null,
+        status: "refused",
+        credit_charged: 0,
+        duration_ms: Date.now() - t0,
+        audio_seconds: 0,
+        cost_usd: 0,
+        job_id: jobId,
+        error: "no_credits",
+      });
+      const e = await errorEnvelope({
+        error: "no_credits",
+        creditKey,
+        installId,
+        license,
+        tier,
+        statusHint: 402,
+      });
+      return res.status(402).json(e.body);
+    }
+
+    // Resolve availability + choose a backend.
+    const hw = await resolveHardwareConfig(cfg);
+    const kokoroReady = !!hw.tts?.url;
+    const elevenConfigured = !!(
+      cfg.relay_elevenlabs_api_key && cfg.relay_elevenlabs_voice_id
+    );
+    const preference = cfg.relay_tts_backend_preference || "hardware_first";
+    const chosen = pickTtsBackend({ preference, kokoroReady, elevenConfigured });
+    if (!chosen) {
+      const reason = hw.tts?.blocked_reason
+        ? hw.tts.blocked_reason
+        : "No TTS backend available — configure Spark Control (Kokoro) or relay_elevenlabs_api_key.";
+      await recordCall({
+        install_id: auditInstall,
+        license_fingerprint: licenseFp,
+        tier,
+        pipeline: "tts",
+        backend: null,
+        model: null,
+        status: "error",
+        credit_charged: 0,
+        duration_ms: Date.now() - t0,
+        audio_seconds: 0,
+        cost_usd: 0,
+        job_id: jobId,
+        error: reason,
+      });
+      const e = await errorEnvelope({
+        error: reason,
+        creditKey,
+        installId,
+        license,
+        tier,
+        statusHint: 503,
+      });
+      return res.status(503).json(e.body);
+    }
+
+    // Decouple billing from routing (same reasoning as transcribe): look
+    // up the job to decide whether to charge, but always synthesize.
+    const reusedJob = !!lookupJob({ creditKey, installId, license, jobId });
+
+    let result;
+    try {
+      if (chosen === "kokoro") {
+        const backend = createKokoroBackend({
+          sparkControlBaseURL: hw.sparkBase || hw.tts.url || "",
+          defaultVoice: cfg.relay_tts_default_voice,
+          defaultFormat: cfg.relay_tts_format,
+        });
+        result = await backend.synthesize({ text, voice: clientVoice, format: clientFormat });
+      } else {
+        const backend = createElevenLabsBackend({
+          apiKey: cfg.relay_elevenlabs_api_key,
+          voiceId: cfg.relay_elevenlabs_voice_id,
+          model: cfg.relay_elevenlabs_model,
+        });
+        result = await backend.synthesize({ text, voice: clientVoice });
+      }
+    } catch (err) {
+      console.error(`[relay/tts] ${chosen} backend error: ${err?.message}`);
+      await recordCall({
+        install_id: auditInstall,
+        license_fingerprint: licenseFp,
+        tier,
+        pipeline: "tts",
+        backend: chosen,
+        model: chosen === "kokoro" ? "kokoro" : cfg.relay_elevenlabs_model,
+        status: "error",
+        credit_charged: 0,
+        duration_ms: Date.now() - t0,
+        audio_seconds: 0,
+        cost_usd: 0,
+        job_id: jobId,
+        error: (err?.message || String(err)).slice(0, 200),
+      });
+      const e = await errorEnvelope({
+        error: err?.message || "tts_backend_error",
+        creditKey,
+        installId,
+        license,
+        tier,
+        statusHint: err?.status || 502,
+      });
+      return res.status(e.statusHint).json(e.body);
+    }
+
+    // Charge once per job id. Operator hardware (Kokoro) is fixed-cost
+    // so cost_usd stays 0; ElevenLabs has a real per-char cost but we
+    // don't have its billing API wired, so 0 here too (audit shows the
+    // call happened; margin tracking for ElevenLabs is a later add).
+    let creditCharged = 0;
+    if (!reusedJob) {
+      await commitCredit({ creditKey, installId, license, backend: chosen, tier });
+      await markJobCharged({ creditKey, installId, license, jobId, backend: chosen, tier });
+      creditCharged = 1;
+    }
+
+    await recordCall({
+      install_id: installId,
+      license_fingerprint: licenseFp,
+      tier,
+      pipeline: "tts",
+      backend: chosen,
+      model: result?.model || null,
+      status: "success",
+      credit_charged: creditCharged,
+      duration_ms: Date.now() - t0,
+      audio_seconds: result?.durationSeconds || 0,
+      cost_usd: 0,
+      job_id: jobId,
+      attempts: result?.attempts || null,
+    });
+
+    // Post-charge balance for the client's credit pill.
+    const balance = computeRemaining(row, quota);
+
+    res.set("Content-Type", result.contentType || "audio/wav");
+    res.set("X-Recap-Tts-Backend", chosen);
+    if (result.voice) res.set("X-Recap-Tts-Voice", result.voice);
+    if (result.durationSeconds != null) {
+      res.set("X-Recap-Audio-Duration", String(result.durationSeconds));
+    }
+    res.set(
+      "X-Recap-Credits-Remaining",
+      balance.total == null ? "unlimited" : String(balance.total)
+    );
+    res.set("X-Recap-Tier", tier);
+    res.set("X-Recap-Credit-Charged", String(creditCharged));
+    res.set("Cache-Control", "no-store");
+    return res.send(result.audio);
+  });
+
+  return router;
+}