Add TTS backends (ElevenLabs, Kokoro) and /relay/tts

2026-06-13 13:36:05 -05:00
parent 0aa648706e
commit 04dcf86fa4
3 changed files with 538 additions and 0 deletions
@@ -0,0 +1,109 @@
 // ElevenLabs TTS backend — the cloud alternative to operator-hardware
 // Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini
 // as a swappable cloud fallback to the Parakeet/vLLM hardware path.
 //
 // Selected when relay_tts_backend_preference routes here (or when
 // Kokoro is unavailable and a fallback is allowed) AND
 // relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs
 // handles full multi-sentence paragraphs in one call — one request per
 // topic summary, no chunking.
 //
 // NOTE: This path is implemented to ElevenLabs' documented API but is
 // UNTESTED against a live key (the operator hadn't supplied one at
 // build time). The Kokoro path is the tested default. Before relying on
 // ElevenLabs in production, set relay_elevenlabs_api_key +
 // relay_elevenlabs_voice_id and smoke-test one /relay/tts call.
 //
 // Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds,
 //           voice, model } — durationSeconds is null (we don't decode
 //           the MP3 frame count here; the Recap server measures it when
 //           transcoding/caching).
 const API_BASE = "https://api.elevenlabs.io/v1";
 const DEFAULT_MODEL = "eleven_turbo_v2_5";
 const DEFAULT_TIMEOUT_MS = 120_000;
 export function createElevenLabsBackend({
  apiKey = "",
  voiceId = "",
  model = DEFAULT_MODEL,
  timeoutMs = DEFAULT_TIMEOUT_MS,
 } = {}) {
  const configured = !!(apiKey && voiceId);
  return {
    hasTts: configured,
    kind: "elevenlabs",
    async synthesize({ text, voice }) {
      if (!apiKey) {
        const e = new Error(
          "ElevenLabs TTS is not configured — set relay_elevenlabs_api_key"
        );
        e.status = 503;
        throw e;
      }
      // `voice` from the client overrides the operator default voice id
      // when present (the Recap client may let a user pick a voice).
      const chosenVoice = (voice || voiceId || "").trim();
      if (!chosenVoice) {
        const e = new Error(
          "ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id"
        );
        e.status = 503;
        throw e;
      }
      const cleaned = (text || "").replace(/\s+/g, " ").trim();
      if (!cleaned) {
        const e = new Error("TTS input text is empty");
        e.status = 400;
        throw e;
      }
      const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`;
      let res;
      try {
        // Public-internet call — use the global fetch with full cert
        // validation (NOT lanFetch, which is scoped to LAN/Spark Control).
        res = await fetch(url, {
          method: "POST",
          headers: {
            "xi-api-key": apiKey,
            "Content-Type": "application/json",
            Accept: "audio/mpeg",
          },
          body: JSON.stringify({
            text: cleaned,
            model_id: model || DEFAULT_MODEL,
          }),
          signal: AbortSignal.timeout(timeoutMs),
        });
      } catch (err) {
        const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`);
        e.status = 502;
        throw e;
      }
      if (!res.ok) {
        let body = "";
        try {
          body = await res.text();
        } catch {}
        const e = new Error(
          `ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}`
        );
        e.status = res.status;
        throw e;
      }
      const audio = Buffer.from(await res.arrayBuffer());
      return {
        audio,
        contentType: "audio/mpeg",
        durationSeconds: null,
        sentenceCount: null,
        attempts: 1,
        voice: chosenVoice,
        model: model || DEFAULT_MODEL,
      };
    },
  };
 }
@@ -0,0 +1,140 @@
 // Kokoro TTS backend — synthesizes a topic summary into speech via Spark
 // Control's OpenAI-compatible /v1/audio/speech endpoint.
 //
 // Kokoro-82M (Apache-2.0, hexgrad/Kokoro-82M) replaced Magpie in Spark
 // Control v0.14.0. Magpie's NVIDIA Riva decoder had a structural
 // truncation defect that capped end-to-end reliability at ~85% even with
 // server-side retries + chunking; Kokoro renders cleanly at any length
 // (100% in our testing, ~1s for a ~100-word summary, no truncation). So
 // this backend is a single pass-through call — NONE of the Magpie-era
 // fragmenting, pacing/recovery-gap, duration-check, retry, or WAV
 // stitching is needed or present.
 //
 // Output: 24kHz mono 16-bit PCM. Kokoro can emit wav/mp3/opus/flac
 // directly via response_format, so we request the caller's format (mp3
 // by default — small + universally playable for the mobile/offline
 // player) and never transcode client-side. durationSeconds is left null:
 // Kokoro's WAV header carries a placeholder size field (bogus computed
 // duration), and for mp3 we'd have to decode — the Recap side measures
 // duration off the cached file / <audio> element instead.
 import { lanFetch } from "../lan-fetch.js";
 const DEFAULT_TIMEOUT_MS = 60_000;
 const DEFAULT_VOICE = "bm_george";
 const DEFAULT_FORMAT = "mp3";
 // One retry on a 5xx / network blip (per the Spark Control dev's
 // error-handling guidance: 4xx = real client error, 5xx = retry once).
 // Kokoro doesn't truncate, so there's no duration-based retry.
 const RETRY_ON_5XX = 1;
 const FORMAT_CONTENT_TYPE = {
  wav: "audio/wav",
  mp3: "audio/mpeg",
  opus: "audio/ogg",
  flac: "audio/flac",
 };
 function sleepMs(ms) {
  return new Promise((r) => setTimeout(r, ms));
 }
 export function createKokoroBackend({
  // Spark Control base URL (no path) — derived by the caller from
  // relay_spark_control_url with the /api/endpoints suffix stripped.
  sparkControlBaseURL = "",
  defaultVoice = DEFAULT_VOICE,
  defaultFormat = DEFAULT_FORMAT,
  timeoutMs = DEFAULT_TIMEOUT_MS,
 } = {}) {
  const sparkBase = (sparkControlBaseURL || "")
    .trim()
    .replace(/\/$/, "")
    .replace(/\/api\/endpoints$/, "");
  async function callKokoro({ text, voice, format }) {
    const url = `${sparkBase}/v1/audio/speech`;
    let res;
    try {
      res = await lanFetch(url, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        redirect: "follow",
        body: JSON.stringify({
          model: "kokoro",
          input: text,
          voice,
          response_format: format,
        }),
        signal: AbortSignal.timeout(timeoutMs),
      });
    } catch (err) {
      const cause = err?.cause?.message || err?.cause?.code || err?.cause || "";
      const detail = cause ? `${err.message} (cause: ${cause})` : err?.message || String(err);
      const e = new Error(`Kokoro TTS network error at ${url}: ${detail}`);
      e.status = 502;
      throw e;
    }
    if (!res.ok) {
      let body = "";
      try {
        body = await res.text();
      } catch {}
      const e = new Error(`Kokoro TTS ${res.status} at ${url}: ${body.slice(0, 300)}`);
      e.status = res.status;
      throw e;
    }
    return Buffer.from(await res.arrayBuffer());
  }
  return {
    hasTts: !!sparkBase,
    kind: "kokoro",
    async synthesize({ text, voice, format }) {
      if (!sparkBase) {
        const e = new Error(
          "Kokoro TTS is not configured — Spark Control discovery isn't reporting a ready kokoro endpoint"
        );
        e.status = 503;
        throw e;
      }
      const cleaned = (text || "").replace(/\s+/g, " ").trim();
      if (!cleaned) {
        const e = new Error("TTS input text is empty");
        e.status = 400;
        throw e;
      }
      const chosenVoice = (voice || defaultVoice || "").trim() || DEFAULT_VOICE;
      const fmt = (format || defaultFormat || DEFAULT_FORMAT).toLowerCase();
      const contentType = FORMAT_CONTENT_TYPE[fmt] || "application/octet-stream";
      let attempt = 0;
      // Retry only on transient 5xx; a 4xx (bad voice/format) is
      // deterministic and surfaces immediately.
      while (true) {
        try {
          const audio = await callKokoro({ text: cleaned, voice: chosenVoice, format: fmt });
          return {
            audio,
            contentType,
            durationSeconds: null,
            voice: chosenVoice,
            model: "kokoro",
            format: fmt,
            attempts: attempt + 1,
          };
        } catch (err) {
          const status = err?.status || 0;
          if (status >= 400 && status < 500) throw err; // client error → no retry
          if (attempt >= RETRY_ON_5XX) throw err;
          attempt += 1;
          console.warn(
            `[kokoro] TTS call failed (${status || "network"}) — retry ${attempt}/${RETRY_ON_5XX}`
          );
          await sleepMs(500);
        }
      }
    },
  };
 }
@@ -0,0 +1,289 @@
 // POST /relay/tts — synthesize a topic summary into speech for the
 // Recap app's audio-first ("walking mode") player. Returns the raw
 // audio bytes (MP3 by default from Kokoro or ElevenLabs) with credit
 // metadata in response headers.
 //
 // Request:
 //   headers:
 //     X-Recap-Install-Id   (required)
 //     X-Recap-Job-Id       (optional but expected — credit dedup key;
 //                           the Recap client sends ONE id per recap so
 //                           synthesizing all N topics of a recap costs
 //                           at most 1 credit, like transcribe+analyze)
 //     Authorization        (optional Bearer LIC1-… for licensed tiers)
 //   body (application/json):
 //     { "text": "the topic summary to speak", "voice": "optional voice id" }
 //
 // Response (200):
 //   body:    raw audio bytes
 //   headers:
 //     Content-Type                audio/wav | audio/mpeg
 //     X-Recap-Tts-Backend         kokoro | elevenlabs
 //     X-Recap-Tts-Voice           voice id used
 //     X-Recap-Audio-Duration      seconds (may be absent for ElevenLabs)
 //     X-Recap-Credits-Remaining   number, or "unlimited"
 //     X-Recap-Tier                core | pro | max
 //     X-Recap-Credit-Charged      0 | 1
 //
 // Errors return the standard JSON errorEnvelope (so the client can keep
 // its credit pill accurate) with an appropriate status.
 //
 // Billing: 1 credit per unique job id, deduped exactly like transcribe.
 // Gated to Max users on the Recap side; the relay still enforces a
 // balance floor so a non-Max install can't drain TTS for free.
 import express from "express";
 import { resolveIdentity, identityTier } from "../identity.js";
 import {
  getOrCreateRow,
  commitCredit,
  computeRemaining,
  licenseFingerprint,
 } from "../credits.js";
 import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
 import { getConfigSnapshot, getTierQuotas } from "../config.js";
 import { resolveHardwareConfig } from "../hardware-config.js";
 import { createKokoroBackend } from "../backends/kokoro.js";
 import { createElevenLabsBackend } from "../backends/elevenlabs.js";
 import { errorEnvelope } from "./envelope.js";
 import { recordCall } from "../audit-log.js";
 // Pick which TTS backend serves this call given the operator preference
 // and what's actually available. Returns "kokoro" | "elevenlabs" | null
 // (null = nothing available → caller surfaces a 503).
 function pickTtsBackend({ preference, kokoroReady, elevenConfigured }) {
  const pref = preference || "hardware_first";
  if (pref === "hardware_only") return kokoroReady ? "kokoro" : null;
  if (pref === "cloud_only") return elevenConfigured ? "elevenlabs" : null;
  if (pref === "cloud_first") {
    if (elevenConfigured) return "elevenlabs";
    if (kokoroReady) return "kokoro";
    return null;
  }
  // hardware_first (default)
  if (kokoroReady) return "kokoro";
  if (elevenConfigured) return "elevenlabs";
  return null;
 }
 export function ttsRouter() {
  const router = express.Router();
  router.post("/tts", express.json({ limit: "256kb" }), async (req, res) => {
    const t0 = Date.now();
    const jobId = req.header("X-Recap-Job-Id") || null;
    let identity;
    try {
      identity = await resolveIdentity(req);
    } catch (err) {
      const e = await errorEnvelope({
        error: err?.message || "auth_error",
        statusHint: err?.status || 401,
      });
      return res.status(e.statusHint || 401).json(e.body);
    }
    if (identity.kind === "license" && !identity.installId) {
      const e = await errorEnvelope({
        error: "missing X-Recap-Install-Id header",
        statusHint: 400,
      });
      return res.status(400).json(e.body);
    }
    const { creditKey, installId, license } = identity;
    const text = typeof req.body?.text === "string" ? req.body.text.trim() : "";
    if (!text) {
      const e = await errorEnvelope({
        error: "missing 'text' in request body",
        creditKey,
        installId,
        statusHint: 400,
      });
      return res.status(400).json(e.body);
    }
    const clientVoice =
      typeof req.body?.voice === "string" ? req.body.voice.trim() : "";
    // Optional output format override (wav|mp3|opus|flac). Kokoro emits
    // any of these directly; default comes from config (mp3). ElevenLabs
    // ignores it (always mp3).
    const clientFormat =
      typeof req.body?.format === "string" ? req.body.format.trim().toLowerCase() : "";
    const row = await getOrCreateRow({ creditKey, installId, license });
    const tier = identityTier(identity, row);
    row.tier_snapshot = tier;
    const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license);
    const auditInstall = installId || identity.userId || null;
    const cfg = await getConfigSnapshot();
    const quota = await getTierQuotas();
    // Balance floor — refuse only when the install has a finite balance
    // that's exhausted (null total = unlimited, e.g. Max). Max users
    // (the intended audience) always pass; this just stops a credit-less
    // Core install from synthesizing for free if it bypasses the Recap-
    // side Max gate.
    const preBalance = computeRemaining(row, quota);
    if (preBalance.total !== null && preBalance.total <= 0) {
      await recordCall({
        install_id: auditInstall,
        license_fingerprint: licenseFp,
        tier,
        pipeline: "tts",
        backend: null,
        model: null,
        status: "refused",
        credit_charged: 0,
        duration_ms: Date.now() - t0,
        audio_seconds: 0,
        cost_usd: 0,
        job_id: jobId,
        error: "no_credits",
      });
      const e = await errorEnvelope({
        error: "no_credits",
        creditKey,
        installId,
        license,
        tier,
        statusHint: 402,
      });
      return res.status(402).json(e.body);
    }
    // Resolve availability + choose a backend.
    const hw = await resolveHardwareConfig(cfg);
    const kokoroReady = !!hw.tts?.url;
    const elevenConfigured = !!(
      cfg.relay_elevenlabs_api_key && cfg.relay_elevenlabs_voice_id
    );
    const preference = cfg.relay_tts_backend_preference || "hardware_first";
    const chosen = pickTtsBackend({ preference, kokoroReady, elevenConfigured });
    if (!chosen) {
      const reason = hw.tts?.blocked_reason
        ? hw.tts.blocked_reason
        : "No TTS backend available — configure Spark Control (Kokoro) or relay_elevenlabs_api_key.";
      await recordCall({
        install_id: auditInstall,
        license_fingerprint: licenseFp,
        tier,
        pipeline: "tts",
        backend: null,
        model: null,
        status: "error",
        credit_charged: 0,
        duration_ms: Date.now() - t0,
        audio_seconds: 0,
        cost_usd: 0,
        job_id: jobId,
        error: reason,
      });
      const e = await errorEnvelope({
        error: reason,
        creditKey,
        installId,
        license,
        tier,
        statusHint: 503,
      });
      return res.status(503).json(e.body);
    }
    // Decouple billing from routing (same reasoning as transcribe): look
    // up the job to decide whether to charge, but always synthesize.
    const reusedJob = !!lookupJob({ creditKey, installId, license, jobId });
    let result;
    try {
      if (chosen === "kokoro") {
        const backend = createKokoroBackend({
          sparkControlBaseURL: hw.sparkBase || hw.tts.url || "",
          defaultVoice: cfg.relay_tts_default_voice,
          defaultFormat: cfg.relay_tts_format,
        });
        result = await backend.synthesize({ text, voice: clientVoice, format: clientFormat });
      } else {
        const backend = createElevenLabsBackend({
          apiKey: cfg.relay_elevenlabs_api_key,
          voiceId: cfg.relay_elevenlabs_voice_id,
          model: cfg.relay_elevenlabs_model,
        });
        result = await backend.synthesize({ text, voice: clientVoice });
      }
    } catch (err) {
      console.error(`[relay/tts] ${chosen} backend error: ${err?.message}`);
      await recordCall({
        install_id: auditInstall,
        license_fingerprint: licenseFp,
        tier,
        pipeline: "tts",
        backend: chosen,
        model: chosen === "kokoro" ? "kokoro" : cfg.relay_elevenlabs_model,
        status: "error",
        credit_charged: 0,
        duration_ms: Date.now() - t0,
        audio_seconds: 0,
        cost_usd: 0,
        job_id: jobId,
        error: (err?.message || String(err)).slice(0, 200),
      });
      const e = await errorEnvelope({
        error: err?.message || "tts_backend_error",
        creditKey,
        installId,
        license,
        tier,
        statusHint: err?.status || 502,
      });
      return res.status(e.statusHint).json(e.body);
    }
    // Charge once per job id. Operator hardware (Kokoro) is fixed-cost
    // so cost_usd stays 0; ElevenLabs has a real per-char cost but we
    // don't have its billing API wired, so 0 here too (audit shows the
    // call happened; margin tracking for ElevenLabs is a later add).
    let creditCharged = 0;
    if (!reusedJob) {
      await commitCredit({ creditKey, installId, license, backend: chosen, tier });
      await markJobCharged({ creditKey, installId, license, jobId, backend: chosen, tier });
      creditCharged = 1;
    }
    await recordCall({
      install_id: installId,
      license_fingerprint: licenseFp,
      tier,
      pipeline: "tts",
      backend: chosen,
      model: result?.model || null,
      status: "success",
      credit_charged: creditCharged,
      duration_ms: Date.now() - t0,
      audio_seconds: result?.durationSeconds || 0,
      cost_usd: 0,
      job_id: jobId,
      attempts: result?.attempts || null,
    });
    // Post-charge balance for the client's credit pill.
    const balance = computeRemaining(row, quota);
    res.set("Content-Type", result.contentType || "audio/wav");
    res.set("X-Recap-Tts-Backend", chosen);
    if (result.voice) res.set("X-Recap-Tts-Voice", result.voice);
    if (result.durationSeconds != null) {
      res.set("X-Recap-Audio-Duration", String(result.durationSeconds));
    }
    res.set(
      "X-Recap-Credits-Remaining",
      balance.total == null ? "unlimited" : String(balance.total)
    );
    res.set("X-Recap-Tier", tier);
    res.set("X-Recap-Credit-Charged", String(creditCharged));
    res.set("Cache-Control", "no-store");
    return res.send(result.audio);
  });
  return router;
 }