recap-relay/server/backends/kokoro.js

// Kokoro TTS backend — synthesizes a topic summary into speech via Spark
// Control's OpenAI-compatible /v1/audio/speech endpoint.
//
// Kokoro-82M (Apache-2.0, hexgrad/Kokoro-82M) replaced Magpie in Spark
// Control v0.14.0. Magpie's NVIDIA Riva decoder had a structural
// truncation defect that capped end-to-end reliability at ~85% even with
// server-side retries + chunking; Kokoro renders cleanly at any length
// (100% in our testing, ~1s for a ~100-word summary, no truncation). So
// this backend is a single pass-through call — NONE of the Magpie-era
// fragmenting, pacing/recovery-gap, duration-check, retry, or WAV
// stitching is needed or present.
//
// Output: 24kHz mono 16-bit PCM. Kokoro can emit wav/mp3/opus/flac
// directly via response_format, so we request the caller's format (mp3
// by default — small + universally playable for the mobile/offline
// player) and never transcode client-side. durationSeconds is left null:
// Kokoro's WAV header carries a placeholder size field (bogus computed
// duration), and for mp3 we'd have to decode — the Recap side measures
// duration off the cached file / <audio> element instead.

import { lanFetch } from "../lan-fetch.js";

const DEFAULT_TIMEOUT_MS = 60_000;
const DEFAULT_VOICE = "bm_george";
const DEFAULT_FORMAT = "mp3";
// One retry on a 5xx / network blip (per the Spark Control dev's
// error-handling guidance: 4xx = real client error, 5xx = retry once).
// Kokoro doesn't truncate, so there's no duration-based retry.
const RETRY_ON_5XX = 1;

const FORMAT_CONTENT_TYPE = {
  wav: "audio/wav",
  mp3: "audio/mpeg",
  opus: "audio/ogg",
  flac: "audio/flac",
};

function sleepMs(ms) {
  return new Promise((r) => setTimeout(r, ms));
}

export function createKokoroBackend({
  // Spark Control base URL (no path) — derived by the caller from
  // relay_spark_control_url with the /api/endpoints suffix stripped.
  sparkControlBaseURL = "",
  defaultVoice = DEFAULT_VOICE,
  defaultFormat = DEFAULT_FORMAT,
  timeoutMs = DEFAULT_TIMEOUT_MS,
} = {}) {
  const sparkBase = (sparkControlBaseURL || "")
    .trim()
    .replace(/\/$/, "")
    .replace(/\/api\/endpoints$/, "");

  async function callKokoro({ text, voice, format }) {
    const url = `${sparkBase}/v1/audio/speech`;
    let res;
    try {
      res = await lanFetch(url, {
        method: "POST",
        headers: { "Content-Type": "application/json" },
        redirect: "follow",
        body: JSON.stringify({
          model: "kokoro",
          input: text,
          voice,
          response_format: format,
        }),
        signal: AbortSignal.timeout(timeoutMs),
      });
    } catch (err) {
      const cause = err?.cause?.message || err?.cause?.code || err?.cause || "";
      const detail = cause ? `${err.message} (cause: ${cause})` : err?.message || String(err);
      const e = new Error(`Kokoro TTS network error at ${url}: ${detail}`);
      e.status = 502;
      throw e;
    }
    if (!res.ok) {
      let body = "";
      try {
        body = await res.text();
      } catch {}
      const e = new Error(`Kokoro TTS ${res.status} at ${url}: ${body.slice(0, 300)}`);
      e.status = res.status;
      throw e;
    }
    return Buffer.from(await res.arrayBuffer());
  }

  return {
    hasTts: !!sparkBase,
    kind: "kokoro",

    async synthesize({ text, voice, format }) {
      if (!sparkBase) {
        const e = new Error(
          "Kokoro TTS is not configured — Spark Control discovery isn't reporting a ready kokoro endpoint"
        );
        e.status = 503;
        throw e;
      }
      const cleaned = (text || "").replace(/\s+/g, " ").trim();
      if (!cleaned) {
        const e = new Error("TTS input text is empty");
        e.status = 400;
        throw e;
      }
      const chosenVoice = (voice || defaultVoice || "").trim() || DEFAULT_VOICE;
      const fmt = (format || defaultFormat || DEFAULT_FORMAT).toLowerCase();
      const contentType = FORMAT_CONTENT_TYPE[fmt] || "application/octet-stream";

      let attempt = 0;
      // Retry only on transient 5xx; a 4xx (bad voice/format) is
      // deterministic and surfaces immediately.
      while (true) {
        try {
          const audio = await callKokoro({ text: cleaned, voice: chosenVoice, format: fmt });
          return {
            audio,
            contentType,
            durationSeconds: null,
            voice: chosenVoice,
            model: "kokoro",
            format: fmt,
            attempts: attempt + 1,
          };
        } catch (err) {
          const status = err?.status || 0;
          if (status >= 400 && status < 500) throw err; // client error → no retry
          if (attempt >= RETRY_ON_5XX) throw err;
          attempt += 1;
          console.warn(
            `[kokoro] TTS call failed (${status || "network"}) — retry ${attempt}/${RETRY_ON_5XX}`
          );
          await sleepMs(500);
        }
      }
    },
  };
}