recap-relay/server/routes/tts.js

// POST /relay/tts — synthesize a topic summary into speech for the
// Recap app's audio-first ("walking mode") player. Returns the raw
// audio bytes (MP3 by default from Kokoro or ElevenLabs) with credit
// metadata in response headers.
//
// Request:
//   headers:
//     X-Recap-Install-Id   (required)
//     X-Recap-Job-Id       (optional but expected — credit dedup key;
//                           the Recap client sends ONE id per recap so
//                           synthesizing all N topics of a recap costs
//                           at most 1 credit, like transcribe+analyze)
//     Authorization        (optional Bearer LIC1-… for licensed tiers)
//   body (application/json):
//     { "text": "the topic summary to speak", "voice": "optional voice id" }
//
// Response (200):
//   body:    raw audio bytes
//   headers:
//     Content-Type                audio/wav | audio/mpeg
//     X-Recap-Tts-Backend         kokoro | elevenlabs
//     X-Recap-Tts-Voice           voice id used
//     X-Recap-Audio-Duration      seconds (may be absent for ElevenLabs)
//     X-Recap-Credits-Remaining   number, or "unlimited"
//     X-Recap-Tier                core | pro | max
//     X-Recap-Credit-Charged      0 | 1
//
// Errors return the standard JSON errorEnvelope (so the client can keep
// its credit pill accurate) with an appropriate status.
//
// Billing: 1 credit per unique job id, deduped exactly like transcribe.
// Gated to Max users on the Recap side; the relay still enforces a
// balance floor so a non-Max install can't drain TTS for free.

import express from "express";
import { resolveIdentity, identityTier } from "../identity.js";
import {
  getOrCreateRow,
  commitCredit,
  computeRemaining,
  licenseFingerprint,
} from "../credits.js";
import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
import { getConfigSnapshot, getTierQuotas } from "../config.js";
import { resolveHardwareConfig } from "../hardware-config.js";
import { createKokoroBackend } from "../backends/kokoro.js";
import { createElevenLabsBackend } from "../backends/elevenlabs.js";
import { errorEnvelope } from "./envelope.js";
import { recordCall } from "../audit-log.js";

// Pick which TTS backend serves this call given the operator preference
// and what's actually available. Returns "kokoro" | "elevenlabs" | null
// (null = nothing available → caller surfaces a 503).
function pickTtsBackend({ preference, kokoroReady, elevenConfigured }) {
  const pref = preference || "hardware_first";
  if (pref === "hardware_only") return kokoroReady ? "kokoro" : null;
  if (pref === "cloud_only") return elevenConfigured ? "elevenlabs" : null;
  if (pref === "cloud_first") {
    if (elevenConfigured) return "elevenlabs";
    if (kokoroReady) return "kokoro";
    return null;
  }
  // hardware_first (default)
  if (kokoroReady) return "kokoro";
  if (elevenConfigured) return "elevenlabs";
  return null;
}

export function ttsRouter() {
  const router = express.Router();

  router.post("/tts", express.json({ limit: "256kb" }), async (req, res) => {
    const t0 = Date.now();
    const jobId = req.header("X-Recap-Job-Id") || null;

    let identity;
    try {
      identity = await resolveIdentity(req);
    } catch (err) {
      const e = await errorEnvelope({
        error: err?.message || "auth_error",
        statusHint: err?.status || 401,
      });
      return res.status(e.statusHint || 401).json(e.body);
    }
    if (identity.kind === "license" && !identity.installId) {
      const e = await errorEnvelope({
        error: "missing X-Recap-Install-Id header",
        statusHint: 400,
      });
      return res.status(400).json(e.body);
    }
    const { creditKey, installId, license } = identity;

    const text = typeof req.body?.text === "string" ? req.body.text.trim() : "";
    if (!text) {
      const e = await errorEnvelope({
        error: "missing 'text' in request body",
        creditKey,
        installId,
        statusHint: 400,
      });
      return res.status(400).json(e.body);
    }
    const clientVoice =
      typeof req.body?.voice === "string" ? req.body.voice.trim() : "";
    // Optional output format override (wav|mp3|opus|flac). Kokoro emits
    // any of these directly; default comes from config (mp3). ElevenLabs
    // ignores it (always mp3).
    const clientFormat =
      typeof req.body?.format === "string" ? req.body.format.trim().toLowerCase() : "";

    const row = await getOrCreateRow({ creditKey, installId, license });
    const tier = identityTier(identity, row);
    row.tier_snapshot = tier;
    const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license);
    const auditInstall = installId || identity.userId || null;

    const cfg = await getConfigSnapshot();
    const quota = await getTierQuotas();

    // Balance floor — refuse only when the install has a finite balance
    // that's exhausted (null total = unlimited, e.g. Max). Max users
    // (the intended audience) always pass; this just stops a credit-less
    // Core install from synthesizing for free if it bypasses the Recap-
    // side Max gate.
    const preBalance = computeRemaining(row, quota);
    if (preBalance.total !== null && preBalance.total <= 0) {
      await recordCall({
        install_id: auditInstall,
        license_fingerprint: licenseFp,
        tier,
        pipeline: "tts",
        backend: null,
        model: null,
        status: "refused",
        credit_charged: 0,
        duration_ms: Date.now() - t0,
        audio_seconds: 0,
        cost_usd: 0,
        job_id: jobId,
        error: "no_credits",
      });
      const e = await errorEnvelope({
        error: "no_credits",
        creditKey,
        installId,
        license,
        tier,
        statusHint: 402,
      });
      return res.status(402).json(e.body);
    }

    // Resolve availability + choose a backend.
    const hw = await resolveHardwareConfig(cfg);
    const kokoroReady = !!hw.tts?.url;
    const elevenConfigured = !!(
      cfg.relay_elevenlabs_api_key && cfg.relay_elevenlabs_voice_id
    );
    const preference = cfg.relay_tts_backend_preference || "hardware_first";
    const chosen = pickTtsBackend({ preference, kokoroReady, elevenConfigured });
    if (!chosen) {
      const reason = hw.tts?.blocked_reason
        ? hw.tts.blocked_reason
        : "No TTS backend available — configure Spark Control (Kokoro) or relay_elevenlabs_api_key.";
      await recordCall({
        install_id: auditInstall,
        license_fingerprint: licenseFp,
        tier,
        pipeline: "tts",
        backend: null,
        model: null,
        status: "error",
        credit_charged: 0,
        duration_ms: Date.now() - t0,
        audio_seconds: 0,
        cost_usd: 0,
        job_id: jobId,
        error: reason,
      });
      const e = await errorEnvelope({
        error: reason,
        creditKey,
        installId,
        license,
        tier,
        statusHint: 503,
      });
      return res.status(503).json(e.body);
    }

    // Decouple billing from routing (same reasoning as transcribe): look
    // up the job to decide whether to charge, but always synthesize.
    const reusedJob = !!lookupJob({ creditKey, installId, license, jobId });

    let result;
    try {
      if (chosen === "kokoro") {
        const backend = createKokoroBackend({
          sparkControlBaseURL: hw.sparkBase || hw.tts.url || "",
          defaultVoice: cfg.relay_tts_default_voice,
          defaultFormat: cfg.relay_tts_format,
        });
        result = await backend.synthesize({ text, voice: clientVoice, format: clientFormat });
      } else {
        const backend = createElevenLabsBackend({
          apiKey: cfg.relay_elevenlabs_api_key,
          voiceId: cfg.relay_elevenlabs_voice_id,
          model: cfg.relay_elevenlabs_model,
        });
        result = await backend.synthesize({ text, voice: clientVoice });
      }
    } catch (err) {
      console.error(`[relay/tts] ${chosen} backend error: ${err?.message}`);
      await recordCall({
        install_id: auditInstall,
        license_fingerprint: licenseFp,
        tier,
        pipeline: "tts",
        backend: chosen,
        model: chosen === "kokoro" ? "kokoro" : cfg.relay_elevenlabs_model,
        status: "error",
        credit_charged: 0,
        duration_ms: Date.now() - t0,
        audio_seconds: 0,
        cost_usd: 0,
        job_id: jobId,
        error: (err?.message || String(err)).slice(0, 200),
      });
      const e = await errorEnvelope({
        error: err?.message || "tts_backend_error",
        creditKey,
        installId,
        license,
        tier,
        statusHint: err?.status || 502,
      });
      return res.status(e.statusHint).json(e.body);
    }

    // Charge once per job id. Operator hardware (Kokoro) is fixed-cost
    // so cost_usd stays 0; ElevenLabs has a real per-char cost but we
    // don't have its billing API wired, so 0 here too (audit shows the
    // call happened; margin tracking for ElevenLabs is a later add).
    let creditCharged = 0;
    if (!reusedJob) {
      await commitCredit({ creditKey, installId, license, backend: chosen, tier });
      await markJobCharged({ creditKey, installId, license, jobId, backend: chosen, tier });
      creditCharged = 1;
    }

    await recordCall({
      install_id: installId,
      license_fingerprint: licenseFp,
      tier,
      pipeline: "tts",
      backend: chosen,
      model: result?.model || null,
      status: "success",
      credit_charged: creditCharged,
      duration_ms: Date.now() - t0,
      audio_seconds: result?.durationSeconds || 0,
      cost_usd: 0,
      job_id: jobId,
      attempts: result?.attempts || null,
    });

    // Post-charge balance for the client's credit pill.
    const balance = computeRemaining(row, quota);

    res.set("Content-Type", result.contentType || "audio/wav");
    res.set("X-Recap-Tts-Backend", chosen);
    if (result.voice) res.set("X-Recap-Tts-Voice", result.voice);
    if (result.durationSeconds != null) {
      res.set("X-Recap-Audio-Duration", String(result.durationSeconds));
    }
    res.set(
      "X-Recap-Credits-Remaining",
      balance.total == null ? "unlimited" : String(balance.total)
    );
    res.set("X-Recap-Tier", tier);
    res.set("X-Recap-Credit-Charged", String(creditCharged));
    res.set("Cache-Control", "no-store");
    return res.send(result.audio);
  });

  return router;
}