// POST /relay/tts — synthesize a topic summary into speech for the // Recap app's audio-first ("walking mode") player. Returns the raw // audio bytes (MP3 by default from Kokoro or ElevenLabs) with credit // metadata in response headers. // // Request: // headers: // X-Recap-Install-Id (required) // X-Recap-Job-Id (optional but expected — credit dedup key; // the Recap client sends ONE id per recap so // synthesizing all N topics of a recap costs // at most 1 credit, like transcribe+analyze) // Authorization (optional Bearer LIC1-… for licensed tiers) // body (application/json): // { "text": "the topic summary to speak", "voice": "optional voice id" } // // Response (200): // body: raw audio bytes // headers: // Content-Type audio/wav | audio/mpeg // X-Recap-Tts-Backend kokoro | elevenlabs // X-Recap-Tts-Voice voice id used // X-Recap-Audio-Duration seconds (may be absent for ElevenLabs) // X-Recap-Credits-Remaining number, or "unlimited" // X-Recap-Tier core | pro | max // X-Recap-Credit-Charged 0 | 1 // // Errors return the standard JSON errorEnvelope (so the client can keep // its credit pill accurate) with an appropriate status. // // Billing: 1 credit per unique job id, deduped exactly like transcribe. // Gated to Max users on the Recap side; the relay still enforces a // balance floor so a non-Max install can't drain TTS for free. import express from "express"; import { resolveIdentity, identityTier } from "../identity.js"; import { getOrCreateRow, commitCredit, computeRemaining, licenseFingerprint, } from "../credits.js"; import { lookupJob, markJobCharged, refundJob } from "../job-credits.js"; import { getConfigSnapshot, getTierQuotas } from "../config.js"; import { resolveHardwareConfig } from "../hardware-config.js"; import { createKokoroBackend } from "../backends/kokoro.js"; import { createElevenLabsBackend } from "../backends/elevenlabs.js"; import { errorEnvelope } from "./envelope.js"; import { recordCall } from "../audit-log.js"; // Pick which TTS backend serves this call given the operator preference // and what's actually available. Returns "kokoro" | "elevenlabs" | null // (null = nothing available → caller surfaces a 503). function pickTtsBackend({ preference, kokoroReady, elevenConfigured }) { const pref = preference || "hardware_first"; if (pref === "hardware_only") return kokoroReady ? "kokoro" : null; if (pref === "cloud_only") return elevenConfigured ? "elevenlabs" : null; if (pref === "cloud_first") { if (elevenConfigured) return "elevenlabs"; if (kokoroReady) return "kokoro"; return null; } // hardware_first (default) if (kokoroReady) return "kokoro"; if (elevenConfigured) return "elevenlabs"; return null; } export function ttsRouter() { const router = express.Router(); router.post("/tts", express.json({ limit: "256kb" }), async (req, res) => { const t0 = Date.now(); const jobId = req.header("X-Recap-Job-Id") || null; let identity; try { identity = await resolveIdentity(req); } catch (err) { const e = await errorEnvelope({ error: err?.message || "auth_error", statusHint: err?.status || 401, }); return res.status(e.statusHint || 401).json(e.body); } if (identity.kind === "license" && !identity.installId) { const e = await errorEnvelope({ error: "missing X-Recap-Install-Id header", statusHint: 400, }); return res.status(400).json(e.body); } const { creditKey, installId, license } = identity; const text = typeof req.body?.text === "string" ? req.body.text.trim() : ""; if (!text) { const e = await errorEnvelope({ error: "missing 'text' in request body", creditKey, installId, statusHint: 400, }); return res.status(400).json(e.body); } const clientVoice = typeof req.body?.voice === "string" ? req.body.voice.trim() : ""; // Optional output format override (wav|mp3|opus|flac). Kokoro emits // any of these directly; default comes from config (mp3). ElevenLabs // ignores it (always mp3). const clientFormat = typeof req.body?.format === "string" ? req.body.format.trim().toLowerCase() : ""; const row = await getOrCreateRow({ creditKey, installId, license }); const tier = identityTier(identity, row); row.tier_snapshot = tier; const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license); const auditInstall = installId || identity.userId || null; const cfg = await getConfigSnapshot(); const quota = await getTierQuotas(); // Balance floor — refuse only when the install has a finite balance // that's exhausted (null total = unlimited, e.g. Max). Max users // (the intended audience) always pass; this just stops a credit-less // Core install from synthesizing for free if it bypasses the Recap- // side Max gate. const preBalance = computeRemaining(row, quota); if (preBalance.total !== null && preBalance.total <= 0) { await recordCall({ install_id: auditInstall, license_fingerprint: licenseFp, tier, pipeline: "tts", backend: null, model: null, status: "refused", credit_charged: 0, duration_ms: Date.now() - t0, audio_seconds: 0, cost_usd: 0, job_id: jobId, error: "no_credits", }); const e = await errorEnvelope({ error: "no_credits", creditKey, installId, license, tier, statusHint: 402, }); return res.status(402).json(e.body); } // Resolve availability + choose a backend. const hw = await resolveHardwareConfig(cfg); const kokoroReady = !!hw.tts?.url; const elevenConfigured = !!( cfg.relay_elevenlabs_api_key && cfg.relay_elevenlabs_voice_id ); const preference = cfg.relay_tts_backend_preference || "hardware_first"; const chosen = pickTtsBackend({ preference, kokoroReady, elevenConfigured }); if (!chosen) { const reason = hw.tts?.blocked_reason ? hw.tts.blocked_reason : "No TTS backend available — configure Spark Control (Kokoro) or relay_elevenlabs_api_key."; await recordCall({ install_id: auditInstall, license_fingerprint: licenseFp, tier, pipeline: "tts", backend: null, model: null, status: "error", credit_charged: 0, duration_ms: Date.now() - t0, audio_seconds: 0, cost_usd: 0, job_id: jobId, error: reason, }); const e = await errorEnvelope({ error: reason, creditKey, installId, license, tier, statusHint: 503, }); return res.status(503).json(e.body); } // Decouple billing from routing (same reasoning as transcribe): look // up the job to decide whether to charge, but always synthesize. const reusedJob = !!lookupJob({ creditKey, installId, license, jobId }); let result; try { if (chosen === "kokoro") { const backend = createKokoroBackend({ sparkControlBaseURL: hw.sparkBase || hw.tts.url || "", defaultVoice: cfg.relay_tts_default_voice, defaultFormat: cfg.relay_tts_format, }); result = await backend.synthesize({ text, voice: clientVoice, format: clientFormat }); } else { const backend = createElevenLabsBackend({ apiKey: cfg.relay_elevenlabs_api_key, voiceId: cfg.relay_elevenlabs_voice_id, model: cfg.relay_elevenlabs_model, }); result = await backend.synthesize({ text, voice: clientVoice }); } } catch (err) { console.error(`[relay/tts] ${chosen} backend error: ${err?.message}`); await recordCall({ install_id: auditInstall, license_fingerprint: licenseFp, tier, pipeline: "tts", backend: chosen, model: chosen === "kokoro" ? "kokoro" : cfg.relay_elevenlabs_model, status: "error", credit_charged: 0, duration_ms: Date.now() - t0, audio_seconds: 0, cost_usd: 0, job_id: jobId, error: (err?.message || String(err)).slice(0, 200), }); const e = await errorEnvelope({ error: err?.message || "tts_backend_error", creditKey, installId, license, tier, statusHint: err?.status || 502, }); return res.status(e.statusHint).json(e.body); } // Charge once per job id. Operator hardware (Kokoro) is fixed-cost // so cost_usd stays 0; ElevenLabs has a real per-char cost but we // don't have its billing API wired, so 0 here too (audit shows the // call happened; margin tracking for ElevenLabs is a later add). let creditCharged = 0; if (!reusedJob) { await commitCredit({ creditKey, installId, license, backend: chosen, tier }); await markJobCharged({ creditKey, installId, license, jobId, backend: chosen, tier }); creditCharged = 1; } await recordCall({ install_id: installId, license_fingerprint: licenseFp, tier, pipeline: "tts", backend: chosen, model: result?.model || null, status: "success", credit_charged: creditCharged, duration_ms: Date.now() - t0, audio_seconds: result?.durationSeconds || 0, cost_usd: 0, job_id: jobId, attempts: result?.attempts || null, }); // Post-charge balance for the client's credit pill. const balance = computeRemaining(row, quota); res.set("Content-Type", result.contentType || "audio/wav"); res.set("X-Recap-Tts-Backend", chosen); if (result.voice) res.set("X-Recap-Tts-Voice", result.voice); if (result.durationSeconds != null) { res.set("X-Recap-Audio-Duration", String(result.durationSeconds)); } res.set( "X-Recap-Credits-Remaining", balance.total == null ? "unlimited" : String(balance.total) ); res.set("X-Recap-Tier", tier); res.set("X-Recap-Credit-Charged", String(creditCharged)); res.set("Cache-Control", "no-store"); return res.send(result.audio); }); return router; }