Files
recap-relay/server/routes/tts.js
T

290 lines
10 KiB
JavaScript

// POST /relay/tts — synthesize a topic summary into speech for the
// Recap app's audio-first ("walking mode") player. Returns the raw
// audio bytes (MP3 by default from Kokoro or ElevenLabs) with credit
// metadata in response headers.
//
// Request:
// headers:
// X-Recap-Install-Id (required)
// X-Recap-Job-Id (optional but expected — credit dedup key;
// the Recap client sends ONE id per recap so
// synthesizing all N topics of a recap costs
// at most 1 credit, like transcribe+analyze)
// Authorization (optional Bearer LIC1-… for licensed tiers)
// body (application/json):
// { "text": "the topic summary to speak", "voice": "optional voice id" }
//
// Response (200):
// body: raw audio bytes
// headers:
// Content-Type audio/wav | audio/mpeg
// X-Recap-Tts-Backend kokoro | elevenlabs
// X-Recap-Tts-Voice voice id used
// X-Recap-Audio-Duration seconds (may be absent for ElevenLabs)
// X-Recap-Credits-Remaining number, or "unlimited"
// X-Recap-Tier core | pro | max
// X-Recap-Credit-Charged 0 | 1
//
// Errors return the standard JSON errorEnvelope (so the client can keep
// its credit pill accurate) with an appropriate status.
//
// Billing: 1 credit per unique job id, deduped exactly like transcribe.
// Gated to Max users on the Recap side; the relay still enforces a
// balance floor so a non-Max install can't drain TTS for free.
import express from "express";
import { resolveIdentity, identityTier } from "../identity.js";
import {
getOrCreateRow,
commitCredit,
computeRemaining,
licenseFingerprint,
} from "../credits.js";
import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
import { getConfigSnapshot, getTierQuotas } from "../config.js";
import { resolveHardwareConfig } from "../hardware-config.js";
import { createKokoroBackend } from "../backends/kokoro.js";
import { createElevenLabsBackend } from "../backends/elevenlabs.js";
import { errorEnvelope } from "./envelope.js";
import { recordCall } from "../audit-log.js";
// Pick which TTS backend serves this call given the operator preference
// and what's actually available. Returns "kokoro" | "elevenlabs" | null
// (null = nothing available → caller surfaces a 503).
function pickTtsBackend({ preference, kokoroReady, elevenConfigured }) {
const pref = preference || "hardware_first";
if (pref === "hardware_only") return kokoroReady ? "kokoro" : null;
if (pref === "cloud_only") return elevenConfigured ? "elevenlabs" : null;
if (pref === "cloud_first") {
if (elevenConfigured) return "elevenlabs";
if (kokoroReady) return "kokoro";
return null;
}
// hardware_first (default)
if (kokoroReady) return "kokoro";
if (elevenConfigured) return "elevenlabs";
return null;
}
export function ttsRouter() {
const router = express.Router();
router.post("/tts", express.json({ limit: "256kb" }), async (req, res) => {
const t0 = Date.now();
const jobId = req.header("X-Recap-Job-Id") || null;
let identity;
try {
identity = await resolveIdentity(req);
} catch (err) {
const e = await errorEnvelope({
error: err?.message || "auth_error",
statusHint: err?.status || 401,
});
return res.status(e.statusHint || 401).json(e.body);
}
if (identity.kind === "license" && !identity.installId) {
const e = await errorEnvelope({
error: "missing X-Recap-Install-Id header",
statusHint: 400,
});
return res.status(400).json(e.body);
}
const { creditKey, installId, license } = identity;
const text = typeof req.body?.text === "string" ? req.body.text.trim() : "";
if (!text) {
const e = await errorEnvelope({
error: "missing 'text' in request body",
creditKey,
installId,
statusHint: 400,
});
return res.status(400).json(e.body);
}
const clientVoice =
typeof req.body?.voice === "string" ? req.body.voice.trim() : "";
// Optional output format override (wav|mp3|opus|flac). Kokoro emits
// any of these directly; default comes from config (mp3). ElevenLabs
// ignores it (always mp3).
const clientFormat =
typeof req.body?.format === "string" ? req.body.format.trim().toLowerCase() : "";
const row = await getOrCreateRow({ creditKey, installId, license });
const tier = identityTier(identity, row);
row.tier_snapshot = tier;
const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license);
const auditInstall = installId || identity.userId || null;
const cfg = await getConfigSnapshot();
const quota = await getTierQuotas();
// Balance floor — refuse only when the install has a finite balance
// that's exhausted (null total = unlimited, e.g. Max). Max users
// (the intended audience) always pass; this just stops a credit-less
// Core install from synthesizing for free if it bypasses the Recap-
// side Max gate.
const preBalance = computeRemaining(row, quota);
if (preBalance.total !== null && preBalance.total <= 0) {
await recordCall({
install_id: auditInstall,
license_fingerprint: licenseFp,
tier,
pipeline: "tts",
backend: null,
model: null,
status: "refused",
credit_charged: 0,
duration_ms: Date.now() - t0,
audio_seconds: 0,
cost_usd: 0,
job_id: jobId,
error: "no_credits",
});
const e = await errorEnvelope({
error: "no_credits",
creditKey,
installId,
license,
tier,
statusHint: 402,
});
return res.status(402).json(e.body);
}
// Resolve availability + choose a backend.
const hw = await resolveHardwareConfig(cfg);
const kokoroReady = !!hw.tts?.url;
const elevenConfigured = !!(
cfg.relay_elevenlabs_api_key && cfg.relay_elevenlabs_voice_id
);
const preference = cfg.relay_tts_backend_preference || "hardware_first";
const chosen = pickTtsBackend({ preference, kokoroReady, elevenConfigured });
if (!chosen) {
const reason = hw.tts?.blocked_reason
? hw.tts.blocked_reason
: "No TTS backend available — configure Spark Control (Kokoro) or relay_elevenlabs_api_key.";
await recordCall({
install_id: auditInstall,
license_fingerprint: licenseFp,
tier,
pipeline: "tts",
backend: null,
model: null,
status: "error",
credit_charged: 0,
duration_ms: Date.now() - t0,
audio_seconds: 0,
cost_usd: 0,
job_id: jobId,
error: reason,
});
const e = await errorEnvelope({
error: reason,
creditKey,
installId,
license,
tier,
statusHint: 503,
});
return res.status(503).json(e.body);
}
// Decouple billing from routing (same reasoning as transcribe): look
// up the job to decide whether to charge, but always synthesize.
const reusedJob = !!lookupJob({ creditKey, installId, license, jobId });
let result;
try {
if (chosen === "kokoro") {
const backend = createKokoroBackend({
sparkControlBaseURL: hw.sparkBase || hw.tts.url || "",
defaultVoice: cfg.relay_tts_default_voice,
defaultFormat: cfg.relay_tts_format,
});
result = await backend.synthesize({ text, voice: clientVoice, format: clientFormat });
} else {
const backend = createElevenLabsBackend({
apiKey: cfg.relay_elevenlabs_api_key,
voiceId: cfg.relay_elevenlabs_voice_id,
model: cfg.relay_elevenlabs_model,
});
result = await backend.synthesize({ text, voice: clientVoice });
}
} catch (err) {
console.error(`[relay/tts] ${chosen} backend error: ${err?.message}`);
await recordCall({
install_id: auditInstall,
license_fingerprint: licenseFp,
tier,
pipeline: "tts",
backend: chosen,
model: chosen === "kokoro" ? "kokoro" : cfg.relay_elevenlabs_model,
status: "error",
credit_charged: 0,
duration_ms: Date.now() - t0,
audio_seconds: 0,
cost_usd: 0,
job_id: jobId,
error: (err?.message || String(err)).slice(0, 200),
});
const e = await errorEnvelope({
error: err?.message || "tts_backend_error",
creditKey,
installId,
license,
tier,
statusHint: err?.status || 502,
});
return res.status(e.statusHint).json(e.body);
}
// Charge once per job id. Operator hardware (Kokoro) is fixed-cost
// so cost_usd stays 0; ElevenLabs has a real per-char cost but we
// don't have its billing API wired, so 0 here too (audit shows the
// call happened; margin tracking for ElevenLabs is a later add).
let creditCharged = 0;
if (!reusedJob) {
await commitCredit({ creditKey, installId, license, backend: chosen, tier });
await markJobCharged({ creditKey, installId, license, jobId, backend: chosen, tier });
creditCharged = 1;
}
await recordCall({
install_id: installId,
license_fingerprint: licenseFp,
tier,
pipeline: "tts",
backend: chosen,
model: result?.model || null,
status: "success",
credit_charged: creditCharged,
duration_ms: Date.now() - t0,
audio_seconds: result?.durationSeconds || 0,
cost_usd: 0,
job_id: jobId,
attempts: result?.attempts || null,
});
// Post-charge balance for the client's credit pill.
const balance = computeRemaining(row, quota);
res.set("Content-Type", result.contentType || "audio/wav");
res.set("X-Recap-Tts-Backend", chosen);
if (result.voice) res.set("X-Recap-Tts-Voice", result.voice);
if (result.durationSeconds != null) {
res.set("X-Recap-Audio-Duration", String(result.durationSeconds));
}
res.set(
"X-Recap-Credits-Remaining",
balance.total == null ? "unlimited" : String(balance.total)
);
res.set("X-Recap-Tier", tier);
res.set("X-Recap-Credit-Charged", String(creditCharged));
res.set("Cache-Control", "no-store");
return res.send(result.audio);
});
return router;
}