290 lines
10 KiB
JavaScript
290 lines
10 KiB
JavaScript
// POST /relay/tts — synthesize a topic summary into speech for the
|
|
// Recap app's audio-first ("walking mode") player. Returns the raw
|
|
// audio bytes (MP3 by default from Kokoro or ElevenLabs) with credit
|
|
// metadata in response headers.
|
|
//
|
|
// Request:
|
|
// headers:
|
|
// X-Recap-Install-Id (required)
|
|
// X-Recap-Job-Id (optional but expected — credit dedup key;
|
|
// the Recap client sends ONE id per recap so
|
|
// synthesizing all N topics of a recap costs
|
|
// at most 1 credit, like transcribe+analyze)
|
|
// Authorization (optional Bearer LIC1-… for licensed tiers)
|
|
// body (application/json):
|
|
// { "text": "the topic summary to speak", "voice": "optional voice id" }
|
|
//
|
|
// Response (200):
|
|
// body: raw audio bytes
|
|
// headers:
|
|
// Content-Type audio/wav | audio/mpeg
|
|
// X-Recap-Tts-Backend kokoro | elevenlabs
|
|
// X-Recap-Tts-Voice voice id used
|
|
// X-Recap-Audio-Duration seconds (may be absent for ElevenLabs)
|
|
// X-Recap-Credits-Remaining number, or "unlimited"
|
|
// X-Recap-Tier core | pro | max
|
|
// X-Recap-Credit-Charged 0 | 1
|
|
//
|
|
// Errors return the standard JSON errorEnvelope (so the client can keep
|
|
// its credit pill accurate) with an appropriate status.
|
|
//
|
|
// Billing: 1 credit per unique job id, deduped exactly like transcribe.
|
|
// Gated to Max users on the Recap side; the relay still enforces a
|
|
// balance floor so a non-Max install can't drain TTS for free.
|
|
|
|
import express from "express";
|
|
import { resolveIdentity, identityTier } from "../identity.js";
|
|
import {
|
|
getOrCreateRow,
|
|
commitCredit,
|
|
computeRemaining,
|
|
licenseFingerprint,
|
|
} from "../credits.js";
|
|
import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
|
|
import { getConfigSnapshot, getTierQuotas } from "../config.js";
|
|
import { resolveHardwareConfig } from "../hardware-config.js";
|
|
import { createKokoroBackend } from "../backends/kokoro.js";
|
|
import { createElevenLabsBackend } from "../backends/elevenlabs.js";
|
|
import { errorEnvelope } from "./envelope.js";
|
|
import { recordCall } from "../audit-log.js";
|
|
|
|
// Pick which TTS backend serves this call given the operator preference
|
|
// and what's actually available. Returns "kokoro" | "elevenlabs" | null
|
|
// (null = nothing available → caller surfaces a 503).
|
|
function pickTtsBackend({ preference, kokoroReady, elevenConfigured }) {
|
|
const pref = preference || "hardware_first";
|
|
if (pref === "hardware_only") return kokoroReady ? "kokoro" : null;
|
|
if (pref === "cloud_only") return elevenConfigured ? "elevenlabs" : null;
|
|
if (pref === "cloud_first") {
|
|
if (elevenConfigured) return "elevenlabs";
|
|
if (kokoroReady) return "kokoro";
|
|
return null;
|
|
}
|
|
// hardware_first (default)
|
|
if (kokoroReady) return "kokoro";
|
|
if (elevenConfigured) return "elevenlabs";
|
|
return null;
|
|
}
|
|
|
|
export function ttsRouter() {
|
|
const router = express.Router();
|
|
|
|
router.post("/tts", express.json({ limit: "256kb" }), async (req, res) => {
|
|
const t0 = Date.now();
|
|
const jobId = req.header("X-Recap-Job-Id") || null;
|
|
|
|
let identity;
|
|
try {
|
|
identity = await resolveIdentity(req);
|
|
} catch (err) {
|
|
const e = await errorEnvelope({
|
|
error: err?.message || "auth_error",
|
|
statusHint: err?.status || 401,
|
|
});
|
|
return res.status(e.statusHint || 401).json(e.body);
|
|
}
|
|
if (identity.kind === "license" && !identity.installId) {
|
|
const e = await errorEnvelope({
|
|
error: "missing X-Recap-Install-Id header",
|
|
statusHint: 400,
|
|
});
|
|
return res.status(400).json(e.body);
|
|
}
|
|
const { creditKey, installId, license } = identity;
|
|
|
|
const text = typeof req.body?.text === "string" ? req.body.text.trim() : "";
|
|
if (!text) {
|
|
const e = await errorEnvelope({
|
|
error: "missing 'text' in request body",
|
|
creditKey,
|
|
installId,
|
|
statusHint: 400,
|
|
});
|
|
return res.status(400).json(e.body);
|
|
}
|
|
const clientVoice =
|
|
typeof req.body?.voice === "string" ? req.body.voice.trim() : "";
|
|
// Optional output format override (wav|mp3|opus|flac). Kokoro emits
|
|
// any of these directly; default comes from config (mp3). ElevenLabs
|
|
// ignores it (always mp3).
|
|
const clientFormat =
|
|
typeof req.body?.format === "string" ? req.body.format.trim().toLowerCase() : "";
|
|
|
|
const row = await getOrCreateRow({ creditKey, installId, license });
|
|
const tier = identityTier(identity, row);
|
|
row.tier_snapshot = tier;
|
|
const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license);
|
|
const auditInstall = installId || identity.userId || null;
|
|
|
|
const cfg = await getConfigSnapshot();
|
|
const quota = await getTierQuotas();
|
|
|
|
// Balance floor — refuse only when the install has a finite balance
|
|
// that's exhausted (null total = unlimited, e.g. Max). Max users
|
|
// (the intended audience) always pass; this just stops a credit-less
|
|
// Core install from synthesizing for free if it bypasses the Recap-
|
|
// side Max gate.
|
|
const preBalance = computeRemaining(row, quota);
|
|
if (preBalance.total !== null && preBalance.total <= 0) {
|
|
await recordCall({
|
|
install_id: auditInstall,
|
|
license_fingerprint: licenseFp,
|
|
tier,
|
|
pipeline: "tts",
|
|
backend: null,
|
|
model: null,
|
|
status: "refused",
|
|
credit_charged: 0,
|
|
duration_ms: Date.now() - t0,
|
|
audio_seconds: 0,
|
|
cost_usd: 0,
|
|
job_id: jobId,
|
|
error: "no_credits",
|
|
});
|
|
const e = await errorEnvelope({
|
|
error: "no_credits",
|
|
creditKey,
|
|
installId,
|
|
license,
|
|
tier,
|
|
statusHint: 402,
|
|
});
|
|
return res.status(402).json(e.body);
|
|
}
|
|
|
|
// Resolve availability + choose a backend.
|
|
const hw = await resolveHardwareConfig(cfg);
|
|
const kokoroReady = !!hw.tts?.url;
|
|
const elevenConfigured = !!(
|
|
cfg.relay_elevenlabs_api_key && cfg.relay_elevenlabs_voice_id
|
|
);
|
|
const preference = cfg.relay_tts_backend_preference || "hardware_first";
|
|
const chosen = pickTtsBackend({ preference, kokoroReady, elevenConfigured });
|
|
if (!chosen) {
|
|
const reason = hw.tts?.blocked_reason
|
|
? hw.tts.blocked_reason
|
|
: "No TTS backend available — configure Spark Control (Kokoro) or relay_elevenlabs_api_key.";
|
|
await recordCall({
|
|
install_id: auditInstall,
|
|
license_fingerprint: licenseFp,
|
|
tier,
|
|
pipeline: "tts",
|
|
backend: null,
|
|
model: null,
|
|
status: "error",
|
|
credit_charged: 0,
|
|
duration_ms: Date.now() - t0,
|
|
audio_seconds: 0,
|
|
cost_usd: 0,
|
|
job_id: jobId,
|
|
error: reason,
|
|
});
|
|
const e = await errorEnvelope({
|
|
error: reason,
|
|
creditKey,
|
|
installId,
|
|
license,
|
|
tier,
|
|
statusHint: 503,
|
|
});
|
|
return res.status(503).json(e.body);
|
|
}
|
|
|
|
// Decouple billing from routing (same reasoning as transcribe): look
|
|
// up the job to decide whether to charge, but always synthesize.
|
|
const reusedJob = !!lookupJob({ creditKey, installId, license, jobId });
|
|
|
|
let result;
|
|
try {
|
|
if (chosen === "kokoro") {
|
|
const backend = createKokoroBackend({
|
|
sparkControlBaseURL: hw.sparkBase || hw.tts.url || "",
|
|
defaultVoice: cfg.relay_tts_default_voice,
|
|
defaultFormat: cfg.relay_tts_format,
|
|
});
|
|
result = await backend.synthesize({ text, voice: clientVoice, format: clientFormat });
|
|
} else {
|
|
const backend = createElevenLabsBackend({
|
|
apiKey: cfg.relay_elevenlabs_api_key,
|
|
voiceId: cfg.relay_elevenlabs_voice_id,
|
|
model: cfg.relay_elevenlabs_model,
|
|
});
|
|
result = await backend.synthesize({ text, voice: clientVoice });
|
|
}
|
|
} catch (err) {
|
|
console.error(`[relay/tts] ${chosen} backend error: ${err?.message}`);
|
|
await recordCall({
|
|
install_id: auditInstall,
|
|
license_fingerprint: licenseFp,
|
|
tier,
|
|
pipeline: "tts",
|
|
backend: chosen,
|
|
model: chosen === "kokoro" ? "kokoro" : cfg.relay_elevenlabs_model,
|
|
status: "error",
|
|
credit_charged: 0,
|
|
duration_ms: Date.now() - t0,
|
|
audio_seconds: 0,
|
|
cost_usd: 0,
|
|
job_id: jobId,
|
|
error: (err?.message || String(err)).slice(0, 200),
|
|
});
|
|
const e = await errorEnvelope({
|
|
error: err?.message || "tts_backend_error",
|
|
creditKey,
|
|
installId,
|
|
license,
|
|
tier,
|
|
statusHint: err?.status || 502,
|
|
});
|
|
return res.status(e.statusHint).json(e.body);
|
|
}
|
|
|
|
// Charge once per job id. Operator hardware (Kokoro) is fixed-cost
|
|
// so cost_usd stays 0; ElevenLabs has a real per-char cost but we
|
|
// don't have its billing API wired, so 0 here too (audit shows the
|
|
// call happened; margin tracking for ElevenLabs is a later add).
|
|
let creditCharged = 0;
|
|
if (!reusedJob) {
|
|
await commitCredit({ creditKey, installId, license, backend: chosen, tier });
|
|
await markJobCharged({ creditKey, installId, license, jobId, backend: chosen, tier });
|
|
creditCharged = 1;
|
|
}
|
|
|
|
await recordCall({
|
|
install_id: installId,
|
|
license_fingerprint: licenseFp,
|
|
tier,
|
|
pipeline: "tts",
|
|
backend: chosen,
|
|
model: result?.model || null,
|
|
status: "success",
|
|
credit_charged: creditCharged,
|
|
duration_ms: Date.now() - t0,
|
|
audio_seconds: result?.durationSeconds || 0,
|
|
cost_usd: 0,
|
|
job_id: jobId,
|
|
attempts: result?.attempts || null,
|
|
});
|
|
|
|
// Post-charge balance for the client's credit pill.
|
|
const balance = computeRemaining(row, quota);
|
|
|
|
res.set("Content-Type", result.contentType || "audio/wav");
|
|
res.set("X-Recap-Tts-Backend", chosen);
|
|
if (result.voice) res.set("X-Recap-Tts-Voice", result.voice);
|
|
if (result.durationSeconds != null) {
|
|
res.set("X-Recap-Audio-Duration", String(result.durationSeconds));
|
|
}
|
|
res.set(
|
|
"X-Recap-Credits-Remaining",
|
|
balance.total == null ? "unlimited" : String(balance.total)
|
|
);
|
|
res.set("X-Recap-Tier", tier);
|
|
res.set("X-Recap-Credit-Charged", String(creditCharged));
|
|
res.set("Cache-Control", "no-store");
|
|
return res.send(result.audio);
|
|
});
|
|
|
|
return router;
|
|
}
|