Add TTS backends (ElevenLabs, Kokoro) and /relay/tts
This commit is contained in:
@@ -0,0 +1,289 @@
|
||||
// POST /relay/tts — synthesize a topic summary into speech for the
|
||||
// Recap app's audio-first ("walking mode") player. Returns the raw
|
||||
// audio bytes (MP3 by default from Kokoro or ElevenLabs) with credit
|
||||
// metadata in response headers.
|
||||
//
|
||||
// Request:
|
||||
// headers:
|
||||
// X-Recap-Install-Id (required)
|
||||
// X-Recap-Job-Id (optional but expected — credit dedup key;
|
||||
// the Recap client sends ONE id per recap so
|
||||
// synthesizing all N topics of a recap costs
|
||||
// at most 1 credit, like transcribe+analyze)
|
||||
// Authorization (optional Bearer LIC1-… for licensed tiers)
|
||||
// body (application/json):
|
||||
// { "text": "the topic summary to speak", "voice": "optional voice id" }
|
||||
//
|
||||
// Response (200):
|
||||
// body: raw audio bytes
|
||||
// headers:
|
||||
// Content-Type audio/wav | audio/mpeg
|
||||
// X-Recap-Tts-Backend kokoro | elevenlabs
|
||||
// X-Recap-Tts-Voice voice id used
|
||||
// X-Recap-Audio-Duration seconds (may be absent for ElevenLabs)
|
||||
// X-Recap-Credits-Remaining number, or "unlimited"
|
||||
// X-Recap-Tier core | pro | max
|
||||
// X-Recap-Credit-Charged 0 | 1
|
||||
//
|
||||
// Errors return the standard JSON errorEnvelope (so the client can keep
|
||||
// its credit pill accurate) with an appropriate status.
|
||||
//
|
||||
// Billing: 1 credit per unique job id, deduped exactly like transcribe.
|
||||
// Gated to Max users on the Recap side; the relay still enforces a
|
||||
// balance floor so a non-Max install can't drain TTS for free.
|
||||
|
||||
import express from "express";
|
||||
import { resolveIdentity, identityTier } from "../identity.js";
|
||||
import {
|
||||
getOrCreateRow,
|
||||
commitCredit,
|
||||
computeRemaining,
|
||||
licenseFingerprint,
|
||||
} from "../credits.js";
|
||||
import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
|
||||
import { getConfigSnapshot, getTierQuotas } from "../config.js";
|
||||
import { resolveHardwareConfig } from "../hardware-config.js";
|
||||
import { createKokoroBackend } from "../backends/kokoro.js";
|
||||
import { createElevenLabsBackend } from "../backends/elevenlabs.js";
|
||||
import { errorEnvelope } from "./envelope.js";
|
||||
import { recordCall } from "../audit-log.js";
|
||||
|
||||
// Pick which TTS backend serves this call given the operator preference
|
||||
// and what's actually available. Returns "kokoro" | "elevenlabs" | null
|
||||
// (null = nothing available → caller surfaces a 503).
|
||||
function pickTtsBackend({ preference, kokoroReady, elevenConfigured }) {
|
||||
const pref = preference || "hardware_first";
|
||||
if (pref === "hardware_only") return kokoroReady ? "kokoro" : null;
|
||||
if (pref === "cloud_only") return elevenConfigured ? "elevenlabs" : null;
|
||||
if (pref === "cloud_first") {
|
||||
if (elevenConfigured) return "elevenlabs";
|
||||
if (kokoroReady) return "kokoro";
|
||||
return null;
|
||||
}
|
||||
// hardware_first (default)
|
||||
if (kokoroReady) return "kokoro";
|
||||
if (elevenConfigured) return "elevenlabs";
|
||||
return null;
|
||||
}
|
||||
|
||||
export function ttsRouter() {
|
||||
const router = express.Router();
|
||||
|
||||
router.post("/tts", express.json({ limit: "256kb" }), async (req, res) => {
|
||||
const t0 = Date.now();
|
||||
const jobId = req.header("X-Recap-Job-Id") || null;
|
||||
|
||||
let identity;
|
||||
try {
|
||||
identity = await resolveIdentity(req);
|
||||
} catch (err) {
|
||||
const e = await errorEnvelope({
|
||||
error: err?.message || "auth_error",
|
||||
statusHint: err?.status || 401,
|
||||
});
|
||||
return res.status(e.statusHint || 401).json(e.body);
|
||||
}
|
||||
if (identity.kind === "license" && !identity.installId) {
|
||||
const e = await errorEnvelope({
|
||||
error: "missing X-Recap-Install-Id header",
|
||||
statusHint: 400,
|
||||
});
|
||||
return res.status(400).json(e.body);
|
||||
}
|
||||
const { creditKey, installId, license } = identity;
|
||||
|
||||
const text = typeof req.body?.text === "string" ? req.body.text.trim() : "";
|
||||
if (!text) {
|
||||
const e = await errorEnvelope({
|
||||
error: "missing 'text' in request body",
|
||||
creditKey,
|
||||
installId,
|
||||
statusHint: 400,
|
||||
});
|
||||
return res.status(400).json(e.body);
|
||||
}
|
||||
const clientVoice =
|
||||
typeof req.body?.voice === "string" ? req.body.voice.trim() : "";
|
||||
// Optional output format override (wav|mp3|opus|flac). Kokoro emits
|
||||
// any of these directly; default comes from config (mp3). ElevenLabs
|
||||
// ignores it (always mp3).
|
||||
const clientFormat =
|
||||
typeof req.body?.format === "string" ? req.body.format.trim().toLowerCase() : "";
|
||||
|
||||
const row = await getOrCreateRow({ creditKey, installId, license });
|
||||
const tier = identityTier(identity, row);
|
||||
row.tier_snapshot = tier;
|
||||
const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license);
|
||||
const auditInstall = installId || identity.userId || null;
|
||||
|
||||
const cfg = await getConfigSnapshot();
|
||||
const quota = await getTierQuotas();
|
||||
|
||||
// Balance floor — refuse only when the install has a finite balance
|
||||
// that's exhausted (null total = unlimited, e.g. Max). Max users
|
||||
// (the intended audience) always pass; this just stops a credit-less
|
||||
// Core install from synthesizing for free if it bypasses the Recap-
|
||||
// side Max gate.
|
||||
const preBalance = computeRemaining(row, quota);
|
||||
if (preBalance.total !== null && preBalance.total <= 0) {
|
||||
await recordCall({
|
||||
install_id: auditInstall,
|
||||
license_fingerprint: licenseFp,
|
||||
tier,
|
||||
pipeline: "tts",
|
||||
backend: null,
|
||||
model: null,
|
||||
status: "refused",
|
||||
credit_charged: 0,
|
||||
duration_ms: Date.now() - t0,
|
||||
audio_seconds: 0,
|
||||
cost_usd: 0,
|
||||
job_id: jobId,
|
||||
error: "no_credits",
|
||||
});
|
||||
const e = await errorEnvelope({
|
||||
error: "no_credits",
|
||||
creditKey,
|
||||
installId,
|
||||
license,
|
||||
tier,
|
||||
statusHint: 402,
|
||||
});
|
||||
return res.status(402).json(e.body);
|
||||
}
|
||||
|
||||
// Resolve availability + choose a backend.
|
||||
const hw = await resolveHardwareConfig(cfg);
|
||||
const kokoroReady = !!hw.tts?.url;
|
||||
const elevenConfigured = !!(
|
||||
cfg.relay_elevenlabs_api_key && cfg.relay_elevenlabs_voice_id
|
||||
);
|
||||
const preference = cfg.relay_tts_backend_preference || "hardware_first";
|
||||
const chosen = pickTtsBackend({ preference, kokoroReady, elevenConfigured });
|
||||
if (!chosen) {
|
||||
const reason = hw.tts?.blocked_reason
|
||||
? hw.tts.blocked_reason
|
||||
: "No TTS backend available — configure Spark Control (Kokoro) or relay_elevenlabs_api_key.";
|
||||
await recordCall({
|
||||
install_id: auditInstall,
|
||||
license_fingerprint: licenseFp,
|
||||
tier,
|
||||
pipeline: "tts",
|
||||
backend: null,
|
||||
model: null,
|
||||
status: "error",
|
||||
credit_charged: 0,
|
||||
duration_ms: Date.now() - t0,
|
||||
audio_seconds: 0,
|
||||
cost_usd: 0,
|
||||
job_id: jobId,
|
||||
error: reason,
|
||||
});
|
||||
const e = await errorEnvelope({
|
||||
error: reason,
|
||||
creditKey,
|
||||
installId,
|
||||
license,
|
||||
tier,
|
||||
statusHint: 503,
|
||||
});
|
||||
return res.status(503).json(e.body);
|
||||
}
|
||||
|
||||
// Decouple billing from routing (same reasoning as transcribe): look
|
||||
// up the job to decide whether to charge, but always synthesize.
|
||||
const reusedJob = !!lookupJob({ creditKey, installId, license, jobId });
|
||||
|
||||
let result;
|
||||
try {
|
||||
if (chosen === "kokoro") {
|
||||
const backend = createKokoroBackend({
|
||||
sparkControlBaseURL: hw.sparkBase || hw.tts.url || "",
|
||||
defaultVoice: cfg.relay_tts_default_voice,
|
||||
defaultFormat: cfg.relay_tts_format,
|
||||
});
|
||||
result = await backend.synthesize({ text, voice: clientVoice, format: clientFormat });
|
||||
} else {
|
||||
const backend = createElevenLabsBackend({
|
||||
apiKey: cfg.relay_elevenlabs_api_key,
|
||||
voiceId: cfg.relay_elevenlabs_voice_id,
|
||||
model: cfg.relay_elevenlabs_model,
|
||||
});
|
||||
result = await backend.synthesize({ text, voice: clientVoice });
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`[relay/tts] ${chosen} backend error: ${err?.message}`);
|
||||
await recordCall({
|
||||
install_id: auditInstall,
|
||||
license_fingerprint: licenseFp,
|
||||
tier,
|
||||
pipeline: "tts",
|
||||
backend: chosen,
|
||||
model: chosen === "kokoro" ? "kokoro" : cfg.relay_elevenlabs_model,
|
||||
status: "error",
|
||||
credit_charged: 0,
|
||||
duration_ms: Date.now() - t0,
|
||||
audio_seconds: 0,
|
||||
cost_usd: 0,
|
||||
job_id: jobId,
|
||||
error: (err?.message || String(err)).slice(0, 200),
|
||||
});
|
||||
const e = await errorEnvelope({
|
||||
error: err?.message || "tts_backend_error",
|
||||
creditKey,
|
||||
installId,
|
||||
license,
|
||||
tier,
|
||||
statusHint: err?.status || 502,
|
||||
});
|
||||
return res.status(e.statusHint).json(e.body);
|
||||
}
|
||||
|
||||
// Charge once per job id. Operator hardware (Kokoro) is fixed-cost
|
||||
// so cost_usd stays 0; ElevenLabs has a real per-char cost but we
|
||||
// don't have its billing API wired, so 0 here too (audit shows the
|
||||
// call happened; margin tracking for ElevenLabs is a later add).
|
||||
let creditCharged = 0;
|
||||
if (!reusedJob) {
|
||||
await commitCredit({ creditKey, installId, license, backend: chosen, tier });
|
||||
await markJobCharged({ creditKey, installId, license, jobId, backend: chosen, tier });
|
||||
creditCharged = 1;
|
||||
}
|
||||
|
||||
await recordCall({
|
||||
install_id: installId,
|
||||
license_fingerprint: licenseFp,
|
||||
tier,
|
||||
pipeline: "tts",
|
||||
backend: chosen,
|
||||
model: result?.model || null,
|
||||
status: "success",
|
||||
credit_charged: creditCharged,
|
||||
duration_ms: Date.now() - t0,
|
||||
audio_seconds: result?.durationSeconds || 0,
|
||||
cost_usd: 0,
|
||||
job_id: jobId,
|
||||
attempts: result?.attempts || null,
|
||||
});
|
||||
|
||||
// Post-charge balance for the client's credit pill.
|
||||
const balance = computeRemaining(row, quota);
|
||||
|
||||
res.set("Content-Type", result.contentType || "audio/wav");
|
||||
res.set("X-Recap-Tts-Backend", chosen);
|
||||
if (result.voice) res.set("X-Recap-Tts-Voice", result.voice);
|
||||
if (result.durationSeconds != null) {
|
||||
res.set("X-Recap-Audio-Duration", String(result.durationSeconds));
|
||||
}
|
||||
res.set(
|
||||
"X-Recap-Credits-Remaining",
|
||||
balance.total == null ? "unlimited" : String(balance.total)
|
||||
);
|
||||
res.set("X-Recap-Tier", tier);
|
||||
res.set("X-Recap-Credit-Charged", String(creditCharged));
|
||||
res.set("Cache-Control", "no-store");
|
||||
return res.send(result.audio);
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
Reference in New Issue
Block a user