Add TTS backends (ElevenLabs, Kokoro) and /relay/tts
This commit is contained in:
@@ -0,0 +1,109 @@
|
||||
// ElevenLabs TTS backend — the cloud alternative to operator-hardware
|
||||
// Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini
|
||||
// as a swappable cloud fallback to the Parakeet/vLLM hardware path.
|
||||
//
|
||||
// Selected when relay_tts_backend_preference routes here (or when
|
||||
// Kokoro is unavailable and a fallback is allowed) AND
|
||||
// relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs
|
||||
// handles full multi-sentence paragraphs in one call — one request per
|
||||
// topic summary, no chunking.
|
||||
//
|
||||
// NOTE: This path is implemented to ElevenLabs' documented API but is
|
||||
// UNTESTED against a live key (the operator hadn't supplied one at
|
||||
// build time). The Kokoro path is the tested default. Before relying on
|
||||
// ElevenLabs in production, set relay_elevenlabs_api_key +
|
||||
// relay_elevenlabs_voice_id and smoke-test one /relay/tts call.
|
||||
//
|
||||
// Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds,
|
||||
// voice, model } — durationSeconds is null (we don't decode
|
||||
// the MP3 frame count here; the Recap server measures it when
|
||||
// transcoding/caching).
|
||||
|
||||
const API_BASE = "https://api.elevenlabs.io/v1";
|
||||
const DEFAULT_MODEL = "eleven_turbo_v2_5";
|
||||
const DEFAULT_TIMEOUT_MS = 120_000;
|
||||
|
||||
export function createElevenLabsBackend({
|
||||
apiKey = "",
|
||||
voiceId = "",
|
||||
model = DEFAULT_MODEL,
|
||||
timeoutMs = DEFAULT_TIMEOUT_MS,
|
||||
} = {}) {
|
||||
const configured = !!(apiKey && voiceId);
|
||||
|
||||
return {
|
||||
hasTts: configured,
|
||||
kind: "elevenlabs",
|
||||
|
||||
async synthesize({ text, voice }) {
|
||||
if (!apiKey) {
|
||||
const e = new Error(
|
||||
"ElevenLabs TTS is not configured — set relay_elevenlabs_api_key"
|
||||
);
|
||||
e.status = 503;
|
||||
throw e;
|
||||
}
|
||||
// `voice` from the client overrides the operator default voice id
|
||||
// when present (the Recap client may let a user pick a voice).
|
||||
const chosenVoice = (voice || voiceId || "").trim();
|
||||
if (!chosenVoice) {
|
||||
const e = new Error(
|
||||
"ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id"
|
||||
);
|
||||
e.status = 503;
|
||||
throw e;
|
||||
}
|
||||
const cleaned = (text || "").replace(/\s+/g, " ").trim();
|
||||
if (!cleaned) {
|
||||
const e = new Error("TTS input text is empty");
|
||||
e.status = 400;
|
||||
throw e;
|
||||
}
|
||||
|
||||
const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`;
|
||||
let res;
|
||||
try {
|
||||
// Public-internet call — use the global fetch with full cert
|
||||
// validation (NOT lanFetch, which is scoped to LAN/Spark Control).
|
||||
res = await fetch(url, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"xi-api-key": apiKey,
|
||||
"Content-Type": "application/json",
|
||||
Accept: "audio/mpeg",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text: cleaned,
|
||||
model_id: model || DEFAULT_MODEL,
|
||||
}),
|
||||
signal: AbortSignal.timeout(timeoutMs),
|
||||
});
|
||||
} catch (err) {
|
||||
const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`);
|
||||
e.status = 502;
|
||||
throw e;
|
||||
}
|
||||
if (!res.ok) {
|
||||
let body = "";
|
||||
try {
|
||||
body = await res.text();
|
||||
} catch {}
|
||||
const e = new Error(
|
||||
`ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}`
|
||||
);
|
||||
e.status = res.status;
|
||||
throw e;
|
||||
}
|
||||
const audio = Buffer.from(await res.arrayBuffer());
|
||||
return {
|
||||
audio,
|
||||
contentType: "audio/mpeg",
|
||||
durationSeconds: null,
|
||||
sentenceCount: null,
|
||||
attempts: 1,
|
||||
voice: chosenVoice,
|
||||
model: model || DEFAULT_MODEL,
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,140 @@
|
||||
// Kokoro TTS backend — synthesizes a topic summary into speech via Spark
|
||||
// Control's OpenAI-compatible /v1/audio/speech endpoint.
|
||||
//
|
||||
// Kokoro-82M (Apache-2.0, hexgrad/Kokoro-82M) replaced Magpie in Spark
|
||||
// Control v0.14.0. Magpie's NVIDIA Riva decoder had a structural
|
||||
// truncation defect that capped end-to-end reliability at ~85% even with
|
||||
// server-side retries + chunking; Kokoro renders cleanly at any length
|
||||
// (100% in our testing, ~1s for a ~100-word summary, no truncation). So
|
||||
// this backend is a single pass-through call — NONE of the Magpie-era
|
||||
// fragmenting, pacing/recovery-gap, duration-check, retry, or WAV
|
||||
// stitching is needed or present.
|
||||
//
|
||||
// Output: 24kHz mono 16-bit PCM. Kokoro can emit wav/mp3/opus/flac
|
||||
// directly via response_format, so we request the caller's format (mp3
|
||||
// by default — small + universally playable for the mobile/offline
|
||||
// player) and never transcode client-side. durationSeconds is left null:
|
||||
// Kokoro's WAV header carries a placeholder size field (bogus computed
|
||||
// duration), and for mp3 we'd have to decode — the Recap side measures
|
||||
// duration off the cached file / <audio> element instead.
|
||||
|
||||
import { lanFetch } from "../lan-fetch.js";
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = 60_000;
|
||||
const DEFAULT_VOICE = "bm_george";
|
||||
const DEFAULT_FORMAT = "mp3";
|
||||
// One retry on a 5xx / network blip (per the Spark Control dev's
|
||||
// error-handling guidance: 4xx = real client error, 5xx = retry once).
|
||||
// Kokoro doesn't truncate, so there's no duration-based retry.
|
||||
const RETRY_ON_5XX = 1;
|
||||
|
||||
const FORMAT_CONTENT_TYPE = {
|
||||
wav: "audio/wav",
|
||||
mp3: "audio/mpeg",
|
||||
opus: "audio/ogg",
|
||||
flac: "audio/flac",
|
||||
};
|
||||
|
||||
function sleepMs(ms) {
|
||||
return new Promise((r) => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
export function createKokoroBackend({
|
||||
// Spark Control base URL (no path) — derived by the caller from
|
||||
// relay_spark_control_url with the /api/endpoints suffix stripped.
|
||||
sparkControlBaseURL = "",
|
||||
defaultVoice = DEFAULT_VOICE,
|
||||
defaultFormat = DEFAULT_FORMAT,
|
||||
timeoutMs = DEFAULT_TIMEOUT_MS,
|
||||
} = {}) {
|
||||
const sparkBase = (sparkControlBaseURL || "")
|
||||
.trim()
|
||||
.replace(/\/$/, "")
|
||||
.replace(/\/api\/endpoints$/, "");
|
||||
|
||||
async function callKokoro({ text, voice, format }) {
|
||||
const url = `${sparkBase}/v1/audio/speech`;
|
||||
let res;
|
||||
try {
|
||||
res = await lanFetch(url, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
redirect: "follow",
|
||||
body: JSON.stringify({
|
||||
model: "kokoro",
|
||||
input: text,
|
||||
voice,
|
||||
response_format: format,
|
||||
}),
|
||||
signal: AbortSignal.timeout(timeoutMs),
|
||||
});
|
||||
} catch (err) {
|
||||
const cause = err?.cause?.message || err?.cause?.code || err?.cause || "";
|
||||
const detail = cause ? `${err.message} (cause: ${cause})` : err?.message || String(err);
|
||||
const e = new Error(`Kokoro TTS network error at ${url}: ${detail}`);
|
||||
e.status = 502;
|
||||
throw e;
|
||||
}
|
||||
if (!res.ok) {
|
||||
let body = "";
|
||||
try {
|
||||
body = await res.text();
|
||||
} catch {}
|
||||
const e = new Error(`Kokoro TTS ${res.status} at ${url}: ${body.slice(0, 300)}`);
|
||||
e.status = res.status;
|
||||
throw e;
|
||||
}
|
||||
return Buffer.from(await res.arrayBuffer());
|
||||
}
|
||||
|
||||
return {
|
||||
hasTts: !!sparkBase,
|
||||
kind: "kokoro",
|
||||
|
||||
async synthesize({ text, voice, format }) {
|
||||
if (!sparkBase) {
|
||||
const e = new Error(
|
||||
"Kokoro TTS is not configured — Spark Control discovery isn't reporting a ready kokoro endpoint"
|
||||
);
|
||||
e.status = 503;
|
||||
throw e;
|
||||
}
|
||||
const cleaned = (text || "").replace(/\s+/g, " ").trim();
|
||||
if (!cleaned) {
|
||||
const e = new Error("TTS input text is empty");
|
||||
e.status = 400;
|
||||
throw e;
|
||||
}
|
||||
const chosenVoice = (voice || defaultVoice || "").trim() || DEFAULT_VOICE;
|
||||
const fmt = (format || defaultFormat || DEFAULT_FORMAT).toLowerCase();
|
||||
const contentType = FORMAT_CONTENT_TYPE[fmt] || "application/octet-stream";
|
||||
|
||||
let attempt = 0;
|
||||
// Retry only on transient 5xx; a 4xx (bad voice/format) is
|
||||
// deterministic and surfaces immediately.
|
||||
while (true) {
|
||||
try {
|
||||
const audio = await callKokoro({ text: cleaned, voice: chosenVoice, format: fmt });
|
||||
return {
|
||||
audio,
|
||||
contentType,
|
||||
durationSeconds: null,
|
||||
voice: chosenVoice,
|
||||
model: "kokoro",
|
||||
format: fmt,
|
||||
attempts: attempt + 1,
|
||||
};
|
||||
} catch (err) {
|
||||
const status = err?.status || 0;
|
||||
if (status >= 400 && status < 500) throw err; // client error → no retry
|
||||
if (attempt >= RETRY_ON_5XX) throw err;
|
||||
attempt += 1;
|
||||
console.warn(
|
||||
`[kokoro] TTS call failed (${status || "network"}) — retry ${attempt}/${RETRY_ON_5XX}`
|
||||
);
|
||||
await sleepMs(500);
|
||||
}
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,289 @@
|
||||
// POST /relay/tts — synthesize a topic summary into speech for the
|
||||
// Recap app's audio-first ("walking mode") player. Returns the raw
|
||||
// audio bytes (MP3 by default from Kokoro or ElevenLabs) with credit
|
||||
// metadata in response headers.
|
||||
//
|
||||
// Request:
|
||||
// headers:
|
||||
// X-Recap-Install-Id (required)
|
||||
// X-Recap-Job-Id (optional but expected — credit dedup key;
|
||||
// the Recap client sends ONE id per recap so
|
||||
// synthesizing all N topics of a recap costs
|
||||
// at most 1 credit, like transcribe+analyze)
|
||||
// Authorization (optional Bearer LIC1-… for licensed tiers)
|
||||
// body (application/json):
|
||||
// { "text": "the topic summary to speak", "voice": "optional voice id" }
|
||||
//
|
||||
// Response (200):
|
||||
// body: raw audio bytes
|
||||
// headers:
|
||||
// Content-Type audio/wav | audio/mpeg
|
||||
// X-Recap-Tts-Backend kokoro | elevenlabs
|
||||
// X-Recap-Tts-Voice voice id used
|
||||
// X-Recap-Audio-Duration seconds (may be absent for ElevenLabs)
|
||||
// X-Recap-Credits-Remaining number, or "unlimited"
|
||||
// X-Recap-Tier core | pro | max
|
||||
// X-Recap-Credit-Charged 0 | 1
|
||||
//
|
||||
// Errors return the standard JSON errorEnvelope (so the client can keep
|
||||
// its credit pill accurate) with an appropriate status.
|
||||
//
|
||||
// Billing: 1 credit per unique job id, deduped exactly like transcribe.
|
||||
// Gated to Max users on the Recap side; the relay still enforces a
|
||||
// balance floor so a non-Max install can't drain TTS for free.
|
||||
|
||||
import express from "express";
|
||||
import { resolveIdentity, identityTier } from "../identity.js";
|
||||
import {
|
||||
getOrCreateRow,
|
||||
commitCredit,
|
||||
computeRemaining,
|
||||
licenseFingerprint,
|
||||
} from "../credits.js";
|
||||
import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
|
||||
import { getConfigSnapshot, getTierQuotas } from "../config.js";
|
||||
import { resolveHardwareConfig } from "../hardware-config.js";
|
||||
import { createKokoroBackend } from "../backends/kokoro.js";
|
||||
import { createElevenLabsBackend } from "../backends/elevenlabs.js";
|
||||
import { errorEnvelope } from "./envelope.js";
|
||||
import { recordCall } from "../audit-log.js";
|
||||
|
||||
// Pick which TTS backend serves this call given the operator preference
|
||||
// and what's actually available. Returns "kokoro" | "elevenlabs" | null
|
||||
// (null = nothing available → caller surfaces a 503).
|
||||
function pickTtsBackend({ preference, kokoroReady, elevenConfigured }) {
|
||||
const pref = preference || "hardware_first";
|
||||
if (pref === "hardware_only") return kokoroReady ? "kokoro" : null;
|
||||
if (pref === "cloud_only") return elevenConfigured ? "elevenlabs" : null;
|
||||
if (pref === "cloud_first") {
|
||||
if (elevenConfigured) return "elevenlabs";
|
||||
if (kokoroReady) return "kokoro";
|
||||
return null;
|
||||
}
|
||||
// hardware_first (default)
|
||||
if (kokoroReady) return "kokoro";
|
||||
if (elevenConfigured) return "elevenlabs";
|
||||
return null;
|
||||
}
|
||||
|
||||
export function ttsRouter() {
|
||||
const router = express.Router();
|
||||
|
||||
router.post("/tts", express.json({ limit: "256kb" }), async (req, res) => {
|
||||
const t0 = Date.now();
|
||||
const jobId = req.header("X-Recap-Job-Id") || null;
|
||||
|
||||
let identity;
|
||||
try {
|
||||
identity = await resolveIdentity(req);
|
||||
} catch (err) {
|
||||
const e = await errorEnvelope({
|
||||
error: err?.message || "auth_error",
|
||||
statusHint: err?.status || 401,
|
||||
});
|
||||
return res.status(e.statusHint || 401).json(e.body);
|
||||
}
|
||||
if (identity.kind === "license" && !identity.installId) {
|
||||
const e = await errorEnvelope({
|
||||
error: "missing X-Recap-Install-Id header",
|
||||
statusHint: 400,
|
||||
});
|
||||
return res.status(400).json(e.body);
|
||||
}
|
||||
const { creditKey, installId, license } = identity;
|
||||
|
||||
const text = typeof req.body?.text === "string" ? req.body.text.trim() : "";
|
||||
if (!text) {
|
||||
const e = await errorEnvelope({
|
||||
error: "missing 'text' in request body",
|
||||
creditKey,
|
||||
installId,
|
||||
statusHint: 400,
|
||||
});
|
||||
return res.status(400).json(e.body);
|
||||
}
|
||||
const clientVoice =
|
||||
typeof req.body?.voice === "string" ? req.body.voice.trim() : "";
|
||||
// Optional output format override (wav|mp3|opus|flac). Kokoro emits
|
||||
// any of these directly; default comes from config (mp3). ElevenLabs
|
||||
// ignores it (always mp3).
|
||||
const clientFormat =
|
||||
typeof req.body?.format === "string" ? req.body.format.trim().toLowerCase() : "";
|
||||
|
||||
const row = await getOrCreateRow({ creditKey, installId, license });
|
||||
const tier = identityTier(identity, row);
|
||||
row.tier_snapshot = tier;
|
||||
const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license);
|
||||
const auditInstall = installId || identity.userId || null;
|
||||
|
||||
const cfg = await getConfigSnapshot();
|
||||
const quota = await getTierQuotas();
|
||||
|
||||
// Balance floor — refuse only when the install has a finite balance
|
||||
// that's exhausted (null total = unlimited, e.g. Max). Max users
|
||||
// (the intended audience) always pass; this just stops a credit-less
|
||||
// Core install from synthesizing for free if it bypasses the Recap-
|
||||
// side Max gate.
|
||||
const preBalance = computeRemaining(row, quota);
|
||||
if (preBalance.total !== null && preBalance.total <= 0) {
|
||||
await recordCall({
|
||||
install_id: auditInstall,
|
||||
license_fingerprint: licenseFp,
|
||||
tier,
|
||||
pipeline: "tts",
|
||||
backend: null,
|
||||
model: null,
|
||||
status: "refused",
|
||||
credit_charged: 0,
|
||||
duration_ms: Date.now() - t0,
|
||||
audio_seconds: 0,
|
||||
cost_usd: 0,
|
||||
job_id: jobId,
|
||||
error: "no_credits",
|
||||
});
|
||||
const e = await errorEnvelope({
|
||||
error: "no_credits",
|
||||
creditKey,
|
||||
installId,
|
||||
license,
|
||||
tier,
|
||||
statusHint: 402,
|
||||
});
|
||||
return res.status(402).json(e.body);
|
||||
}
|
||||
|
||||
// Resolve availability + choose a backend.
|
||||
const hw = await resolveHardwareConfig(cfg);
|
||||
const kokoroReady = !!hw.tts?.url;
|
||||
const elevenConfigured = !!(
|
||||
cfg.relay_elevenlabs_api_key && cfg.relay_elevenlabs_voice_id
|
||||
);
|
||||
const preference = cfg.relay_tts_backend_preference || "hardware_first";
|
||||
const chosen = pickTtsBackend({ preference, kokoroReady, elevenConfigured });
|
||||
if (!chosen) {
|
||||
const reason = hw.tts?.blocked_reason
|
||||
? hw.tts.blocked_reason
|
||||
: "No TTS backend available — configure Spark Control (Kokoro) or relay_elevenlabs_api_key.";
|
||||
await recordCall({
|
||||
install_id: auditInstall,
|
||||
license_fingerprint: licenseFp,
|
||||
tier,
|
||||
pipeline: "tts",
|
||||
backend: null,
|
||||
model: null,
|
||||
status: "error",
|
||||
credit_charged: 0,
|
||||
duration_ms: Date.now() - t0,
|
||||
audio_seconds: 0,
|
||||
cost_usd: 0,
|
||||
job_id: jobId,
|
||||
error: reason,
|
||||
});
|
||||
const e = await errorEnvelope({
|
||||
error: reason,
|
||||
creditKey,
|
||||
installId,
|
||||
license,
|
||||
tier,
|
||||
statusHint: 503,
|
||||
});
|
||||
return res.status(503).json(e.body);
|
||||
}
|
||||
|
||||
// Decouple billing from routing (same reasoning as transcribe): look
|
||||
// up the job to decide whether to charge, but always synthesize.
|
||||
const reusedJob = !!lookupJob({ creditKey, installId, license, jobId });
|
||||
|
||||
let result;
|
||||
try {
|
||||
if (chosen === "kokoro") {
|
||||
const backend = createKokoroBackend({
|
||||
sparkControlBaseURL: hw.sparkBase || hw.tts.url || "",
|
||||
defaultVoice: cfg.relay_tts_default_voice,
|
||||
defaultFormat: cfg.relay_tts_format,
|
||||
});
|
||||
result = await backend.synthesize({ text, voice: clientVoice, format: clientFormat });
|
||||
} else {
|
||||
const backend = createElevenLabsBackend({
|
||||
apiKey: cfg.relay_elevenlabs_api_key,
|
||||
voiceId: cfg.relay_elevenlabs_voice_id,
|
||||
model: cfg.relay_elevenlabs_model,
|
||||
});
|
||||
result = await backend.synthesize({ text, voice: clientVoice });
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`[relay/tts] ${chosen} backend error: ${err?.message}`);
|
||||
await recordCall({
|
||||
install_id: auditInstall,
|
||||
license_fingerprint: licenseFp,
|
||||
tier,
|
||||
pipeline: "tts",
|
||||
backend: chosen,
|
||||
model: chosen === "kokoro" ? "kokoro" : cfg.relay_elevenlabs_model,
|
||||
status: "error",
|
||||
credit_charged: 0,
|
||||
duration_ms: Date.now() - t0,
|
||||
audio_seconds: 0,
|
||||
cost_usd: 0,
|
||||
job_id: jobId,
|
||||
error: (err?.message || String(err)).slice(0, 200),
|
||||
});
|
||||
const e = await errorEnvelope({
|
||||
error: err?.message || "tts_backend_error",
|
||||
creditKey,
|
||||
installId,
|
||||
license,
|
||||
tier,
|
||||
statusHint: err?.status || 502,
|
||||
});
|
||||
return res.status(e.statusHint).json(e.body);
|
||||
}
|
||||
|
||||
// Charge once per job id. Operator hardware (Kokoro) is fixed-cost
|
||||
// so cost_usd stays 0; ElevenLabs has a real per-char cost but we
|
||||
// don't have its billing API wired, so 0 here too (audit shows the
|
||||
// call happened; margin tracking for ElevenLabs is a later add).
|
||||
let creditCharged = 0;
|
||||
if (!reusedJob) {
|
||||
await commitCredit({ creditKey, installId, license, backend: chosen, tier });
|
||||
await markJobCharged({ creditKey, installId, license, jobId, backend: chosen, tier });
|
||||
creditCharged = 1;
|
||||
}
|
||||
|
||||
await recordCall({
|
||||
install_id: installId,
|
||||
license_fingerprint: licenseFp,
|
||||
tier,
|
||||
pipeline: "tts",
|
||||
backend: chosen,
|
||||
model: result?.model || null,
|
||||
status: "success",
|
||||
credit_charged: creditCharged,
|
||||
duration_ms: Date.now() - t0,
|
||||
audio_seconds: result?.durationSeconds || 0,
|
||||
cost_usd: 0,
|
||||
job_id: jobId,
|
||||
attempts: result?.attempts || null,
|
||||
});
|
||||
|
||||
// Post-charge balance for the client's credit pill.
|
||||
const balance = computeRemaining(row, quota);
|
||||
|
||||
res.set("Content-Type", result.contentType || "audio/wav");
|
||||
res.set("X-Recap-Tts-Backend", chosen);
|
||||
if (result.voice) res.set("X-Recap-Tts-Voice", result.voice);
|
||||
if (result.durationSeconds != null) {
|
||||
res.set("X-Recap-Audio-Duration", String(result.durationSeconds));
|
||||
}
|
||||
res.set(
|
||||
"X-Recap-Credits-Remaining",
|
||||
balance.total == null ? "unlimited" : String(balance.total)
|
||||
);
|
||||
res.set("X-Recap-Tier", tier);
|
||||
res.set("X-Recap-Credit-Charged", String(creditCharged));
|
||||
res.set("Cache-Control", "no-store");
|
||||
return res.send(result.audio);
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
Reference in New Issue
Block a user