Add TTS backends (ElevenLabs, Kokoro) and /relay/tts
This commit is contained in:
@@ -0,0 +1,109 @@
|
|||||||
|
// ElevenLabs TTS backend — the cloud alternative to operator-hardware
|
||||||
|
// Kokoro, mirroring how the transcribe/analyze pipelines treat Gemini
|
||||||
|
// as a swappable cloud fallback to the Parakeet/vLLM hardware path.
|
||||||
|
//
|
||||||
|
// Selected when relay_tts_backend_preference routes here (or when
|
||||||
|
// Kokoro is unavailable and a fallback is allowed) AND
|
||||||
|
// relay_elevenlabs_api_key is configured. Like Kokoro, ElevenLabs
|
||||||
|
// handles full multi-sentence paragraphs in one call — one request per
|
||||||
|
// topic summary, no chunking.
|
||||||
|
//
|
||||||
|
// NOTE: This path is implemented to ElevenLabs' documented API but is
|
||||||
|
// UNTESTED against a live key (the operator hadn't supplied one at
|
||||||
|
// build time). The Kokoro path is the tested default. Before relying on
|
||||||
|
// ElevenLabs in production, set relay_elevenlabs_api_key +
|
||||||
|
// relay_elevenlabs_voice_id and smoke-test one /relay/tts call.
|
||||||
|
//
|
||||||
|
// Output: { audio: Buffer, contentType: "audio/mpeg", durationSeconds,
|
||||||
|
// voice, model } — durationSeconds is null (we don't decode
|
||||||
|
// the MP3 frame count here; the Recap server measures it when
|
||||||
|
// transcoding/caching).
|
||||||
|
|
||||||
|
const API_BASE = "https://api.elevenlabs.io/v1";
|
||||||
|
const DEFAULT_MODEL = "eleven_turbo_v2_5";
|
||||||
|
const DEFAULT_TIMEOUT_MS = 120_000;
|
||||||
|
|
||||||
|
export function createElevenLabsBackend({
|
||||||
|
apiKey = "",
|
||||||
|
voiceId = "",
|
||||||
|
model = DEFAULT_MODEL,
|
||||||
|
timeoutMs = DEFAULT_TIMEOUT_MS,
|
||||||
|
} = {}) {
|
||||||
|
const configured = !!(apiKey && voiceId);
|
||||||
|
|
||||||
|
return {
|
||||||
|
hasTts: configured,
|
||||||
|
kind: "elevenlabs",
|
||||||
|
|
||||||
|
async synthesize({ text, voice }) {
|
||||||
|
if (!apiKey) {
|
||||||
|
const e = new Error(
|
||||||
|
"ElevenLabs TTS is not configured — set relay_elevenlabs_api_key"
|
||||||
|
);
|
||||||
|
e.status = 503;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
// `voice` from the client overrides the operator default voice id
|
||||||
|
// when present (the Recap client may let a user pick a voice).
|
||||||
|
const chosenVoice = (voice || voiceId || "").trim();
|
||||||
|
if (!chosenVoice) {
|
||||||
|
const e = new Error(
|
||||||
|
"ElevenLabs TTS has no voice — set relay_elevenlabs_voice_id"
|
||||||
|
);
|
||||||
|
e.status = 503;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
const cleaned = (text || "").replace(/\s+/g, " ").trim();
|
||||||
|
if (!cleaned) {
|
||||||
|
const e = new Error("TTS input text is empty");
|
||||||
|
e.status = 400;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
const url = `${API_BASE}/text-to-speech/${encodeURIComponent(chosenVoice)}`;
|
||||||
|
let res;
|
||||||
|
try {
|
||||||
|
// Public-internet call — use the global fetch with full cert
|
||||||
|
// validation (NOT lanFetch, which is scoped to LAN/Spark Control).
|
||||||
|
res = await fetch(url, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"xi-api-key": apiKey,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
Accept: "audio/mpeg",
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
text: cleaned,
|
||||||
|
model_id: model || DEFAULT_MODEL,
|
||||||
|
}),
|
||||||
|
signal: AbortSignal.timeout(timeoutMs),
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
const e = new Error(`ElevenLabs TTS network error: ${err?.message || err}`);
|
||||||
|
e.status = 502;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
if (!res.ok) {
|
||||||
|
let body = "";
|
||||||
|
try {
|
||||||
|
body = await res.text();
|
||||||
|
} catch {}
|
||||||
|
const e = new Error(
|
||||||
|
`ElevenLabs TTS ${res.status}: ${body.slice(0, 300)}`
|
||||||
|
);
|
||||||
|
e.status = res.status;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
const audio = Buffer.from(await res.arrayBuffer());
|
||||||
|
return {
|
||||||
|
audio,
|
||||||
|
contentType: "audio/mpeg",
|
||||||
|
durationSeconds: null,
|
||||||
|
sentenceCount: null,
|
||||||
|
attempts: 1,
|
||||||
|
voice: chosenVoice,
|
||||||
|
model: model || DEFAULT_MODEL,
|
||||||
|
};
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -0,0 +1,140 @@
|
|||||||
|
// Kokoro TTS backend — synthesizes a topic summary into speech via Spark
|
||||||
|
// Control's OpenAI-compatible /v1/audio/speech endpoint.
|
||||||
|
//
|
||||||
|
// Kokoro-82M (Apache-2.0, hexgrad/Kokoro-82M) replaced Magpie in Spark
|
||||||
|
// Control v0.14.0. Magpie's NVIDIA Riva decoder had a structural
|
||||||
|
// truncation defect that capped end-to-end reliability at ~85% even with
|
||||||
|
// server-side retries + chunking; Kokoro renders cleanly at any length
|
||||||
|
// (100% in our testing, ~1s for a ~100-word summary, no truncation). So
|
||||||
|
// this backend is a single pass-through call — NONE of the Magpie-era
|
||||||
|
// fragmenting, pacing/recovery-gap, duration-check, retry, or WAV
|
||||||
|
// stitching is needed or present.
|
||||||
|
//
|
||||||
|
// Output: 24kHz mono 16-bit PCM. Kokoro can emit wav/mp3/opus/flac
|
||||||
|
// directly via response_format, so we request the caller's format (mp3
|
||||||
|
// by default — small + universally playable for the mobile/offline
|
||||||
|
// player) and never transcode client-side. durationSeconds is left null:
|
||||||
|
// Kokoro's WAV header carries a placeholder size field (bogus computed
|
||||||
|
// duration), and for mp3 we'd have to decode — the Recap side measures
|
||||||
|
// duration off the cached file / <audio> element instead.
|
||||||
|
|
||||||
|
import { lanFetch } from "../lan-fetch.js";
|
||||||
|
|
||||||
|
const DEFAULT_TIMEOUT_MS = 60_000;
|
||||||
|
const DEFAULT_VOICE = "bm_george";
|
||||||
|
const DEFAULT_FORMAT = "mp3";
|
||||||
|
// One retry on a 5xx / network blip (per the Spark Control dev's
|
||||||
|
// error-handling guidance: 4xx = real client error, 5xx = retry once).
|
||||||
|
// Kokoro doesn't truncate, so there's no duration-based retry.
|
||||||
|
const RETRY_ON_5XX = 1;
|
||||||
|
|
||||||
|
const FORMAT_CONTENT_TYPE = {
|
||||||
|
wav: "audio/wav",
|
||||||
|
mp3: "audio/mpeg",
|
||||||
|
opus: "audio/ogg",
|
||||||
|
flac: "audio/flac",
|
||||||
|
};
|
||||||
|
|
||||||
|
function sleepMs(ms) {
|
||||||
|
return new Promise((r) => setTimeout(r, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function createKokoroBackend({
|
||||||
|
// Spark Control base URL (no path) — derived by the caller from
|
||||||
|
// relay_spark_control_url with the /api/endpoints suffix stripped.
|
||||||
|
sparkControlBaseURL = "",
|
||||||
|
defaultVoice = DEFAULT_VOICE,
|
||||||
|
defaultFormat = DEFAULT_FORMAT,
|
||||||
|
timeoutMs = DEFAULT_TIMEOUT_MS,
|
||||||
|
} = {}) {
|
||||||
|
const sparkBase = (sparkControlBaseURL || "")
|
||||||
|
.trim()
|
||||||
|
.replace(/\/$/, "")
|
||||||
|
.replace(/\/api\/endpoints$/, "");
|
||||||
|
|
||||||
|
async function callKokoro({ text, voice, format }) {
|
||||||
|
const url = `${sparkBase}/v1/audio/speech`;
|
||||||
|
let res;
|
||||||
|
try {
|
||||||
|
res = await lanFetch(url, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
redirect: "follow",
|
||||||
|
body: JSON.stringify({
|
||||||
|
model: "kokoro",
|
||||||
|
input: text,
|
||||||
|
voice,
|
||||||
|
response_format: format,
|
||||||
|
}),
|
||||||
|
signal: AbortSignal.timeout(timeoutMs),
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
const cause = err?.cause?.message || err?.cause?.code || err?.cause || "";
|
||||||
|
const detail = cause ? `${err.message} (cause: ${cause})` : err?.message || String(err);
|
||||||
|
const e = new Error(`Kokoro TTS network error at ${url}: ${detail}`);
|
||||||
|
e.status = 502;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
if (!res.ok) {
|
||||||
|
let body = "";
|
||||||
|
try {
|
||||||
|
body = await res.text();
|
||||||
|
} catch {}
|
||||||
|
const e = new Error(`Kokoro TTS ${res.status} at ${url}: ${body.slice(0, 300)}`);
|
||||||
|
e.status = res.status;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
return Buffer.from(await res.arrayBuffer());
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
hasTts: !!sparkBase,
|
||||||
|
kind: "kokoro",
|
||||||
|
|
||||||
|
async synthesize({ text, voice, format }) {
|
||||||
|
if (!sparkBase) {
|
||||||
|
const e = new Error(
|
||||||
|
"Kokoro TTS is not configured — Spark Control discovery isn't reporting a ready kokoro endpoint"
|
||||||
|
);
|
||||||
|
e.status = 503;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
const cleaned = (text || "").replace(/\s+/g, " ").trim();
|
||||||
|
if (!cleaned) {
|
||||||
|
const e = new Error("TTS input text is empty");
|
||||||
|
e.status = 400;
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
const chosenVoice = (voice || defaultVoice || "").trim() || DEFAULT_VOICE;
|
||||||
|
const fmt = (format || defaultFormat || DEFAULT_FORMAT).toLowerCase();
|
||||||
|
const contentType = FORMAT_CONTENT_TYPE[fmt] || "application/octet-stream";
|
||||||
|
|
||||||
|
let attempt = 0;
|
||||||
|
// Retry only on transient 5xx; a 4xx (bad voice/format) is
|
||||||
|
// deterministic and surfaces immediately.
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
const audio = await callKokoro({ text: cleaned, voice: chosenVoice, format: fmt });
|
||||||
|
return {
|
||||||
|
audio,
|
||||||
|
contentType,
|
||||||
|
durationSeconds: null,
|
||||||
|
voice: chosenVoice,
|
||||||
|
model: "kokoro",
|
||||||
|
format: fmt,
|
||||||
|
attempts: attempt + 1,
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
const status = err?.status || 0;
|
||||||
|
if (status >= 400 && status < 500) throw err; // client error → no retry
|
||||||
|
if (attempt >= RETRY_ON_5XX) throw err;
|
||||||
|
attempt += 1;
|
||||||
|
console.warn(
|
||||||
|
`[kokoro] TTS call failed (${status || "network"}) — retry ${attempt}/${RETRY_ON_5XX}`
|
||||||
|
);
|
||||||
|
await sleepMs(500);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -0,0 +1,289 @@
|
|||||||
|
// POST /relay/tts — synthesize a topic summary into speech for the
|
||||||
|
// Recap app's audio-first ("walking mode") player. Returns the raw
|
||||||
|
// audio bytes (MP3 by default from Kokoro or ElevenLabs) with credit
|
||||||
|
// metadata in response headers.
|
||||||
|
//
|
||||||
|
// Request:
|
||||||
|
// headers:
|
||||||
|
// X-Recap-Install-Id (required)
|
||||||
|
// X-Recap-Job-Id (optional but expected — credit dedup key;
|
||||||
|
// the Recap client sends ONE id per recap so
|
||||||
|
// synthesizing all N topics of a recap costs
|
||||||
|
// at most 1 credit, like transcribe+analyze)
|
||||||
|
// Authorization (optional Bearer LIC1-… for licensed tiers)
|
||||||
|
// body (application/json):
|
||||||
|
// { "text": "the topic summary to speak", "voice": "optional voice id" }
|
||||||
|
//
|
||||||
|
// Response (200):
|
||||||
|
// body: raw audio bytes
|
||||||
|
// headers:
|
||||||
|
// Content-Type audio/wav | audio/mpeg
|
||||||
|
// X-Recap-Tts-Backend kokoro | elevenlabs
|
||||||
|
// X-Recap-Tts-Voice voice id used
|
||||||
|
// X-Recap-Audio-Duration seconds (may be absent for ElevenLabs)
|
||||||
|
// X-Recap-Credits-Remaining number, or "unlimited"
|
||||||
|
// X-Recap-Tier core | pro | max
|
||||||
|
// X-Recap-Credit-Charged 0 | 1
|
||||||
|
//
|
||||||
|
// Errors return the standard JSON errorEnvelope (so the client can keep
|
||||||
|
// its credit pill accurate) with an appropriate status.
|
||||||
|
//
|
||||||
|
// Billing: 1 credit per unique job id, deduped exactly like transcribe.
|
||||||
|
// Gated to Max users on the Recap side; the relay still enforces a
|
||||||
|
// balance floor so a non-Max install can't drain TTS for free.
|
||||||
|
|
||||||
|
import express from "express";
|
||||||
|
import { resolveIdentity, identityTier } from "../identity.js";
|
||||||
|
import {
|
||||||
|
getOrCreateRow,
|
||||||
|
commitCredit,
|
||||||
|
computeRemaining,
|
||||||
|
licenseFingerprint,
|
||||||
|
} from "../credits.js";
|
||||||
|
import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
|
||||||
|
import { getConfigSnapshot, getTierQuotas } from "../config.js";
|
||||||
|
import { resolveHardwareConfig } from "../hardware-config.js";
|
||||||
|
import { createKokoroBackend } from "../backends/kokoro.js";
|
||||||
|
import { createElevenLabsBackend } from "../backends/elevenlabs.js";
|
||||||
|
import { errorEnvelope } from "./envelope.js";
|
||||||
|
import { recordCall } from "../audit-log.js";
|
||||||
|
|
||||||
|
// Pick which TTS backend serves this call given the operator preference
|
||||||
|
// and what's actually available. Returns "kokoro" | "elevenlabs" | null
|
||||||
|
// (null = nothing available → caller surfaces a 503).
|
||||||
|
function pickTtsBackend({ preference, kokoroReady, elevenConfigured }) {
|
||||||
|
const pref = preference || "hardware_first";
|
||||||
|
if (pref === "hardware_only") return kokoroReady ? "kokoro" : null;
|
||||||
|
if (pref === "cloud_only") return elevenConfigured ? "elevenlabs" : null;
|
||||||
|
if (pref === "cloud_first") {
|
||||||
|
if (elevenConfigured) return "elevenlabs";
|
||||||
|
if (kokoroReady) return "kokoro";
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
// hardware_first (default)
|
||||||
|
if (kokoroReady) return "kokoro";
|
||||||
|
if (elevenConfigured) return "elevenlabs";
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function ttsRouter() {
|
||||||
|
const router = express.Router();
|
||||||
|
|
||||||
|
router.post("/tts", express.json({ limit: "256kb" }), async (req, res) => {
|
||||||
|
const t0 = Date.now();
|
||||||
|
const jobId = req.header("X-Recap-Job-Id") || null;
|
||||||
|
|
||||||
|
let identity;
|
||||||
|
try {
|
||||||
|
identity = await resolveIdentity(req);
|
||||||
|
} catch (err) {
|
||||||
|
const e = await errorEnvelope({
|
||||||
|
error: err?.message || "auth_error",
|
||||||
|
statusHint: err?.status || 401,
|
||||||
|
});
|
||||||
|
return res.status(e.statusHint || 401).json(e.body);
|
||||||
|
}
|
||||||
|
if (identity.kind === "license" && !identity.installId) {
|
||||||
|
const e = await errorEnvelope({
|
||||||
|
error: "missing X-Recap-Install-Id header",
|
||||||
|
statusHint: 400,
|
||||||
|
});
|
||||||
|
return res.status(400).json(e.body);
|
||||||
|
}
|
||||||
|
const { creditKey, installId, license } = identity;
|
||||||
|
|
||||||
|
const text = typeof req.body?.text === "string" ? req.body.text.trim() : "";
|
||||||
|
if (!text) {
|
||||||
|
const e = await errorEnvelope({
|
||||||
|
error: "missing 'text' in request body",
|
||||||
|
creditKey,
|
||||||
|
installId,
|
||||||
|
statusHint: 400,
|
||||||
|
});
|
||||||
|
return res.status(400).json(e.body);
|
||||||
|
}
|
||||||
|
const clientVoice =
|
||||||
|
typeof req.body?.voice === "string" ? req.body.voice.trim() : "";
|
||||||
|
// Optional output format override (wav|mp3|opus|flac). Kokoro emits
|
||||||
|
// any of these directly; default comes from config (mp3). ElevenLabs
|
||||||
|
// ignores it (always mp3).
|
||||||
|
const clientFormat =
|
||||||
|
typeof req.body?.format === "string" ? req.body.format.trim().toLowerCase() : "";
|
||||||
|
|
||||||
|
const row = await getOrCreateRow({ creditKey, installId, license });
|
||||||
|
const tier = identityTier(identity, row);
|
||||||
|
row.tier_snapshot = tier;
|
||||||
|
const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license);
|
||||||
|
const auditInstall = installId || identity.userId || null;
|
||||||
|
|
||||||
|
const cfg = await getConfigSnapshot();
|
||||||
|
const quota = await getTierQuotas();
|
||||||
|
|
||||||
|
// Balance floor — refuse only when the install has a finite balance
|
||||||
|
// that's exhausted (null total = unlimited, e.g. Max). Max users
|
||||||
|
// (the intended audience) always pass; this just stops a credit-less
|
||||||
|
// Core install from synthesizing for free if it bypasses the Recap-
|
||||||
|
// side Max gate.
|
||||||
|
const preBalance = computeRemaining(row, quota);
|
||||||
|
if (preBalance.total !== null && preBalance.total <= 0) {
|
||||||
|
await recordCall({
|
||||||
|
install_id: auditInstall,
|
||||||
|
license_fingerprint: licenseFp,
|
||||||
|
tier,
|
||||||
|
pipeline: "tts",
|
||||||
|
backend: null,
|
||||||
|
model: null,
|
||||||
|
status: "refused",
|
||||||
|
credit_charged: 0,
|
||||||
|
duration_ms: Date.now() - t0,
|
||||||
|
audio_seconds: 0,
|
||||||
|
cost_usd: 0,
|
||||||
|
job_id: jobId,
|
||||||
|
error: "no_credits",
|
||||||
|
});
|
||||||
|
const e = await errorEnvelope({
|
||||||
|
error: "no_credits",
|
||||||
|
creditKey,
|
||||||
|
installId,
|
||||||
|
license,
|
||||||
|
tier,
|
||||||
|
statusHint: 402,
|
||||||
|
});
|
||||||
|
return res.status(402).json(e.body);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve availability + choose a backend.
|
||||||
|
const hw = await resolveHardwareConfig(cfg);
|
||||||
|
const kokoroReady = !!hw.tts?.url;
|
||||||
|
const elevenConfigured = !!(
|
||||||
|
cfg.relay_elevenlabs_api_key && cfg.relay_elevenlabs_voice_id
|
||||||
|
);
|
||||||
|
const preference = cfg.relay_tts_backend_preference || "hardware_first";
|
||||||
|
const chosen = pickTtsBackend({ preference, kokoroReady, elevenConfigured });
|
||||||
|
if (!chosen) {
|
||||||
|
const reason = hw.tts?.blocked_reason
|
||||||
|
? hw.tts.blocked_reason
|
||||||
|
: "No TTS backend available — configure Spark Control (Kokoro) or relay_elevenlabs_api_key.";
|
||||||
|
await recordCall({
|
||||||
|
install_id: auditInstall,
|
||||||
|
license_fingerprint: licenseFp,
|
||||||
|
tier,
|
||||||
|
pipeline: "tts",
|
||||||
|
backend: null,
|
||||||
|
model: null,
|
||||||
|
status: "error",
|
||||||
|
credit_charged: 0,
|
||||||
|
duration_ms: Date.now() - t0,
|
||||||
|
audio_seconds: 0,
|
||||||
|
cost_usd: 0,
|
||||||
|
job_id: jobId,
|
||||||
|
error: reason,
|
||||||
|
});
|
||||||
|
const e = await errorEnvelope({
|
||||||
|
error: reason,
|
||||||
|
creditKey,
|
||||||
|
installId,
|
||||||
|
license,
|
||||||
|
tier,
|
||||||
|
statusHint: 503,
|
||||||
|
});
|
||||||
|
return res.status(503).json(e.body);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decouple billing from routing (same reasoning as transcribe): look
|
||||||
|
// up the job to decide whether to charge, but always synthesize.
|
||||||
|
const reusedJob = !!lookupJob({ creditKey, installId, license, jobId });
|
||||||
|
|
||||||
|
let result;
|
||||||
|
try {
|
||||||
|
if (chosen === "kokoro") {
|
||||||
|
const backend = createKokoroBackend({
|
||||||
|
sparkControlBaseURL: hw.sparkBase || hw.tts.url || "",
|
||||||
|
defaultVoice: cfg.relay_tts_default_voice,
|
||||||
|
defaultFormat: cfg.relay_tts_format,
|
||||||
|
});
|
||||||
|
result = await backend.synthesize({ text, voice: clientVoice, format: clientFormat });
|
||||||
|
} else {
|
||||||
|
const backend = createElevenLabsBackend({
|
||||||
|
apiKey: cfg.relay_elevenlabs_api_key,
|
||||||
|
voiceId: cfg.relay_elevenlabs_voice_id,
|
||||||
|
model: cfg.relay_elevenlabs_model,
|
||||||
|
});
|
||||||
|
result = await backend.synthesize({ text, voice: clientVoice });
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`[relay/tts] ${chosen} backend error: ${err?.message}`);
|
||||||
|
await recordCall({
|
||||||
|
install_id: auditInstall,
|
||||||
|
license_fingerprint: licenseFp,
|
||||||
|
tier,
|
||||||
|
pipeline: "tts",
|
||||||
|
backend: chosen,
|
||||||
|
model: chosen === "kokoro" ? "kokoro" : cfg.relay_elevenlabs_model,
|
||||||
|
status: "error",
|
||||||
|
credit_charged: 0,
|
||||||
|
duration_ms: Date.now() - t0,
|
||||||
|
audio_seconds: 0,
|
||||||
|
cost_usd: 0,
|
||||||
|
job_id: jobId,
|
||||||
|
error: (err?.message || String(err)).slice(0, 200),
|
||||||
|
});
|
||||||
|
const e = await errorEnvelope({
|
||||||
|
error: err?.message || "tts_backend_error",
|
||||||
|
creditKey,
|
||||||
|
installId,
|
||||||
|
license,
|
||||||
|
tier,
|
||||||
|
statusHint: err?.status || 502,
|
||||||
|
});
|
||||||
|
return res.status(e.statusHint).json(e.body);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Charge once per job id. Operator hardware (Kokoro) is fixed-cost
|
||||||
|
// so cost_usd stays 0; ElevenLabs has a real per-char cost but we
|
||||||
|
// don't have its billing API wired, so 0 here too (audit shows the
|
||||||
|
// call happened; margin tracking for ElevenLabs is a later add).
|
||||||
|
let creditCharged = 0;
|
||||||
|
if (!reusedJob) {
|
||||||
|
await commitCredit({ creditKey, installId, license, backend: chosen, tier });
|
||||||
|
await markJobCharged({ creditKey, installId, license, jobId, backend: chosen, tier });
|
||||||
|
creditCharged = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
await recordCall({
|
||||||
|
install_id: installId,
|
||||||
|
license_fingerprint: licenseFp,
|
||||||
|
tier,
|
||||||
|
pipeline: "tts",
|
||||||
|
backend: chosen,
|
||||||
|
model: result?.model || null,
|
||||||
|
status: "success",
|
||||||
|
credit_charged: creditCharged,
|
||||||
|
duration_ms: Date.now() - t0,
|
||||||
|
audio_seconds: result?.durationSeconds || 0,
|
||||||
|
cost_usd: 0,
|
||||||
|
job_id: jobId,
|
||||||
|
attempts: result?.attempts || null,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Post-charge balance for the client's credit pill.
|
||||||
|
const balance = computeRemaining(row, quota);
|
||||||
|
|
||||||
|
res.set("Content-Type", result.contentType || "audio/wav");
|
||||||
|
res.set("X-Recap-Tts-Backend", chosen);
|
||||||
|
if (result.voice) res.set("X-Recap-Tts-Voice", result.voice);
|
||||||
|
if (result.durationSeconds != null) {
|
||||||
|
res.set("X-Recap-Audio-Duration", String(result.durationSeconds));
|
||||||
|
}
|
||||||
|
res.set(
|
||||||
|
"X-Recap-Credits-Remaining",
|
||||||
|
balance.total == null ? "unlimited" : String(balance.total)
|
||||||
|
);
|
||||||
|
res.set("X-Recap-Tier", tier);
|
||||||
|
res.set("X-Recap-Credit-Charged", String(creditCharged));
|
||||||
|
res.set("Cache-Control", "no-store");
|
||||||
|
return res.send(result.audio);
|
||||||
|
});
|
||||||
|
|
||||||
|
return router;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user