// Audio-first ("walking mode") TTS routes. Turns a saved recap's per-topic // summaries into spoken MP3 clips (via the relay's /relay/tts → Kokoro), // caches them next to the session JSON, and serves them to the player. // // Endpoints (all scope-isolated to the requesting user's library): // GET /api/tts/availability — { has_tts, allowed, default_voice } // POST /api/tts/generate/:id/:index — synthesize + cache ONE topic clip // (idempotent, retried, deduped by job // id). The player calls this on demand // as it reaches each topic + prefetches // the next, so clips are generated when // needed and retried until they succeed // rather than skipped. // GET /api/tts/status/:id — { total, ready:[idx...], done } // GET /api/tts/audio/:id/:index — serve a cached topic clip (mp3) // // Access policy (the "Max gate"): // - single mode: the operator owns the box AND the TTS hardware, so no // tier gate — TTS is available whenever the relay advertises has_tts. // - multi mode admin: the operator; allowed. // - multi-tenant cloud users: any paid subscription (Pro or Max). The // operator can tighten this to Max-only here if shared TTS hardware // throughput becomes a constraint. // // Billing: all of a recap's topics share ONE relay job id (`tts:`), so // the relay charges at most 1 credit to voice an entire recap. import fs from "fs/promises"; import path from "path"; import { scopeForRequest, sessionAudioDir, loadSession, patchSession, } from "./history.js"; import { getProvider, resolveProviderOpts } from "./providers/index.js"; import { getRelayCapabilities } from "./relay-capabilities.js"; const CLIP_FORMAT = "mp3"; const CLIP_EXT = "mp3"; // Whether THIS request's user may generate TTS. See the policy note above. export function userHasTtsAccess(req) { // Single mode (or no request context): operator owns the hardware. if (!req || req.recapMode !== "multi") return true; // Multi-mode admin = the operator. if (req.user && req.user.is_admin) return true; // Multi-tenant cloud user: Pro or Max. Core-decoupling — the tier is the // relay-owned subscription tier, cached on the Recaps account // (req.user.tier), kept in sync by the operator grant flow. const tier = req.user?.tier; return tier === "pro" || tier === "max"; } // The text we speak for a topic: its title as a lead-in, then the summary, // so an eyes-free listener hears what the topic is before its recap. export function chunkSpeechText(chunk) { const title = (chunk?.title || "").trim(); const summary = (chunk?.summary || "").trim(); if (title && summary) return `${title}. ${summary}`; return summary || title || ""; } function clipFileName(index) { return `topic-${index}.${CLIP_EXT}`; } // Server-side retries per clip on a transient (5xx/network) relay failure, // on top of any retry the relay itself does. const GEN_RETRIES = 2; // Generate + cache ONE topic clip. Idempotent: returns {cached:true} if the // file already exists. Retries transient failures; a 4xx (e.g. bad voice) or // empty summary is permanent (no retry). Returns // { ok, cached?, empty?, error?, voice? }. async function generateClip({ scope, id, index, chunk, provider, jobId, voice }) { const dir = sessionAudioDir(scope, id); const file = path.join(dir, clipFileName(index)); try { await fs.access(file); return { ok: true, cached: true }; } catch {} const text = chunkSpeechText(chunk); if (!text) return { ok: false, empty: true, error: "empty_summary" }; await fs.mkdir(dir, { recursive: true }).catch(() => {}); let lastErr = null; for (let attempt = 1; attempt <= GEN_RETRIES + 1; attempt++) { try { const r = await provider.tts({ text, voice, format: CLIP_FORMAT, jobId }); await fs.writeFile(file, r.audio); return { ok: true, voice: r.voice, backend: r.backend }; } catch (err) { lastErr = err; const status = err?.status || 0; console.warn( `[tts] clip ${index} attempt ${attempt}/${GEN_RETRIES + 1} failed (${status || "net"}): ${err?.message || err}`, ); if (status >= 400 && status < 500) break; // client error → permanent if (attempt <= GEN_RETRIES) await new Promise((r2) => setTimeout(r2, 600)); } } return { ok: false, error: (lastErr?.message || "tts_failed").slice(0, 200) }; } function resolveScope(req, res) { try { return scopeForRequest(req); } catch { res.status(401).json({ error: "no_scope" }); return null; } } export function setupTtsRoutes(app) { // Lightweight probe for the frontend: should it show the "Listen" // affordance, and what's the default voice? app.get("/api/tts/availability", (req, res) => { const caps = getRelayCapabilities(); res.json({ has_tts: !!caps.has_tts, tts_backend: caps.tts_backend || null, default_voice: caps.tts_default_voice || null, allowed: userHasTtsAccess(req) && !!caps.has_tts, }); }); // Generate (or return cached) the audio for ONE topic. The player calls // this on demand as it reaches each topic — and prefetches the next — so a // clip is generated when needed and RETRIED until it succeeds, rather than // skipped. Idempotent + deduped by the shared job id (≤1 credit/recap). // // Responses: // 200 { ok:true, index, cached } — clip is ready to play // 200 { ok:false, empty:true } — topic has no summary text (permanent; // client should not retry) // 502 { ok:false, error } — transient failure; client retries app.post("/api/tts/generate/:id/:index", async (req, res) => { const scope = resolveScope(req, res); if (!scope) return; if (!userHasTtsAccess(req)) { return res.status(403).json({ error: "tts_requires_subscription", message: "Audio recaps are available to Pro and Max subscribers.", }); } const caps = getRelayCapabilities(); if (!caps.has_tts) { return res.status(503).json({ error: "tts_unavailable", message: "Text-to-speech isn't available on this relay right now.", }); } const id = req.params.id; const index = parseInt(req.params.index, 10); const session = await loadSession(scope, id); if (!session) return res.status(404).json({ error: "session_not_found" }); const chunks = Array.isArray(session.chunks) ? session.chunks : []; if (!Number.isInteger(index) || index < 0 || index >= chunks.length) { return res.status(400).json({ error: "bad_index" }); } let provider; try { provider = getProvider("relay", resolveProviderOpts("relay", { req })); } catch (err) { return res.status(503).json({ error: "relay_unavailable", message: err?.message || "Relay is not configured.", }); } const voice = typeof req.query.voice === "string" && req.query.voice.trim() ? req.query.voice.trim() : undefined; const result = await generateClip({ scope, id, index, chunk: chunks[index], provider, jobId: `tts:${id}`, // one credit for the whole recap voice, }); if (result.ok) { patchSession(scope, id, { summaryAudio: { ready: true, total: chunks.length, voice: result.voice || caps.tts_default_voice || null, format: CLIP_FORMAT, updatedAt: new Date().toISOString(), }, }).catch(() => {}); return res.json({ ok: true, index, cached: !!result.cached, voice: result.voice || null }); } if (result.empty) { return res.json({ ok: false, index, empty: true, error: "empty_summary" }); } return res.status(502).json({ ok: false, index, error: result.error || "tts_failed" }); }); // Which topics are already synthesized for a recap. app.get("/api/tts/status/:id", async (req, res) => { const scope = resolveScope(req, res); if (!scope) return; const session = await loadSession(scope, req.params.id); if (!session) return res.status(404).json({ error: "session_not_found" }); const total = Array.isArray(session.chunks) ? session.chunks.length : 0; const dir = sessionAudioDir(scope, req.params.id); let files = []; try { files = await fs.readdir(dir); } catch {} const ready = files .map((f) => { const m = new RegExp(`^topic-(\\d+)\\.${CLIP_EXT}$`).exec(f); return m ? Number(m[1]) : null; }) .filter((n) => n !== null) .sort((a, b) => a - b); const caps = getRelayCapabilities(); res.json({ total, ready, done: total > 0 && ready.length >= total, allowed: userHasTtsAccess(req) && !!caps.has_tts, voice: session.summaryAudio?.voice || caps.tts_default_voice || null, }); }); // Serve one cached topic clip. sendFile handles Range requests (so the //