// POST /relay/analyze — forwards an analysis prompt to the chosen // backend and returns the standard envelope. // // Request body (application/json): // { prompt: string } // // Headers: same as /relay/transcribe (X-Recap-Install-Id required, // X-Recap-Job-Id optional, Authorization optional Bearer license). // // Same charge-once-per-job semantics: a Recap summarize job pairs // transcribe + analyze with the same X-Recap-Job-Id. The first call // (whichever endpoint) charges 1 credit; the second is free. // // Every outcome (success / quota-refused / backend-error) writes one // row to the audit log so the admin dashboard can compute cost, // margin, and speed metrics. import express from "express"; import { resolveIdentity, identityTier } from "../identity.js"; import { getOrCreateRow, planBackend, commitCredit, licenseFingerprint, } from "../credits.js"; import { lookupJob, markJobCharged, refundJob } from "../job-credits.js"; import { getConfigSnapshot, getTierQuotas } from "../config.js"; import { createGeminiBackend } from "../backends/gemini.js"; import { createHardwareBackend } from "../backends/hardware.js"; import { envelope, errorEnvelope } from "./envelope.js"; import { recordCall } from "../audit-log.js"; import { resolveHardwareConfig } from "../hardware-config.js"; import { reportHealthEvent } from "../spark-control-events.js"; import { calcGeminiCost } from "../pricing.js"; export function analyzeRouter() { const router = express.Router(); router.post("/analyze", express.json({ limit: "10mb" }), async (req, res) => { const t0 = Date.now(); const jobId = req.header("X-Recap-Job-Id") || null; let identity; try { identity = await resolveIdentity(req); } catch (err) { const e = await errorEnvelope({ error: err?.message || "auth_error", statusHint: err?.status || 401, }); return res.status(e.statusHint || 401).json(e.body); } if (identity.kind === "license" && !identity.installId) { const e = await errorEnvelope({ error: "missing X-Recap-Install-Id header", statusHint: 400, }); return res.status(400).json(e.body); } const { creditKey, installId, license } = identity; const prompt = req.body?.prompt; if (!prompt || typeof prompt !== "string") { const e = await errorEnvelope({ error: "missing or non-string body.prompt", creditKey, installId, statusHint: 400, }); return res.status(400).json(e.body); } const row = await getOrCreateRow({ creditKey, installId, license }); const tier = identityTier(identity, row); row.tier_snapshot = tier; const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license); const auditInstall = installId || identity.userId || null; // Two separate decisions on every call: // 1. Billing: did we already charge a credit for this job? (look // up by job_id; reused → don't charge again.) // 2. Routing: which backend serves THIS pipeline step's request? // (always per-pipeline preference + planBackend, even when // the job has a prior transcribe call that routed elsewhere.) // // The old code conflated the two — it copied `backend` from the // existing job, which meant analyze would silently inherit // transcribe's backend choice even when the operator's analyze // preference said something different. Fixed: routing is decided // fresh per pipeline step, regardless of job history. const reusedJob = !!lookupJob({ creditKey, installId, license, jobId }); const cfg = await getConfigSnapshot(); const hw = await resolveHardwareConfig(cfg); // Operator-only diagnostic — see summarize-url.js for the full // reasoning. We don't 503 here on blocked_reason because doing // so pre-empts planBackend and would surface operator-internal // Spark Control / vLLM wording to clients even when Gemini was // the configured preference. planBackend correctly routes around // an unavailable hardware path via hasHardware = false. if (hw.analyze.blocked_reason) { console.warn( `[analyze] hardware analyze currently blocked (planBackend will route to Gemini if available): ${hw.analyze.blocked_reason}`, ); } const hasHardware = !!hw.analyze.url; const quota = await getTierQuotas(); const preference = cfg.relay_analyze_backend_preference || "gemini_first"; const plan = planBackend(row, quota, { hasHardware, preference }); if (!plan.allowed) { await recordCall({ install_id: auditInstall, license_fingerprint: licenseFp, tier, pipeline: "analyze", backend: null, model: null, status: "refused", credit_charged: 0, duration_ms: Date.now() - t0, cost_usd: 0, job_id: jobId, error: plan.reason, }); const e = await errorEnvelope({ error: plan.reason, creditKey, installId, tier, statusHint: 402, }); return res.status(402).json(e.body); } const chosenBackend = plan.backend; let result; try { if (chosenBackend === "gemini") { const backend = createGeminiBackend({ apiKey: cfg.relay_gemini_api_key, transcriptionModel: cfg.relay_gemini_transcription_model, analysisModel: cfg.relay_gemini_analysis_model, }); result = await backend.analyzeText({ prompt }); } else { const backend = createHardwareBackend({ parakeetBaseURL: hw.transcribe.url || "", gemmaBaseURL: hw.analyze.url || "", sparkControlBaseURL: hw.sparkBase || "", parakeetModel: hw.transcribe.model || "", gemmaModel: hw.analyze.model || "", }); result = await backend.analyzeText({ prompt }); } } catch (err) { if (reusedJob) await refundJob({ creditKey, installId, license, jobId }); console.error(`[relay/analyze] backend error: ${err?.message}`); // Passive health-event report to Spark Control so the // operator's dashboard surfaces the failure immediately // (without waiting for its own polling cycle to catch it). // Only fired for hardware-side calls — Gemini failures are a // separate observability surface (Google's API health). if (chosenBackend === "hardware") { reportHealthEvent({ service: "vllm", ok: false, error: (err?.message || String(err)).slice(0, 280), ms: Date.now() - t0, }); } await recordCall({ install_id: auditInstall, license_fingerprint: licenseFp, tier, pipeline: "analyze", backend: chosenBackend, model: chosenBackend === "gemini" ? cfg.relay_gemini_analysis_model : hw.analyze.model || "(auto)", status: "error", credit_charged: 0, duration_ms: Date.now() - t0, cost_usd: 0, job_id: jobId, error: (err?.message || String(err)).slice(0, 200), }); const e = await errorEnvelope({ error: err?.message || "backend_error", creditKey, installId, tier, statusHint: err?.status || 502, }); return res.status(e.statusHint).json(e.body); } let creditCharged = 0; if (!reusedJob) { await commitCredit({ creditKey, installId, license, backend: chosenBackend, tier }); await markJobCharged({ creditKey, installId, license, jobId, backend: chosenBackend, tier }); creditCharged = 1; } const costDetails = chosenBackend === "gemini" && result.usage ? calcGeminiCost(result.model, result.usage) : { input_tokens: 0, output_tokens: 0, thinking_tokens: 0, cost_usd: 0, }; await recordCall({ install_id: auditInstall, license_fingerprint: licenseFp, tier, pipeline: "analyze", backend: chosenBackend, model: result?.model || null, status: "success", credit_charged: creditCharged, duration_ms: Date.now() - t0, job_id: jobId, // Surface the cascade so the dashboard can show "served by // 2.5-flash after 3-flash 503'd" — Gemini backend returns this; // hardware backend doesn't (no per-model fallback there). attempts: result?.attempts || null, ...costDetails, }); const body = await envelope({ result, creditKey, installId, license, tier, creditCharged }); res.json(body); }); return router; }