Files
recap-relay/server/routes/analyze.js
T

236 lines
8.5 KiB
JavaScript

// POST /relay/analyze — forwards an analysis prompt to the chosen
// backend and returns the standard envelope.
//
// Request body (application/json):
// { prompt: string }
//
// Headers: same as /relay/transcribe (X-Recap-Install-Id required,
// X-Recap-Job-Id optional, Authorization optional Bearer license).
//
// Same charge-once-per-job semantics: a Recap summarize job pairs
// transcribe + analyze with the same X-Recap-Job-Id. The first call
// (whichever endpoint) charges 1 credit; the second is free.
//
// Every outcome (success / quota-refused / backend-error) writes one
// row to the audit log so the admin dashboard can compute cost,
// margin, and speed metrics.
import express from "express";
import { resolveIdentity, identityTier } from "../identity.js";
import {
getOrCreateRow,
planBackend,
commitCredit,
licenseFingerprint,
} from "../credits.js";
import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
import { getConfigSnapshot, getTierQuotas } from "../config.js";
import { createGeminiBackend } from "../backends/gemini.js";
import { createHardwareBackend } from "../backends/hardware.js";
import { envelope, errorEnvelope } from "./envelope.js";
import { recordCall } from "../audit-log.js";
import { resolveHardwareConfig } from "../hardware-config.js";
import { reportHealthEvent } from "../spark-control-events.js";
import { calcGeminiCost } from "../pricing.js";
export function analyzeRouter() {
const router = express.Router();
router.post("/analyze", express.json({ limit: "10mb" }), async (req, res) => {
const t0 = Date.now();
const jobId = req.header("X-Recap-Job-Id") || null;
let identity;
try {
identity = await resolveIdentity(req);
} catch (err) {
const e = await errorEnvelope({
error: err?.message || "auth_error",
statusHint: err?.status || 401,
});
return res.status(e.statusHint || 401).json(e.body);
}
if (identity.kind === "license" && !identity.installId) {
const e = await errorEnvelope({
error: "missing X-Recap-Install-Id header",
statusHint: 400,
});
return res.status(400).json(e.body);
}
const { creditKey, installId, license } = identity;
const prompt = req.body?.prompt;
if (!prompt || typeof prompt !== "string") {
const e = await errorEnvelope({
error: "missing or non-string body.prompt",
creditKey,
installId,
statusHint: 400,
});
return res.status(400).json(e.body);
}
const row = await getOrCreateRow({ creditKey, installId, license });
const tier = identityTier(identity, row);
row.tier_snapshot = tier;
const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license);
const auditInstall = installId || identity.userId || null;
// Two separate decisions on every call:
// 1. Billing: did we already charge a credit for this job? (look
// up by job_id; reused → don't charge again.)
// 2. Routing: which backend serves THIS pipeline step's request?
// (always per-pipeline preference + planBackend, even when
// the job has a prior transcribe call that routed elsewhere.)
//
// The old code conflated the two — it copied `backend` from the
// existing job, which meant analyze would silently inherit
// transcribe's backend choice even when the operator's analyze
// preference said something different. Fixed: routing is decided
// fresh per pipeline step, regardless of job history.
const reusedJob = !!lookupJob({ creditKey, installId, license, jobId });
const cfg = await getConfigSnapshot();
const hw = await resolveHardwareConfig(cfg);
// Operator-only diagnostic — see summarize-url.js for the full
// reasoning. We don't 503 here on blocked_reason because doing
// so pre-empts planBackend and would surface operator-internal
// Spark Control / vLLM wording to clients even when Gemini was
// the configured preference. planBackend correctly routes around
// an unavailable hardware path via hasHardware = false.
if (hw.analyze.blocked_reason) {
console.warn(
`[analyze] hardware analyze currently blocked (planBackend will route to Gemini if available): ${hw.analyze.blocked_reason}`,
);
}
const hasHardware = !!hw.analyze.url;
const quota = await getTierQuotas();
const preference =
cfg.relay_analyze_backend_preference || "gemini_first";
const plan = planBackend(row, quota, { hasHardware, preference });
if (!plan.allowed) {
await recordCall({
install_id: auditInstall,
license_fingerprint: licenseFp,
tier,
pipeline: "analyze",
backend: null,
model: null,
status: "refused",
credit_charged: 0,
duration_ms: Date.now() - t0,
cost_usd: 0,
job_id: jobId,
error: plan.reason,
});
const e = await errorEnvelope({
error: plan.reason,
creditKey,
installId,
tier,
statusHint: 402,
});
return res.status(402).json(e.body);
}
const chosenBackend = plan.backend;
let result;
try {
if (chosenBackend === "gemini") {
const backend = createGeminiBackend({
apiKey: cfg.relay_gemini_api_key,
transcriptionModel: cfg.relay_gemini_transcription_model,
analysisModel: cfg.relay_gemini_analysis_model,
});
result = await backend.analyzeText({ prompt });
} else {
const backend = createHardwareBackend({
parakeetBaseURL: hw.transcribe.url || "",
gemmaBaseURL: hw.analyze.url || "",
sparkControlBaseURL: hw.sparkBase || "",
parakeetModel: hw.transcribe.model || "",
gemmaModel: hw.analyze.model || "",
});
result = await backend.analyzeText({ prompt });
}
} catch (err) {
if (reusedJob) await refundJob({ creditKey, installId, license, jobId });
console.error(`[relay/analyze] backend error: ${err?.message}`);
// Passive health-event report to Spark Control so the
// operator's dashboard surfaces the failure immediately
// (without waiting for its own polling cycle to catch it).
// Only fired for hardware-side calls — Gemini failures are a
// separate observability surface (Google's API health).
if (chosenBackend === "hardware") {
reportHealthEvent({
service: "vllm",
ok: false,
error: (err?.message || String(err)).slice(0, 280),
ms: Date.now() - t0,
});
}
await recordCall({
install_id: auditInstall,
license_fingerprint: licenseFp,
tier,
pipeline: "analyze",
backend: chosenBackend,
model: chosenBackend === "gemini"
? cfg.relay_gemini_analysis_model
: hw.analyze.model || "(auto)",
status: "error",
credit_charged: 0,
duration_ms: Date.now() - t0,
cost_usd: 0,
job_id: jobId,
error: (err?.message || String(err)).slice(0, 200),
});
const e = await errorEnvelope({
error: err?.message || "backend_error",
creditKey,
installId,
tier,
statusHint: err?.status || 502,
});
return res.status(e.statusHint).json(e.body);
}
let creditCharged = 0;
if (!reusedJob) {
await commitCredit({ creditKey, installId, license, backend: chosenBackend, tier });
await markJobCharged({ creditKey, installId, license, jobId, backend: chosenBackend, tier });
creditCharged = 1;
}
const costDetails =
chosenBackend === "gemini" && result.usage
? calcGeminiCost(result.model, result.usage)
: {
input_tokens: 0,
output_tokens: 0,
thinking_tokens: 0,
cost_usd: 0,
};
await recordCall({
install_id: auditInstall,
license_fingerprint: licenseFp,
tier,
pipeline: "analyze",
backend: chosenBackend,
model: result?.model || null,
status: "success",
credit_charged: creditCharged,
duration_ms: Date.now() - t0,
job_id: jobId,
// Surface the cascade so the dashboard can show "served by
// 2.5-flash after 3-flash 503'd" — Gemini backend returns this;
// hardware backend doesn't (no per-model fallback there).
attempts: result?.attempts || null,
...costDetails,
});
const body = await envelope({ result, creditKey, installId, license, tier, creditCharged });
res.json(body);
});
return router;
}