Wire new routes; identity, summarize-url, dashboard, admin

2026-06-13 13:36:30 -05:00
parent 04dcf86fa4
commit 318c6c4b81
20 changed files with 12407 additions and 499 deletions
@@ -10,6 +10,13 @@
 //   {
 //     ts:           ms-epoch when the request landed
 //     install_id:   X-Recap-Install-Id (truncated for log readability)
+//     license_fingerprint: stable 16-hex hash of the licenseUuid for
+//                   paid-tier calls; null for anonymous/Core. Added
+//                   in the license-keyed-credits refactor so spend can
+//                   be aggregated by license-pool (since one license
+//                   may span multiple installs). install_id is STILL
+//                   logged on every entry — license_fingerprint is
+//                   additive forensic visibility, not a replacement.
 //     tier:         "core" | "pro" | "max"
 //     pipeline:     "transcribe" | "analyze"
 //     backend:      "gemini" | "hardware"
@@ -36,9 +43,16 @@ import path from "path";
 let dataDir = "/data";
 let logPath = "/data/relay-calls.ndjson";

+// Size at which we rotate the live ndjson to a dated archive. Picked
+// to roughly match a year of high-volume relay traffic — a typical
+// entry is ~400 bytes, so 50MB ≈ 130k entries. Rotation runs once at
+// boot; the operator can also rotate manually any time.
+const ROTATION_THRESHOLD_BYTES = 50 * 1024 * 1024;
+
 export async function initAuditLog({ dataDir: dd }) {
  if (dd) dataDir = dd;
  logPath = path.join(dataDir, "relay-calls.ndjson");
+  await maybeRotateLog();
  // Ensure the file exists so the streaming read path doesn't trip.
  try {
    await fs.access(logPath);
@@ -48,6 +62,46 @@ export async function initAuditLog({ dataDir: dd }) {
  console.log(`[audit-log] writing to ${logPath}`);
 }

+// Rotate the live ndjson to a dated archive when it grows past the
+// threshold. The dashboard's `readEntries` always reads the live file
+// only — archived entries fall out of the rolling 30-day window
+// naturally and are kept around as raw files for ad-hoc analysis or
+// long-term storage / CSV export. If a same-day archive already exists
+// (e.g. operator restarts the relay mid-rotation), append a counter.
+async function maybeRotateLog() {
+  let stat;
+  try {
+    stat = await fs.stat(logPath);
+  } catch {
+    return; // No file yet — nothing to rotate.
+  }
+  if (stat.size < ROTATION_THRESHOLD_BYTES) return;
+
+  const ymd = new Date().toISOString().slice(0, 10);
+  let archive = path.join(dataDir, `relay-calls-${ymd}.ndjson`);
+  let counter = 1;
+  while (true) {
+    try {
+      await fs.access(archive);
+      // Exists; pick a new name with a counter suffix.
+      archive = path.join(dataDir, `relay-calls-${ymd}.${counter}.ndjson`);
+      counter += 1;
+      if (counter > 99) return; // pathological — give up rotating
+    } catch {
+      break; // Free name found.
+    }
+  }
+  try {
+    await fs.rename(logPath, archive);
+    await fs.writeFile(logPath, "", { mode: 0o600 });
+    console.log(
+      `[audit-log] rotated ${(stat.size / 1024 / 1024).toFixed(1)}MB → ${archive}`
+    );
+  } catch (err) {
+    console.warn(`[audit-log] rotation failed: ${err?.message || err}`);
+  }
+}
+
 // Best-effort append. Errors are logged but never rethrown — losing
 // an audit line shouldn't fail the relay call that caused it.
 export async function recordCall(entry) {
@@ -59,6 +113,55 @@ export async function recordCall(entry) {
  }
 }

+// Truncate the entire audit log. Used by the dashboard's "Delete all"
+// button for cleanup before going-live or after a string of bad-data
+// test runs (relay re-installed mid-run, config tweaks producing
+// inconsistent measurements, etc.). Destructive — no undo.
+export async function clearAllAuditEntries() {
+  try {
+    await fs.writeFile(logPath, "", { mode: 0o600 });
+    return { ok: true };
+  } catch (err) {
+    return { ok: false, error: err?.message || String(err) };
+  }
+}
+
+// Delete audit rows matching specific job_ids. Reads the whole log,
+// filters out lines belonging to the target jobs, writes the remainder
+// back. O(N) on the file size; fine for any plausible audit log (we
+// rotate at 64MB anyway). Returns the count of rows removed.
+export async function deleteAuditRowsByJobIds(jobIds) {
+  if (!Array.isArray(jobIds) || jobIds.length === 0) return { deleted: 0 };
+  const idSet = new Set(jobIds);
+  const lines = [];
+  let deleted = 0;
+  try {
+    const stream = createReadStream(logPath, { encoding: "utf8" });
+    const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
+    for await (const line of rl) {
+      if (!line.trim()) continue;
+      try {
+        const r = JSON.parse(line);
+        if (r.job_id && idSet.has(r.job_id)) {
+          deleted += 1;
+          continue;
+        }
+        lines.push(line);
+      } catch {
+        // Bad line — preserve it rather than dropping; matches the
+        // skip-and-continue behavior of readEntries.
+        lines.push(line);
+      }
+    }
+  } catch (err) {
+    if (err.code !== "ENOENT") throw err;
+  }
+  await fs.writeFile(logPath, lines.join("\n") + (lines.length ? "\n" : ""), {
+    mode: 0o600,
+  });
+  return { deleted };
+}
+
 // Read all entries since `sinceMs` (default: 30 days). Streamed
 // line-by-line so the whole file doesn't sit in memory at once.
 // Returned array is newest-first.
@@ -114,13 +217,21 @@ export function aggregate(entries) {
  }

  // ── By tier ──
+  // unique_users is install-count for Core (no license to dedup against)
+  // and distinct-license-count for paid tiers (so a Pro license active
+  // on two installs counts ONCE here, matching the post-refactor credit
+  // model where they share one monthly pool). Falls back to install_id
+  // for paid entries that predate the license_fingerprint field.
  const byTier = groupBy(entries, (e) => e.tier || "unknown");
  const tierRows = Object.entries(byTier).map(([tier, list]) => ({
    tier,
    calls: list.length,
    cost_usd: sumBy(list, "cost_usd"),
    avg_duration_ms: avgBy(list, "duration_ms"),
-    unique_installs: new Set(list.map((e) => e.install_id)).size,
+    unique_installs:
+      tier === "core" || tier === "unknown"
+        ? new Set(list.map((e) => e.install_id)).size
+        : new Set(list.map((e) => e.license_fingerprint || e.install_id)).size,
  }));

  // ── By model ──
@@ -170,6 +281,30 @@ export function aggregate(entries) {
    .sort((a, b) => b.cost_usd - a.cost_usd)
    .slice(0, 20);

+  // ── By license fingerprint (top 20 by spend, paid tiers only) ──
+  // One license may span multiple installs (cloud account + self-host),
+  // and the post-refactor credit ledger aggregates their spend onto a
+  // single pool. This view mirrors that — operators get a "by paid
+  // user" rollup that doesn't double-count multi-install Pros, plus an
+  // install-count column to see distribution per license.
+  const byLicense = groupBy(
+    entries.filter((e) => e.license_fingerprint),
+    (e) => e.license_fingerprint
+  );
+  const licenseRows = Object.entries(byLicense)
+    .map(([fp, list]) => ({
+      license_fingerprint: fp,
+      tier_snapshot: list[0]?.tier || "core",
+      calls: list.length,
+      cost_usd: sumBy(list, "cost_usd"),
+      summaries: new Set(list.map((e) => e.job_id).filter(Boolean)).size,
+      unique_installs: new Set(list.map((e) => e.install_id).filter(Boolean)).size,
+      avg_duration_ms: avgBy(list, "duration_ms"),
+      last_active_at: Math.max(...list.map((e) => e.ts || 0)),
+    }))
+    .sort((a, b) => b.cost_usd - a.cost_usd)
+    .slice(0, 20);
+
  // ── By hour-of-day (for traffic-pattern view) ──
  const byHour = groupBy(entries, (e) => new Date(e.ts).getUTCHours());
  const hourRows = Array.from({ length: 24 }, (_, h) => {
@@ -193,6 +328,198 @@ export function aggregate(entries) {
    }))
    .sort((a, b) => a.avg_duration_ms - b.avg_duration_ms);

+  // ── Per-summary rollup (collapse transcribe + analyze pairs) ──
+  // Every "summarize" produces 2 audit entries — one transcribe, one
+  // analyze — sharing a job_id. The dashboard's call-level views show
+  // them separately, which is useful for backend-vs-pipeline tuning but
+  // confusing as "how many summaries did I serve". Group by job_id so
+  // operators see one row per summary with combined cost/duration.
+  // Entries without a job_id (older relay versions, or balance pings)
+  // are bucketed into their own "no-jobid" row at the bottom.
+  const byJob = groupBy(entries, (e) => e.job_id || "__no_jobid__");
+  const summaryRows = Object.entries(byJob)
+    .filter(([k]) => k !== "__no_jobid__")
+    .map(([jobId, list]) => {
+      const transcribe = list.find((e) => e.pipeline === "transcribe");
+      const analyze = list.find((e) => e.pipeline === "analyze");
+      return {
+        job_id: jobId,
+        install_id: list[0]?.install_id || null,
+        tier: list[0]?.tier || null,
+        started_at: Math.min(...list.map((e) => e.ts || Infinity)),
+        completed_at: Math.max(...list.map((e) => e.ts || 0)),
+        transcribe_backend: transcribe?.backend || null,
+        transcribe_model: transcribe?.model || null,
+        analyze_backend: analyze?.backend || null,
+        analyze_model: analyze?.model || null,
+        total_cost_usd: sumBy(list, "cost_usd"),
+        total_duration_ms: sumBy(list, "duration_ms"),
+        status:
+          list.every((e) => e.status === "success")
+            ? "success"
+            : list.some((e) => e.status === "error")
+            ? "error"
+            : "partial",
+        had_transcribe: !!transcribe,
+        had_analyze: !!analyze,
+      };
+    })
+    .sort((a, b) => b.completed_at - a.completed_at);
+
+  // ── Recent errors (newest 50) ──
+  // Quick triage view — when something is failing, the operator needs
+  // to see the offending error strings without scrolling the full
+  // call log.
+  // Surface any audit row carrying an error message — that catches
+  // status="error" (true backend failures) AND status="partial"
+  // (e.g. transcribe-with-truncated-chunks, which records the
+  // missing-speech message in the error field). Operators rely on
+  // this view to triage all degraded behavior, not just outright
+  // 5xx-class failures, so the broader filter is the right default.
+  const errorRows = entries
+    .filter((e) => e.error)
+    .slice(0, 50)
+    .map((e) => ({
+      ts: e.ts,
+      install_id: e.install_id || null,
+      tier: e.tier || null,
+      pipeline: e.pipeline || null,
+      backend: e.backend || null,
+      model: e.model || null,
+      duration_ms: e.duration_ms || 0,
+      error: (e.error || "").slice(0, 280),
+      attempts: Array.isArray(e.attempts) ? e.attempts : null,
+    }));
+
+  // ── Per-(pipeline, model) performance + failure tables ──
+  // Normalizes raw duration_ms by audio_seconds so different models
+  // can be compared on a backend-agnostic benchmark: how many ms of
+  // wall-clock time does this model take per minute of audio? Analyze
+  // calls don't have audio (they consume the transcript text), so we
+  // report ms-per-1k-input-tokens for those instead.
+  //
+  // Failure rate is computed against `attempted` (success + error)
+  // and excludes `refused` calls — refused requests never reached the
+  // backend, so they shouldn't count against the model's reliability.
+  const byPipelineModel = {};
+  for (const e of entries) {
+    const pipeline = e.pipeline || "unknown";
+    const model = e.model || "unknown";
+    if (model === "unknown" && e.status === "refused") continue; // refused entries often have no model
+    const key = `${pipeline}::${model}`;
+    if (!byPipelineModel[key]) {
+      byPipelineModel[key] = {
+        pipeline,
+        model,
+        calls: 0,
+        success: 0,
+        errors: 0,
+        refused: 0,
+        partials: 0,
+        sum_duration_ms: 0,
+        sum_audio_seconds: 0,
+        sum_input_tokens: 0,
+        sum_output_tokens: 0,
+        error_counts: {}, // { error_signature: count }
+      };
+    }
+    const row = byPipelineModel[key];
+    row.calls += 1;
+    if (e.status === "success") row.success += 1;
+    if (e.status === "error") row.errors += 1;
+    if (e.status === "refused") row.refused += 1;
+    if (e.status === "partial") row.partials += 1;
+    row.sum_duration_ms += e.duration_ms || 0;
+    if (typeof e.audio_seconds === "number" && e.audio_seconds > 0) {
+      row.sum_audio_seconds += e.audio_seconds;
+    }
+    row.sum_input_tokens += e.input_tokens || 0;
+    row.sum_output_tokens += e.output_tokens || 0;
+    // Aggregate the top-error counts off ANY row that has a populated
+    // error message — not just status="error" rows. Partial (truncated
+    // transcribe) and refused (out-of-credits, capacity-gated) rows
+    // also carry useful error strings the operator wants to see in
+    // the "Top failure modes" table. The old gate `status === "error"`
+    // hid all truncations because they're recorded as status="partial".
+    if (e.error) {
+      const sig = errorSignature(e.error);
+      row.error_counts[sig] = (row.error_counts[sig] || 0) + 1;
+    }
+  }
+  const perfByModel = Object.values(byPipelineModel).map((r) => {
+    const attempted = r.success + r.errors;
+    const successRate = attempted > 0 ? r.success / attempted : null;
+    const audioMin = r.sum_audio_seconds / 60;
+    const msPerAudioMin = audioMin > 0 ? r.sum_duration_ms / audioMin : null;
+    const msPer1kInputTokens =
+      r.sum_input_tokens > 0
+        ? r.sum_duration_ms / (r.sum_input_tokens / 1000)
+        : null;
+    // Top 3 error signatures by frequency for this model.
+    const topErrors = Object.entries(r.error_counts)
+      .map(([signature, count]) => ({ signature, count }))
+      .sort((a, b) => b.count - a.count)
+      .slice(0, 3);
+    return {
+      pipeline: r.pipeline,
+      model: r.model,
+      calls: r.calls,
+      success: r.success,
+      errors: r.errors,
+      refused: r.refused,
+      partials: r.partials,
+      // "failures" = total signal worth surfacing in failure tables.
+      // Includes partials so a TX that lost minutes of speech via a
+      // truncated chunk is counted as a failure mode, not silently
+      // tucked away under a "success" pipe. The errors-by-model
+      // dashboard table reads this; the per-call "errors" field stays
+      // available for stricter computations.
+      failures: r.errors + r.partials,
+      success_rate: successRate,
+      // Speed benchmark fields. Either or both may be null when there
+      // wasn't enough successful-with-metadata data to compute them.
+      ms_per_audio_minute: msPerAudioMin,
+      ms_per_1k_input_tokens: msPer1kInputTokens,
+      total_audio_minutes: audioMin > 0 ? audioMin : null,
+      top_errors: topErrors,
+    };
+  });
+
+  // ── Revenue / margin (requires tier prices supplied by caller) ──
+  // Distinct paying USERS in the window × the operator's per-tier
+  // monthly price. For Core (free) we count distinct installs — that's
+  // still the right grain for free-tier "active users", since Core has
+  // no license to dedup against. For Pro/Max we count distinct license
+  // fingerprints so a single Pro license activated on two installs
+  // (cloud + self-host) counts ONCE toward monthly revenue, matching
+  // the post-refactor credit model where they share one monthly pool.
+  // Falls back to install_id for paid entries missing a fingerprint
+  // (legacy pre-refactor audit rows) so historical ranges stay
+  // approximately correct rather than dropping to zero.
+  //
+  // Strictly an *estimate* — the relay doesn't know if a Pro user
+  // actually paid this month, just that they touched a request.
+  // Underestimates churned customers (who paid but didn't call) and
+  // overestimates trial users (who haven't paid yet). Hooked in by
+  // the dashboard route, not here, so tests can pass an empty prices
+  // map and get zero.
+  const tierActiveInstalls = {
+    core: new Set(),
+    pro: new Set(),
+    max: new Set(),
+  };
+  for (const e of entries) {
+    const t = e.tier || "core";
+    if (!tierActiveInstalls[t]) continue;
+    if (t === "core") {
+      if (e.install_id) tierActiveInstalls.core.add(e.install_id);
+    } else {
+      // Paid: prefer fingerprint, fall back to install_id for legacy rows.
+      const id = e.license_fingerprint || e.install_id;
+      if (id) tierActiveInstalls[t].add(id);
+    }
+  }
+
  return {
    summary: {
      calls,
@@ -206,14 +533,89 @@ export function aggregate(entries) {
      total_input_tokens: totalInputTokens,
      total_output_tokens: totalOutputTokens,
      total_thinking_tokens: totalThinkingTokens,
+      total_summaries: summaryRows.length,
+      // active_installs_by_tier name retained for dashboard compatibility,
+      // but the paid-tier counts here are actually DISTINCT LICENSES,
+      // not distinct installs (see the comment on tierActiveInstalls
+      // above). Core remains install-based. The dashboard label is
+      // "Active users by tier" which fits either grain.
+      active_installs_by_tier: {
+        core: tierActiveInstalls.core.size,
+        pro: tierActiveInstalls.pro.size,
+        max: tierActiveInstalls.max.size,
+      },
    },
    by_tier: tierRows,
    by_model: modelRows,
    by_pipeline: pipelineRows,
    by_backend: backendRows,
    by_install: installRows,
+    by_license: licenseRows,
    by_hour_utc: hourRows,
    cost_vs_speed: costSpeedRows,
+    by_summary: summaryRows,
+    errors: errorRows,
+    perf_by_model: perfByModel,
+  };
+}
+
+// Normalize a raw error string into a stable signature so two
+// near-identical messages bucket together. The audit log stores
+// truncated raw messages — we want the bucket key to be coarse enough
+// that small variations (a different request-id, file name, port
+// number, etc.) collapse into a single error class.
+//
+// Heuristics:
+//   - Strip ISO timestamps and timestamps with offsets
+//   - Strip UUIDs / hex blob hashes / long alphanumeric IDs
+//   - Strip numeric file sizes and ports
+//   - Strip URLs to their host + path-pattern
+//   - Trim to first 120 chars after normalization
+function errorSignature(raw) {
+  if (!raw) return "(unknown)";
+  let s = String(raw);
+  s = s.replace(/\b\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:?\d{2})?\b/g, "<ts>");
+  s = s.replace(/\b[0-9a-f]{32,}\b/gi, "<hex>");
+  s = s.replace(/\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/gi, "<uuid>");
+  s = s.replace(/https?:\/\/[^\s)"']+/g, "<url>");
+  s = s.replace(/:\d{2,5}\b/g, ":<port>");
+  s = s.replace(/\b\d{4,}\b/g, "<n>");
+  return s.trim().slice(0, 120);
+}
+
+// Derived revenue/margin numbers. Pulled out of aggregate() because it
+// needs prices the operator sets in config — keeping the core
+// aggregator config-agnostic. Returns:
+//   {
+//     monthly_revenue_usd:  pro_count * pro_price + max_count * max_price
+//                            + core_count * core_price,
+//     gemini_cost_usd_in_range: summary.total_cost_usd (passed through),
+//     margin_usd:           revenue - cost,  // approximate
+//     by_tier_revenue:      [{ tier, active_installs, price_usd, revenue_usd }],
+//   }
+//
+// `active_installs_by_tier` should come from the aggregate summary
+// (Set sizes already computed there). `prices` is the {core,pro,max}
+// USD-per-month map. `geminiCostInRange` is total_cost_usd from the
+// summary.
+export function computeRevenue({ activeInstallsByTier, prices, geminiCostInRange }) {
+  const tiers = ["core", "pro", "max"];
+  const byTier = tiers.map((tier) => {
+    const installs = activeInstallsByTier?.[tier] || 0;
+    const price = Math.max(0, Number(prices?.[tier] ?? 0));
+    return {
+      tier,
+      active_installs: installs,
+      price_usd: price,
+      revenue_usd: installs * price,
+    };
+  });
+  const revenue = byTier.reduce((s, r) => s + r.revenue_usd, 0);
+  return {
+    monthly_revenue_usd: revenue,
+    gemini_cost_usd_in_range: geminiCostInRange,
+    margin_usd: revenue - geminiCostInRange,
+    by_tier_revenue: byTier,
  };
 }