Add multi-tenant cloud mode: self-serve purchase, credit metering, core-decoupling

Introduces RECAP_MODE=multi alongside single-mode self-host: - Tenant auth + accounts (magic-link via System SMTP), per-tenant credit pool, anonymous trial minting with per-IP/-64 caps - Self-serve Pro/Max purchase: inline Lightning (BTCPay) + card (Zaprite), prepaid 30-day periods, expiry-reminder emails - Core-decoupling: relay owns cloud tier/expiry keyed by Recaps user-id - SQLite (better-sqlite3) schema for multi-mode; filesystem unchanged for single - StartOS actions/versions through 0.2.155
2026-06-13 14:25:05 -05:00
parent db580abad7
commit 0ae59f3550
176 changed files with 23823 additions and 803 deletions
@@ -24,22 +24,27 @@ import { calcCost } from "../gemini-helpers.js";

 // Models exposed to the analysis fallback chain. Order matters — first
 // is the preferred default, the rest are tried in order if it fails.
+// The five Gemini models we expose. Verified valid against
+// ai.google.dev/gemini-api/docs/models — older IDs (gemini-3-pro-preview
+// shut down 2026-03-09, gemini-2.0-flash deprecated, gemini-3.1-flash*
+// never existed) are intentionally not in either list.
 export const GEMINI_ANALYSIS_MODELS = [
  "gemini-3.1-pro-preview",
-  "gemini-3-pro-preview",
+  "gemini-2.5-pro",
  "gemini-3-flash-preview",
  "gemini-2.5-flash",
+  "gemini-3.1-flash-lite",
 ];

-// Transcription models, in fallback order. Flash is best speed/cost
-// for audio → text; 2.5 Flash is the stable previous-gen multimodal
-// model and works well as a fallback when Gemini 3 Flash returns 503
-// (capacity / overload). The orchestration layer in server/index.js
-// iterates this list, retrying with the next model when one fails.
+// Transcription fallback order: Flash first (Flash is Google's
+// natural audio fit), Pro only as last-resort because Pro on audio
+// is significantly more expensive than Flash.
 export const GEMINI_TRANSCRIPTION_MODELS = [
  "gemini-3-flash-preview",
  "gemini-2.5-flash",
-  "gemini-2.0-flash",
+  "gemini-3.1-flash-lite",
+  "gemini-2.5-pro",
+  "gemini-3.1-pro-preview",
 ];

 // Empty-response retries: when the SDK returns 200 with no text (which
@@ -178,9 +183,15 @@ export function createGeminiProvider({ apiKey, timeoutMs = 900_000 } = {}) {
        chapters: chaptersHint,
      });

-      // thinkingLevel: "minimal" is only valid for Flash. Pro models
-      // reject it. Match prior behavior precisely.
-      const txConfig = model.includes("flash")
+      // thinkingLevel is a Gemini 3.x param — Gemini 2.5 models use
+      // a different shape (`thinkingBudget`, integer) and 400 on
+      // `thinkingLevel`. Pro models reject thinking config entirely
+      // for the transcribe pipeline. Only send for Gemini 3.x flash
+      // variants where it's a valid latency/cost knob.
+      const isGemini3Flash =
+        model.includes("flash") &&
+        (model.startsWith("gemini-3-") || model.startsWith("gemini-3.") || model.startsWith("gemini-3."));
+      const txConfig = isGemini3Flash
        ? { thinkingConfig: { thinkingLevel: "minimal" } }
        : {};

@@ -199,6 +210,20 @@ export function createGeminiProvider({ apiKey, timeoutMs = 900_000 } = {}) {
                config: {
                  ...txConfig,
                  safetySettings: TRANSCRIPTION_SAFETY,
+                  // Transcripts of long audio are output-token-bound.
+                  // Gemini's default is small (commonly 8192) which is
+                  // enough for ~10-15 min of dense speech but truncates
+                  // 30-45 min chunks mid-transcript with no warning.
+                  // Observed (May 2026): a 45-min chunk transcribed by
+                  // gemini-3.1-flash-lite ended at local 31:05, losing
+                  // 14 minutes of speech silently; another chunk lost
+                  // 43 of 45 minutes after the model output 5 segments
+                  // and stopped. Setting this high gives the model room
+                  // to emit the full transcript; models that don't
+                  // support values this large will clamp internally to
+                  // their max. 65,536 is the upper bound for Gemini 3.x
+                  // flash variants per Google's docs.
+                  maxOutputTokens: 65536,
                },
                contents: [
                  {
@@ -272,8 +297,13 @@ export function createGeminiProvider({ apiKey, timeoutMs = 900_000 } = {}) {
      // UX: fail fast on a single model and let the outer fallback
      // chain in server/index.js walk to the next model (Pro → Pro
      // older → Flash → Flash 2.5) immediately. Caller can override
-      // with retries: 2 if they want the old behavior.
-      retries = 1,
+      // Bumped 1 → 2 in 0.2.76 alongside the responseMimeType:json
+      // change. Analyze is by far the cheapest pipeline phase
+      // (~few seconds per call), so a third total attempt (1 initial
+      // + 2 retries on caught error) is essentially free in wall time
+      // but materially reduces "lost window" failures on transient
+      // 503/429 blips. Callers can override.
+      retries = 2,
      signal,
    }) {
      const result = await retryGemini(
@@ -281,6 +311,19 @@ export function createGeminiProvider({ apiKey, timeoutMs = 900_000 } = {}) {
          withAbort(
            aiAnalyze.models.generateContent({
              model,
+              config: {
+                // JSON mode — Gemini guarantees the response body is
+                // valid JSON when this is set. Eliminates the entire
+                // class of "invalid JSON in window response" failures
+                // that came from the model occasionally wrapping its
+                // sections array in a prose preamble, a ```json```
+                // markdown fence, or truncating the closing brace.
+                // The prompt already asks for JSON; this turns that
+                // into a hard server-enforced constraint on the
+                // model\'s decoder. Mirrors recap-relay 0.2.69\'s
+                // change for the relay-mode analyze path.
+                responseMimeType: "application/json",
+              },
              contents: [
                {
                  role: "user",
@@ -356,7 +399,7 @@ Format each line as:
 Rules:
 - Transcribe EVERY word spoken, do not skip or summarize anything.
 - Use [MM:SS] or [H:MM:SS] timestamp format at the start of each line.
- Start a new timestamped line every 15-30 seconds or at natural speech pauses.
+- Start a new timestamped line every 15-30 seconds or at natural speech pauses or speaker changes.
 - Include filler words (um, uh, you know) for accuracy.
 - Speaker identification: FIRST consult the metadata above — descriptions and chapter titles usually name the host(s) and guest(s) explicitly, and the channel name is often the host's name. Match those names to the voices in the audio (introductions, "I'm Dax", "this is Will", first-person references) and use them as speaker labels. Format as: [MM:SS] Name: text. Only fall back to "Host"/"Guest" if no names appear in the metadata AND nobody is introduced by name in the audio.