v0.2.9 Gemini model selects + fallback chain

2026-05-12 00:45:41 -05:00
parent 05ebeb5d51
commit 8ffc3ffb73
5 changed files with 209 additions and 72 deletions
@@ -27,6 +27,46 @@ const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
 const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview";
 const EMPTY_RETRIES = 3;

+// Per-pipeline fallback chains, ordered newest/most-expensive →
+// older/cheaper. When the operator-selected primary model returns a
+// retryable error (503 capacity, 429 rate limit, etc.) the relay
+// walks DOWN this list — never up, since the operator's choice
+// reflects their preferred price/quality point. The chain is sliced
+// from the primary forward, so picking 2.5-flash falls back to only
+// 2.0-flash, never back up to 3-flash.
+const TRANSCRIPTION_FALLBACK_CHAIN = [
+  "gemini-3-flash-preview",
+  "gemini-2.5-flash",
+  "gemini-2.0-flash",
+];
+const ANALYSIS_FALLBACK_CHAIN = [
+  "gemini-3.1-pro-preview",
+  "gemini-3-pro-preview",
+  "gemini-3-flash-preview",
+  "gemini-2.5-flash",
+];
+
+// Slice the chain starting at the primary model. If the primary isn't
+// in the chain (unknown / typo), return just the primary — no
+// fallback possible. Returns a fresh array so callers can iterate
+// safely.
+function fallbackChain(chain, primary) {
+  const idx = chain.indexOf(primary);
+  if (idx < 0) return [primary];
+  return chain.slice(idx);
+}
+
+// Detect errors that warrant trying the next model in the chain.
+// Capacity / rate-limit / network blips → yes. Auth failures / 400s
+// → no, those would just keep failing with the same root cause.
+function isFallbackEligibleError(err) {
+  const status = err?.status || err?.httpStatusCode || 0;
+  const msg = err?.message || String(err);
+  if (status === 503 || status === 429 || status === 529) return true;
+  if (/overloaded|unavailable|capacity|high demand|rate limit|fetch failed|ECONNRESET|ETIMEDOUT|socket hang up|network/i.test(msg)) return true;
+  return false;
+}
+
 const TRANSCRIPTION_SAFETY = [
  { category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
  { category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
@@ -47,10 +87,13 @@ export function createGeminiBackend({
    apiKey,
    httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
  });
-  // Flash models accept `thinkingLevel: "minimal"`; Pro models reject
-  // it. Detect from the model id so the operator can flip flash <-> pro
-  // via the StartOS action without breaking the request.
-  const txIsFlash = /flash/i.test(transcriptionModel);
+  // Build the per-call fallback chains. The primary is whatever the
+  // operator selected via the StartOS action; subsequent entries are
+  // the lower-tier members of the chain (we never fall back UP). When
+  // the primary returns a 503/capacity/rate-limit error, the loops
+  // below try the next model.
+  const txChain = fallbackChain(TRANSCRIPTION_FALLBACK_CHAIN, transcriptionModel);
+  const anChain = fallbackChain(ANALYSIS_FALLBACK_CHAIN, analysisModel);

  async function transcribeAudio({
    audio,
@@ -84,67 +127,109 @@ export function createGeminiBackend({
      }

      const prompt = buildTranscriptionPrompt({ title, channel, description, chapters });
-      let result;
-      for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
-        result = await ai.models.generateContent({
-          model: transcriptionModel,
-          config: {
-            // thinkingLevel: "minimal" is only valid for Flash. Pro
-            // models reject it. Skip when the operator picks a Pro
-            // model for transcription (slower but valid).
-            ...(txIsFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}),
-            safetySettings: TRANSCRIPTION_SAFETY,
-          },
-          contents: [
-            {
-              role: "user",
-              parts: [
-                { fileData: { fileUri: f.uri, mimeType } },
-                { text: prompt },
+
+      // Walk the fallback chain: try the primary model first; on a
+      // retryable error (capacity / 503 / rate-limit), try the next
+      // model in the chain. Non-retryable errors bubble up to the
+      // caller — they'd just fail the same way on every model.
+      let lastErr;
+      for (let modelIdx = 0; modelIdx < txChain.length; modelIdx++) {
+        const model = txChain[modelIdx];
+        const isFlash = /flash/i.test(model);
+        try {
+          let result;
+          // Empty-response retries: when the SDK returns 200 with no
+          // text (which happens periodically with audio inputs),
+          // retry up to N times with the SAME model before falling
+          // back.
+          for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
+            result = await ai.models.generateContent({
+              model,
+              config: {
+                // thinkingLevel: "minimal" is only valid for Flash.
+                // Pro models reject it. Skip when the chain hop
+                // landed on a Pro model.
+                ...(isFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}),
+                safetySettings: TRANSCRIPTION_SAFETY,
+              },
+              contents: [
+                {
+                  role: "user",
+                  parts: [
+                    { fileData: { fileUri: f.uri, mimeType } },
+                    { text: prompt },
+                  ],
+                },
              ],
-            },
-          ],
-        });
-        if (safeText(result)) break;
+            });
+            if (safeText(result)) break;
+          }
+
+          // Best-effort cleanup of the uploaded File API artifact.
+          try { await ai.files.delete({ name: f.name }); } catch {}
+
+          const text = safeText(result) || "";
+          return {
+            text,
+            segments: [],
+            duration_seconds: 0,
+            usage: result?.usageMetadata || null,
+            // Return the model that ACTUALLY served the request — so
+            // the audit log records what was used, not just what was
+            // requested. Lets the operator see "this call fell back
+            // from 3-flash to 2.5-flash" via the dashboard.
+            model,
+          };
+        } catch (err) {
+          lastErr = err;
+          const canFallback =
+            isFallbackEligibleError(err) && modelIdx < txChain.length - 1;
+          console.warn(
+            `[gemini] transcribe with ${model} failed (${err?.status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${txChain[modelIdx + 1]}` : ""}`
+          );
+          if (!canFallback) {
+            try { await ai.files.delete({ name: f.name }); } catch {}
+            throw err;
+          }
+          // loop continues with next model
+        }
      }
-
-      // Best-effort cleanup of the uploaded File API artifact.
-      try { await ai.files.delete({ name: f.name }); } catch {}
-
-      const text = safeText(result) || "";
-      return {
-        text,
-        // Gemini returns a single timestamped blob — segments are
-        // parsed client-side by the orchestration layer. We could
-        // pre-parse here but Recap already has parseTimestampedTranscript
-        // that handles this exact shape.
-        segments: [],
-        duration_seconds: 0,
-        // Pass usage + the model id back to the route so audit-log
-        // entries can include token counts + computed cost.
-        usage: result?.usageMetadata || null,
-        model: transcriptionModel,
-      };
+      throw lastErr || new Error("transcribe: all models in fallback chain failed");
    } finally {
      try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
    }
  }

  async function analyzeText({ prompt }) {
-    const result = await ai.models.generateContent({
-      model: analysisModel,
-      contents: [
-        {
-          role: "user",
-          parts: [{ text: prompt }],
-        },
-      ],
-    });
-    return {
-      text: safeText(result) || "",
-      usage: result?.usageMetadata || null,
-      model: analysisModel,
-    };
+    let lastErr;
+    for (let modelIdx = 0; modelIdx < anChain.length; modelIdx++) {
+      const model = anChain[modelIdx];
+      try {
+        const result = await ai.models.generateContent({
+          model,
+          contents: [
+            {
+              role: "user",
+              parts: [{ text: prompt }],
+            },
+          ],
+        });
+        return {
+          text: safeText(result) || "",
+          usage: result?.usageMetadata || null,
+          model,
+        };
+      } catch (err) {
+        lastErr = err;
+        const canFallback =
+          isFallbackEligibleError(err) && modelIdx < anChain.length - 1;
+        console.warn(
+          `[gemini] analyze with ${model} failed (${err?.status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${anChain[modelIdx + 1]}` : ""}`
+        );
+        if (!canFallback) throw err;
+      }
+    }
+    throw lastErr || new Error("analyze: all models in fallback chain failed");
  }

  return { transcribeAudio, analyzeText };
@@ -1,6 +1,6 @@
 {
  "name": "recap-relay-server",
-  "version": "0.2.8",
+  "version": "0.2.9",
  "type": "module",
  "private": true,
  "dependencies": {
@@ -11,23 +11,40 @@ const { InputSpec, Value } = sdk

 const inputSpec = InputSpec.of({
  // ── Gemini model selection ──
-  relay_gemini_transcription_model: Value.text({
+  // Both fields are radio-select with curated options. The relay's
+  // Gemini backend automatically falls back to lower-tier models in
+  // this same list when the chosen one returns a 503 / capacity /
+  // rate-limit error — see server/backends/gemini.js for the
+  // fallback-chain logic.
+  relay_gemini_transcription_model: Value.select({
    name: 'Gemini Transcription Model',
    description:
-      "The Gemini SKU used when a transcription request is routed to Gemini. Flash is recommended (cheap, fast, multimodal). Examples: gemini-3-flash-preview (default), gemini-2.5-flash, gemini-2.0-flash, gemini-3-pro-preview (slower + pricier but higher quality on edge cases).",
-    required: true,
+      "Primary Gemini SKU used when a transcription request is routed to Gemini. On 503/capacity/rate-limit failure, the relay falls back to lower-tier models in order (e.g. 3-flash → 2.5-flash → 2.0-flash).",
    default: 'gemini-3-flash-preview',
-    minLength: 1,
-    maxLength: 128,
+    values: {
+      'gemini-3-flash-preview':
+        'Gemini 3 Flash — latest, recommended (~$0.30/M in, $2.50/M out)',
+      'gemini-2.5-flash':
+        'Gemini 2.5 Flash — prior gen (same pricing as 3-flash)',
+      'gemini-2.0-flash':
+        'Gemini 2.0 Flash — older + cheapest (~$0.10/M in, $0.40/M out)',
+    },
  }),
-  relay_gemini_analysis_model: Value.text({
+  relay_gemini_analysis_model: Value.select({
    name: 'Gemini Analysis Model',
    description:
-      "The Gemini SKU used when an analysis request is routed to Gemini. Pro is the default for higher-quality structured output. Swap to a flash SKU (e.g. gemini-3-flash-preview) for faster + cheaper analysis at some loss of section-boundary precision.",
-    required: true,
+      "Primary Gemini SKU used when an analysis request is routed to Gemini. On 503/capacity/rate-limit failure, the relay falls back to lower-tier models in order (e.g. 3.1-pro → 3-pro → 3-flash → 2.5-flash).",
    default: 'gemini-3.1-pro-preview',
-    minLength: 1,
-    maxLength: 128,
+    values: {
+      'gemini-3.1-pro-preview':
+        'Gemini 3.1 Pro — best quality on structured-JSON output ($5/M in, $25/M out)',
+      'gemini-3-pro-preview':
+        'Gemini 3 Pro — prior Pro gen (same pricing as 3.1)',
+      'gemini-3-flash-preview':
+        'Gemini 3 Flash — faster + ~20× cheaper than Pro; some loss of section-boundary precision on long transcripts',
+      'gemini-2.5-flash':
+        'Gemini 2.5 Flash — prior Flash gen',
+    },
  }),

  // ── Backend routing preference per pipeline ──
@@ -78,11 +95,32 @@ export const setBackendRouting = sdk.Action.withInput(

  async ({ effects }) => {
    const config = await configFile.read().once()
+    // Coerce any previously-saved model name to a value in the new
+    // select's options. Older 0.2.7-era saved configs could hold a
+    // free-text value that's no longer in the dropdown — clamp to a
+    // sensible default rather than presenting an invalid radio.
+    const TX_OPTIONS = [
+      'gemini-3-flash-preview',
+      'gemini-2.5-flash',
+      'gemini-2.0-flash',
+    ] as const
+    const AN_OPTIONS = [
+      'gemini-3.1-pro-preview',
+      'gemini-3-pro-preview',
+      'gemini-3-flash-preview',
+      'gemini-2.5-flash',
+    ] as const
+    const tx = config?.relay_gemini_transcription_model as
+      | (typeof TX_OPTIONS)[number]
+      | undefined
+    const an = config?.relay_gemini_analysis_model as
+      | (typeof AN_OPTIONS)[number]
+      | undefined
    return {
      relay_gemini_transcription_model:
-        config?.relay_gemini_transcription_model || 'gemini-3-flash-preview',
+        tx && TX_OPTIONS.includes(tx) ? tx : 'gemini-3-flash-preview',
      relay_gemini_analysis_model:
-        config?.relay_gemini_analysis_model || 'gemini-3.1-pro-preview',
+        an && AN_OPTIONS.includes(an) ? an : 'gemini-3.1-pro-preview',
      relay_transcribe_backend_preference:
        (config?.relay_transcribe_backend_preference as
          | 'gemini_first'
@@ -9,8 +9,9 @@ import { v_0_2_5 } from './v0.2.5'
 import { v_0_2_6 } from './v0.2.6'
 import { v_0_2_7 } from './v0.2.7'
 import { v_0_2_8 } from './v0.2.8'
+import { v_0_2_9 } from './v0.2.9'

 export const versionGraph = VersionGraph.of({
-  current: v_0_2_8,
-  other: [v_0_2_7, v_0_2_6, v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0],
+  current: v_0_2_9,
+  other: [v_0_2_8, v_0_2_7, v_0_2_6, v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0],
 })
@@ -0,0 +1,13 @@
+import { VersionInfo } from '@start9labs/start-sdk'
+
+export const v_0_2_9 = VersionInfo.of({
+  version: '0.2.9:0',
+  releaseNotes: {
+    en_US:
+      'Set Backend Routing & Models action: Gemini transcription and analysis fields are now radio-select dropdowns with curated options (transcribe: 3-flash, 2.5-flash, 2.0-flash; analyze: 3.1-pro, 3-pro, 3-flash, 2.5-flash). Gemini backend automatically falls back to lower-tier models in the same chain when the primary returns a 503/capacity/rate-limit error. Audit log records the model that actually served each call, so dashboard reflects fallback behavior accurately.',
+  },
+  migrations: {
+    up: async ({ effects }) => {},
+    down: async ({ effects }) => {},
+  },
+})