From 9af70302b1975bbfdebb50d101efaef0b0742287 Mon Sep 17 00:00:00 2001
From: local <local@local>
Date: Tue, 12 May 2026 00:15:07 -0500
Subject: [PATCH] v0.2.7 configurable Gemini models + per-pipeline backend
 preference

---
 server/backends/gemini.js            |  29 +++++--
 server/config.js                     |   4 +
 server/credits.js                    |  77 +++++++++++++++---
 server/package.json                  |   2 +-
 server/routes/analyze.js             |  10 ++-
 server/routes/transcribe.js          |  10 ++-
 startos/actions/index.ts             |   2 +
 startos/actions/setBackendRouting.ts | 116 +++++++++++++++++++++++++++
 startos/file-models/config.json.ts   |  27 +++++++
 startos/versions/index.ts            |   5 +-
 startos/versions/v0.2.7.ts           |  13 +++
 11 files changed, 273 insertions(+), 22 deletions(-)
 create mode 100644 startos/actions/setBackendRouting.ts
 create mode 100644 startos/versions/v0.2.7.ts

diff --git a/server/backends/gemini.js b/server/backends/gemini.js
index 311ab4f..a1d4351 100644
--- a/server/backends/gemini.js
+++ b/server/backends/gemini.js
@@ -18,8 +18,13 @@ import fs from "fs/promises";
 import os from "os";
 import path from "path";
 
-const TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
-const ANALYSIS_MODEL = "gemini-3.1-pro-preview";
+// Defaults used only when the caller doesn't supply explicit model
+// names. Production callers should pass models pulled from
+// relay_gemini_transcription_model / relay_gemini_analysis_model in
+// the relay config so the operator can swap SKUs (e.g. flash for
+// analysis) without rebuilding the relay.
+const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
+const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview";
 const EMPTY_RETRIES = 3;
 
 const TRANSCRIPTION_SAFETY = [
@@ -29,7 +34,12 @@ const TRANSCRIPTION_SAFETY = [
   { category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" },
 ];
 
-export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
+export function createGeminiBackend({
+  apiKey,
+  transcriptionModel = DEFAULT_TRANSCRIPTION_MODEL,
+  analysisModel = DEFAULT_ANALYSIS_MODEL,
+  timeoutMs = 900_000,
+} = {}) {
   if (!apiKey) {
     throw new Error("createGeminiBackend: apiKey is required");
   }
@@ -37,6 +47,10 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
     apiKey,
     httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
   });
+  // Flash models accept `thinkingLevel: "minimal"`; Pro models reject
+  // it. Detect from the model id so the operator can flip flash <-> pro
+  // via the StartOS action without breaking the request.
+  const txIsFlash = /flash/i.test(transcriptionModel);
 
   async function transcribeAudio({
     audio,
@@ -73,9 +87,12 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
       let result;
       for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
         result = await ai.models.generateContent({
-          model: TRANSCRIPTION_MODEL,
+          model: transcriptionModel,
           config: {
-            thinkingConfig: { thinkingLevel: "minimal" },
+            // thinkingLevel: "minimal" is only valid for Flash. Pro
+            // models reject it. Skip when the operator picks a Pro
+            // model for transcription (slower but valid).
+            ...(txIsFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}),
             safetySettings: TRANSCRIPTION_SAFETY,
           },
           contents: [
@@ -111,7 +128,7 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
 
   async function analyzeText({ prompt }) {
     const result = await ai.models.generateContent({
-      model: ANALYSIS_MODEL,
+      model: analysisModel,
       contents: [
         {
           role: "user",
diff --git a/server/config.js b/server/config.js
index d62a6cb..f5cf1fa 100644
--- a/server/config.js
+++ b/server/config.js
@@ -18,6 +18,10 @@ function defaultConfig() {
     relay_gemma_base_url: "",
     relay_parakeet_model: "parakeet-tdt-0.6b-v3",
     relay_gemma_model: "gemma3:27b",
+    relay_gemini_transcription_model: "gemini-3-flash-preview",
+    relay_gemini_analysis_model: "gemini-3.1-pro-preview",
+    relay_transcribe_backend_preference: "gemini_first",
+    relay_analyze_backend_preference: "gemini_first",
     relay_keysat_base_url: "https://keysat.xyz",
     relay_admin_username: "",
     relay_admin_password_hash: "",
diff --git a/server/credits.js b/server/credits.js
index be02f4f..c8625cb 100644
--- a/server/credits.js
+++ b/server/credits.js
@@ -250,7 +250,24 @@ export function computeRemaining(row, quota) {
 // served at all. Returns { allowed, backend: "gemini"|"hardware",
 // reason }. Does NOT debit — that's a separate commit step after the
 // backend call succeeds.
-export function planBackend(row, quota, { hasHardware }) {
+//
+// `preference` is the operator-configured routing strategy for the
+// current pipeline step (transcribe or analyze), one of:
+//   - "gemini_first"   try Gemini until cap is exceeded, then hardware
+//                      (default — best quality routing on operator's
+//                      Gemini budget, hardware as overflow)
+//   - "hardware_first" try hardware first, fall back to Gemini when
+//                      hardware isn't configured (lets the operator
+//                      conserve Gemini budget for premium use cases)
+//   - "gemini_only"    Gemini only, fail when cap exceeded (caps the
+//                      operator's spend at the per-tier limit)
+//   - "hardware_only"  Hardware only, fail when not configured (good
+//                      for fully local / offline deployments)
+//
+// The Gemini cap (geminiCapMonthly / geminiCapLifetime on the tier
+// quota) still applies regardless of preference — preference just
+// controls the order in which backends are tried.
+export function planBackend(row, quota, { hasHardware, preference = "gemini_first" }) {
   const balance = computeRemaining(row, quota);
 
   // Out of credits entirely?
@@ -258,15 +275,57 @@ export function planBackend(row, quota, { hasHardware }) {
     return { allowed: false, backend: null, reason: "out_of_credits" };
   }
 
-  // Pick backend: Gemini if there's room under the Gemini cap; else
-  // fall back to hardware if configured; else 402.
-  if (balance.gemini_remaining === null || balance.gemini_remaining > 0) {
-    return { allowed: true, backend: "gemini", reason: null };
+  const geminiAvailable =
+    balance.gemini_remaining === null || balance.gemini_remaining > 0;
+
+  switch (preference) {
+    case "hardware_only":
+      if (hasHardware) {
+        return { allowed: true, backend: "hardware", reason: null };
+      }
+      return {
+        allowed: false,
+        backend: null,
+        reason: "hardware_only_not_configured",
+      };
+
+    case "gemini_only":
+      if (geminiAvailable) {
+        return { allowed: true, backend: "gemini", reason: null };
+      }
+      return {
+        allowed: false,
+        backend: null,
+        reason: "gemini_cap_exceeded_no_fallback",
+      };
+
+    case "hardware_first":
+      if (hasHardware) {
+        return { allowed: true, backend: "hardware", reason: null };
+      }
+      if (geminiAvailable) {
+        return { allowed: true, backend: "gemini", reason: null };
+      }
+      return {
+        allowed: false,
+        backend: null,
+        reason: "no_backend_available",
+      };
+
+    case "gemini_first":
+    default:
+      if (geminiAvailable) {
+        return { allowed: true, backend: "gemini", reason: null };
+      }
+      if (hasHardware) {
+        return { allowed: true, backend: "hardware", reason: null };
+      }
+      return {
+        allowed: false,
+        backend: null,
+        reason: "gemini_cap_exceeded_no_hardware",
+      };
   }
-  if (hasHardware) {
-    return { allowed: true, backend: "hardware", reason: null };
-  }
-  return { allowed: false, backend: null, reason: "gemini_cap_exceeded_no_hardware" };
 }
 
 // Debit one credit on a successful call. Persists immediately.
diff --git a/server/package.json b/server/package.json
index 13f06ce..550c817 100644
--- a/server/package.json
+++ b/server/package.json
@@ -1,6 +1,6 @@
 {
   "name": "recap-relay-server",
-  "version": "0.2.6",
+  "version": "0.2.7",
   "type": "module",
   "private": true,
   "dependencies": {
diff --git a/server/routes/analyze.js b/server/routes/analyze.js
index 517c037..4aa5608 100644
--- a/server/routes/analyze.js
+++ b/server/routes/analyze.js
@@ -61,7 +61,9 @@ export function analyzeRouter() {
       const cfg = await getConfigSnapshot();
       const hasHardware = !!cfg.relay_gemma_base_url;
       const quota = await getTierQuotas();
-      const plan = planBackend(row, quota, { hasHardware });
+      const preference =
+        cfg.relay_analyze_backend_preference || "gemini_first";
+      const plan = planBackend(row, quota, { hasHardware, preference });
       if (!plan.allowed) {
         const e = await errorEnvelope({
           error: plan.reason,
@@ -78,7 +80,11 @@ export function analyzeRouter() {
     let result;
     try {
       if (chosenBackend === "gemini") {
-        const backend = createGeminiBackend({ apiKey: cfg.relay_gemini_api_key });
+        const backend = createGeminiBackend({
+          apiKey: cfg.relay_gemini_api_key,
+          transcriptionModel: cfg.relay_gemini_transcription_model,
+          analysisModel: cfg.relay_gemini_analysis_model,
+        });
         result = await backend.analyzeText({ prompt });
       } else {
         const backend = createHardwareBackend({
diff --git a/server/routes/transcribe.js b/server/routes/transcribe.js
index bb5b03c..700ee7c 100644
--- a/server/routes/transcribe.js
+++ b/server/routes/transcribe.js
@@ -78,7 +78,9 @@ export function transcribeRouter() {
       const cfg = await getConfigSnapshot();
       const hasHardware = !!cfg.relay_parakeet_base_url;
       const quota = await getTierQuotas();
-      const plan = planBackend(row, quota, { hasHardware });
+      const preference =
+        cfg.relay_transcribe_backend_preference || "gemini_first";
+      const plan = planBackend(row, quota, { hasHardware, preference });
       if (!plan.allowed) {
         const e = await errorEnvelope({
           error: plan.reason,
@@ -96,7 +98,11 @@ export function transcribeRouter() {
     let result;
     try {
       if (chosenBackend === "gemini") {
-        const backend = createGeminiBackend({ apiKey: cfg.relay_gemini_api_key });
+        const backend = createGeminiBackend({
+          apiKey: cfg.relay_gemini_api_key,
+          transcriptionModel: cfg.relay_gemini_transcription_model,
+          analysisModel: cfg.relay_gemini_analysis_model,
+        });
         result = await backend.transcribeAudio({
           audio: req.file.buffer,
           mimeType: req.body?.mime_type || req.file.mimetype || "application/octet-stream",
diff --git a/startos/actions/index.ts b/startos/actions/index.ts
index ed9e1e7..81321cf 100644
--- a/startos/actions/index.ts
+++ b/startos/actions/index.ts
@@ -5,11 +5,13 @@ import { setParakeetUrl } from './setParakeetUrl'
 import { setGemmaUrl } from './setGemmaUrl'
 import { setAdminPassword } from './setAdminPassword'
 import { adjustTierQuotas } from './adjustTierQuotas'
+import { setBackendRouting } from './setBackendRouting'
 
 export const actions = sdk.Actions.of()
   .addAction(setGeminiKey)
   .addAction(setKeysatBaseUrl)
   .addAction(setParakeetUrl)
   .addAction(setGemmaUrl)
+  .addAction(setBackendRouting)
   .addAction(setAdminPassword)
   .addAction(adjustTierQuotas)
diff --git a/startos/actions/setBackendRouting.ts b/startos/actions/setBackendRouting.ts
new file mode 100644
index 0000000..0d0ea13
--- /dev/null
+++ b/startos/actions/setBackendRouting.ts
@@ -0,0 +1,116 @@
+import { sdk } from '../sdk'
+import { configFile } from '../file-models/config.json'
+
+const { InputSpec, Value } = sdk
+
+// Lets the operator tune which backend gets tried first per pipeline
+// step (transcribe vs analyze) AND which Gemini SKU is used when
+// Gemini is the backend. All four knobs live-reload — change them
+// via this action and the next relay request honors the new values
+// without a daemon restart.
+
+const inputSpec = InputSpec.of({
+  // ── Gemini model selection ──
+  relay_gemini_transcription_model: Value.text({
+    name: 'Gemini Transcription Model',
+    description:
+      "The Gemini SKU used when a transcription request is routed to Gemini. Flash is recommended (cheap, fast, multimodal). Examples: gemini-3-flash-preview (default), gemini-2.5-flash, gemini-2.0-flash, gemini-3-pro-preview (slower + pricier but higher quality on edge cases).",
+    required: true,
+    default: 'gemini-3-flash-preview',
+    minLength: 1,
+    maxLength: 128,
+  }),
+  relay_gemini_analysis_model: Value.text({
+    name: 'Gemini Analysis Model',
+    description:
+      "The Gemini SKU used when an analysis request is routed to Gemini. Pro is the default for higher-quality structured output. Swap to a flash SKU (e.g. gemini-3-flash-preview) for faster + cheaper analysis at some loss of section-boundary precision.",
+    required: true,
+    default: 'gemini-3.1-pro-preview',
+    minLength: 1,
+    maxLength: 128,
+  }),
+
+  // ── Backend routing preference per pipeline ──
+  relay_transcribe_backend_preference: Value.select({
+    name: 'Transcribe Backend Preference',
+    description:
+      'Routing strategy for transcription requests. The selected option controls the ORDER in which the relay tries each backend. The Gemini per-tier cap still applies regardless of this setting.',
+    default: 'gemini_first',
+    values: {
+      gemini_first:
+        'Gemini first → operator hardware (Parakeet) when cap exceeded',
+      hardware_first: 'Operator hardware first → Gemini as fallback',
+      gemini_only: 'Gemini only — fail when cap is exceeded',
+      hardware_only:
+        'Hardware only — fail when no Parakeet endpoint is configured',
+    },
+  }),
+  relay_analyze_backend_preference: Value.select({
+    name: 'Analyze Backend Preference',
+    description:
+      'Routing strategy for analysis requests. Same options as transcription but applies to the analyze step independently — you can route transcribe to hardware and analyze to Gemini, or vice versa.',
+    default: 'gemini_first',
+    values: {
+      gemini_first:
+        'Gemini first → operator hardware (Gemma) when cap exceeded',
+      hardware_first: 'Operator hardware first → Gemini as fallback',
+      gemini_only: 'Gemini only — fail when cap is exceeded',
+      hardware_only:
+        'Hardware only — fail when no Gemma endpoint is configured',
+    },
+  }),
+})
+
+export const setBackendRouting = sdk.Action.withInput(
+  'set-backend-routing',
+
+  async ({ effects }) => ({
+    name: 'Set Backend Routing & Models',
+    description:
+      "Tune which Gemini SKUs the relay uses and the per-pipeline backend pecking order. Live-reloaded — changes take effect on the next request, no restart.",
+    warning: null,
+    allowedStatuses: 'any',
+    group: 'AI Backends',
+    visibility: 'enabled',
+  }),
+
+  inputSpec,
+
+  async ({ effects }) => {
+    const config = await configFile.read().once()
+    return {
+      relay_gemini_transcription_model:
+        config?.relay_gemini_transcription_model || 'gemini-3-flash-preview',
+      relay_gemini_analysis_model:
+        config?.relay_gemini_analysis_model || 'gemini-3.1-pro-preview',
+      relay_transcribe_backend_preference:
+        (config?.relay_transcribe_backend_preference as
+          | 'gemini_first'
+          | 'hardware_first'
+          | 'gemini_only'
+          | 'hardware_only'
+          | undefined) || 'gemini_first',
+      relay_analyze_backend_preference:
+        (config?.relay_analyze_backend_preference as
+          | 'gemini_first'
+          | 'hardware_first'
+          | 'gemini_only'
+          | 'hardware_only'
+          | undefined) || 'gemini_first',
+    }
+  },
+
+  async ({ effects, input }) => {
+    await configFile.merge(effects, {
+      relay_gemini_transcription_model: (
+        input.relay_gemini_transcription_model || 'gemini-3-flash-preview'
+      ).trim(),
+      relay_gemini_analysis_model: (
+        input.relay_gemini_analysis_model || 'gemini-3.1-pro-preview'
+      ).trim(),
+      relay_transcribe_backend_preference: input.relay_transcribe_backend_preference,
+      relay_analyze_backend_preference: input.relay_analyze_backend_preference,
+    })
+    return null
+  },
+)
diff --git a/startos/file-models/config.json.ts b/startos/file-models/config.json.ts
index 39bc986..1acfda6 100644
--- a/startos/file-models/config.json.ts
+++ b/startos/file-models/config.json.ts
@@ -36,6 +36,33 @@ export const configFile = FileHelper.json(
     relay_parakeet_model: z.string().default('parakeet-tdt-0.6b-v3'),
     relay_gemma_model: z.string().default('gemma3:27b'),
 
+    // ── Gemini model selection ──
+    // Operator can pick which Gemini SKU is used per pipeline step
+    // without rebuilding the relay. Defaults match Google's typical
+    // recommendations: Flash for transcription (cheap, fast,
+    // multimodal-capable), Pro for analysis (higher quality on
+    // structured-JSON outputs). Operators can swap to flash for
+    // analysis when they want faster + cheaper at the cost of some
+    // section-boundary precision.
+    relay_gemini_transcription_model: z.string().default('gemini-3-flash-preview'),
+    relay_gemini_analysis_model: z.string().default('gemini-3.1-pro-preview'),
+
+    // ── Backend routing preference per pipeline ──
+    // Controls whether the relay tries Gemini first (current default —
+    // best quality, costs operator's Gemini API budget) or the
+    // operator-hardware backend first (saves Gemini budget, may be
+    // slower depending on the operator's hardware). One of:
+    //   - "gemini_first"   try Gemini until per-tier cap, then hardware
+    //   - "hardware_first" try hardware first, fall back to Gemini
+    //   - "gemini_only"    Gemini only, fail when cap is exceeded
+    //   - "hardware_only"  Hardware only, fail when not configured
+    relay_transcribe_backend_preference: z
+      .enum(['gemini_first', 'hardware_first', 'gemini_only', 'hardware_only'])
+      .default('gemini_first'),
+    relay_analyze_backend_preference: z
+      .enum(['gemini_first', 'hardware_first', 'gemini_only', 'hardware_only'])
+      .default('gemini_first'),
+
     // ── License server ──
     // URL of the Keysat license server used for the cached online
     // license-validation check. Defaults to the public endpoint;
diff --git a/startos/versions/index.ts b/startos/versions/index.ts
index c823859..0329fef 100644
--- a/startos/versions/index.ts
+++ b/startos/versions/index.ts
@@ -7,8 +7,9 @@ import { v_0_2_3 } from './v0.2.3'
 import { v_0_2_4 } from './v0.2.4'
 import { v_0_2_5 } from './v0.2.5'
 import { v_0_2_6 } from './v0.2.6'
+import { v_0_2_7 } from './v0.2.7'
 
 export const versionGraph = VersionGraph.of({
-  current: v_0_2_6,
-  other: [v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0],
+  current: v_0_2_7,
+  other: [v_0_2_6, v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0],
 })
diff --git a/startos/versions/v0.2.7.ts b/startos/versions/v0.2.7.ts
new file mode 100644
index 0000000..112b74e
--- /dev/null
+++ b/startos/versions/v0.2.7.ts
@@ -0,0 +1,13 @@
+import { VersionInfo } from '@start9labs/start-sdk'
+
+export const v_0_2_7 = VersionInfo.of({
+  version: '0.2.7:0',
+  releaseNotes: {
+    en_US:
+      'New "Set Backend Routing & Models" action exposes four operator knobs: Gemini transcription model, Gemini analysis model, transcribe backend preference (gemini_first / hardware_first / gemini_only / hardware_only), and analyze backend preference. Routing strategies are honored by planBackend per-pipeline, so the operator can route transcribe to Parakeet for speed AND analyze through Gemini Flash for faster + cheaper structured output, or any other combination.',
+  },
+  migrations: {
+    up: async ({ effects }) => {},
+    down: async ({ effects }) => {},
+  },
+})