v0.2.7 configurable Gemini models + per-pipeline backend preference

2026-05-12 00:15:07 -05:00
parent cd377683fb
commit 9af70302b1
11 changed files with 273 additions and 22 deletions
@@ -18,8 +18,13 @@ import fs from "fs/promises";
 import os from "os";
 import path from "path";

-const TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
-const ANALYSIS_MODEL = "gemini-3.1-pro-preview";
+// Defaults used only when the caller doesn't supply explicit model
+// names. Production callers should pass models pulled from
+// relay_gemini_transcription_model / relay_gemini_analysis_model in
+// the relay config so the operator can swap SKUs (e.g. flash for
+// analysis) without rebuilding the relay.
+const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
+const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview";
 const EMPTY_RETRIES = 3;

 const TRANSCRIPTION_SAFETY = [
@@ -29,7 +34,12 @@ const TRANSCRIPTION_SAFETY = [
  { category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" },
 ];

-export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
+export function createGeminiBackend({
+  apiKey,
+  transcriptionModel = DEFAULT_TRANSCRIPTION_MODEL,
+  analysisModel = DEFAULT_ANALYSIS_MODEL,
+  timeoutMs = 900_000,
+} = {}) {
  if (!apiKey) {
    throw new Error("createGeminiBackend: apiKey is required");
  }
@@ -37,6 +47,10 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
    apiKey,
    httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
  });
+  // Flash models accept `thinkingLevel: "minimal"`; Pro models reject
+  // it. Detect from the model id so the operator can flip flash <-> pro
+  // via the StartOS action without breaking the request.
+  const txIsFlash = /flash/i.test(transcriptionModel);

  async function transcribeAudio({
    audio,
@@ -73,9 +87,12 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
      let result;
      for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
        result = await ai.models.generateContent({
-          model: TRANSCRIPTION_MODEL,
+          model: transcriptionModel,
          config: {
-            thinkingConfig: { thinkingLevel: "minimal" },
+            // thinkingLevel: "minimal" is only valid for Flash. Pro
+            // models reject it. Skip when the operator picks a Pro
+            // model for transcription (slower but valid).
+            ...(txIsFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}),
            safetySettings: TRANSCRIPTION_SAFETY,
          },
          contents: [
@@ -111,7 +128,7 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {

  async function analyzeText({ prompt }) {
    const result = await ai.models.generateContent({
-      model: ANALYSIS_MODEL,
+      model: analysisModel,
      contents: [
        {
          role: "user",
@@ -18,6 +18,10 @@ function defaultConfig() {
    relay_gemma_base_url: "",
    relay_parakeet_model: "parakeet-tdt-0.6b-v3",
    relay_gemma_model: "gemma3:27b",
+    relay_gemini_transcription_model: "gemini-3-flash-preview",
+    relay_gemini_analysis_model: "gemini-3.1-pro-preview",
+    relay_transcribe_backend_preference: "gemini_first",
+    relay_analyze_backend_preference: "gemini_first",
    relay_keysat_base_url: "https://keysat.xyz",
    relay_admin_username: "",
    relay_admin_password_hash: "",
@@ -250,7 +250,24 @@ export function computeRemaining(row, quota) {
 // served at all. Returns { allowed, backend: "gemini"|"hardware",
 // reason }. Does NOT debit — that's a separate commit step after the
 // backend call succeeds.
-export function planBackend(row, quota, { hasHardware }) {
+//
+// `preference` is the operator-configured routing strategy for the
+// current pipeline step (transcribe or analyze), one of:
+//   - "gemini_first"   try Gemini until cap is exceeded, then hardware
+//                      (default — best quality routing on operator's
+//                      Gemini budget, hardware as overflow)
+//   - "hardware_first" try hardware first, fall back to Gemini when
+//                      hardware isn't configured (lets the operator
+//                      conserve Gemini budget for premium use cases)
+//   - "gemini_only"    Gemini only, fail when cap exceeded (caps the
+//                      operator's spend at the per-tier limit)
+//   - "hardware_only"  Hardware only, fail when not configured (good
+//                      for fully local / offline deployments)
+//
+// The Gemini cap (geminiCapMonthly / geminiCapLifetime on the tier
+// quota) still applies regardless of preference — preference just
+// controls the order in which backends are tried.
+export function planBackend(row, quota, { hasHardware, preference = "gemini_first" }) {
  const balance = computeRemaining(row, quota);

  // Out of credits entirely?
@@ -258,15 +275,57 @@ export function planBackend(row, quota, { hasHardware }) {
    return { allowed: false, backend: null, reason: "out_of_credits" };
  }

-  // Pick backend: Gemini if there's room under the Gemini cap; else
-  // fall back to hardware if configured; else 402.
-  if (balance.gemini_remaining === null || balance.gemini_remaining > 0) {
-    return { allowed: true, backend: "gemini", reason: null };
+  const geminiAvailable =
+    balance.gemini_remaining === null || balance.gemini_remaining > 0;
+
+  switch (preference) {
+    case "hardware_only":
+      if (hasHardware) {
+        return { allowed: true, backend: "hardware", reason: null };
+      }
+      return {
+        allowed: false,
+        backend: null,
+        reason: "hardware_only_not_configured",
+      };
+
+    case "gemini_only":
+      if (geminiAvailable) {
+        return { allowed: true, backend: "gemini", reason: null };
+      }
+      return {
+        allowed: false,
+        backend: null,
+        reason: "gemini_cap_exceeded_no_fallback",
+      };
+
+    case "hardware_first":
+      if (hasHardware) {
+        return { allowed: true, backend: "hardware", reason: null };
+      }
+      if (geminiAvailable) {
+        return { allowed: true, backend: "gemini", reason: null };
+      }
+      return {
+        allowed: false,
+        backend: null,
+        reason: "no_backend_available",
+      };
+
+    case "gemini_first":
+    default:
+      if (geminiAvailable) {
+        return { allowed: true, backend: "gemini", reason: null };
+      }
+      if (hasHardware) {
+        return { allowed: true, backend: "hardware", reason: null };
+      }
+      return {
+        allowed: false,
+        backend: null,
+        reason: "gemini_cap_exceeded_no_hardware",
+      };
  }
-  if (hasHardware) {
-    return { allowed: true, backend: "hardware", reason: null };
-  }
-  return { allowed: false, backend: null, reason: "gemini_cap_exceeded_no_hardware" };
 }

 // Debit one credit on a successful call. Persists immediately.
@@ -1,6 +1,6 @@
 {
  "name": "recap-relay-server",
-  "version": "0.2.6",
+  "version": "0.2.7",
  "type": "module",
  "private": true,
  "dependencies": {
@@ -61,7 +61,9 @@ export function analyzeRouter() {
      const cfg = await getConfigSnapshot();
      const hasHardware = !!cfg.relay_gemma_base_url;
      const quota = await getTierQuotas();
-      const plan = planBackend(row, quota, { hasHardware });
+      const preference =
+        cfg.relay_analyze_backend_preference || "gemini_first";
+      const plan = planBackend(row, quota, { hasHardware, preference });
      if (!plan.allowed) {
        const e = await errorEnvelope({
          error: plan.reason,
@@ -78,7 +80,11 @@ export function analyzeRouter() {
    let result;
    try {
      if (chosenBackend === "gemini") {
-        const backend = createGeminiBackend({ apiKey: cfg.relay_gemini_api_key });
+        const backend = createGeminiBackend({
+          apiKey: cfg.relay_gemini_api_key,
+          transcriptionModel: cfg.relay_gemini_transcription_model,
+          analysisModel: cfg.relay_gemini_analysis_model,
+        });
        result = await backend.analyzeText({ prompt });
      } else {
        const backend = createHardwareBackend({
@@ -78,7 +78,9 @@ export function transcribeRouter() {
      const cfg = await getConfigSnapshot();
      const hasHardware = !!cfg.relay_parakeet_base_url;
      const quota = await getTierQuotas();
-      const plan = planBackend(row, quota, { hasHardware });
+      const preference =
+        cfg.relay_transcribe_backend_preference || "gemini_first";
+      const plan = planBackend(row, quota, { hasHardware, preference });
      if (!plan.allowed) {
        const e = await errorEnvelope({
          error: plan.reason,
@@ -96,7 +98,11 @@ export function transcribeRouter() {
    let result;
    try {
      if (chosenBackend === "gemini") {
-        const backend = createGeminiBackend({ apiKey: cfg.relay_gemini_api_key });
+        const backend = createGeminiBackend({
+          apiKey: cfg.relay_gemini_api_key,
+          transcriptionModel: cfg.relay_gemini_transcription_model,
+          analysisModel: cfg.relay_gemini_analysis_model,
+        });
        result = await backend.transcribeAudio({
          audio: req.file.buffer,
          mimeType: req.body?.mime_type || req.file.mimetype || "application/octet-stream",