From 9af70302b1975bbfdebb50d101efaef0b0742287 Mon Sep 17 00:00:00 2001 From: local Date: Tue, 12 May 2026 00:15:07 -0500 Subject: [PATCH] v0.2.7 configurable Gemini models + per-pipeline backend preference --- server/backends/gemini.js | 29 +++++-- server/config.js | 4 + server/credits.js | 77 +++++++++++++++--- server/package.json | 2 +- server/routes/analyze.js | 10 ++- server/routes/transcribe.js | 10 ++- startos/actions/index.ts | 2 + startos/actions/setBackendRouting.ts | 116 +++++++++++++++++++++++++++ startos/file-models/config.json.ts | 27 +++++++ startos/versions/index.ts | 5 +- startos/versions/v0.2.7.ts | 13 +++ 11 files changed, 273 insertions(+), 22 deletions(-) create mode 100644 startos/actions/setBackendRouting.ts create mode 100644 startos/versions/v0.2.7.ts diff --git a/server/backends/gemini.js b/server/backends/gemini.js index 311ab4f..a1d4351 100644 --- a/server/backends/gemini.js +++ b/server/backends/gemini.js @@ -18,8 +18,13 @@ import fs from "fs/promises"; import os from "os"; import path from "path"; -const TRANSCRIPTION_MODEL = "gemini-3-flash-preview"; -const ANALYSIS_MODEL = "gemini-3.1-pro-preview"; +// Defaults used only when the caller doesn't supply explicit model +// names. Production callers should pass models pulled from +// relay_gemini_transcription_model / relay_gemini_analysis_model in +// the relay config so the operator can swap SKUs (e.g. flash for +// analysis) without rebuilding the relay. +const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview"; +const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview"; const EMPTY_RETRIES = 3; const TRANSCRIPTION_SAFETY = [ @@ -29,7 +34,12 @@ const TRANSCRIPTION_SAFETY = [ { category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" }, ]; -export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) { +export function createGeminiBackend({ + apiKey, + transcriptionModel = DEFAULT_TRANSCRIPTION_MODEL, + analysisModel = DEFAULT_ANALYSIS_MODEL, + timeoutMs = 900_000, +} = {}) { if (!apiKey) { throw new Error("createGeminiBackend: apiKey is required"); } @@ -37,6 +47,10 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) { apiKey, httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs }, }); + // Flash models accept `thinkingLevel: "minimal"`; Pro models reject + // it. Detect from the model id so the operator can flip flash <-> pro + // via the StartOS action without breaking the request. + const txIsFlash = /flash/i.test(transcriptionModel); async function transcribeAudio({ audio, @@ -73,9 +87,12 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) { let result; for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) { result = await ai.models.generateContent({ - model: TRANSCRIPTION_MODEL, + model: transcriptionModel, config: { - thinkingConfig: { thinkingLevel: "minimal" }, + // thinkingLevel: "minimal" is only valid for Flash. Pro + // models reject it. Skip when the operator picks a Pro + // model for transcription (slower but valid). + ...(txIsFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}), safetySettings: TRANSCRIPTION_SAFETY, }, contents: [ @@ -111,7 +128,7 @@ export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) { async function analyzeText({ prompt }) { const result = await ai.models.generateContent({ - model: ANALYSIS_MODEL, + model: analysisModel, contents: [ { role: "user", diff --git a/server/config.js b/server/config.js index d62a6cb..f5cf1fa 100644 --- a/server/config.js +++ b/server/config.js @@ -18,6 +18,10 @@ function defaultConfig() { relay_gemma_base_url: "", relay_parakeet_model: "parakeet-tdt-0.6b-v3", relay_gemma_model: "gemma3:27b", + relay_gemini_transcription_model: "gemini-3-flash-preview", + relay_gemini_analysis_model: "gemini-3.1-pro-preview", + relay_transcribe_backend_preference: "gemini_first", + relay_analyze_backend_preference: "gemini_first", relay_keysat_base_url: "https://keysat.xyz", relay_admin_username: "", relay_admin_password_hash: "", diff --git a/server/credits.js b/server/credits.js index be02f4f..c8625cb 100644 --- a/server/credits.js +++ b/server/credits.js @@ -250,7 +250,24 @@ export function computeRemaining(row, quota) { // served at all. Returns { allowed, backend: "gemini"|"hardware", // reason }. Does NOT debit — that's a separate commit step after the // backend call succeeds. -export function planBackend(row, quota, { hasHardware }) { +// +// `preference` is the operator-configured routing strategy for the +// current pipeline step (transcribe or analyze), one of: +// - "gemini_first" try Gemini until cap is exceeded, then hardware +// (default — best quality routing on operator's +// Gemini budget, hardware as overflow) +// - "hardware_first" try hardware first, fall back to Gemini when +// hardware isn't configured (lets the operator +// conserve Gemini budget for premium use cases) +// - "gemini_only" Gemini only, fail when cap exceeded (caps the +// operator's spend at the per-tier limit) +// - "hardware_only" Hardware only, fail when not configured (good +// for fully local / offline deployments) +// +// The Gemini cap (geminiCapMonthly / geminiCapLifetime on the tier +// quota) still applies regardless of preference — preference just +// controls the order in which backends are tried. +export function planBackend(row, quota, { hasHardware, preference = "gemini_first" }) { const balance = computeRemaining(row, quota); // Out of credits entirely? @@ -258,15 +275,57 @@ export function planBackend(row, quota, { hasHardware }) { return { allowed: false, backend: null, reason: "out_of_credits" }; } - // Pick backend: Gemini if there's room under the Gemini cap; else - // fall back to hardware if configured; else 402. - if (balance.gemini_remaining === null || balance.gemini_remaining > 0) { - return { allowed: true, backend: "gemini", reason: null }; + const geminiAvailable = + balance.gemini_remaining === null || balance.gemini_remaining > 0; + + switch (preference) { + case "hardware_only": + if (hasHardware) { + return { allowed: true, backend: "hardware", reason: null }; + } + return { + allowed: false, + backend: null, + reason: "hardware_only_not_configured", + }; + + case "gemini_only": + if (geminiAvailable) { + return { allowed: true, backend: "gemini", reason: null }; + } + return { + allowed: false, + backend: null, + reason: "gemini_cap_exceeded_no_fallback", + }; + + case "hardware_first": + if (hasHardware) { + return { allowed: true, backend: "hardware", reason: null }; + } + if (geminiAvailable) { + return { allowed: true, backend: "gemini", reason: null }; + } + return { + allowed: false, + backend: null, + reason: "no_backend_available", + }; + + case "gemini_first": + default: + if (geminiAvailable) { + return { allowed: true, backend: "gemini", reason: null }; + } + if (hasHardware) { + return { allowed: true, backend: "hardware", reason: null }; + } + return { + allowed: false, + backend: null, + reason: "gemini_cap_exceeded_no_hardware", + }; } - if (hasHardware) { - return { allowed: true, backend: "hardware", reason: null }; - } - return { allowed: false, backend: null, reason: "gemini_cap_exceeded_no_hardware" }; } // Debit one credit on a successful call. Persists immediately. diff --git a/server/package.json b/server/package.json index 13f06ce..550c817 100644 --- a/server/package.json +++ b/server/package.json @@ -1,6 +1,6 @@ { "name": "recap-relay-server", - "version": "0.2.6", + "version": "0.2.7", "type": "module", "private": true, "dependencies": { diff --git a/server/routes/analyze.js b/server/routes/analyze.js index 517c037..4aa5608 100644 --- a/server/routes/analyze.js +++ b/server/routes/analyze.js @@ -61,7 +61,9 @@ export function analyzeRouter() { const cfg = await getConfigSnapshot(); const hasHardware = !!cfg.relay_gemma_base_url; const quota = await getTierQuotas(); - const plan = planBackend(row, quota, { hasHardware }); + const preference = + cfg.relay_analyze_backend_preference || "gemini_first"; + const plan = planBackend(row, quota, { hasHardware, preference }); if (!plan.allowed) { const e = await errorEnvelope({ error: plan.reason, @@ -78,7 +80,11 @@ export function analyzeRouter() { let result; try { if (chosenBackend === "gemini") { - const backend = createGeminiBackend({ apiKey: cfg.relay_gemini_api_key }); + const backend = createGeminiBackend({ + apiKey: cfg.relay_gemini_api_key, + transcriptionModel: cfg.relay_gemini_transcription_model, + analysisModel: cfg.relay_gemini_analysis_model, + }); result = await backend.analyzeText({ prompt }); } else { const backend = createHardwareBackend({ diff --git a/server/routes/transcribe.js b/server/routes/transcribe.js index bb5b03c..700ee7c 100644 --- a/server/routes/transcribe.js +++ b/server/routes/transcribe.js @@ -78,7 +78,9 @@ export function transcribeRouter() { const cfg = await getConfigSnapshot(); const hasHardware = !!cfg.relay_parakeet_base_url; const quota = await getTierQuotas(); - const plan = planBackend(row, quota, { hasHardware }); + const preference = + cfg.relay_transcribe_backend_preference || "gemini_first"; + const plan = planBackend(row, quota, { hasHardware, preference }); if (!plan.allowed) { const e = await errorEnvelope({ error: plan.reason, @@ -96,7 +98,11 @@ export function transcribeRouter() { let result; try { if (chosenBackend === "gemini") { - const backend = createGeminiBackend({ apiKey: cfg.relay_gemini_api_key }); + const backend = createGeminiBackend({ + apiKey: cfg.relay_gemini_api_key, + transcriptionModel: cfg.relay_gemini_transcription_model, + analysisModel: cfg.relay_gemini_analysis_model, + }); result = await backend.transcribeAudio({ audio: req.file.buffer, mimeType: req.body?.mime_type || req.file.mimetype || "application/octet-stream", diff --git a/startos/actions/index.ts b/startos/actions/index.ts index ed9e1e7..81321cf 100644 --- a/startos/actions/index.ts +++ b/startos/actions/index.ts @@ -5,11 +5,13 @@ import { setParakeetUrl } from './setParakeetUrl' import { setGemmaUrl } from './setGemmaUrl' import { setAdminPassword } from './setAdminPassword' import { adjustTierQuotas } from './adjustTierQuotas' +import { setBackendRouting } from './setBackendRouting' export const actions = sdk.Actions.of() .addAction(setGeminiKey) .addAction(setKeysatBaseUrl) .addAction(setParakeetUrl) .addAction(setGemmaUrl) + .addAction(setBackendRouting) .addAction(setAdminPassword) .addAction(adjustTierQuotas) diff --git a/startos/actions/setBackendRouting.ts b/startos/actions/setBackendRouting.ts new file mode 100644 index 0000000..0d0ea13 --- /dev/null +++ b/startos/actions/setBackendRouting.ts @@ -0,0 +1,116 @@ +import { sdk } from '../sdk' +import { configFile } from '../file-models/config.json' + +const { InputSpec, Value } = sdk + +// Lets the operator tune which backend gets tried first per pipeline +// step (transcribe vs analyze) AND which Gemini SKU is used when +// Gemini is the backend. All four knobs live-reload — change them +// via this action and the next relay request honors the new values +// without a daemon restart. + +const inputSpec = InputSpec.of({ + // ── Gemini model selection ── + relay_gemini_transcription_model: Value.text({ + name: 'Gemini Transcription Model', + description: + "The Gemini SKU used when a transcription request is routed to Gemini. Flash is recommended (cheap, fast, multimodal). Examples: gemini-3-flash-preview (default), gemini-2.5-flash, gemini-2.0-flash, gemini-3-pro-preview (slower + pricier but higher quality on edge cases).", + required: true, + default: 'gemini-3-flash-preview', + minLength: 1, + maxLength: 128, + }), + relay_gemini_analysis_model: Value.text({ + name: 'Gemini Analysis Model', + description: + "The Gemini SKU used when an analysis request is routed to Gemini. Pro is the default for higher-quality structured output. Swap to a flash SKU (e.g. gemini-3-flash-preview) for faster + cheaper analysis at some loss of section-boundary precision.", + required: true, + default: 'gemini-3.1-pro-preview', + minLength: 1, + maxLength: 128, + }), + + // ── Backend routing preference per pipeline ── + relay_transcribe_backend_preference: Value.select({ + name: 'Transcribe Backend Preference', + description: + 'Routing strategy for transcription requests. The selected option controls the ORDER in which the relay tries each backend. The Gemini per-tier cap still applies regardless of this setting.', + default: 'gemini_first', + values: { + gemini_first: + 'Gemini first → operator hardware (Parakeet) when cap exceeded', + hardware_first: 'Operator hardware first → Gemini as fallback', + gemini_only: 'Gemini only — fail when cap is exceeded', + hardware_only: + 'Hardware only — fail when no Parakeet endpoint is configured', + }, + }), + relay_analyze_backend_preference: Value.select({ + name: 'Analyze Backend Preference', + description: + 'Routing strategy for analysis requests. Same options as transcription but applies to the analyze step independently — you can route transcribe to hardware and analyze to Gemini, or vice versa.', + default: 'gemini_first', + values: { + gemini_first: + 'Gemini first → operator hardware (Gemma) when cap exceeded', + hardware_first: 'Operator hardware first → Gemini as fallback', + gemini_only: 'Gemini only — fail when cap is exceeded', + hardware_only: + 'Hardware only — fail when no Gemma endpoint is configured', + }, + }), +}) + +export const setBackendRouting = sdk.Action.withInput( + 'set-backend-routing', + + async ({ effects }) => ({ + name: 'Set Backend Routing & Models', + description: + "Tune which Gemini SKUs the relay uses and the per-pipeline backend pecking order. Live-reloaded — changes take effect on the next request, no restart.", + warning: null, + allowedStatuses: 'any', + group: 'AI Backends', + visibility: 'enabled', + }), + + inputSpec, + + async ({ effects }) => { + const config = await configFile.read().once() + return { + relay_gemini_transcription_model: + config?.relay_gemini_transcription_model || 'gemini-3-flash-preview', + relay_gemini_analysis_model: + config?.relay_gemini_analysis_model || 'gemini-3.1-pro-preview', + relay_transcribe_backend_preference: + (config?.relay_transcribe_backend_preference as + | 'gemini_first' + | 'hardware_first' + | 'gemini_only' + | 'hardware_only' + | undefined) || 'gemini_first', + relay_analyze_backend_preference: + (config?.relay_analyze_backend_preference as + | 'gemini_first' + | 'hardware_first' + | 'gemini_only' + | 'hardware_only' + | undefined) || 'gemini_first', + } + }, + + async ({ effects, input }) => { + await configFile.merge(effects, { + relay_gemini_transcription_model: ( + input.relay_gemini_transcription_model || 'gemini-3-flash-preview' + ).trim(), + relay_gemini_analysis_model: ( + input.relay_gemini_analysis_model || 'gemini-3.1-pro-preview' + ).trim(), + relay_transcribe_backend_preference: input.relay_transcribe_backend_preference, + relay_analyze_backend_preference: input.relay_analyze_backend_preference, + }) + return null + }, +) diff --git a/startos/file-models/config.json.ts b/startos/file-models/config.json.ts index 39bc986..1acfda6 100644 --- a/startos/file-models/config.json.ts +++ b/startos/file-models/config.json.ts @@ -36,6 +36,33 @@ export const configFile = FileHelper.json( relay_parakeet_model: z.string().default('parakeet-tdt-0.6b-v3'), relay_gemma_model: z.string().default('gemma3:27b'), + // ── Gemini model selection ── + // Operator can pick which Gemini SKU is used per pipeline step + // without rebuilding the relay. Defaults match Google's typical + // recommendations: Flash for transcription (cheap, fast, + // multimodal-capable), Pro for analysis (higher quality on + // structured-JSON outputs). Operators can swap to flash for + // analysis when they want faster + cheaper at the cost of some + // section-boundary precision. + relay_gemini_transcription_model: z.string().default('gemini-3-flash-preview'), + relay_gemini_analysis_model: z.string().default('gemini-3.1-pro-preview'), + + // ── Backend routing preference per pipeline ── + // Controls whether the relay tries Gemini first (current default — + // best quality, costs operator's Gemini API budget) or the + // operator-hardware backend first (saves Gemini budget, may be + // slower depending on the operator's hardware). One of: + // - "gemini_first" try Gemini until per-tier cap, then hardware + // - "hardware_first" try hardware first, fall back to Gemini + // - "gemini_only" Gemini only, fail when cap is exceeded + // - "hardware_only" Hardware only, fail when not configured + relay_transcribe_backend_preference: z + .enum(['gemini_first', 'hardware_first', 'gemini_only', 'hardware_only']) + .default('gemini_first'), + relay_analyze_backend_preference: z + .enum(['gemini_first', 'hardware_first', 'gemini_only', 'hardware_only']) + .default('gemini_first'), + // ── License server ── // URL of the Keysat license server used for the cached online // license-validation check. Defaults to the public endpoint; diff --git a/startos/versions/index.ts b/startos/versions/index.ts index c823859..0329fef 100644 --- a/startos/versions/index.ts +++ b/startos/versions/index.ts @@ -7,8 +7,9 @@ import { v_0_2_3 } from './v0.2.3' import { v_0_2_4 } from './v0.2.4' import { v_0_2_5 } from './v0.2.5' import { v_0_2_6 } from './v0.2.6' +import { v_0_2_7 } from './v0.2.7' export const versionGraph = VersionGraph.of({ - current: v_0_2_6, - other: [v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0], + current: v_0_2_7, + other: [v_0_2_6, v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0], }) diff --git a/startos/versions/v0.2.7.ts b/startos/versions/v0.2.7.ts new file mode 100644 index 0000000..112b74e --- /dev/null +++ b/startos/versions/v0.2.7.ts @@ -0,0 +1,13 @@ +import { VersionInfo } from '@start9labs/start-sdk' + +export const v_0_2_7 = VersionInfo.of({ + version: '0.2.7:0', + releaseNotes: { + en_US: + 'New "Set Backend Routing & Models" action exposes four operator knobs: Gemini transcription model, Gemini analysis model, transcribe backend preference (gemini_first / hardware_first / gemini_only / hardware_only), and analyze backend preference. Routing strategies are honored by planBackend per-pipeline, so the operator can route transcribe to Parakeet for speed AND analyze through Gemini Flash for faster + cheaper structured output, or any other combination.', + }, + migrations: { + up: async ({ effects }) => {}, + down: async ({ effects }) => {}, + }, +})