diff --git a/server/backends/gemini.js b/server/backends/gemini.js index 7851dd1..9726de3 100644 --- a/server/backends/gemini.js +++ b/server/backends/gemini.js @@ -27,6 +27,46 @@ const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview"; const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview"; const EMPTY_RETRIES = 3; +// Per-pipeline fallback chains, ordered newest/most-expensive → +// older/cheaper. When the operator-selected primary model returns a +// retryable error (503 capacity, 429 rate limit, etc.) the relay +// walks DOWN this list — never up, since the operator's choice +// reflects their preferred price/quality point. The chain is sliced +// from the primary forward, so picking 2.5-flash falls back to only +// 2.0-flash, never back up to 3-flash. +const TRANSCRIPTION_FALLBACK_CHAIN = [ + "gemini-3-flash-preview", + "gemini-2.5-flash", + "gemini-2.0-flash", +]; +const ANALYSIS_FALLBACK_CHAIN = [ + "gemini-3.1-pro-preview", + "gemini-3-pro-preview", + "gemini-3-flash-preview", + "gemini-2.5-flash", +]; + +// Slice the chain starting at the primary model. If the primary isn't +// in the chain (unknown / typo), return just the primary — no +// fallback possible. Returns a fresh array so callers can iterate +// safely. +function fallbackChain(chain, primary) { + const idx = chain.indexOf(primary); + if (idx < 0) return [primary]; + return chain.slice(idx); +} + +// Detect errors that warrant trying the next model in the chain. +// Capacity / rate-limit / network blips → yes. Auth failures / 400s +// → no, those would just keep failing with the same root cause. +function isFallbackEligibleError(err) { + const status = err?.status || err?.httpStatusCode || 0; + const msg = err?.message || String(err); + if (status === 503 || status === 429 || status === 529) return true; + if (/overloaded|unavailable|capacity|high demand|rate limit|fetch failed|ECONNRESET|ETIMEDOUT|socket hang up|network/i.test(msg)) return true; + return false; +} + const TRANSCRIPTION_SAFETY = [ { category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" }, { category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" }, @@ -47,10 +87,13 @@ export function createGeminiBackend({ apiKey, httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs }, }); - // Flash models accept `thinkingLevel: "minimal"`; Pro models reject - // it. Detect from the model id so the operator can flip flash <-> pro - // via the StartOS action without breaking the request. - const txIsFlash = /flash/i.test(transcriptionModel); + // Build the per-call fallback chains. The primary is whatever the + // operator selected via the StartOS action; subsequent entries are + // the lower-tier members of the chain (we never fall back UP). When + // the primary returns a 503/capacity/rate-limit error, the loops + // below try the next model. + const txChain = fallbackChain(TRANSCRIPTION_FALLBACK_CHAIN, transcriptionModel); + const anChain = fallbackChain(ANALYSIS_FALLBACK_CHAIN, analysisModel); async function transcribeAudio({ audio, @@ -84,67 +127,109 @@ export function createGeminiBackend({ } const prompt = buildTranscriptionPrompt({ title, channel, description, chapters }); - let result; - for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) { - result = await ai.models.generateContent({ - model: transcriptionModel, - config: { - // thinkingLevel: "minimal" is only valid for Flash. Pro - // models reject it. Skip when the operator picks a Pro - // model for transcription (slower but valid). - ...(txIsFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}), - safetySettings: TRANSCRIPTION_SAFETY, - }, - contents: [ - { - role: "user", - parts: [ - { fileData: { fileUri: f.uri, mimeType } }, - { text: prompt }, + + // Walk the fallback chain: try the primary model first; on a + // retryable error (capacity / 503 / rate-limit), try the next + // model in the chain. Non-retryable errors bubble up to the + // caller — they'd just fail the same way on every model. + let lastErr; + for (let modelIdx = 0; modelIdx < txChain.length; modelIdx++) { + const model = txChain[modelIdx]; + const isFlash = /flash/i.test(model); + try { + let result; + // Empty-response retries: when the SDK returns 200 with no + // text (which happens periodically with audio inputs), + // retry up to N times with the SAME model before falling + // back. + for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) { + result = await ai.models.generateContent({ + model, + config: { + // thinkingLevel: "minimal" is only valid for Flash. + // Pro models reject it. Skip when the chain hop + // landed on a Pro model. + ...(isFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}), + safetySettings: TRANSCRIPTION_SAFETY, + }, + contents: [ + { + role: "user", + parts: [ + { fileData: { fileUri: f.uri, mimeType } }, + { text: prompt }, + ], + }, ], - }, - ], - }); - if (safeText(result)) break; + }); + if (safeText(result)) break; + } + + // Best-effort cleanup of the uploaded File API artifact. + try { await ai.files.delete({ name: f.name }); } catch {} + + const text = safeText(result) || ""; + return { + text, + segments: [], + duration_seconds: 0, + usage: result?.usageMetadata || null, + // Return the model that ACTUALLY served the request — so + // the audit log records what was used, not just what was + // requested. Lets the operator see "this call fell back + // from 3-flash to 2.5-flash" via the dashboard. + model, + }; + } catch (err) { + lastErr = err; + const canFallback = + isFallbackEligibleError(err) && modelIdx < txChain.length - 1; + console.warn( + `[gemini] transcribe with ${model} failed (${err?.status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${txChain[modelIdx + 1]}` : ""}` + ); + if (!canFallback) { + try { await ai.files.delete({ name: f.name }); } catch {} + throw err; + } + // loop continues with next model + } } - - // Best-effort cleanup of the uploaded File API artifact. - try { await ai.files.delete({ name: f.name }); } catch {} - - const text = safeText(result) || ""; - return { - text, - // Gemini returns a single timestamped blob — segments are - // parsed client-side by the orchestration layer. We could - // pre-parse here but Recap already has parseTimestampedTranscript - // that handles this exact shape. - segments: [], - duration_seconds: 0, - // Pass usage + the model id back to the route so audit-log - // entries can include token counts + computed cost. - usage: result?.usageMetadata || null, - model: transcriptionModel, - }; + throw lastErr || new Error("transcribe: all models in fallback chain failed"); } finally { try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {} } } async function analyzeText({ prompt }) { - const result = await ai.models.generateContent({ - model: analysisModel, - contents: [ - { - role: "user", - parts: [{ text: prompt }], - }, - ], - }); - return { - text: safeText(result) || "", - usage: result?.usageMetadata || null, - model: analysisModel, - }; + let lastErr; + for (let modelIdx = 0; modelIdx < anChain.length; modelIdx++) { + const model = anChain[modelIdx]; + try { + const result = await ai.models.generateContent({ + model, + contents: [ + { + role: "user", + parts: [{ text: prompt }], + }, + ], + }); + return { + text: safeText(result) || "", + usage: result?.usageMetadata || null, + model, + }; + } catch (err) { + lastErr = err; + const canFallback = + isFallbackEligibleError(err) && modelIdx < anChain.length - 1; + console.warn( + `[gemini] analyze with ${model} failed (${err?.status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${anChain[modelIdx + 1]}` : ""}` + ); + if (!canFallback) throw err; + } + } + throw lastErr || new Error("analyze: all models in fallback chain failed"); } return { transcribeAudio, analyzeText }; diff --git a/server/package.json b/server/package.json index 1362f31..10cebb0 100644 --- a/server/package.json +++ b/server/package.json @@ -1,6 +1,6 @@ { "name": "recap-relay-server", - "version": "0.2.8", + "version": "0.2.9", "type": "module", "private": true, "dependencies": { diff --git a/startos/actions/setBackendRouting.ts b/startos/actions/setBackendRouting.ts index 0d0ea13..f13de3f 100644 --- a/startos/actions/setBackendRouting.ts +++ b/startos/actions/setBackendRouting.ts @@ -11,23 +11,40 @@ const { InputSpec, Value } = sdk const inputSpec = InputSpec.of({ // ── Gemini model selection ── - relay_gemini_transcription_model: Value.text({ + // Both fields are radio-select with curated options. The relay's + // Gemini backend automatically falls back to lower-tier models in + // this same list when the chosen one returns a 503 / capacity / + // rate-limit error — see server/backends/gemini.js for the + // fallback-chain logic. + relay_gemini_transcription_model: Value.select({ name: 'Gemini Transcription Model', description: - "The Gemini SKU used when a transcription request is routed to Gemini. Flash is recommended (cheap, fast, multimodal). Examples: gemini-3-flash-preview (default), gemini-2.5-flash, gemini-2.0-flash, gemini-3-pro-preview (slower + pricier but higher quality on edge cases).", - required: true, + "Primary Gemini SKU used when a transcription request is routed to Gemini. On 503/capacity/rate-limit failure, the relay falls back to lower-tier models in order (e.g. 3-flash → 2.5-flash → 2.0-flash).", default: 'gemini-3-flash-preview', - minLength: 1, - maxLength: 128, + values: { + 'gemini-3-flash-preview': + 'Gemini 3 Flash — latest, recommended (~$0.30/M in, $2.50/M out)', + 'gemini-2.5-flash': + 'Gemini 2.5 Flash — prior gen (same pricing as 3-flash)', + 'gemini-2.0-flash': + 'Gemini 2.0 Flash — older + cheapest (~$0.10/M in, $0.40/M out)', + }, }), - relay_gemini_analysis_model: Value.text({ + relay_gemini_analysis_model: Value.select({ name: 'Gemini Analysis Model', description: - "The Gemini SKU used when an analysis request is routed to Gemini. Pro is the default for higher-quality structured output. Swap to a flash SKU (e.g. gemini-3-flash-preview) for faster + cheaper analysis at some loss of section-boundary precision.", - required: true, + "Primary Gemini SKU used when an analysis request is routed to Gemini. On 503/capacity/rate-limit failure, the relay falls back to lower-tier models in order (e.g. 3.1-pro → 3-pro → 3-flash → 2.5-flash).", default: 'gemini-3.1-pro-preview', - minLength: 1, - maxLength: 128, + values: { + 'gemini-3.1-pro-preview': + 'Gemini 3.1 Pro — best quality on structured-JSON output ($5/M in, $25/M out)', + 'gemini-3-pro-preview': + 'Gemini 3 Pro — prior Pro gen (same pricing as 3.1)', + 'gemini-3-flash-preview': + 'Gemini 3 Flash — faster + ~20× cheaper than Pro; some loss of section-boundary precision on long transcripts', + 'gemini-2.5-flash': + 'Gemini 2.5 Flash — prior Flash gen', + }, }), // ── Backend routing preference per pipeline ── @@ -78,11 +95,32 @@ export const setBackendRouting = sdk.Action.withInput( async ({ effects }) => { const config = await configFile.read().once() + // Coerce any previously-saved model name to a value in the new + // select's options. Older 0.2.7-era saved configs could hold a + // free-text value that's no longer in the dropdown — clamp to a + // sensible default rather than presenting an invalid radio. + const TX_OPTIONS = [ + 'gemini-3-flash-preview', + 'gemini-2.5-flash', + 'gemini-2.0-flash', + ] as const + const AN_OPTIONS = [ + 'gemini-3.1-pro-preview', + 'gemini-3-pro-preview', + 'gemini-3-flash-preview', + 'gemini-2.5-flash', + ] as const + const tx = config?.relay_gemini_transcription_model as + | (typeof TX_OPTIONS)[number] + | undefined + const an = config?.relay_gemini_analysis_model as + | (typeof AN_OPTIONS)[number] + | undefined return { relay_gemini_transcription_model: - config?.relay_gemini_transcription_model || 'gemini-3-flash-preview', + tx && TX_OPTIONS.includes(tx) ? tx : 'gemini-3-flash-preview', relay_gemini_analysis_model: - config?.relay_gemini_analysis_model || 'gemini-3.1-pro-preview', + an && AN_OPTIONS.includes(an) ? an : 'gemini-3.1-pro-preview', relay_transcribe_backend_preference: (config?.relay_transcribe_backend_preference as | 'gemini_first' diff --git a/startos/versions/index.ts b/startos/versions/index.ts index c329daa..8605fc6 100644 --- a/startos/versions/index.ts +++ b/startos/versions/index.ts @@ -9,8 +9,9 @@ import { v_0_2_5 } from './v0.2.5' import { v_0_2_6 } from './v0.2.6' import { v_0_2_7 } from './v0.2.7' import { v_0_2_8 } from './v0.2.8' +import { v_0_2_9 } from './v0.2.9' export const versionGraph = VersionGraph.of({ - current: v_0_2_8, - other: [v_0_2_7, v_0_2_6, v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0], + current: v_0_2_9, + other: [v_0_2_8, v_0_2_7, v_0_2_6, v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0], }) diff --git a/startos/versions/v0.2.9.ts b/startos/versions/v0.2.9.ts new file mode 100644 index 0000000..6e203e7 --- /dev/null +++ b/startos/versions/v0.2.9.ts @@ -0,0 +1,13 @@ +import { VersionInfo } from '@start9labs/start-sdk' + +export const v_0_2_9 = VersionInfo.of({ + version: '0.2.9:0', + releaseNotes: { + en_US: + 'Set Backend Routing & Models action: Gemini transcription and analysis fields are now radio-select dropdowns with curated options (transcribe: 3-flash, 2.5-flash, 2.0-flash; analyze: 3.1-pro, 3-pro, 3-flash, 2.5-flash). Gemini backend automatically falls back to lower-tier models in the same chain when the primary returns a 503/capacity/rate-limit error. Audit log records the model that actually served each call, so dashboard reflects fallback behavior accurately.', + }, + migrations: { + up: async ({ effects }) => {}, + down: async ({ effects }) => {}, + }, +})