v0.2.9 Gemini model selects + fallback chain
This commit is contained in:
+103
-18
@@ -27,6 +27,46 @@ const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
|
|||||||
const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview";
|
const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview";
|
||||||
const EMPTY_RETRIES = 3;
|
const EMPTY_RETRIES = 3;
|
||||||
|
|
||||||
|
// Per-pipeline fallback chains, ordered newest/most-expensive →
|
||||||
|
// older/cheaper. When the operator-selected primary model returns a
|
||||||
|
// retryable error (503 capacity, 429 rate limit, etc.) the relay
|
||||||
|
// walks DOWN this list — never up, since the operator's choice
|
||||||
|
// reflects their preferred price/quality point. The chain is sliced
|
||||||
|
// from the primary forward, so picking 2.5-flash falls back to only
|
||||||
|
// 2.0-flash, never back up to 3-flash.
|
||||||
|
const TRANSCRIPTION_FALLBACK_CHAIN = [
|
||||||
|
"gemini-3-flash-preview",
|
||||||
|
"gemini-2.5-flash",
|
||||||
|
"gemini-2.0-flash",
|
||||||
|
];
|
||||||
|
const ANALYSIS_FALLBACK_CHAIN = [
|
||||||
|
"gemini-3.1-pro-preview",
|
||||||
|
"gemini-3-pro-preview",
|
||||||
|
"gemini-3-flash-preview",
|
||||||
|
"gemini-2.5-flash",
|
||||||
|
];
|
||||||
|
|
||||||
|
// Slice the chain starting at the primary model. If the primary isn't
|
||||||
|
// in the chain (unknown / typo), return just the primary — no
|
||||||
|
// fallback possible. Returns a fresh array so callers can iterate
|
||||||
|
// safely.
|
||||||
|
function fallbackChain(chain, primary) {
|
||||||
|
const idx = chain.indexOf(primary);
|
||||||
|
if (idx < 0) return [primary];
|
||||||
|
return chain.slice(idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect errors that warrant trying the next model in the chain.
|
||||||
|
// Capacity / rate-limit / network blips → yes. Auth failures / 400s
|
||||||
|
// → no, those would just keep failing with the same root cause.
|
||||||
|
function isFallbackEligibleError(err) {
|
||||||
|
const status = err?.status || err?.httpStatusCode || 0;
|
||||||
|
const msg = err?.message || String(err);
|
||||||
|
if (status === 503 || status === 429 || status === 529) return true;
|
||||||
|
if (/overloaded|unavailable|capacity|high demand|rate limit|fetch failed|ECONNRESET|ETIMEDOUT|socket hang up|network/i.test(msg)) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
const TRANSCRIPTION_SAFETY = [
|
const TRANSCRIPTION_SAFETY = [
|
||||||
{ category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
|
{ category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
|
||||||
{ category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
|
{ category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
|
||||||
@@ -47,10 +87,13 @@ export function createGeminiBackend({
|
|||||||
apiKey,
|
apiKey,
|
||||||
httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
|
httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
|
||||||
});
|
});
|
||||||
// Flash models accept `thinkingLevel: "minimal"`; Pro models reject
|
// Build the per-call fallback chains. The primary is whatever the
|
||||||
// it. Detect from the model id so the operator can flip flash <-> pro
|
// operator selected via the StartOS action; subsequent entries are
|
||||||
// via the StartOS action without breaking the request.
|
// the lower-tier members of the chain (we never fall back UP). When
|
||||||
const txIsFlash = /flash/i.test(transcriptionModel);
|
// the primary returns a 503/capacity/rate-limit error, the loops
|
||||||
|
// below try the next model.
|
||||||
|
const txChain = fallbackChain(TRANSCRIPTION_FALLBACK_CHAIN, transcriptionModel);
|
||||||
|
const anChain = fallbackChain(ANALYSIS_FALLBACK_CHAIN, analysisModel);
|
||||||
|
|
||||||
async function transcribeAudio({
|
async function transcribeAudio({
|
||||||
audio,
|
audio,
|
||||||
@@ -84,15 +127,29 @@ export function createGeminiBackend({
|
|||||||
}
|
}
|
||||||
|
|
||||||
const prompt = buildTranscriptionPrompt({ title, channel, description, chapters });
|
const prompt = buildTranscriptionPrompt({ title, channel, description, chapters });
|
||||||
|
|
||||||
|
// Walk the fallback chain: try the primary model first; on a
|
||||||
|
// retryable error (capacity / 503 / rate-limit), try the next
|
||||||
|
// model in the chain. Non-retryable errors bubble up to the
|
||||||
|
// caller — they'd just fail the same way on every model.
|
||||||
|
let lastErr;
|
||||||
|
for (let modelIdx = 0; modelIdx < txChain.length; modelIdx++) {
|
||||||
|
const model = txChain[modelIdx];
|
||||||
|
const isFlash = /flash/i.test(model);
|
||||||
|
try {
|
||||||
let result;
|
let result;
|
||||||
|
// Empty-response retries: when the SDK returns 200 with no
|
||||||
|
// text (which happens periodically with audio inputs),
|
||||||
|
// retry up to N times with the SAME model before falling
|
||||||
|
// back.
|
||||||
for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
|
for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
|
||||||
result = await ai.models.generateContent({
|
result = await ai.models.generateContent({
|
||||||
model: transcriptionModel,
|
model,
|
||||||
config: {
|
config: {
|
||||||
// thinkingLevel: "minimal" is only valid for Flash. Pro
|
// thinkingLevel: "minimal" is only valid for Flash.
|
||||||
// models reject it. Skip when the operator picks a Pro
|
// Pro models reject it. Skip when the chain hop
|
||||||
// model for transcription (slower but valid).
|
// landed on a Pro model.
|
||||||
...(txIsFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}),
|
...(isFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}),
|
||||||
safetySettings: TRANSCRIPTION_SAFETY,
|
safetySettings: TRANSCRIPTION_SAFETY,
|
||||||
},
|
},
|
||||||
contents: [
|
contents: [
|
||||||
@@ -114,25 +171,42 @@ export function createGeminiBackend({
|
|||||||
const text = safeText(result) || "";
|
const text = safeText(result) || "";
|
||||||
return {
|
return {
|
||||||
text,
|
text,
|
||||||
// Gemini returns a single timestamped blob — segments are
|
|
||||||
// parsed client-side by the orchestration layer. We could
|
|
||||||
// pre-parse here but Recap already has parseTimestampedTranscript
|
|
||||||
// that handles this exact shape.
|
|
||||||
segments: [],
|
segments: [],
|
||||||
duration_seconds: 0,
|
duration_seconds: 0,
|
||||||
// Pass usage + the model id back to the route so audit-log
|
|
||||||
// entries can include token counts + computed cost.
|
|
||||||
usage: result?.usageMetadata || null,
|
usage: result?.usageMetadata || null,
|
||||||
model: transcriptionModel,
|
// Return the model that ACTUALLY served the request — so
|
||||||
|
// the audit log records what was used, not just what was
|
||||||
|
// requested. Lets the operator see "this call fell back
|
||||||
|
// from 3-flash to 2.5-flash" via the dashboard.
|
||||||
|
model,
|
||||||
};
|
};
|
||||||
|
} catch (err) {
|
||||||
|
lastErr = err;
|
||||||
|
const canFallback =
|
||||||
|
isFallbackEligibleError(err) && modelIdx < txChain.length - 1;
|
||||||
|
console.warn(
|
||||||
|
`[gemini] transcribe with ${model} failed (${err?.status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${txChain[modelIdx + 1]}` : ""}`
|
||||||
|
);
|
||||||
|
if (!canFallback) {
|
||||||
|
try { await ai.files.delete({ name: f.name }); } catch {}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
// loop continues with next model
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw lastErr || new Error("transcribe: all models in fallback chain failed");
|
||||||
} finally {
|
} finally {
|
||||||
try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
|
try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function analyzeText({ prompt }) {
|
async function analyzeText({ prompt }) {
|
||||||
|
let lastErr;
|
||||||
|
for (let modelIdx = 0; modelIdx < anChain.length; modelIdx++) {
|
||||||
|
const model = anChain[modelIdx];
|
||||||
|
try {
|
||||||
const result = await ai.models.generateContent({
|
const result = await ai.models.generateContent({
|
||||||
model: analysisModel,
|
model,
|
||||||
contents: [
|
contents: [
|
||||||
{
|
{
|
||||||
role: "user",
|
role: "user",
|
||||||
@@ -143,8 +217,19 @@ export function createGeminiBackend({
|
|||||||
return {
|
return {
|
||||||
text: safeText(result) || "",
|
text: safeText(result) || "",
|
||||||
usage: result?.usageMetadata || null,
|
usage: result?.usageMetadata || null,
|
||||||
model: analysisModel,
|
model,
|
||||||
};
|
};
|
||||||
|
} catch (err) {
|
||||||
|
lastErr = err;
|
||||||
|
const canFallback =
|
||||||
|
isFallbackEligibleError(err) && modelIdx < anChain.length - 1;
|
||||||
|
console.warn(
|
||||||
|
`[gemini] analyze with ${model} failed (${err?.status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${anChain[modelIdx + 1]}` : ""}`
|
||||||
|
);
|
||||||
|
if (!canFallback) throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw lastErr || new Error("analyze: all models in fallback chain failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
return { transcribeAudio, analyzeText };
|
return { transcribeAudio, analyzeText };
|
||||||
|
|||||||
+1
-1
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "recap-relay-server",
|
"name": "recap-relay-server",
|
||||||
"version": "0.2.8",
|
"version": "0.2.9",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"private": true,
|
"private": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
|||||||
@@ -11,23 +11,40 @@ const { InputSpec, Value } = sdk
|
|||||||
|
|
||||||
const inputSpec = InputSpec.of({
|
const inputSpec = InputSpec.of({
|
||||||
// ── Gemini model selection ──
|
// ── Gemini model selection ──
|
||||||
relay_gemini_transcription_model: Value.text({
|
// Both fields are radio-select with curated options. The relay's
|
||||||
|
// Gemini backend automatically falls back to lower-tier models in
|
||||||
|
// this same list when the chosen one returns a 503 / capacity /
|
||||||
|
// rate-limit error — see server/backends/gemini.js for the
|
||||||
|
// fallback-chain logic.
|
||||||
|
relay_gemini_transcription_model: Value.select({
|
||||||
name: 'Gemini Transcription Model',
|
name: 'Gemini Transcription Model',
|
||||||
description:
|
description:
|
||||||
"The Gemini SKU used when a transcription request is routed to Gemini. Flash is recommended (cheap, fast, multimodal). Examples: gemini-3-flash-preview (default), gemini-2.5-flash, gemini-2.0-flash, gemini-3-pro-preview (slower + pricier but higher quality on edge cases).",
|
"Primary Gemini SKU used when a transcription request is routed to Gemini. On 503/capacity/rate-limit failure, the relay falls back to lower-tier models in order (e.g. 3-flash → 2.5-flash → 2.0-flash).",
|
||||||
required: true,
|
|
||||||
default: 'gemini-3-flash-preview',
|
default: 'gemini-3-flash-preview',
|
||||||
minLength: 1,
|
values: {
|
||||||
maxLength: 128,
|
'gemini-3-flash-preview':
|
||||||
|
'Gemini 3 Flash — latest, recommended (~$0.30/M in, $2.50/M out)',
|
||||||
|
'gemini-2.5-flash':
|
||||||
|
'Gemini 2.5 Flash — prior gen (same pricing as 3-flash)',
|
||||||
|
'gemini-2.0-flash':
|
||||||
|
'Gemini 2.0 Flash — older + cheapest (~$0.10/M in, $0.40/M out)',
|
||||||
|
},
|
||||||
}),
|
}),
|
||||||
relay_gemini_analysis_model: Value.text({
|
relay_gemini_analysis_model: Value.select({
|
||||||
name: 'Gemini Analysis Model',
|
name: 'Gemini Analysis Model',
|
||||||
description:
|
description:
|
||||||
"The Gemini SKU used when an analysis request is routed to Gemini. Pro is the default for higher-quality structured output. Swap to a flash SKU (e.g. gemini-3-flash-preview) for faster + cheaper analysis at some loss of section-boundary precision.",
|
"Primary Gemini SKU used when an analysis request is routed to Gemini. On 503/capacity/rate-limit failure, the relay falls back to lower-tier models in order (e.g. 3.1-pro → 3-pro → 3-flash → 2.5-flash).",
|
||||||
required: true,
|
|
||||||
default: 'gemini-3.1-pro-preview',
|
default: 'gemini-3.1-pro-preview',
|
||||||
minLength: 1,
|
values: {
|
||||||
maxLength: 128,
|
'gemini-3.1-pro-preview':
|
||||||
|
'Gemini 3.1 Pro — best quality on structured-JSON output ($5/M in, $25/M out)',
|
||||||
|
'gemini-3-pro-preview':
|
||||||
|
'Gemini 3 Pro — prior Pro gen (same pricing as 3.1)',
|
||||||
|
'gemini-3-flash-preview':
|
||||||
|
'Gemini 3 Flash — faster + ~20× cheaper than Pro; some loss of section-boundary precision on long transcripts',
|
||||||
|
'gemini-2.5-flash':
|
||||||
|
'Gemini 2.5 Flash — prior Flash gen',
|
||||||
|
},
|
||||||
}),
|
}),
|
||||||
|
|
||||||
// ── Backend routing preference per pipeline ──
|
// ── Backend routing preference per pipeline ──
|
||||||
@@ -78,11 +95,32 @@ export const setBackendRouting = sdk.Action.withInput(
|
|||||||
|
|
||||||
async ({ effects }) => {
|
async ({ effects }) => {
|
||||||
const config = await configFile.read().once()
|
const config = await configFile.read().once()
|
||||||
|
// Coerce any previously-saved model name to a value in the new
|
||||||
|
// select's options. Older 0.2.7-era saved configs could hold a
|
||||||
|
// free-text value that's no longer in the dropdown — clamp to a
|
||||||
|
// sensible default rather than presenting an invalid radio.
|
||||||
|
const TX_OPTIONS = [
|
||||||
|
'gemini-3-flash-preview',
|
||||||
|
'gemini-2.5-flash',
|
||||||
|
'gemini-2.0-flash',
|
||||||
|
] as const
|
||||||
|
const AN_OPTIONS = [
|
||||||
|
'gemini-3.1-pro-preview',
|
||||||
|
'gemini-3-pro-preview',
|
||||||
|
'gemini-3-flash-preview',
|
||||||
|
'gemini-2.5-flash',
|
||||||
|
] as const
|
||||||
|
const tx = config?.relay_gemini_transcription_model as
|
||||||
|
| (typeof TX_OPTIONS)[number]
|
||||||
|
| undefined
|
||||||
|
const an = config?.relay_gemini_analysis_model as
|
||||||
|
| (typeof AN_OPTIONS)[number]
|
||||||
|
| undefined
|
||||||
return {
|
return {
|
||||||
relay_gemini_transcription_model:
|
relay_gemini_transcription_model:
|
||||||
config?.relay_gemini_transcription_model || 'gemini-3-flash-preview',
|
tx && TX_OPTIONS.includes(tx) ? tx : 'gemini-3-flash-preview',
|
||||||
relay_gemini_analysis_model:
|
relay_gemini_analysis_model:
|
||||||
config?.relay_gemini_analysis_model || 'gemini-3.1-pro-preview',
|
an && AN_OPTIONS.includes(an) ? an : 'gemini-3.1-pro-preview',
|
||||||
relay_transcribe_backend_preference:
|
relay_transcribe_backend_preference:
|
||||||
(config?.relay_transcribe_backend_preference as
|
(config?.relay_transcribe_backend_preference as
|
||||||
| 'gemini_first'
|
| 'gemini_first'
|
||||||
|
|||||||
@@ -9,8 +9,9 @@ import { v_0_2_5 } from './v0.2.5'
|
|||||||
import { v_0_2_6 } from './v0.2.6'
|
import { v_0_2_6 } from './v0.2.6'
|
||||||
import { v_0_2_7 } from './v0.2.7'
|
import { v_0_2_7 } from './v0.2.7'
|
||||||
import { v_0_2_8 } from './v0.2.8'
|
import { v_0_2_8 } from './v0.2.8'
|
||||||
|
import { v_0_2_9 } from './v0.2.9'
|
||||||
|
|
||||||
export const versionGraph = VersionGraph.of({
|
export const versionGraph = VersionGraph.of({
|
||||||
current: v_0_2_8,
|
current: v_0_2_9,
|
||||||
other: [v_0_2_7, v_0_2_6, v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0],
|
other: [v_0_2_8, v_0_2_7, v_0_2_6, v_0_2_5, v_0_2_4, v_0_2_3, v_0_2_2, v_0_2_1, v_0_2_0, v_0_1_0],
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -0,0 +1,13 @@
|
|||||||
|
import { VersionInfo } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
|
export const v_0_2_9 = VersionInfo.of({
|
||||||
|
version: '0.2.9:0',
|
||||||
|
releaseNotes: {
|
||||||
|
en_US:
|
||||||
|
'Set Backend Routing & Models action: Gemini transcription and analysis fields are now radio-select dropdowns with curated options (transcribe: 3-flash, 2.5-flash, 2.0-flash; analyze: 3.1-pro, 3-pro, 3-flash, 2.5-flash). Gemini backend automatically falls back to lower-tier models in the same chain when the primary returns a 503/capacity/rate-limit error. Audit log records the model that actually served each call, so dashboard reflects fallback behavior accurately.',
|
||||||
|
},
|
||||||
|
migrations: {
|
||||||
|
up: async ({ effects }) => {},
|
||||||
|
down: async ({ effects }) => {},
|
||||||
|
},
|
||||||
|
})
|
||||||
Reference in New Issue
Block a user