v0.2.9 Gemini model selects + fallback chain
This commit is contained in:
+142
-57
@@ -27,6 +27,46 @@ const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
|
||||
const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview";
|
||||
const EMPTY_RETRIES = 3;
|
||||
|
||||
// Per-pipeline fallback chains, ordered newest/most-expensive →
|
||||
// older/cheaper. When the operator-selected primary model returns a
|
||||
// retryable error (503 capacity, 429 rate limit, etc.) the relay
|
||||
// walks DOWN this list — never up, since the operator's choice
|
||||
// reflects their preferred price/quality point. The chain is sliced
|
||||
// from the primary forward, so picking 2.5-flash falls back to only
|
||||
// 2.0-flash, never back up to 3-flash.
|
||||
const TRANSCRIPTION_FALLBACK_CHAIN = [
|
||||
"gemini-3-flash-preview",
|
||||
"gemini-2.5-flash",
|
||||
"gemini-2.0-flash",
|
||||
];
|
||||
const ANALYSIS_FALLBACK_CHAIN = [
|
||||
"gemini-3.1-pro-preview",
|
||||
"gemini-3-pro-preview",
|
||||
"gemini-3-flash-preview",
|
||||
"gemini-2.5-flash",
|
||||
];
|
||||
|
||||
// Slice the chain starting at the primary model. If the primary isn't
|
||||
// in the chain (unknown / typo), return just the primary — no
|
||||
// fallback possible. Returns a fresh array so callers can iterate
|
||||
// safely.
|
||||
function fallbackChain(chain, primary) {
|
||||
const idx = chain.indexOf(primary);
|
||||
if (idx < 0) return [primary];
|
||||
return chain.slice(idx);
|
||||
}
|
||||
|
||||
// Detect errors that warrant trying the next model in the chain.
|
||||
// Capacity / rate-limit / network blips → yes. Auth failures / 400s
|
||||
// → no, those would just keep failing with the same root cause.
|
||||
function isFallbackEligibleError(err) {
|
||||
const status = err?.status || err?.httpStatusCode || 0;
|
||||
const msg = err?.message || String(err);
|
||||
if (status === 503 || status === 429 || status === 529) return true;
|
||||
if (/overloaded|unavailable|capacity|high demand|rate limit|fetch failed|ECONNRESET|ETIMEDOUT|socket hang up|network/i.test(msg)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
const TRANSCRIPTION_SAFETY = [
|
||||
{ category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
|
||||
{ category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
|
||||
@@ -47,10 +87,13 @@ export function createGeminiBackend({
|
||||
apiKey,
|
||||
httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
|
||||
});
|
||||
// Flash models accept `thinkingLevel: "minimal"`; Pro models reject
|
||||
// it. Detect from the model id so the operator can flip flash <-> pro
|
||||
// via the StartOS action without breaking the request.
|
||||
const txIsFlash = /flash/i.test(transcriptionModel);
|
||||
// Build the per-call fallback chains. The primary is whatever the
|
||||
// operator selected via the StartOS action; subsequent entries are
|
||||
// the lower-tier members of the chain (we never fall back UP). When
|
||||
// the primary returns a 503/capacity/rate-limit error, the loops
|
||||
// below try the next model.
|
||||
const txChain = fallbackChain(TRANSCRIPTION_FALLBACK_CHAIN, transcriptionModel);
|
||||
const anChain = fallbackChain(ANALYSIS_FALLBACK_CHAIN, analysisModel);
|
||||
|
||||
async function transcribeAudio({
|
||||
audio,
|
||||
@@ -84,67 +127,109 @@ export function createGeminiBackend({
|
||||
}
|
||||
|
||||
const prompt = buildTranscriptionPrompt({ title, channel, description, chapters });
|
||||
let result;
|
||||
for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
|
||||
result = await ai.models.generateContent({
|
||||
model: transcriptionModel,
|
||||
config: {
|
||||
// thinkingLevel: "minimal" is only valid for Flash. Pro
|
||||
// models reject it. Skip when the operator picks a Pro
|
||||
// model for transcription (slower but valid).
|
||||
...(txIsFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}),
|
||||
safetySettings: TRANSCRIPTION_SAFETY,
|
||||
},
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ fileData: { fileUri: f.uri, mimeType } },
|
||||
{ text: prompt },
|
||||
|
||||
// Walk the fallback chain: try the primary model first; on a
|
||||
// retryable error (capacity / 503 / rate-limit), try the next
|
||||
// model in the chain. Non-retryable errors bubble up to the
|
||||
// caller — they'd just fail the same way on every model.
|
||||
let lastErr;
|
||||
for (let modelIdx = 0; modelIdx < txChain.length; modelIdx++) {
|
||||
const model = txChain[modelIdx];
|
||||
const isFlash = /flash/i.test(model);
|
||||
try {
|
||||
let result;
|
||||
// Empty-response retries: when the SDK returns 200 with no
|
||||
// text (which happens periodically with audio inputs),
|
||||
// retry up to N times with the SAME model before falling
|
||||
// back.
|
||||
for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
|
||||
result = await ai.models.generateContent({
|
||||
model,
|
||||
config: {
|
||||
// thinkingLevel: "minimal" is only valid for Flash.
|
||||
// Pro models reject it. Skip when the chain hop
|
||||
// landed on a Pro model.
|
||||
...(isFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}),
|
||||
safetySettings: TRANSCRIPTION_SAFETY,
|
||||
},
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ fileData: { fileUri: f.uri, mimeType } },
|
||||
{ text: prompt },
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
if (safeText(result)) break;
|
||||
});
|
||||
if (safeText(result)) break;
|
||||
}
|
||||
|
||||
// Best-effort cleanup of the uploaded File API artifact.
|
||||
try { await ai.files.delete({ name: f.name }); } catch {}
|
||||
|
||||
const text = safeText(result) || "";
|
||||
return {
|
||||
text,
|
||||
segments: [],
|
||||
duration_seconds: 0,
|
||||
usage: result?.usageMetadata || null,
|
||||
// Return the model that ACTUALLY served the request — so
|
||||
// the audit log records what was used, not just what was
|
||||
// requested. Lets the operator see "this call fell back
|
||||
// from 3-flash to 2.5-flash" via the dashboard.
|
||||
model,
|
||||
};
|
||||
} catch (err) {
|
||||
lastErr = err;
|
||||
const canFallback =
|
||||
isFallbackEligibleError(err) && modelIdx < txChain.length - 1;
|
||||
console.warn(
|
||||
`[gemini] transcribe with ${model} failed (${err?.status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${txChain[modelIdx + 1]}` : ""}`
|
||||
);
|
||||
if (!canFallback) {
|
||||
try { await ai.files.delete({ name: f.name }); } catch {}
|
||||
throw err;
|
||||
}
|
||||
// loop continues with next model
|
||||
}
|
||||
}
|
||||
|
||||
// Best-effort cleanup of the uploaded File API artifact.
|
||||
try { await ai.files.delete({ name: f.name }); } catch {}
|
||||
|
||||
const text = safeText(result) || "";
|
||||
return {
|
||||
text,
|
||||
// Gemini returns a single timestamped blob — segments are
|
||||
// parsed client-side by the orchestration layer. We could
|
||||
// pre-parse here but Recap already has parseTimestampedTranscript
|
||||
// that handles this exact shape.
|
||||
segments: [],
|
||||
duration_seconds: 0,
|
||||
// Pass usage + the model id back to the route so audit-log
|
||||
// entries can include token counts + computed cost.
|
||||
usage: result?.usageMetadata || null,
|
||||
model: transcriptionModel,
|
||||
};
|
||||
throw lastErr || new Error("transcribe: all models in fallback chain failed");
|
||||
} finally {
|
||||
try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
|
||||
}
|
||||
}
|
||||
|
||||
async function analyzeText({ prompt }) {
|
||||
const result = await ai.models.generateContent({
|
||||
model: analysisModel,
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [{ text: prompt }],
|
||||
},
|
||||
],
|
||||
});
|
||||
return {
|
||||
text: safeText(result) || "",
|
||||
usage: result?.usageMetadata || null,
|
||||
model: analysisModel,
|
||||
};
|
||||
let lastErr;
|
||||
for (let modelIdx = 0; modelIdx < anChain.length; modelIdx++) {
|
||||
const model = anChain[modelIdx];
|
||||
try {
|
||||
const result = await ai.models.generateContent({
|
||||
model,
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [{ text: prompt }],
|
||||
},
|
||||
],
|
||||
});
|
||||
return {
|
||||
text: safeText(result) || "",
|
||||
usage: result?.usageMetadata || null,
|
||||
model,
|
||||
};
|
||||
} catch (err) {
|
||||
lastErr = err;
|
||||
const canFallback =
|
||||
isFallbackEligibleError(err) && modelIdx < anChain.length - 1;
|
||||
console.warn(
|
||||
`[gemini] analyze with ${model} failed (${err?.status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${anChain[modelIdx + 1]}` : ""}`
|
||||
);
|
||||
if (!canFallback) throw err;
|
||||
}
|
||||
}
|
||||
throw lastErr || new Error("analyze: all models in fallback chain failed");
|
||||
}
|
||||
|
||||
return { transcribeAudio, analyzeText };
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "recap-relay-server",
|
||||
"version": "0.2.8",
|
||||
"version": "0.2.9",
|
||||
"type": "module",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
|
||||
Reference in New Issue
Block a user