285 lines
11 KiB
JavaScript
285 lines
11 KiB
JavaScript
// Gemini backend forwarder. Receives a transcribe or analyze request
|
|
// from a route handler, calls the corresponding Gemini API, and
|
|
// returns a normalized result the route can wrap in the standard
|
|
// envelope.
|
|
//
|
|
// v0.1 implements:
|
|
// - transcribeAudio({ audio: Buffer, mimeType, title?, channel?,
|
|
// description?, chapters?, offsetSeconds? }) → { text, segments,
|
|
// duration_seconds }
|
|
// - analyzeText({ prompt }) → { text }
|
|
//
|
|
// Both go through @google/genai with similar prompts to Recap's
|
|
// gemini.js provider, so output shapes line up with what Recap's
|
|
// orchestration layer expects.
|
|
|
|
import { GoogleGenAI } from "@google/genai";
|
|
import fs from "fs/promises";
|
|
import os from "os";
|
|
import path from "path";
|
|
|
|
// Defaults used only when the caller doesn't supply explicit model
|
|
// names. Production callers should pass models pulled from
|
|
// relay_gemini_transcription_model / relay_gemini_analysis_model in
|
|
// the relay config so the operator can swap SKUs (e.g. flash for
|
|
// analysis) without rebuilding the relay.
|
|
const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
|
|
const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview";
|
|
const EMPTY_RETRIES = 3;
|
|
|
|
// Per-pipeline fallback chains, ordered newest/most-expensive →
|
|
// older/cheaper. When the operator-selected primary model returns a
|
|
// retryable error (503 capacity, 429 rate limit, etc.) the relay
|
|
// walks DOWN this list — never up, since the operator's choice
|
|
// reflects their preferred price/quality point. The chain is sliced
|
|
// from the primary forward, so picking 2.5-flash falls back to only
|
|
// 2.0-flash, never back up to 3-flash.
|
|
const TRANSCRIPTION_FALLBACK_CHAIN = [
|
|
"gemini-3-flash-preview",
|
|
"gemini-2.5-flash",
|
|
"gemini-2.0-flash",
|
|
];
|
|
const ANALYSIS_FALLBACK_CHAIN = [
|
|
"gemini-3.1-pro-preview",
|
|
"gemini-3-pro-preview",
|
|
"gemini-3-flash-preview",
|
|
"gemini-2.5-flash",
|
|
];
|
|
|
|
// Slice the chain starting at the primary model. If the primary isn't
|
|
// in the chain (unknown / typo), return just the primary — no
|
|
// fallback possible. Returns a fresh array so callers can iterate
|
|
// safely.
|
|
function fallbackChain(chain, primary) {
|
|
const idx = chain.indexOf(primary);
|
|
if (idx < 0) return [primary];
|
|
return chain.slice(idx);
|
|
}
|
|
|
|
// Detect errors that warrant trying the next model in the chain.
|
|
// Capacity / rate-limit / network blips → yes. Auth failures / 400s
|
|
// → no, those would just keep failing with the same root cause.
|
|
function isFallbackEligibleError(err) {
|
|
const status = err?.status || err?.httpStatusCode || 0;
|
|
const msg = err?.message || String(err);
|
|
if (status === 503 || status === 429 || status === 529) return true;
|
|
if (/overloaded|unavailable|capacity|high demand|rate limit|fetch failed|ECONNRESET|ETIMEDOUT|socket hang up|network/i.test(msg)) return true;
|
|
return false;
|
|
}
|
|
|
|
const TRANSCRIPTION_SAFETY = [
|
|
{ category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
|
|
{ category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
|
|
{ category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "BLOCK_NONE" },
|
|
{ category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" },
|
|
];
|
|
|
|
export function createGeminiBackend({
|
|
apiKey,
|
|
transcriptionModel = DEFAULT_TRANSCRIPTION_MODEL,
|
|
analysisModel = DEFAULT_ANALYSIS_MODEL,
|
|
timeoutMs = 900_000,
|
|
} = {}) {
|
|
if (!apiKey) {
|
|
throw new Error("createGeminiBackend: apiKey is required");
|
|
}
|
|
const ai = new GoogleGenAI({
|
|
apiKey,
|
|
httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
|
|
});
|
|
// Build the per-call fallback chains. The primary is whatever the
|
|
// operator selected via the StartOS action; subsequent entries are
|
|
// the lower-tier members of the chain (we never fall back UP). When
|
|
// the primary returns a 503/capacity/rate-limit error, the loops
|
|
// below try the next model.
|
|
const txChain = fallbackChain(TRANSCRIPTION_FALLBACK_CHAIN, transcriptionModel);
|
|
const anChain = fallbackChain(ANALYSIS_FALLBACK_CHAIN, analysisModel);
|
|
|
|
async function transcribeAudio({
|
|
audio,
|
|
mimeType,
|
|
title = "",
|
|
channel = "",
|
|
description = "",
|
|
chapters = [],
|
|
offsetSeconds = 0,
|
|
}) {
|
|
// The Files API requires a path on disk; write to a temp file.
|
|
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "relay-tx-"));
|
|
const tmpPath = path.join(tmpDir, "audio.bin");
|
|
await fs.writeFile(tmpPath, audio);
|
|
try {
|
|
const uploaded = await ai.files.upload({
|
|
file: tmpPath,
|
|
config: { mimeType },
|
|
});
|
|
let f = uploaded;
|
|
const pStart = Date.now();
|
|
while (f.state === "PROCESSING") {
|
|
if (Date.now() - pStart > 5 * 60 * 1000) {
|
|
throw new Error("Gemini file processing exceeded 5 min");
|
|
}
|
|
await new Promise((r) => setTimeout(r, 3000));
|
|
f = await ai.files.get({ name: f.name });
|
|
}
|
|
if (f.state === "FAILED") {
|
|
throw new Error("Gemini failed to process audio file");
|
|
}
|
|
|
|
const prompt = buildTranscriptionPrompt({ title, channel, description, chapters });
|
|
|
|
// Walk the fallback chain: try the primary model first; on a
|
|
// retryable error (capacity / 503 / rate-limit), try the next
|
|
// model in the chain. Non-retryable errors bubble up to the
|
|
// caller — they'd just fail the same way on every model.
|
|
let lastErr;
|
|
for (let modelIdx = 0; modelIdx < txChain.length; modelIdx++) {
|
|
const model = txChain[modelIdx];
|
|
const isFlash = /flash/i.test(model);
|
|
try {
|
|
let result;
|
|
// Empty-response retries: when the SDK returns 200 with no
|
|
// text (which happens periodically with audio inputs),
|
|
// retry up to N times with the SAME model before falling
|
|
// back.
|
|
for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
|
|
result = await ai.models.generateContent({
|
|
model,
|
|
config: {
|
|
// thinkingLevel: "minimal" is only valid for Flash.
|
|
// Pro models reject it. Skip when the chain hop
|
|
// landed on a Pro model.
|
|
...(isFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}),
|
|
safetySettings: TRANSCRIPTION_SAFETY,
|
|
},
|
|
contents: [
|
|
{
|
|
role: "user",
|
|
parts: [
|
|
{ fileData: { fileUri: f.uri, mimeType } },
|
|
{ text: prompt },
|
|
],
|
|
},
|
|
],
|
|
});
|
|
if (safeText(result)) break;
|
|
}
|
|
|
|
// Best-effort cleanup of the uploaded File API artifact.
|
|
try { await ai.files.delete({ name: f.name }); } catch {}
|
|
|
|
const text = safeText(result) || "";
|
|
return {
|
|
text,
|
|
segments: [],
|
|
duration_seconds: 0,
|
|
usage: result?.usageMetadata || null,
|
|
// Return the model that ACTUALLY served the request — so
|
|
// the audit log records what was used, not just what was
|
|
// requested. Lets the operator see "this call fell back
|
|
// from 3-flash to 2.5-flash" via the dashboard.
|
|
model,
|
|
};
|
|
} catch (err) {
|
|
lastErr = err;
|
|
const canFallback =
|
|
isFallbackEligibleError(err) && modelIdx < txChain.length - 1;
|
|
console.warn(
|
|
`[gemini] transcribe with ${model} failed (${err?.status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${txChain[modelIdx + 1]}` : ""}`
|
|
);
|
|
if (!canFallback) {
|
|
try { await ai.files.delete({ name: f.name }); } catch {}
|
|
throw err;
|
|
}
|
|
// loop continues with next model
|
|
}
|
|
}
|
|
throw lastErr || new Error("transcribe: all models in fallback chain failed");
|
|
} finally {
|
|
try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
|
|
}
|
|
}
|
|
|
|
async function analyzeText({ prompt }) {
|
|
let lastErr;
|
|
for (let modelIdx = 0; modelIdx < anChain.length; modelIdx++) {
|
|
const model = anChain[modelIdx];
|
|
try {
|
|
const result = await ai.models.generateContent({
|
|
model,
|
|
contents: [
|
|
{
|
|
role: "user",
|
|
parts: [{ text: prompt }],
|
|
},
|
|
],
|
|
});
|
|
return {
|
|
text: safeText(result) || "",
|
|
usage: result?.usageMetadata || null,
|
|
model,
|
|
};
|
|
} catch (err) {
|
|
lastErr = err;
|
|
const canFallback =
|
|
isFallbackEligibleError(err) && modelIdx < anChain.length - 1;
|
|
console.warn(
|
|
`[gemini] analyze with ${model} failed (${err?.status || "?"}): ${err?.message || err}${canFallback ? ` — falling back to ${anChain[modelIdx + 1]}` : ""}`
|
|
);
|
|
if (!canFallback) throw err;
|
|
}
|
|
}
|
|
throw lastErr || new Error("analyze: all models in fallback chain failed");
|
|
}
|
|
|
|
return { transcribeAudio, analyzeText };
|
|
}
|
|
|
|
function safeText(r) {
|
|
try {
|
|
if (r?.text) return r.text;
|
|
} catch {}
|
|
try {
|
|
const parts = r?.candidates?.[0]?.content?.parts;
|
|
if (parts) return parts.map((p) => p.text || "").join("");
|
|
} catch {}
|
|
return "";
|
|
}
|
|
|
|
function buildTranscriptionPrompt({ title, channel, description, chapters } = {}) {
|
|
let ctx = "";
|
|
if (title) ctx += `Video title: "${title}"\n`;
|
|
if (channel) ctx += `Channel: ${channel}\n`;
|
|
if (description) {
|
|
const d = description.length > 1500 ? description.slice(0, 1500) + "…" : description;
|
|
ctx += `Video description (use to identify speakers by name):\n${d}\n`;
|
|
}
|
|
if (Array.isArray(chapters) && chapters.length > 0) {
|
|
const lines = chapters
|
|
.slice(0, 30)
|
|
.map((c) => {
|
|
const start = typeof c.start_time === "number" ? c.start_time : 0;
|
|
const mm = Math.floor(start / 60);
|
|
const ss = Math.floor(start % 60).toString().padStart(2, "0");
|
|
return ` [${mm}:${ss}] ${c.title || ""}`;
|
|
})
|
|
.join("\n");
|
|
ctx += `Chapter markers:\n${lines}\n`;
|
|
}
|
|
if (ctx) ctx += "\n";
|
|
|
|
return `${ctx}Transcribe this audio completely and verbatim. Include timestamps at regular intervals (every 15-30 seconds or at natural pauses).
|
|
|
|
Format each line as:
|
|
[MM:SS] The spoken text here...
|
|
|
|
Rules:
|
|
- Transcribe EVERY word spoken, do not skip or summarize anything.
|
|
- Use [MM:SS] or [H:MM:SS] timestamp format at the start of each line.
|
|
- Start a new timestamped line every 15-30 seconds or at natural speech pauses.
|
|
- Include filler words (um, uh, you know) for accuracy.
|
|
- Speaker identification: FIRST consult the metadata above — descriptions and chapter titles usually name the host(s) and guest(s) explicitly. Format as: [MM:SS] Name: text. Only fall back to "Host"/"Guest" if no names appear.
|
|
|
|
Return ONLY the timestamped transcript, nothing else.`;
|
|
}
|