// Gemini backend forwarder. Receives a transcribe or analyze request // from a route handler, calls the corresponding Gemini API, and // returns a normalized result the route can wrap in the standard // envelope. // // v0.1 implements: // - transcribeAudio({ audio: Buffer, mimeType, title?, channel?, // description?, chapters?, offsetSeconds? }) → { text, segments, // duration_seconds } // - analyzeText({ prompt }) → { text } // // Both go through @google/genai with similar prompts to Recap's // gemini.js provider, so output shapes line up with what Recap's // orchestration layer expects. import { GoogleGenAI } from "@google/genai"; import fs from "fs/promises"; import os from "os"; import path from "path"; // Defaults used only when the caller doesn't supply explicit model // names. Production callers should pass models pulled from // relay_gemini_transcription_model / relay_gemini_analysis_model in // the relay config so the operator can swap SKUs (e.g. flash for // analysis) without rebuilding the relay. const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview"; const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview"; const EMPTY_RETRIES = 3; const TRANSCRIPTION_SAFETY = [ { category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" }, { category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" }, { category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "BLOCK_NONE" }, { category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" }, ]; export function createGeminiBackend({ apiKey, transcriptionModel = DEFAULT_TRANSCRIPTION_MODEL, analysisModel = DEFAULT_ANALYSIS_MODEL, timeoutMs = 900_000, } = {}) { if (!apiKey) { throw new Error("createGeminiBackend: apiKey is required"); } const ai = new GoogleGenAI({ apiKey, httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs }, }); // Flash models accept `thinkingLevel: "minimal"`; Pro models reject // it. Detect from the model id so the operator can flip flash <-> pro // via the StartOS action without breaking the request. const txIsFlash = /flash/i.test(transcriptionModel); async function transcribeAudio({ audio, mimeType, title = "", channel = "", description = "", chapters = [], offsetSeconds = 0, }) { // The Files API requires a path on disk; write to a temp file. const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "relay-tx-")); const tmpPath = path.join(tmpDir, "audio.bin"); await fs.writeFile(tmpPath, audio); try { const uploaded = await ai.files.upload({ file: tmpPath, config: { mimeType }, }); let f = uploaded; const pStart = Date.now(); while (f.state === "PROCESSING") { if (Date.now() - pStart > 5 * 60 * 1000) { throw new Error("Gemini file processing exceeded 5 min"); } await new Promise((r) => setTimeout(r, 3000)); f = await ai.files.get({ name: f.name }); } if (f.state === "FAILED") { throw new Error("Gemini failed to process audio file"); } const prompt = buildTranscriptionPrompt({ title, channel, description, chapters }); let result; for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) { result = await ai.models.generateContent({ model: transcriptionModel, config: { // thinkingLevel: "minimal" is only valid for Flash. Pro // models reject it. Skip when the operator picks a Pro // model for transcription (slower but valid). ...(txIsFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}), safetySettings: TRANSCRIPTION_SAFETY, }, contents: [ { role: "user", parts: [ { fileData: { fileUri: f.uri, mimeType } }, { text: prompt }, ], }, ], }); if (safeText(result)) break; } // Best-effort cleanup of the uploaded File API artifact. try { await ai.files.delete({ name: f.name }); } catch {} const text = safeText(result) || ""; return { text, // Gemini returns a single timestamped blob — segments are // parsed client-side by the orchestration layer. We could // pre-parse here but Recap already has parseTimestampedTranscript // that handles this exact shape. segments: [], duration_seconds: 0, }; } finally { try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {} } } async function analyzeText({ prompt }) { const result = await ai.models.generateContent({ model: analysisModel, contents: [ { role: "user", parts: [{ text: prompt }], }, ], }); return { text: safeText(result) || "", }; } return { transcribeAudio, analyzeText }; } function safeText(r) { try { if (r?.text) return r.text; } catch {} try { const parts = r?.candidates?.[0]?.content?.parts; if (parts) return parts.map((p) => p.text || "").join(""); } catch {} return ""; } function buildTranscriptionPrompt({ title, channel, description, chapters } = {}) { let ctx = ""; if (title) ctx += `Video title: "${title}"\n`; if (channel) ctx += `Channel: ${channel}\n`; if (description) { const d = description.length > 1500 ? description.slice(0, 1500) + "…" : description; ctx += `Video description (use to identify speakers by name):\n${d}\n`; } if (Array.isArray(chapters) && chapters.length > 0) { const lines = chapters .slice(0, 30) .map((c) => { const start = typeof c.start_time === "number" ? c.start_time : 0; const mm = Math.floor(start / 60); const ss = Math.floor(start % 60).toString().padStart(2, "0"); return ` [${mm}:${ss}] ${c.title || ""}`; }) .join("\n"); ctx += `Chapter markers:\n${lines}\n`; } if (ctx) ctx += "\n"; return `${ctx}Transcribe this audio completely and verbatim. Include timestamps at regular intervals (every 15-30 seconds or at natural pauses). Format each line as: [MM:SS] The spoken text here... Rules: - Transcribe EVERY word spoken, do not skip or summarize anything. - Use [MM:SS] or [H:MM:SS] timestamp format at the start of each line. - Start a new timestamped line every 15-30 seconds or at natural speech pauses. - Include filler words (um, uh, you know) for accuracy. - Speaker identification: FIRST consult the metadata above — descriptions and chapter titles usually name the host(s) and guest(s) explicitly. Format as: [MM:SS] Name: text. Only fall back to "Host"/"Guest" if no names appear. Return ONLY the timestamped transcript, nothing else.`; }