recap-relay/server/backends/gemini.js

// Gemini backend forwarder. Receives a transcribe or analyze request
// from a route handler, calls the corresponding Gemini API, and
// returns a normalized result the route can wrap in the standard
// envelope.
//
// v0.1 implements:
//   - transcribeAudio({ audio: Buffer, mimeType, title?, channel?,
//     description?, chapters?, offsetSeconds? }) → { text, segments,
//     duration_seconds }
//   - analyzeText({ prompt }) → { text }
//
// Both go through @google/genai with similar prompts to Recap's
// gemini.js provider, so output shapes line up with what Recap's
// orchestration layer expects.

import { GoogleGenAI } from "@google/genai";
import fs from "fs/promises";
import os from "os";
import path from "path";

// Defaults used only when the caller doesn't supply explicit model
// names. Production callers should pass models pulled from
// relay_gemini_transcription_model / relay_gemini_analysis_model in
// the relay config so the operator can swap SKUs (e.g. flash for
// analysis) without rebuilding the relay.
const DEFAULT_TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
const DEFAULT_ANALYSIS_MODEL = "gemini-3.1-pro-preview";
const EMPTY_RETRIES = 3;

const TRANSCRIPTION_SAFETY = [
  { category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
  { category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
  { category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "BLOCK_NONE" },
  { category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" },
];

export function createGeminiBackend({
  apiKey,
  transcriptionModel = DEFAULT_TRANSCRIPTION_MODEL,
  analysisModel = DEFAULT_ANALYSIS_MODEL,
  timeoutMs = 900_000,
} = {}) {
  if (!apiKey) {
    throw new Error("createGeminiBackend: apiKey is required");
  }
  const ai = new GoogleGenAI({
    apiKey,
    httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
  });
  // Flash models accept `thinkingLevel: "minimal"`; Pro models reject
  // it. Detect from the model id so the operator can flip flash <-> pro
  // via the StartOS action without breaking the request.
  const txIsFlash = /flash/i.test(transcriptionModel);

  async function transcribeAudio({
    audio,
    mimeType,
    title = "",
    channel = "",
    description = "",
    chapters = [],
    offsetSeconds = 0,
  }) {
    // The Files API requires a path on disk; write to a temp file.
    const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "relay-tx-"));
    const tmpPath = path.join(tmpDir, "audio.bin");
    await fs.writeFile(tmpPath, audio);
    try {
      const uploaded = await ai.files.upload({
        file: tmpPath,
        config: { mimeType },
      });
      let f = uploaded;
      const pStart = Date.now();
      while (f.state === "PROCESSING") {
        if (Date.now() - pStart > 5 * 60 * 1000) {
          throw new Error("Gemini file processing exceeded 5 min");
        }
        await new Promise((r) => setTimeout(r, 3000));
        f = await ai.files.get({ name: f.name });
      }
      if (f.state === "FAILED") {
        throw new Error("Gemini failed to process audio file");
      }

      const prompt = buildTranscriptionPrompt({ title, channel, description, chapters });
      let result;
      for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
        result = await ai.models.generateContent({
          model: transcriptionModel,
          config: {
            // thinkingLevel: "minimal" is only valid for Flash. Pro
            // models reject it. Skip when the operator picks a Pro
            // model for transcription (slower but valid).
            ...(txIsFlash ? { thinkingConfig: { thinkingLevel: "minimal" } } : {}),
            safetySettings: TRANSCRIPTION_SAFETY,
          },
          contents: [
            {
              role: "user",
              parts: [
                { fileData: { fileUri: f.uri, mimeType } },
                { text: prompt },
              ],
            },
          ],
        });
        if (safeText(result)) break;
      }

      // Best-effort cleanup of the uploaded File API artifact.
      try { await ai.files.delete({ name: f.name }); } catch {}

      const text = safeText(result) || "";
      return {
        text,
        // Gemini returns a single timestamped blob — segments are
        // parsed client-side by the orchestration layer. We could
        // pre-parse here but Recap already has parseTimestampedTranscript
        // that handles this exact shape.
        segments: [],
        duration_seconds: 0,
        // Pass usage + the model id back to the route so audit-log
        // entries can include token counts + computed cost.
        usage: result?.usageMetadata || null,
        model: transcriptionModel,
      };
    } finally {
      try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
    }
  }

  async function analyzeText({ prompt }) {
    const result = await ai.models.generateContent({
      model: analysisModel,
      contents: [
        {
          role: "user",
          parts: [{ text: prompt }],
        },
      ],
    });
    return {
      text: safeText(result) || "",
      usage: result?.usageMetadata || null,
      model: analysisModel,
    };
  }

  return { transcribeAudio, analyzeText };
}

function safeText(r) {
  try {
    if (r?.text) return r.text;
  } catch {}
  try {
    const parts = r?.candidates?.[0]?.content?.parts;
    if (parts) return parts.map((p) => p.text || "").join("");
  } catch {}
  return "";
}

function buildTranscriptionPrompt({ title, channel, description, chapters } = {}) {
  let ctx = "";
  if (title) ctx += `Video title: "${title}"\n`;
  if (channel) ctx += `Channel: ${channel}\n`;
  if (description) {
    const d = description.length > 1500 ? description.slice(0, 1500) + "…" : description;
    ctx += `Video description (use to identify speakers by name):\n${d}\n`;
  }
  if (Array.isArray(chapters) && chapters.length > 0) {
    const lines = chapters
      .slice(0, 30)
      .map((c) => {
        const start = typeof c.start_time === "number" ? c.start_time : 0;
        const mm = Math.floor(start / 60);
        const ss = Math.floor(start % 60).toString().padStart(2, "0");
        return `  [${mm}:${ss}] ${c.title || ""}`;
      })
      .join("\n");
    ctx += `Chapter markers:\n${lines}\n`;
  }
  if (ctx) ctx += "\n";

  return `${ctx}Transcribe this audio completely and verbatim. Include timestamps at regular intervals (every 15-30 seconds or at natural pauses).

Format each line as:
[MM:SS] The spoken text here...

Rules:
- Transcribe EVERY word spoken, do not skip or summarize anything.
- Use [MM:SS] or [H:MM:SS] timestamp format at the start of each line.
- Start a new timestamped line every 15-30 seconds or at natural speech pauses.
- Include filler words (um, uh, you know) for accuracy.
- Speaker identification: FIRST consult the metadata above — descriptions and chapter titles usually name the host(s) and guest(s) explicitly. Format as: [MM:SS] Name: text. Only fall back to "Host"/"Guest" if no names appear.

Return ONLY the timestamped transcript, nothing else.`;
}