initial relay scaffold

2026-05-11 20:03:27 -05:00
commit b9d86fa303
58 changed files with 7609 additions and 0 deletions
@@ -0,0 +1,176 @@
+// Gemini backend forwarder. Receives a transcribe or analyze request
+// from a route handler, calls the corresponding Gemini API, and
+// returns a normalized result the route can wrap in the standard
+// envelope.
+//
+// v0.1 implements:
+//   - transcribeAudio({ audio: Buffer, mimeType, title?, channel?,
+//     description?, chapters?, offsetSeconds? }) → { text, segments,
+//     duration_seconds }
+//   - analyzeText({ prompt }) → { text }
+//
+// Both go through @google/genai with similar prompts to Recap's
+// gemini.js provider, so output shapes line up with what Recap's
+// orchestration layer expects.
+
+import { GoogleGenAI } from "@google/genai";
+import fs from "fs/promises";
+import os from "os";
+import path from "path";
+
+const TRANSCRIPTION_MODEL = "gemini-3-flash-preview";
+const ANALYSIS_MODEL = "gemini-3.1-pro-preview";
+const EMPTY_RETRIES = 3;
+
+const TRANSCRIPTION_SAFETY = [
+  { category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
+  { category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
+  { category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "BLOCK_NONE" },
+  { category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" },
+];
+
+export function createGeminiBackend({ apiKey, timeoutMs = 900_000 } = {}) {
+  if (!apiKey) {
+    throw new Error("createGeminiBackend: apiKey is required");
+  }
+  const ai = new GoogleGenAI({
+    apiKey,
+    httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
+  });
+
+  async function transcribeAudio({
+    audio,
+    mimeType,
+    title = "",
+    channel = "",
+    description = "",
+    chapters = [],
+    offsetSeconds = 0,
+  }) {
+    // The Files API requires a path on disk; write to a temp file.
+    const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "relay-tx-"));
+    const tmpPath = path.join(tmpDir, "audio.bin");
+    await fs.writeFile(tmpPath, audio);
+    try {
+      const uploaded = await ai.files.upload({
+        file: tmpPath,
+        config: { mimeType },
+      });
+      let f = uploaded;
+      const pStart = Date.now();
+      while (f.state === "PROCESSING") {
+        if (Date.now() - pStart > 5 * 60 * 1000) {
+          throw new Error("Gemini file processing exceeded 5 min");
+        }
+        await new Promise((r) => setTimeout(r, 3000));
+        f = await ai.files.get({ name: f.name });
+      }
+      if (f.state === "FAILED") {
+        throw new Error("Gemini failed to process audio file");
+      }
+
+      const prompt = buildTranscriptionPrompt({ title, channel, description, chapters });
+      let result;
+      for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
+        result = await ai.models.generateContent({
+          model: TRANSCRIPTION_MODEL,
+          config: {
+            thinkingConfig: { thinkingLevel: "minimal" },
+            safetySettings: TRANSCRIPTION_SAFETY,
+          },
+          contents: [
+            {
+              role: "user",
+              parts: [
+                { fileData: { fileUri: f.uri, mimeType } },
+                { text: prompt },
+              ],
+            },
+          ],
+        });
+        if (safeText(result)) break;
+      }
+
+      // Best-effort cleanup of the uploaded File API artifact.
+      try { await ai.files.delete({ name: f.name }); } catch {}
+
+      const text = safeText(result) || "";
+      return {
+        text,
+        // Gemini returns a single timestamped blob — segments are
+        // parsed client-side by the orchestration layer. We could
+        // pre-parse here but Recap already has parseTimestampedTranscript
+        // that handles this exact shape.
+        segments: [],
+        duration_seconds: 0,
+      };
+    } finally {
+      try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
+    }
+  }
+
+  async function analyzeText({ prompt }) {
+    const result = await ai.models.generateContent({
+      model: ANALYSIS_MODEL,
+      contents: [
+        {
+          role: "user",
+          parts: [{ text: prompt }],
+        },
+      ],
+    });
+    return {
+      text: safeText(result) || "",
+    };
+  }
+
+  return { transcribeAudio, analyzeText };
+}
+
+function safeText(r) {
+  try {
+    if (r?.text) return r.text;
+  } catch {}
+  try {
+    const parts = r?.candidates?.[0]?.content?.parts;
+    if (parts) return parts.map((p) => p.text || "").join("");
+  } catch {}
+  return "";
+}
+
+function buildTranscriptionPrompt({ title, channel, description, chapters } = {}) {
+  let ctx = "";
+  if (title) ctx += `Video title: "${title}"\n`;
+  if (channel) ctx += `Channel: ${channel}\n`;
+  if (description) {
+    const d = description.length > 1500 ? description.slice(0, 1500) + "…" : description;
+    ctx += `Video description (use to identify speakers by name):\n${d}\n`;
+  }
+  if (Array.isArray(chapters) && chapters.length > 0) {
+    const lines = chapters
+      .slice(0, 30)
+      .map((c) => {
+        const start = typeof c.start_time === "number" ? c.start_time : 0;
+        const mm = Math.floor(start / 60);
+        const ss = Math.floor(start % 60).toString().padStart(2, "0");
+        return `  [${mm}:${ss}] ${c.title || ""}`;
+      })
+      .join("\n");
+    ctx += `Chapter markers:\n${lines}\n`;
+  }
+  if (ctx) ctx += "\n";
+
+  return `${ctx}Transcribe this audio completely and verbatim. Include timestamps at regular intervals (every 15-30 seconds or at natural pauses).
+
+Format each line as:
+[MM:SS] The spoken text here...
+
+Rules:
+- Transcribe EVERY word spoken, do not skip or summarize anything.
+- Use [MM:SS] or [H:MM:SS] timestamp format at the start of each line.
+- Start a new timestamped line every 15-30 seconds or at natural speech pauses.
+- Include filler words (um, uh, you know) for accuracy.
+- Speaker identification: FIRST consult the metadata above — descriptions and chapter titles usually name the host(s) and guest(s) explicitly. Format as: [MM:SS] Name: text. Only fall back to "Host"/"Guest" if no names appear.
+
+Return ONLY the timestamped transcript, nothing else.`;
+}