recap/server/providers/gemini.js

// Gemini provider — wraps @google/genai behind the shared Provider
// interface. Stateless helpers + a per-request factory: each call to
// createGeminiProvider({ apiKey }) returns a provider instance bound to
// that key, mirroring how `new GoogleGenAI({ apiKey })` was used before.
//
// What lives here:
//   - SDK init + per-request HTTP timeouts
//   - File API upload + processing-state polling
//   - generateContent calls for transcription + analysis
//   - Empty-response retry loop
//   - Safety settings + thinking-config selection
//   - Cost calculation (delegated to gemini-helpers.calcCost)
//   - Model lists for the two pipelines (transcription vs. analysis)
//
// What does NOT live here (stays in server/index.js as orchestration):
//   - Audio chunking decisions + transcript merging
//   - Analysis-output JSON parsing
//   - Topic-analysis prompt construction (provider-neutral, in
//     gemini-helpers.js)

import { GoogleGenAI } from "@google/genai";
import { safeText, retryGemini, formatTime } from "../util.js";
import { calcCost } from "../gemini-helpers.js";

// Models exposed to the analysis fallback chain. Order matters — first
// is the preferred default, the rest are tried in order if it fails.
// The five Gemini models we expose. Verified valid against
// ai.google.dev/gemini-api/docs/models — older IDs (gemini-3-pro-preview
// shut down 2026-03-09, gemini-2.0-flash deprecated, gemini-3.1-flash*
// never existed) are intentionally not in either list.
export const GEMINI_ANALYSIS_MODELS = [
  "gemini-3.1-pro-preview",
  "gemini-2.5-pro",
  "gemini-3-flash-preview",
  "gemini-2.5-flash",
  "gemini-3.1-flash-lite",
];

// Transcription fallback order: Flash first (Flash is Google's
// natural audio fit), Pro only as last-resort because Pro on audio
// is significantly more expensive than Flash.
export const GEMINI_TRANSCRIPTION_MODELS = [
  "gemini-3-flash-preview",
  "gemini-2.5-flash",
  "gemini-3.1-flash-lite",
  "gemini-2.5-pro",
  "gemini-3.1-pro-preview",
];

// Empty-response retries: when the SDK returns 200 with no text (which
// happens periodically with audio inputs), retry up to N times with
// linear backoff before giving up.
const EMPTY_RETRIES = 3;

// The @google/genai SDK does not accept a per-call AbortSignal, so when
// the user cancels a request we need to interrupt the in-flight promise
// ourselves. Race the SDK call against a promise that rejects when the
// caller's signal aborts — the rejection bubbles up immediately and the
// underlying HTTP request gets garbage-collected by the SDK on its own
// timeout. `signal` is optional; without it this is a no-op passthrough.
function withAbort(promise, signal) {
  if (!signal) return promise;
  if (signal.aborted) {
    return Promise.reject(Object.assign(new Error("aborted"), { name: "AbortError" }));
  }
  return new Promise((resolve, reject) => {
    const onAbort = () => {
      reject(Object.assign(new Error("aborted"), { name: "AbortError" }));
    };
    signal.addEventListener("abort", onAbort, { once: true });
    promise.then(
      (v) => {
        signal.removeEventListener("abort", onAbort);
        resolve(v);
      },
      (e) => {
        signal.removeEventListener("abort", onAbort);
        reject(e);
      }
    );
  });
}

// Safety filters disabled for transcription so the model doesn't refuse
// to transcribe sensitive but legitimate spoken content. Analysis
// inherits whatever Gemini's defaults are.
const TRANSCRIPTION_SAFETY = [
  { category: "HARM_CATEGORY_HARASSMENT", threshold: "BLOCK_NONE" },
  { category: "HARM_CATEGORY_HATE_SPEECH", threshold: "BLOCK_NONE" },
  { category: "HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold: "BLOCK_NONE" },
  { category: "HARM_CATEGORY_DANGEROUS_CONTENT", threshold: "BLOCK_NONE" },
];

export function createGeminiProvider({ apiKey, timeoutMs = 900_000 } = {}) {
  if (!apiKey) {
    throw new Error("createGeminiProvider: apiKey is required");
  }
  const ai = new GoogleGenAI({
    apiKey,
    httpOptions: { timeout: timeoutMs, headersTimeout: timeoutMs },
  });
  // Analysis uses the same client — legitimate analysis on long
  // transcripts can genuinely take 3–5+ minutes, so an aggressive
  // timeout cuts off real work. The double-retry-of-overloaded-model
  // waste that 0.2.22 was trying to fix is already handled by
  // retries=1 below: a 503 fast-fails in seconds, and the outer
  // fallback chain (Pro → Pro older → Flash → Flash 2.5) moves
  // on immediately.
  const aiAnalyze = ai;

  return {
    name: "gemini",

    capabilities: {
      transcribe: true,
      analyze: true,
      listModels: true,
    },

    listAnalysisModels() {
      return [...GEMINI_ANALYSIS_MODELS];
    },

    listTranscriptionModels() {
      return [...GEMINI_TRANSCRIPTION_MODELS];
    },

    // Transcribe a single audio file. The caller handles chunking +
    // merging — this is the atomic unit. Returns:
    //   { text, entries?, usage, cost, finishReason, blockReason }
    // `text` is the raw model output (with [MM:SS] markers); the caller
    // parses it into entries. `cost` uses the same shape calcCost
    // already produces, so existing accounting code is unchanged.
    async transcribeAudio({
      filePath,
      mimeType,
      titleHint,
      channelHint = "",
      descriptionHint = "",
      chaptersHint = [],
      model,
      offsetSeconds = 0,
      onProgress = () => {},
      signal,
    }) {
      const upStart = Date.now();
      onProgress(
        `Uploading audio${offsetSeconds > 0 ? ` (offset ${formatTime(offsetSeconds)})` : ""} to Gemini File API...`
      );
      const uploaded = await withAbort(
        ai.files.upload({
          file: filePath,
          config: { mimeType },
        }),
        signal
      );
      const upTime = ((Date.now() - upStart) / 1000).toFixed(1);
      onProgress(`Audio uploaded in ${upTime}s`);

      // Wait for the File API to finish ingesting before generation.
      let f = uploaded;
      const pStart = Date.now();
      while (f.state === "PROCESSING") {
        if (signal?.aborted) {
          throw Object.assign(new Error("aborted"), { name: "AbortError" });
        }
        const ws = ((Date.now() - pStart) / 1000).toFixed(0);
        onProgress(`Waiting for Gemini to process audio... (${ws}s)`);
        await new Promise((r) => setTimeout(r, 3000));
        f = await withAbort(ai.files.get({ name: f.name }), signal);
      }
      if (f.state === "FAILED") {
        throw new Error("Gemini failed to process audio file.");
      }

      const pTime = ((Date.now() - pStart) / 1000).toFixed(1);
      onProgress(`Audio processed in ${pTime}s. Transcribing with ${model}...`);

      const prompt = buildTranscriptionPrompt({
        title: titleHint,
        channel: channelHint,
        description: descriptionHint,
        chapters: chaptersHint,
      });

      // thinkingLevel is a Gemini 3.x param — Gemini 2.5 models use
      // a different shape (`thinkingBudget`, integer) and 400 on
      // `thinkingLevel`. Pro models reject thinking config entirely
      // for the transcribe pipeline. Only send for Gemini 3.x flash
      // variants where it's a valid latency/cost knob.
      const isGemini3Flash =
        model.includes("flash") &&
        (model.startsWith("gemini-3-") || model.startsWith("gemini-3.") || model.startsWith("gemini-3."));
      const txConfig = isGemini3Flash
        ? { thinkingConfig: { thinkingLevel: "minimal" } }
        : {};

      let result;
      let finishReason = "UNKNOWN";
      let blockReason = "none";
      for (let attempt = 0; attempt < EMPTY_RETRIES; attempt++) {
        if (signal?.aborted) {
          throw Object.assign(new Error("aborted"), { name: "AbortError" });
        }
        result = await retryGemini(
          () =>
            withAbort(
              ai.models.generateContent({
                model,
                config: {
                  ...txConfig,
                  safetySettings: TRANSCRIPTION_SAFETY,
                  // Transcripts of long audio are output-token-bound.
                  // Gemini's default is small (commonly 8192) which is
                  // enough for ~10-15 min of dense speech but truncates
                  // 30-45 min chunks mid-transcript with no warning.
                  // Observed (May 2026): a 45-min chunk transcribed by
                  // gemini-3.1-flash-lite ended at local 31:05, losing
                  // 14 minutes of speech silently; another chunk lost
                  // 43 of 45 minutes after the model output 5 segments
                  // and stopped. Setting this high gives the model room
                  // to emit the full transcript; models that don't
                  // support values this large will clamp internally to
                  // their max. 65,536 is the upper bound for Gemini 3.x
                  // flash variants per Google's docs.
                  maxOutputTokens: 65536,
                },
                contents: [
                  {
                    role: "user",
                    parts: [
                      { fileData: { fileUri: f.uri, mimeType } },
                      { text: prompt },
                    ],
                  },
                ],
              }),
              signal
            ),
          {
            retries: 3,
            delayMs: 5000,
            label: `Transcription${offsetSeconds > 0 ? ` (chunk@${formatTime(offsetSeconds)})` : ""}`,
            log: (msg) => onProgress(msg),
          }
        );

        const text = safeText(result);
        if (text) break;

        const candidate = result?.candidates?.[0];
        finishReason = candidate?.finishReason || "UNKNOWN";
        blockReason = result?.promptFeedback?.blockReason || "none";
        onProgress(
          `⚠ Empty response (attempt ${attempt + 1}/${EMPTY_RETRIES}) — finishReason: ${finishReason}, blockReason: ${blockReason}`
        );

        if (attempt < EMPTY_RETRIES - 1) {
          const waitSec = 10 * (attempt + 1);
          onProgress(`Waiting ${waitSec}s before retry...`);
          await new Promise((r) => setTimeout(r, waitSec * 1000));
        }
      }

      // Best-effort cleanup of the uploaded file. Failure here is
      // harmless — Gemini garbage-collects on its own schedule.
      try {
        await ai.files.delete({ name: f.name });
      } catch {}

      const usage = result.usageMetadata || {};
      const cost = calcCost(model, usage);

      return {
        text: safeText(result) || "",
        usage,
        cost,
        finishReason,
        blockReason,
        // Pass-through for callers that still want the raw SDK response
        // (e.g. existing logging code). Will be removed once nothing
        // depends on it.
        raw: result,
      };
    },

    // Generate text from a prompt (no audio). Used by the topic-analysis
    // step today, but generic enough for any text→text model call.
    // Returns: { text, usage, cost, finishReason }
    async analyzeText({
      prompt,
      model,
      onProgress = () => {},
      // Default to 1 attempt (no per-model retry). Analysis-step 503s
      // ("model overloaded") almost never clear in 5–10 seconds —
      // they're capacity-shaped, not transient-blip-shaped. Better
      // UX: fail fast on a single model and let the outer fallback
      // chain in server/index.js walk to the next model (Pro → Pro
      // older → Flash → Flash 2.5) immediately. Caller can override
      // Bumped 1 → 2 in 0.2.76 alongside the responseMimeType:json
      // change. Analyze is by far the cheapest pipeline phase
      // (~few seconds per call), so a third total attempt (1 initial
      // + 2 retries on caught error) is essentially free in wall time
      // but materially reduces "lost window" failures on transient
      // 503/429 blips. Callers can override.
      retries = 2,
      signal,
    }) {
      const result = await retryGemini(
        () =>
          withAbort(
            aiAnalyze.models.generateContent({
              model,
              config: {
                // JSON mode — Gemini guarantees the response body is
                // valid JSON when this is set. Eliminates the entire
                // class of "invalid JSON in window response" failures
                // that came from the model occasionally wrapping its
                // sections array in a prose preamble, a ```json```
                // markdown fence, or truncating the closing brace.
                // The prompt already asks for JSON; this turns that
                // into a hard server-enforced constraint on the
                // model\'s decoder. Mirrors recap-relay 0.2.69\'s
                // change for the relay-mode analyze path.
                responseMimeType: "application/json",
              },
              contents: [
                {
                  role: "user",
                  parts: [{ text: prompt }],
                },
              ],
            }),
            signal
          ),
        {
          retries,
          delayMs: 5000,
          label: "Analysis",
          log: (msg) => onProgress(msg),
        }
      );

      const text = safeText(result);
      const usage = result.usageMetadata || {};
      const cost = calcCost(model, usage);
      const finishReason = result?.candidates?.[0]?.finishReason || null;

      return {
        text: text || "",
        usage,
        cost,
        finishReason,
        raw: result,
      };
    },
  };
}

// Transcription prompt — Gemini-specific because it relies on
// timestamp-formatted output we then parse. Other providers may need a
// differently-shaped prompt, so each provider owns its own.
//
// Accepts richer context than just a title: channel name, video
// description, and YouTube chapter markers. These dramatically improve
// speaker-name extraction — most podcast descriptions list host and
// guest by name, channel names are often the host's name, and chapter
// titles sometimes label introductions ("Conversation with John Doe").
// Without this context, the model falls back to "Host"/"Guest".
function buildTranscriptionPrompt({ title, channel, description, chapters } = {}) {
  let context = "";
  if (title) context += `Video title: "${title}"\n`;
  if (channel) context += `Channel: ${channel}\n`;
  if (description) {
    // Trim to keep prompt size sane on hours-long podcasts whose
    // descriptions can include full sponsor lists + show notes.
    const desc = description.length > 1500 ? description.slice(0, 1500) + "…" : description;
    context += `Video description (use to identify speakers by name):\n${desc}\n`;
  }
  if (Array.isArray(chapters) && chapters.length > 0) {
    const lines = chapters
      .slice(0, 30)
      .map((c) => {
        const start = typeof c.start_time === "number" ? c.start_time : 0;
        const mm = Math.floor(start / 60);
        const ss = Math.floor(start % 60).toString().padStart(2, "0");
        return `  [${mm}:${ss}] ${c.title || ""}`;
      })
      .join("\n");
    context += `Chapter markers (titles often name speakers or topics):\n${lines}\n`;
  }
  if (context) context += "\n";

  return `${context}Transcribe this audio completely and verbatim. Include timestamps at regular intervals (every 15-30 seconds or at natural pauses).

Format each line as:
[MM:SS] The spoken text here...

Rules:
- Transcribe EVERY word spoken, do not skip or summarize anything.
- Use [MM:SS] or [H:MM:SS] timestamp format at the start of each line.
- Start a new timestamped line every 15-30 seconds or at natural speech pauses or speaker changes.
- Include filler words (um, uh, you know) for accuracy.
- Speaker identification: FIRST consult the metadata above — descriptions and chapter titles usually name the host(s) and guest(s) explicitly, and the channel name is often the host's name. Match those names to the voices in the audio (introductions, "I'm Dax", "this is Will", first-person references) and use them as speaker labels. Format as: [MM:SS] Name: text. Only fall back to "Host"/"Guest" if no names appear in the metadata AND nobody is introduced by name in the audio.

Return ONLY the timestamped transcript, nothing else.`;
}