recap/server/gemini-helpers.js

// Gemini-specific helpers: pricing table, cost calculation, prompt
// builder. Pure module — no state, no I/O. When we add other providers,
// each provider gets its own equivalent of this file.

import { formatTime } from "./util.js";

// ── Pricing (per 1M tokens) ─────────────────────────────────────────────────
// Only the models we actually use as analysis fallbacks. Keep flat — the
// numbers are operational data, not configuration. Update when Google
// changes published rates.
export const PRICING = {
  // The five Gemini models we support. Verified against Google's
  // official docs on 2026-05-12. Retired/never-existed IDs omitted.
  "gemini-3.1-pro-preview":    { input: 2.00, output: 12.00, thinking: 12.00 },
  "gemini-2.5-pro":            { input: 1.25, output: 10.00, thinking: 10.00 },
  "gemini-3-flash-preview":    { input: 0.50, output: 3.00,  thinking: 3.00 },
  "gemini-2.5-flash":          { input: 0.15, output: 0.60,  thinking: 0.60 },
  "gemini-3.1-flash-lite":     { input: 0.10, output: 0.40,  thinking: 0.40 },
  // Fallback for unknown / future models — better an estimate than nothing.
  "default":                   { input: 1.00, output: 5.00,  thinking: 5.00 },
};

// ── Cost calculation ────────────────────────────────────────────────────────
// Takes a Gemini SDK `usage` object (response.usageMetadata) and produces
// a structured cost record. Display strings are formatted at extraction
// time so callers don't reformat. Returns zeros for unknown models (uses
// the "default" rates).
export function calcCost(modelName, usage) {
  const rates = PRICING[modelName] || PRICING["default"];
  const inputTokens = usage.promptTokenCount || 0;
  const outputTokens = usage.candidatesTokenCount || 0;
  const thinkingTokens = usage.thoughtsTokenCount || 0;

  const inputCost = (inputTokens / 1_000_000) * rates.input;
  const outputCost = (outputTokens / 1_000_000) * rates.output;
  const thinkingCost = (thinkingTokens / 1_000_000) * rates.thinking;
  const totalCost = inputCost + outputCost + thinkingCost;

  return {
    inputTokens,
    outputTokens,
    thinkingTokens,
    totalTokens: usage.totalTokenCount || (inputTokens + outputTokens + thinkingTokens),
    inputCost: inputCost.toFixed(6),
    outputCost: outputCost.toFixed(6),
    thinkingCost: thinkingCost.toFixed(6),
    totalCost: totalCost.toFixed(6),
    totalCostDisplay: totalCost < 0.01 ? `$${(totalCost * 100).toFixed(3)}¢` : `$${totalCost.toFixed(4)}`,
  };
}

// ── Section-count target by VIDEO duration ─────────────────────────────────
// Mirrors recap-relay's computePerWindowTarget() (server/chunked-analyze.js).
// Operator-tunable on the relay; baked into code defaults here on the
// Recap-app direct path. The defaults match the relay's defaults so
// segmentation density is consistent across both pipelines.
//
// Buckets are TOTAL video duration in minutes:
//   <30 → 6 sections / 30-60 → 8 / 60-90 → 9 / 90-120 → 10
//   120-150 → 11 / 150-180 → 12 / >=180 → 12
// Per-window target = total_target × window_sec / total_audio_sec
// (clamped to ≥1 for single-shot runs).
function pickTotalSectionsTarget(totalAudioSec) {
  const m = (totalAudioSec || 0) / 60;
  if (m < 30) return 6;
  if (m < 60) return 8;
  if (m < 90) return 9;
  if (m < 120) return 10;
  if (m < 150) return 11;
  if (m < 180) return 12;
  return 12;
}
function formatTargetSectionsLabel(avg) {
  if (avg <= 1.2) return "1 section";
  const lo = Math.max(1, Math.floor(avg));
  const hi = Math.max(lo, Math.ceil(avg));
  if (lo === hi) return "around " + lo + " sections";
  return lo + "–" + hi + " sections";
}

// ── Topic-analysis prompt builder ───────────────────────────────────────────
// Takes the parsed transcript entries for a WINDOW and builds the
// JSON-output prompt fed to the analysis model. Indices in the response
// are positional into the same window-entries array — the caller relies
// on that contract.
//
// `opts.totalAudioSec` is the FULL audio duration (not just this window),
// used to scale the section-count target via the per-video-duration table
// above. When omitted, falls back to deriving from the windowEntries
// themselves (legacy callers / unit tests / single-shot path).
export function buildAnalysisPrompt(entries, opts = {}) {
  const numbered = entries
    .map((e, i) => `[${i}] (${formatTime(e.offset)}) ${e.text}`)
    .join("\n");

  // Window length in minutes (this window's own transcript span).
  const windowSec = entries.length > 1
    ? (entries[entries.length - 1].offset || 0) - (entries[0].offset || 0)
    : 0;
  const windowMin = Math.max(1, Math.round(windowSec / 60));
  const maxIndex = Math.max(0, entries.length - 1);

  // Total audio duration drives the per-video-duration target picker.
  // If the caller didn't supply it, assume this is a single-shot run
  // and the window IS the whole audio.
  const totalAudioSec = opts.totalAudioSec || windowSec || 60;
  const totalTarget = pickTotalSectionsTarget(totalAudioSec);
  const numWindows = Math.max(1, totalAudioSec / Math.max(60, windowSec || 60));
  const avgPerWindow = totalTarget / numWindows;
  const targetSections = formatTargetSectionsLabel(avgPerWindow);

  return `You are analyzing a ~${windowMin}-minute section of a longer transcript. Your job is to identify natural topic boundaries and group the transcript into discussion-based sections — aim for ${targetSections}.

TRANSCRIPT (each line is numbered with a timestamp):
${numbered}

INSTRUCTIONS:
1. Read the entire transcript carefully.
2. Identify where the discussion naturally shifts from one topic to another.
3. Group consecutive transcript segments by topic. Some sections may be short (a quick aside) and some may be long (an extended deep-dive). Let the content dictate the length.
4. For each section, write:
   - A short, specific topic title (3-8 words)
   - A 1-3 sentence summary of what's discussed
   - The start and end segment indices (inclusive), counted as the bracketed [N] number at the start of each transcript line above.

IMPORTANT:
- Sections must be chronological and non-overlapping.
- Every segment index from 0 to ${maxIndex} must belong to exactly one section.
- startIndex of section N+1 must equal endIndex of section N plus 1.
- Create as many or as few sections as the content naturally requires — but lean toward broad, substantive topics rather than minute-by-minute breakdowns. A natural topic that spans several minutes of dialogue should be one section, not several.
- Titles should be descriptive and specific, not generic like "Introduction" unless it truly is one.

Respond with ONLY valid JSON in this exact format, no other text:
{
  "sections": [
    {
      "title": "Brief Topic Title",
      "summary": "1-3 sentence summary of this discussion section.",
      "startIndex": 0,
      "endIndex": 15
    }
  ]
}`;
}