recap-relay/server/routes/summarize-url.js

// POST /relay/summarize-url — unified transcribe + analyze pipeline.
//
// Why this exists: the older /relay/transcribe-url + /relay/analyze
// pair required the Recap client to do its OWN analyze windowing
// (per-window POST to /relay/analyze, then client-side stitch).
// That round-trips the full transcript back to the client only for
// the client to slice it into 12 windows and ship those slices back
// to the relay, one per HTTP request. Bandwidth + latency tax on long
// content, and the chunking knobs in the operator's Settings tab
// only affected benchmarks — production traffic ignored them because
// Recap had its own hardcoded constants.
//
// This route does both steps server-side in one async job:
//   1. Download audio (yt-dlp / direct HTTP) — same code as transcribe-url
//   2. Transcribe with chunking (Gemini or Hardware) — same backends
//   3. Run chunked-analyze on the transcript using the operator's
//      Settings-tab knobs (window body / overlap / concurrency)
//   4. Per-window section results stream to the client via SSE
//   5. Final transcript + stitched analysis returned in the
//      complete job result
//
// Credit policy: charges ONE credit on full success (same as
// transcribe-url alone — Recap's billing model treats one summarize
// as one credit, regardless of which steps it touches). If transcribe
// fails before analyze starts, no credit is charged. If transcribe
// succeeds but ALL analyze windows fail, the credit IS charged (the
// expensive part — transcribe — completed, and partial-window
// failures are tolerated upstream of this).
//
// Endpoints exposed:
//   POST /relay/summarize-url                 → kick off, returns job_id
//   GET  /relay/summarize-url/:jobId/events   → SSE stream of progress + window events
//   GET  /relay/jobs/:id (existing)           → poll-based fallback

import express from "express";
import fs from "fs/promises";
import os from "os";
import path from "path";
import { getConfigSnapshot } from "../config.js";
import { createGeminiBackend } from "../backends/gemini.js";
import { createHardwareBackend } from "../backends/hardware.js";
import { resolveHardwareConfig } from "../hardware-config.js";
import { recordCall } from "../audit-log.js";
import { calcGeminiCost } from "../pricing.js";
import { getAudioDurationSeconds } from "../audio-meta.js";
import {
  createJob,
  getJob,
  markRunning,
  setProgress,
  markComplete,
  markFailed,
  appendEvent,
  subscribeToJob,
} from "../jobs.js";
import { resolveIdentity, identityTier } from "../identity.js";
import {
  getOrCreateRow,
  planBackend,
  commitCredit,
  licenseFingerprint,
} from "../credits.js";
import { lookupJob, markJobCharged, refundJob } from "../job-credits.js";
import { getTierQuotas } from "../config.js";
import { envelope, errorEnvelope } from "./envelope.js";
import {
  runChunkedAnalysis,
  runPipelinedAnalysis,
  parseBracketedTranscript,
  firstEntryAtOrAfter,
  lastEntryBefore,
  canonicalIndexForOffset,
  stitchAnalysisResults,
} from "../chunked-analyze.js";
import { createChunkBuffer } from "../chunk-buffer.js";
import {
  runNameInference,
  runSummaryPolish,
} from "../post-cluster-polish.js";
import { acquireHardwareSlot } from "../hardware-queue.js";
import { saveJobOutput } from "../output-store.js";
import {
  looksLikeYouTube,
  downloadDirect,
  downloadYouTube,
} from "./transcribe-url.js";
import { reportHealthEvent } from "../spark-control-events.js";

export function summarizeUrlRouter() {
  const router = express.Router();

  // POST /relay/summarize-url — kicks off the combined pipeline.
  // Same request shape as /relay/transcribe-url (media_url + optional
  // hints). Same auth (X-Recap-Install-Id header + license proof).
  router.post("/summarize-url", express.json({ limit: "1mb" }), async (req, res) => {
    const summaryJobId = req.header("X-Recap-Job-Id") || null;

    let identity;
    try {
      identity = await resolveIdentity(req);
    } catch (err) {
      const e = await errorEnvelope({
        error: err?.message || "auth_error",
        statusHint: err?.status || 401,
      });
      return res.status(e.statusHint || 401).json(e.body);
    }
    if (identity.kind === "license" && !identity.installId) {
      const e = await errorEnvelope({
        error: "missing X-Recap-Install-Id header",
        statusHint: 400,
      });
      return res.status(400).json(e.body);
    }
    const { creditKey, installId, license } = identity;
    // `title` is `let` rather than `const` because the worker may
    // backfill it from yt-dlp metadata after the download completes
    // (when the client didn't pre-fetch the title — typical for the
    // Recap-app relay-mode branch which submits the URL alone).
    let title;
    // channel/description/chapters are declared with `let` (not const)
    // because the yt-dlp download step below may backfill any of them
    // when the client sent empty values — see the "Fall back to yt-dlp-
    // extracted metadata" block after downloadYouTube returns.
    let channel, description, chapters;
    const {
      media_url: mediaUrl,
      type,
      mime_type: bodyMime,
      title: bodyTitle,
      channel: bodyChannel,
      description: bodyDescription,
      chapters: bodyChapters,
    } = req.body || {};
    channel = bodyChannel;
    description = bodyDescription;
    chapters = bodyChapters;
    // Treat empty strings and the literal sentinel "Untitled" as
    // "no title supplied" so the yt-dlp metadata fallback below
    // (`if (!title && audio.title) title = audio.title`) fires.
    // Older Recap clients (< 0.2.71) pass "Untitled" verbatim when
    // the operator hasn't pre-fetched metadata — without this
    // normalization the relay would echo "Untitled" back as if it
    // were a real title, and the resulting library entry would
    // stay "Untitled" forever despite yt-dlp having the real one.
    const trimmedBodyTitle = typeof bodyTitle === "string" ? bodyTitle.trim() : bodyTitle;
    title = (trimmedBodyTitle === "" || trimmedBodyTitle === "Untitled")
      ? null
      : trimmedBodyTitle;
    if (!mediaUrl || typeof mediaUrl !== "string") {
      const e = await errorEnvelope({
        error: "missing or non-string body.media_url",
        creditKey,
        installId,
        statusHint: 400,
      });
      return res.status(400).json(e.body);
    }

    const row = await getOrCreateRow({ creditKey, installId, license });
    const tier = identityTier(identity, row);
    row.tier_snapshot = tier;
    const licenseFp = identity.kind === "cloud" ? null : licenseFingerprint(license);
    const auditInstall = installId || identity.userId || null;

    const reusedSummaryJob = !!lookupJob({ creditKey, installId, license, jobId: summaryJobId });
    const cfgPlan = await getConfigSnapshot();
    const hw = await resolveHardwareConfig(cfgPlan);
    // Operator-only diagnostic. We do NOT 503 the request here on
    // blocked_reason — that pre-empts the backend-preference choice
    // and would return a hardware-down error even when Gemini was
    // selected as the routing preference (it'd also leak the
    // operator-internal Spark Control / parakeet wording to the
    // client). Instead: log the detail for the operator's eyes,
    // let `hasHardware = !!hw.transcribe.url` go to false, and let
    // planBackend route to Gemini under the normal preference
    // rules. If the operator chose hardware_only mode, planBackend
    // refuses with the generic `hardware_only_not_configured`
    // reason — which is client-safe.
    if (hw.transcribe.blocked_reason) {
      console.warn(
        `[summarize-url] hardware transcribe currently blocked (planBackend will route to Gemini if available): ${hw.transcribe.blocked_reason}`,
      );
    }
    if (hw.analyze.blocked_reason) {
      console.warn(
        `[summarize-url] hardware analyze currently blocked (planBackend will route to Gemini if available): ${hw.analyze.blocked_reason}`,
      );
    }
    const hasHardware = !!hw.transcribe.url;
    const quota = await getTierQuotas();
    const preference =
      cfgPlan.relay_transcribe_backend_preference || "gemini_first";
    const plan = planBackend(row, quota, { hasHardware, preference });
    if (!plan.allowed) {
      await recordCall({
        install_id: auditInstall,
        license_fingerprint: licenseFp,
        tier,
        pipeline: "transcribe",
        backend: null,
        model: null,
        status: "refused",
        credit_charged: 0,
        duration_ms: 0,
        cost_usd: 0,
        job_id: summaryJobId,
        media_url: mediaUrl || null,
        title: title || null,
        error: plan.reason,
      });
      const e = await errorEnvelope({
        error: plan.reason,
        installId,
        license,
        tier,
        statusHint: 402,
      });
      return res.status(402).json(e.body);
    }
    const chosenBackend = plan.backend;

    // The analyze backend follows the operator's analyze-side
    // preference, NOT the transcribe-side preference. They're set
    // independently in StartOS so the operator can route, e.g.,
    // transcribe-to-Gemini + analyze-to-hardware on tight Gemini
    // budgets. planBackend evaluates the same quota/preference rules
    // for analyze and returns a separate decision.
    const analyzePreference =
      cfgPlan.relay_analyze_backend_preference || "gemini_first";
    const hasAnalyzeHardware = !!hw.analyze.url;
    const analyzePlan = planBackend(row, quota, {
      hasHardware: hasAnalyzeHardware,
      preference: analyzePreference,
    });
    if (!analyzePlan.allowed) {
      const e = await errorEnvelope({
        error: analyzePlan.reason,
        installId,
        license,
        tier,
        statusHint: 402,
      });
      return res.status(402).json(e.body);
    }
    const chosenAnalyzeBackend = analyzePlan.backend;

    const job = createJob({
      kind: "summarize-url",
      installId: auditInstall,
      metadata: {
        owner: creditKey, // authorizes the SSE /events stream (per-identity)
        media_url: mediaUrl,
        transcribe_backend: chosenBackend,
        analyze_backend: chosenAnalyzeBackend,
        summary_job_id: summaryJobId,
      },
    });

    // Background worker — same try/catch discipline as transcribe-url.
    // All progress + per-window events go through appendEvent(job.id, ...)
    // so any SSE subscriber gets them in real time.
    (async () => {
      const workerT0 = Date.now();
      markRunning(job.id);
      appendEvent(job.id, "progress", { message: "downloading media…" });
      setProgress(job.id, "downloading media…");
      console.log(
        `[summarize-url ${job.id.slice(0, 8)}] downloading media from ${(mediaUrl || "(no url)").slice(0, 120)}`
      );

      const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "relay-sum-"));
      const isYT = type === "youtube" || (!type && looksLikeYouTube(mediaUrl));
      let audio;
      let downloadMs = 0;
      try {
        const dlStart = Date.now();
        audio = isYT
          ? await downloadYouTube(mediaUrl, tmpDir)
          : await downloadDirect(mediaUrl, tmpDir);
        downloadMs = Date.now() - dlStart;
        audio.seconds = await getAudioDurationSeconds(audio.filePath);
        console.log(
          `[summarize-url ${job.id.slice(0, 8)}] download done in ${(downloadMs / 1000).toFixed(1)}s — ${Math.round((audio.seconds || 0) / 60)} min audio`
        );
        // Fall back to the yt-dlp-extracted title when the client
        // didn't pass one. Recap-app's relay-mode branch doesn't
        // pre-fetch YouTube metadata, so without this the Jobs table
        // shows "Untitled" for every Recap submission.
        if (!title && audio.title) {
          title = audio.title;
        }
        // Same fallback pattern for channel / description / chapters.
        // These power speaker-identification in the transcribe prompt
        // (the model uses them to assign names like "Brandon Karpeles:"
        // and "Matt Hill:" to dialogue turns). Recap-app's direct-to-
        // Gemini path always fetches these via fetchYouTubeMetadata
        // before transcribing; the relay path either gets them from
        // the Recap client (if Recap fetched them and passed them
        // through) OR — when the client sent empty fields — falls back
        // here to whatever yt-dlp extracted during download. Defense-in-
        // depth: works for ANY client that hits /relay/summarize-url,
        // not just Recap, and survives a client upgrade gap where an
        // older Recap is still sending empty channelHint/descriptionHint.
        if (!channel && audio.channel) {
          channel = audio.channel;
        }
        if (!description && audio.description) {
          description = audio.description;
        }
        if ((!Array.isArray(chapters) || chapters.length === 0) && Array.isArray(audio.chapters) && audio.chapters.length > 0) {
          chapters = audio.chapters;
        }
        appendEvent(job.id, "progress", {
          message: `transcribing ${Math.round((audio.seconds || 0) / 60)} min audio…`,
        });
        setProgress(job.id, `transcribing ${Math.round((audio.seconds || 0) / 60)} min audio…`);
      } catch (err) {
        try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
        const msg = (err?.message || String(err)).slice(0, 300);
        await recordCall({
          install_id: auditInstall,
          license_fingerprint: licenseFp,
          tier,
          pipeline: "transcribe",
          backend: chosenBackend,
          model: null,
          status: "error",
          credit_charged: 0,
          duration_ms: Date.now() - workerT0,
          download_ms: Date.now() - workerT0,
          audio_seconds: null,
          cost_usd: 0,
          job_id: summaryJobId,
          media_url: mediaUrl || null,
          title: title || null,
          error: "download_failed: " + msg,
        });
        markFailed(job.id, "download_failed: " + msg);
        return;
      }

      // ── Hardware FIFO queue acquire ─────────────────────────────
      // Hardware jobs serialize through a single-slot semaphore so
      // the operator's GPU box doesn't get N jobs hammering it at
      // once. Gemini-backend jobs bypass — Google's API handles
      // concurrency at scale. The acquire awaits if there's an
      // active hardware job ahead; we emit a "queued" SSE event so
      // Recap can show "Queued — N jobs ahead" while we wait. See
      // server/hardware-queue.js.
      const usesHardwareForAnything =
        chosenBackend === "hardware" || chosenAnalyzeBackend === "hardware";
      let releaseHardwareSlot = null;
      if (usesHardwareForAnything) {
        releaseHardwareSlot = await acquireHardwareSlot({
          jobId: job.id,
          onWait: ({ position, activeJobId }) => {
            // Tell the SSE client we're waiting so the UI can show
            // a queue position instead of a stalled "Transcribing…"
            // state. Best-effort — appendEvent is fire-and-forget.
            try {
              appendEvent(job.id, "queued", {
                position,
                activeJobId: activeJobId || null,
              });
              setProgress(
                job.id,
                `queued — ${position} job(s) ahead on operator hardware`
              );
            } catch {}
            console.log(
              `[summarize-url ${job.id.slice(0, 8)}] queued for hardware — position ${position}`
            );
          },
        });
      }

      // Wrap from here to end-of-IIFE in try/finally so the hardware
      // slot ALWAYS releases — on success, on early-return failure,
      // and on uncaught exceptions. Finally executes before the
      // throw propagates so the next queued job unblocks promptly.
      try {

      // ── Transcribe ──────────────────────────────────────────────
      // Pipelined-analyze design (v0.2.89): when the hardware backend
      // is transcribing, fire each analyze window's call AS SOON AS
      // its required transcribe chunks have completed — in parallel
      // with later chunks still being transcribed. The total wall
      // time savings are modest (~10s for a 94-min video), but the
      // user-perceived "first topic visible" time drops from
      // ~T=160s to ~T=80s because window 1's sections render while
      // chunks 5-21 are still in-flight. See server/chunk-buffer.js
      // and runPipelinedAnalysis in chunked-analyze.js for the
      // mechanics. Gemini transcribe doesn't expose per-chunk
      // callbacks (single big call) so its path stays sequential.
      const cfg = await getConfigSnapshot();
      let txResult;
      const txPhaseStart = Date.now();

      // Build the analyze backend UP FRONT — we need it before
      // transcribe starts in pipelined mode. Sequential mode would
      // also work with it built here, just doesn't strictly need to.
      const anStart = Date.now();
      let analyzeBackend = null;
      try {
        if (chosenAnalyzeBackend === "gemini") {
          analyzeBackend = createGeminiBackend({
            apiKey: cfg.relay_gemini_api_key,
            transcriptionModel: cfg.relay_gemini_transcription_model,
            analysisModel: cfg.relay_gemini_analysis_model,
            txChunkSeconds: (cfg.relay_gemini_tx_chunk_minutes || 30) * 60,
            txConcurrency: cfg.relay_gemini_tx_concurrency || 12,
            txMaxOutputTokens: cfg.relay_gemini_tx_max_output_tokens || 65536,
            anMaxOutputTokens: cfg.relay_gemini_an_max_output_tokens || 8192,
          });
        } else {
          if (!hw.analyze.url) {
            throw new Error("hardware analyze URL not configured");
          }
          analyzeBackend = createHardwareBackend({
            parakeetBaseURL: hw.transcribe.url || "",
            gemmaBaseURL: hw.analyze.url,
            sparkControlBaseURL: hw.sparkBase || "",
            parakeetModel: hw.transcribe.model || "",
            gemmaModel: hw.analyze.model || "",
            txChunkSeconds: (cfg.relay_hardware_tx_chunk_minutes || 5) * 60,
            txChunkOverlapSeconds:
              cfg.relay_hardware_tx_chunk_overlap_seconds ?? 30,
            diarizationEnabled: !!cfg.relay_hardware_diarization_enabled,
            clusterThresholdPct:
              cfg.relay_hardware_voice_clustering_threshold ?? 70,
            anchorMinSpeakingSec:
              cfg.relay_hardware_anchor_min_speaking_sec ?? 30,
            smallClusterMaxSpeakingSec:
              cfg.relay_hardware_small_cluster_max_speaking_sec ?? 15,
            uncertainMarginPct:
              cfg.relay_hardware_uncertain_margin_pct ?? 10,
            txConcurrency: cfg.relay_hardware_tx_concurrency || 4,
            anMaxTokens: cfg.relay_hardware_an_max_tokens || 16000,
          });
        }
      } catch (err) {
        const msg = (err?.message || String(err)).slice(0, 400);
        await recordCall({
          install_id: auditInstall,
          license_fingerprint: licenseFp,
          tier,
          pipeline: "analyze",
          backend: chosenAnalyzeBackend,
          model: null,
          status: "error",
          duration_ms: 0,
          audio_seconds: 0,
          cost_usd: 0,
          job_id: summaryJobId,
          media_url: mediaUrl || null,
          title: title || null,
          error: "analyze_backend_init_failed: " + msg,
          window_idx: 0,
          window_count: 1,
        });
        markFailed(job.id, "analyze_init_failed: " + msg);
        try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
        return;
      }

      // Analyze tunables (used by both pipelined + sequential paths).
      const bodyMin = chosenAnalyzeBackend === "gemini"
        ? (cfg.relay_gemini_analyze_window_minutes || 18)
        : (cfg.relay_hardware_analyze_window_minutes || 18);
      const overlapMin = chosenAnalyzeBackend === "gemini"
        ? (cfg.relay_gemini_analyze_overlap_minutes || 2)
        : (cfg.relay_hardware_analyze_overlap_minutes || 2);
      const anConcurrency = chosenAnalyzeBackend === "gemini"
        ? (cfg.relay_gemini_analyze_concurrency || 12)
        : (cfg.relay_hardware_analyze_concurrency || 8);
      const cutoffMin = cfg.relay_analyze_cutoff_minutes || 25;
      const targetTotalsByBucket = {
        under_30: cfg.relay_analyze_total_sections_under_30,
        "30_60": cfg.relay_analyze_total_sections_30_60,
        "60_90": cfg.relay_analyze_total_sections_60_90,
        "90_120": cfg.relay_analyze_total_sections_90_120,
        "120_150": cfg.relay_analyze_total_sections_120_150,
        "150_180": cfg.relay_analyze_total_sections_150_180,
        over_180: cfg.relay_analyze_total_sections_over_180,
      };
      const analyzePromptOverride =
        cfg.relay_analyze_prompt || cfg.relay_analyze_prompt_default || "";
      const computeCostDetails = (model, usage) =>
        chosenAnalyzeBackend === "gemini" && usage
          ? calcGeminiCost(model, usage)
          : { input_tokens: 0, output_tokens: 0, thinking_tokens: 0, cost_usd: 0 };

      // Pipelined mode requires hardware transcribe (per-chunk
      // onChunkComplete callback) + a known audio duration. When both
      // hold, we start the analyze workers BEFORE the transcribe
      // promise so they're already waiting on chunkBuffer when the
      // first chunks land.
      const usePipelining =
        chosenBackend === "hardware" && (audio.seconds || 0) > 0;
      const chunkBuffer = usePipelining ? createChunkBuffer() : null;
      const onWindowCompleteHandler = (cb) => {
        appendEvent(job.id, "window_complete", {
          windowIdx: cb.windowIdx,
          totalWindows: cb.totalWindows,
          ownedSections: cb.ownedSections,
          // Pipelined mode: include this window's local entries so
          // the SDK / recap-app handler can render the partial
          // sections WITHOUT waiting for transcribe_complete to
          // populate streamedRelayEntries. The SDK falls back to
          // streamedRelayEntries when windowEntries is absent
          // (sequential mode).
          windowEntries: cb.windowEntries || undefined,
          windowBodySeconds: cb.windowBodySeconds,
          model: cb.model,
          durationMs: cb.durationMs,
        });
        setProgress(job.id, `analyze window ${cb.windowIdx + 1}/${cb.totalWindows} done`);
        const sectionCount = Array.isArray(cb.ownedSections)
          ? cb.ownedSections.length
          : 0;
        console.log(
          `[summarize-url ${job.id.slice(0, 8)}] analyze window ${cb.windowIdx + 1}/${cb.totalWindows} done in ${((cb.durationMs || 0) / 1000).toFixed(1)}s (${sectionCount} sections, model=${cb.model || "?"})`
        );
      };

      // Kick off the pipelined-analyze promise BEFORE we await
      // transcribe. Workers block on chunkBuffer.waitForTime() until
      // their windows' required chunks land. If transcribe throws,
      // we call chunkBuffer.fail() in the catch to release the
      // pending waiters.
      let pipelinedAnalyzePromise = null;
      if (usePipelining) {
        pipelinedAnalyzePromise = runPipelinedAnalysis({
          audioDurationSec: audio.seconds || 0,
          waitForTime: (sec) => chunkBuffer.waitForTime(sec),
          getReadySegments: (s, e) => chunkBuffer.getSegments(s, e),
          bodySeconds: bodyMin * 60,
          overlapSeconds: overlapMin * 60,
          cutoffSeconds: cutoffMin * 60,
          concurrency: anConcurrency,
          backend: analyzeBackend,
          pipelineBackend: chosenAnalyzeBackend,
          jobId: summaryJobId,
          batchId: null,
          mediaUrl,
          title: title || null,
          installId,
          licenseFingerprint: licenseFp,
          source: "summarize-url",
          computeCostDetails,
          analyzePromptOverride,
          targetTotalsByBucket,
          onWindowComplete: onWindowCompleteHandler,
        }).catch((err) => ({ __error: err }));
      }

      try {
        const audioBuf = await fs.readFile(audio.filePath);
        const mimeType = bodyMime || audio.mimeType;
        if (chosenBackend === "gemini") {
          const backend = createGeminiBackend({
            apiKey: cfg.relay_gemini_api_key,
            transcriptionModel: cfg.relay_gemini_transcription_model,
            analysisModel: cfg.relay_gemini_analysis_model,
            txChunkSeconds: (cfg.relay_gemini_tx_chunk_minutes || 30) * 60,
            txConcurrency: cfg.relay_gemini_tx_concurrency || 12,
            // Three-layer prompt resolution: per-session override
            // wins; operator-promoted default is next; gemini.js's
            // hardcoded DEFAULT_TRANSCRIBE_PROMPT_BODY is the
            // factory fallback. Empty string here means "fall back
            // to the code-side default" inside the backend.
            transcribePromptOverride:
              cfg.relay_transcribe_prompt ||
              cfg.relay_transcribe_prompt_default ||
              "",
            txMaxOutputTokens: cfg.relay_gemini_tx_max_output_tokens || 65536,
            anMaxOutputTokens: cfg.relay_gemini_an_max_output_tokens || 8192,
          });
          txResult = await backend.transcribeAudio({
            audio: audioBuf,
            mimeType,
            title: title || "",
            channel: channel || "",
            description: description || "",
            chapters: Array.isArray(chapters) ? chapters : [],
            offsetSeconds: 0,
          });
        } else {
          const backend = createHardwareBackend({
            parakeetBaseURL: hw.transcribe.url || "",
            gemmaBaseURL: hw.analyze.url || "",
            sparkControlBaseURL: hw.sparkBase || "",
            parakeetModel: hw.transcribe.model || "",
            gemmaModel: hw.analyze.model || "",
            txChunkSeconds: (cfg.relay_hardware_tx_chunk_minutes || 5) * 60,
            txChunkOverlapSeconds:
              cfg.relay_hardware_tx_chunk_overlap_seconds ?? 30,
            diarizationEnabled: !!cfg.relay_hardware_diarization_enabled,
            clusterThresholdPct:
              cfg.relay_hardware_voice_clustering_threshold ?? 70,
            anchorMinSpeakingSec:
              cfg.relay_hardware_anchor_min_speaking_sec ?? 30,
            smallClusterMaxSpeakingSec:
              cfg.relay_hardware_small_cluster_max_speaking_sec ?? 15,
            uncertainMarginPct:
              cfg.relay_hardware_uncertain_margin_pct ?? 10,
            txConcurrency: cfg.relay_hardware_tx_concurrency || 4,
            anMaxTokens: cfg.relay_hardware_an_max_tokens || 16000,
            // Pipelined-mode hook: feed each completed chunk into
            // the shared buffer so the analyze workers can unblock.
            onChunkComplete: chunkBuffer
              ? (cd) => chunkBuffer.add(cd)
              : null,
          });
          txResult = await backend.transcribeAudio({
            audio: audioBuf,
            mimeType,
            offsetSeconds: 0,
          });
        }
      } catch (err) {
        if (chunkBuffer) chunkBuffer.fail(err);
        try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
        if (reusedSummaryJob) await refundJob({ creditKey, installId, license, jobId: summaryJobId });
        const msg = (err?.message || String(err)).slice(0, 400);
        if (chosenBackend === "hardware") {
          reportHealthEvent({
            service: "parakeet",
            ok: false,
            error: msg.slice(0, 280),
            ms: Date.now() - workerT0,
          });
        }
        await recordCall({
          install_id: auditInstall,
          license_fingerprint: licenseFp,
          tier,
          pipeline: "transcribe",
          backend: chosenBackend,
          model: chosenBackend === "gemini"
            ? cfg.relay_gemini_transcription_model
            : hw.transcribe.model || "(auto)",
          status: "error",
          credit_charged: 0,
          duration_ms: Date.now() - txPhaseStart,
          download_ms: downloadMs,
          audio_seconds: audio?.seconds || null,
          audio_bytes: audio?.bytes || null,
          cost_usd: 0,
          job_id: summaryJobId,
          media_url: mediaUrl || null,
          title: title || null,
          error: msg,
        });
        markFailed(job.id, msg);
        return;
      } finally {
        try { await fs.rm(tmpDir, { recursive: true, force: true }); } catch {}
      }

      // Audit the transcribe row (success path).
      const txCostDetails =
        chosenBackend === "gemini" && txResult.usage
          ? calcGeminiCost(txResult.model, txResult.usage)
          : { input_tokens: 0, output_tokens: 0, thinking_tokens: 0, cost_usd: 0 };
      // Detect output-token truncation. The backend returns a
      // truncated_chunks array when any chunk's last emitted
      // timestamp falls short of the chunk's expected duration —
      // a silent failure mode where the model hit its output cap
      // mid-transcript. We mark the audit row status="partial"
      // and stamp an error string so the Jobs table doesn't
      // mislead the operator into thinking the run succeeded.
      const truncatedChunks = Array.isArray(txResult?.truncated_chunks)
        ? txResult.truncated_chunks
        : [];
      const wasTruncated = truncatedChunks.length > 0;
      const truncationError = wasTruncated
        ? `transcribe: ${truncatedChunks.length} chunk(s) truncated — missing ~${truncatedChunks.reduce((s, c) => s + (c.missingSec || 0), 0)}s of speech (model: ${txResult.model || "unknown"}). Likely hit maxOutputTokens.`
        : null;
      await recordCall({
        install_id: auditInstall,
        license_fingerprint: licenseFp,
        tier,
        pipeline: "transcribe",
        backend: chosenBackend,
        model: txResult?.model || null,
        status: wasTruncated ? "partial" : "success",
        credit_charged: 0, // credit committed AFTER analyze succeeds too
        // duration_ms = transcribe phase ONLY (txPhaseStart was
        // stamped after the download completed). Previously this used
        // Date.now() - workerT0 which included the download time too,
        // so the Jobs table's "TX wall time" column appeared inflated
        // by ~50s on long YouTube fetches. download_ms below carries
        // the download phase as a separate field so the operator can
        // see both numbers on the same row.
        duration_ms: Date.now() - txPhaseStart,
        download_ms: downloadMs,
        audio_bytes: audio.bytes,
        audio_seconds: audio.seconds || null,
        job_id: summaryJobId,
        attempts: txResult?.attempts || null,
        media_url: mediaUrl || null,
        title: title || null,
        chunk_count: txResult?.chunk_count ?? null,
        chunk_durations_ms: txResult?.chunk_durations_ms || null,
        truncated_chunks: wasTruncated ? truncatedChunks : null,
        error: truncationError,
        ...txCostDetails,
      });

      // Emit a transcribe_complete event so the SSE client can show
      // the full transcript while analyze is still in flight (useful
      // for a "transcript preview" UI affordance).
      appendEvent(job.id, "transcribe_complete", {
        transcript: txResult.text || "",
        model: txResult.model || null,
        chunk_count: txResult.chunk_count ?? null,
        audio_seconds: audio.seconds || null,
        audio_bytes: audio.bytes || null,
        duration_ms: Date.now() - workerT0,
      });
      setProgress(job.id, "analyzing topics…");
      console.log(
        `[summarize-url ${job.id.slice(0, 8)}] transcribe done in ${((Date.now() - txPhaseStart) / 1000).toFixed(1)}s — starting analyze`
      );

      // ── Analyze ─────────────────────────────────────────────────
      // Pipelined mode: await the promise we kicked off BEFORE
      // transcribe; remap window-local section indices to global
      // canonical indices; stitch.
      // Sequential mode: call runChunkedAnalysis on the full
      // transcript as today.
      let analyzeResult;
      try {
        if (pipelinedAnalyzePromise) {
          const pipelinedRaw = await pipelinedAnalyzePromise;
          if (pipelinedRaw && pipelinedRaw.__error) {
            throw pipelinedRaw.__error;
          }
          // Build canonical entries from the final stitched transcript.
          // The pipelined windows were each analyzed against their own
          // segment-only view; we now map their LOCAL section indices
          // to GLOBAL canonical-entry indices (which is what the
          // stitcher + downstream stitching code expects).
          const canonicalEntries = parseBracketedTranscript(txResult.text || "");
          const syntheticResults = (pipelinedRaw.windowResults || []).map((wr) => {
            if (!wr || !wr.ok) return wr;
            const w = wr.window;
            const globalStartIdx = firstEntryAtOrAfter(canonicalEntries, w.startSec);
            const globalBodyStartIdx = firstEntryAtOrAfter(canonicalEntries, w.bodyStartSec);
            const globalEndIdx = lastEntryBefore(canonicalEntries, w.windowEndSec + 0.5);
            // Edge case: this window's time range produced no canonical
            // entries (audio gap, severe truncation). Mark as failed
            // so the stitcher skips it.
            if (
              globalStartIdx >= canonicalEntries.length ||
              globalEndIdx < globalStartIdx
            ) {
              return {
                window: { startIdx: 0, endIdx: -1, bodyStartIdx: 0 },
                ok: false,
                error: new Error("pipelined window had no canonical entries — audio gap"),
              };
            }
            // Remap each section's window-LOCAL indices to GLOBAL
            // canonical indices via time matching, then express as
            // window-local for the stitcher's offset math (which
            // adds window.startIdx to each section's startIndex).
            const remapped = [];
            for (const s of wr.sections || []) {
              const localStartEntry = wr.windowEntries?.[s.startIndex];
              const localEndEntry = wr.windowEntries?.[s.endIndex];
              if (!localStartEntry || !localEndEntry) continue;
              const globalStart = canonicalIndexForOffset(
                canonicalEntries,
                localStartEntry.offset || 0
              );
              const globalEnd = canonicalIndexForOffset(
                canonicalEntries,
                localEndEntry.offset || 0
              );
              if (globalStart < 0 || globalEnd < 0) continue;
              remapped.push({
                startIndex: globalStart - globalStartIdx,
                endIndex: globalEnd - globalStartIdx,
                title: s.title,
                summary: s.summary,
              });
            }
            return {
              window: {
                startIdx: globalStartIdx,
                endIdx: globalEndIdx,
                bodyStartIdx: globalBodyStartIdx,
              },
              ok: true,
              sections: remapped,
              model: wr.model,
            };
          });
          const stitched = stitchAnalysisResults(syntheticResults);
          analyzeResult = {
            text: JSON.stringify({ sections: stitched }),
            model: pipelinedRaw.dominantModel,
            attempts: pipelinedRaw.attempts,
            usage: null,
          };
        } else {
          analyzeResult = await runChunkedAnalysis({
            transcriptText: txResult.text || "",
            backend: analyzeBackend,
            pipelineBackend: chosenAnalyzeBackend,
            jobId: summaryJobId,
            batchId: null,
            mediaUrl,
            title: title || null,
            installId,
            licenseFingerprint: licenseFp,
            source: "summarize-url",
            computeCostDetails,
            bodySeconds: bodyMin * 60,
            overlapSeconds: overlapMin * 60,
            concurrency: anConcurrency,
            cutoffSeconds: cutoffMin * 60,
            analyzePromptOverride,
            totalAudioSec: audio.seconds || 0,
            targetTotalsByBucket,
            onWindowComplete: ({ windowIdx, totalWindows, ownedSections, windowBodySeconds, model, durationMs }) => {
              appendEvent(job.id, "window_complete", {
                windowIdx,
                totalWindows,
                ownedSections,
                windowBodySeconds,
                model,
                durationMs,
              });
              setProgress(job.id, `analyze window ${windowIdx + 1}/${totalWindows} done`);
              const sectionCount = Array.isArray(ownedSections)
                ? ownedSections.length
                : (typeof ownedSections === "number" ? ownedSections : 0);
              console.log(
                `[summarize-url ${job.id.slice(0, 8)}] analyze window ${windowIdx + 1}/${totalWindows} done in ${(durationMs / 1000).toFixed(1)}s (${sectionCount} sections, model=${model || "?"})`
              );
            },
          });
        }
      } catch (err) {
        // All analyze windows failed (pipelined OR sequential).
        // Refund credit, mark job failed.
        if (reusedSummaryJob) await refundJob({ creditKey, installId, license, jobId: summaryJobId });
        const msg = (err?.message || String(err)).slice(0, 400);
        markFailed(job.id, "analyze_failed: " + msg);
        return;
      }

      // ── Post-cluster polish (Phase 2) ───────────────────────────
      // After diarization + clustering produced speaker IDs AND
      // analyze produced topic sections, run a two-stage LLM pass
      // that (1) infers real speaker names from the labeled
      // transcript + episode metadata, and (2) rewrites section
      // summaries to attribute statements to those speakers.
      // Skipped when:
      //   - Operator disabled via Settings
      //   - Diarization didn't run (no speakers to attribute)
      //   - Fewer than 2 speakers detected (nothing to differentiate)
      //   - Analyze had no sections to polish
      //
      // Failure handling: both stages are best-effort. If either
      // fails entirely, we keep the unpolished analyze output and
      // null speaker names. The user still gets a working result.
      let speakerNames = null;
      const polishEnabled = cfg.relay_post_cluster_polish_enabled !== false;
      const detectedSpeakerCount = Object.keys(txResult?.speakers || {}).length;
      const parsedAnalysisForPolish = tryParseJson(analyzeResult.text);
      const polishableSections = Array.isArray(parsedAnalysisForPolish?.sections)
        ? parsedAnalysisForPolish.sections
        : null;
      if (
        polishEnabled &&
        detectedSpeakerCount >= 2 &&
        Array.isArray(txResult?.segments) &&
        polishableSections &&
        polishableSections.length > 0
      ) {
        const polishStart = Date.now();
        try {
          // Stage 1 — global speaker name inference. Single call,
          // returns map { Speaker_A: "Matt Hill", Speaker_B: null, ... }.
          speakerNames = await runNameInference({
            speakers: txResult.speakers,
            transcriptSegments: txResult.segments,
            channelHint: channel || "",
            titleHint: title || "",
            descriptionHint: description || "",
            // Three-layer prompt resolution — per-session override
            // wins, then operator-promoted default, then the
            // hardcoded default inside post-cluster-polish.js.
            promptOverride:
              cfg.relay_polish_name_inference_prompt ||
              cfg.relay_polish_name_inference_prompt_default ||
              "",
            backend: analyzeBackend,
            pipelineBackend: chosenAnalyzeBackend,
            jobId: summaryJobId,
            batchId: null,
            mediaUrl,
            installId,
            licenseFingerprint: licenseFp,
            source: "summarize-url",
            computeCostDetails,
          });

          // Stage 2 — per-window summary polish (parallel). Rewrites
          // section summaries to attribute statements to speakers.
          // Sections whose window's polish fails keep their
          // original summary. Titles + indices are never modified.
          const canonicalEntriesForPolish = parseBracketedTranscript(txResult.text || "");
          const { planWindowsByDuration } = await import("../chunked-analyze.js");
          const windowsForPolish = planWindowsByDuration({
            totalAudioSec: audio.seconds || 0,
            bodySeconds: bodyMin * 60,
            overlapSeconds: overlapMin * 60,
            cutoffSeconds: cutoffMin * 60,
          });
          const polishedSections = await runSummaryPolish({
            sections: polishableSections,
            canonicalEntries: canonicalEntriesForPolish,
            windows: windowsForPolish,
            transcriptSegments: txResult.segments,
            speakerNames,
            speakerStats: txResult.speakers,
            promptOverride:
              cfg.relay_polish_summary_rewrite_prompt ||
              cfg.relay_polish_summary_rewrite_prompt_default ||
              "",
            backend: analyzeBackend,
            concurrency: anConcurrency,
            pipelineBackend: chosenAnalyzeBackend,
            jobId: summaryJobId,
            batchId: null,
            mediaUrl,
            installId,
            licenseFingerprint: licenseFp,
            source: "summarize-url",
            computeCostDetails,
          });
          // Overwrite analyzeResult.text with the polished sections
          // so downstream code paths (saveJobOutput, markComplete)
          // see the polished version.
          analyzeResult = {
            ...analyzeResult,
            text: JSON.stringify({ sections: polishedSections }),
          };
          const polishMs = Date.now() - polishStart;
          console.log(
            `[summarize-url ${job.id.slice(0, 8)}] post-cluster polish done in ${(polishMs / 1000).toFixed(1)}s — ${Object.values(speakerNames || {}).filter(Boolean).length}/${detectedSpeakerCount} speakers named`
          );
        } catch (polishErr) {
          // Polish failure should never kill the request — keep the
          // unpolished output. Operator can see what went wrong in
          // the audit log; the user gets a working result either way.
          console.warn(
            `[summarize-url ${job.id.slice(0, 8)}] polish pass failed (keeping unpolished output): ${polishErr?.message || polishErr}`
          );
          speakerNames = null;
        }
      } else if (polishEnabled) {
        // Diagnostic — surface why we skipped so operator can see
        // it without grep-ing the source.
        const reason = !Array.isArray(txResult?.segments)
          ? "no transcript segments"
          : detectedSpeakerCount < 2
            ? `only ${detectedSpeakerCount} speaker(s) detected`
            : !polishableSections
              ? "no parseable sections"
              : "no sections to polish";
        console.log(
          `[summarize-url ${job.id.slice(0, 8)}] polish skipped — ${reason}`
        );
      }

      // ── Commit credit + finalize ────────────────────────────────
      // Credit policy: charged ONLY when the full pipeline completes
      // cleanly. "Cleanly" means EVERY analyze window succeeded —
      // any per-window failure voids the charge even if the stitched
      // output is still usable. Rationale: a partial result is
      // surfacing degraded quality to the user; charging full price
      // for that would erode trust. The user keeps the partial
      // output (saved to disk + returned in the result envelope),
      // the operator eats the cost of the compute they paid for on
      // the windows that did succeed.
      const anyWindowFailed = (analyzeResult?.attempts?.failed ?? 0) > 0;
      let creditCharged = 0;
      if (anyWindowFailed) {
        // Don't charge. If a previous endpoint (legacy two-call path)
        // pre-charged on the same X-Recap-Job-Id, refund it.
        if (reusedSummaryJob) await refundJob({ creditKey, installId, license, jobId: summaryJobId });
        console.log(
          `[summarize-url ${job.id.slice(0, 8)}] partial analyze (${analyzeResult.attempts.failed}/${analyzeResult.attempts.windows} windows failed) — credit NOT charged`
        );
      } else if (!reusedSummaryJob) {
        await commitCredit({ creditKey, installId, license, backend: chosenBackend, tier });
        await markJobCharged({ creditKey, installId, license, jobId: summaryJobId, backend: chosenBackend, tier });
        creditCharged = 1;
      }

      // Phase 1D: build the speaker-tagged transcript segments
      // ONCE so both the operator-output store AND the Recap-facing
      // markComplete envelope share the same payload. Each segment
      // carries `start`, `end`, `text`, and (when diarization ran)
      // `speaker` + `speaker_confidence`. Null when diarization was
      // off and txResult.segments isn't an array.
      const transcriptSegments = Array.isArray(txResult?.segments)
        ? txResult.segments.map((s) => ({
            start: s.start || 0,
            end: s.end || 0,
            text: s.text || "",
            speaker: s.speaker || null,
            speaker_confidence: s.speaker_confidence ?? null,
            // Phase 2 — set when the post-cluster suppression pass
            // reassigned this segment's source cluster to an anchor
            // (best-guess attribution). Recap chip will show "?".
            speaker_uncertain: !!s.speaker_uncertain,
          }))
        : null;

      // Save the full transcript+analysis to disk for the dashboard
      // when the operator has opted in via relay_save_user_outputs.
      if (cfg.relay_save_user_outputs) {
        await saveJobOutput(summaryJobId || job.id, {
          batch_id: null,
          source: "summarize-url",
          transcript: txResult?.text || "",
          analysis: tryParseJson(analyzeResult.text),
          analysis_raw_text: analyzeResult.text || null,
          // Persist diarization output too so it shows up in the
          // operator dashboard's per-job viewer. Both keys are null
          // when diarization was off.
          speakers: txResult?.speakers || null,
          // Phase 2 — inferred speaker names from the post-cluster
          // polish pass. Null if polish was skipped or all names
          // came back null. Recap renders these in place of
          // "Speaker A" in the legend.
          speaker_names: speakerNames || null,
          diarization: txResult?.diarization || null,
          transcript_segments: transcriptSegments,
          meta: {
            title: title || null,
            media_url: mediaUrl,
            audio_seconds: audio.seconds || null,
            audio_bytes: audio.bytes,
            captions_mode: null,
            transcribe_backend: chosenBackend,
            transcribe_model: txResult?.model || null,
            analyze_backend: chosenAnalyzeBackend,
            analyze_model: analyzeResult.model || null,
          },
        });
      }

      const wallTimeMs = Date.now() - workerT0;
      markComplete(job.id, {
        result: {
          transcript: txResult.text || "",
          analysis: tryParseJson(analyzeResult.text),
          analysis_raw_text: analyzeResult.text || "",
          // Echo back the resolved title (either operator-supplied via
          // the kickoff body or extracted by yt-dlp during download).
          // Recap-app's relay-mode branch was previously saving every
          // submission as "Untitled" because the title only existed
          // inside the relay's worker — this envelope field is how
          // we hand it back across the SSE done event.
          title: title || null,
          transcribe_model: txResult.model || null,
          analyze_model: analyzeResult.model || null,
          audio_seconds: audio.seconds || null,
          audio_bytes: audio.bytes || null,
          chunk_count: txResult.chunk_count ?? null,
          analyze_windows: analyzeResult.attempts?.windows ?? null,
          analyze_windows_failed: analyzeResult.attempts?.failed ?? null,
          wall_time_ms: wallTimeMs,
          // Phase 1D fields (null when diarization didn't run):
          speakers: txResult?.speakers || null,
          transcript_segments: transcriptSegments,
          // Phase 2 — speaker name map from the post-cluster polish
          // pass. Null when polish was skipped or all names came
          // back null. Recap merges this into the speakers legend
          // (showing "Matt Hill · 24:42" instead of "Speaker A").
          speaker_names: speakerNames || null,
        },
        credit_charged: creditCharged,
        tier,
      });
      console.log(
        `[summarize-url ${job.id.slice(0, 8)}] complete in ${(wallTimeMs / 1000).toFixed(1)}s (TX ${chosenBackend}, AN ${chosenAnalyzeBackend})`
      );
      } finally {
        // Release the hardware FIFO slot so the next queued job can
        // start. Runs on success path, on every early-return inside
        // the wrapped block, and on uncaught exceptions (in which
        // case the throw propagates to the .catch below after this
        // finally executes). releaseHardwareSlot is null when the
        // job was Gemini-only and never acquired a slot — no-op.
        if (releaseHardwareSlot) releaseHardwareSlot();
      }
    })().catch((err) => {
      markFailed(job.id, "worker_crashed: " + (err?.message || String(err)));
      console.error(`[summarize-url ${job.id.slice(0, 8)}] worker crashed:`, err);
    });

    const body = await envelope({
      result: {
        job_id: job.id,
        status: "queued",
        kind: "summarize-url",
      },
      creditKey,
      installId,
      license,
      tier,
    });
    res.json(body);
  });

  // GET /relay/summarize-url/:jobId/events — SSE stream of progress
  // and per-window events. Replays the existing event log on connect
  // (so a client that connects 2s after the kickoff still sees the
  // download + transcribe events), then pushes live events as they
  // arrive until the job reaches a terminal state.
  router.get("/summarize-url/:jobId/events", async (req, res) => {
    let identity;
    try {
      identity = await resolveIdentity(req);
    } catch {
      return res.status(401).json({ error: "unauthorized" });
    }
    const ownerKey = identity.creditKey;
    if (!ownerKey) {
      return res.status(400).json({ error: "missing identity" });
    }
    const job = getJob(req.params.jobId);
    if (!job) return res.status(404).json({ error: "job not found or expired" });
    // New jobs carry metadata.owner = creditKey; older jobs only carry
    // install_id. Authorize by whichever the job has — and never leak the
    // existence of another identity's job.
    const allowed = job.metadata?.owner
      ? job.metadata.owner === ownerKey
      : identity.installId && job.install_id === identity.installId;
    if (!allowed) {
      return res.status(404).json({ error: "job not found or expired" });
    }

    // SSE headers.
    res.setHeader("Content-Type", "text/event-stream");
    res.setHeader("Cache-Control", "no-cache, no-transform");
    res.setHeader("Connection", "keep-alive");
    res.setHeader("X-Accel-Buffering", "no"); // disable nginx buffering
    res.flushHeaders?.();

    const send = (event) => {
      try {
        res.write(`event: ${event.type}\n`);
        res.write(`data: ${JSON.stringify(event.data)}\n`);
        res.write(`id: ${event.ts}\n\n`);
      } catch (err) {
        // Connection closed mid-write — caller's cleanup handles it.
      }
    };

    // Replay history. The client may have missed events that fired
    // between the POST returning and the GET arriving.
    for (const ev of job.events) {
      send(ev);
    }
    // If the job is already terminal, close immediately.
    if (job.status === "complete" || job.status === "failed") {
      res.end();
      return;
    }

    // Subscribe to live events. Returned function MUST be called on
    // any disconnect path or the subscriber leaks.
    const unsubscribe = subscribeToJob(job.id, send);

    // Heartbeat every 25s. Many proxies (nginx default 60s, others
    // ~30s) close idle SSE connections — a periodic comment frame
    // keeps the conn alive between sparse events without polluting
    // the event stream.
    const heartbeat = setInterval(() => {
      try {
        res.write(": ping\n\n");
      } catch {}
    }, 25_000);

    const cleanup = () => {
      clearInterval(heartbeat);
      if (unsubscribe) unsubscribe();
    };
    req.on("close", cleanup);
    req.on("aborted", cleanup);
    res.on("close", cleanup);
  });

  return router;
}

function tryParseJson(text) {
  if (!text || typeof text !== "string") return null;
  let s = text.trim();
  const cb = s.match(/```(?:json)?\s*([\s\S]*?)```/);
  if (cb) s = cb[1].trim();
  try {
    return JSON.parse(s);
  } catch {
    return null;
  }
}