Add internal-meetings pipeline and post-hoc speaker tools

2026-06-13 13:35:53 -05:00
parent 9a2dbf69df
commit 705807e286
15 changed files with 7375 additions and 0 deletions
@@ -0,0 +1,655 @@
+// Post-cluster polish pass: after transcribe + diarize + clustering
+// have produced a speaker-labeled transcript, AND after pipelined
+// analyze has produced section objects (titles + summaries), run a
+// two-stage LLM pass that:
+//
+//   Stage 1 — Global name inference. One LLM call with the
+//     speaker-labeled transcript + episode metadata (channel name,
+//     title, description) → JSON map { Speaker_A: "Matt Hill",
+//     Speaker_B: "Sarah Jones", Speaker_C: null }. The "_C: null"
+//     case is essential: when the LLM can't confidently identify a
+//     speaker, it must return null instead of guessing.
+//
+//   Stage 2 — Per-window summary polish. N parallel LLM calls, one
+//     per analyze window. Each call sees that window's sections
+//     (original summaries) + that window's transcript with speaker
+//     labels + the global name map from Stage 1, and rewrites each
+//     section's SUMMARY to attribute statements to specific
+//     speakers ("Matt Hill explains..." vs "the discussion
+//     centers..."). Section TITLES and start/end indices are kept
+//     unchanged — polish only touches summary text.
+//
+// Why two stages: name inference benefits from the FULL transcript
+// view (name introductions like "welcome Matt" tend to appear in
+// window 1 but Matt keeps speaking throughout); per-window polish
+// benefits from parallelism (matches the existing analyze pattern).
+// Running them as one batched call would either lose parallelism
+// or send the full transcript N times.
+//
+// Failure modes:
+//   - Stage 1 returns invalid JSON → all names default to null;
+//     Stage 2 still runs and produces "Speaker A explains..." etc.
+//   - Stage 2 fails for a particular window → keep the original
+//     analyze summary for that window's sections. Per-window
+//     failure shouldn't kill the whole polish.
+//   - Both stages fail → fall back to the unpolished analyzeResult.
+//     The caller sees the same output as a polish-disabled run.
+//
+// Cost: Stage 1 ~5-10s; Stage 2 ~10-15s (parallel); total ~15-25s
+// added to end of pipeline. On a 200s pipelined pipeline that's a
+// ~10% slowdown for the speaker-attribution UX win.
+
+import { recordCall } from "./audit-log.js";
+
+const STAGE_1_MAX_ATTEMPTS = 3;
+const STAGE_2_MAX_ATTEMPTS = 3;
+
+// ─── Default prompts (operator-editable via Settings tab) ───────────
+//
+// Same three-layer override pattern as the analyze + transcribe
+// prompts: per-session operator override → operator-promoted default
+// → these hardcoded defaults. Both are validated on save —
+// `DEFAULT_NAME_INFERENCE_PROMPT_TEMPLATE` must contain {{transcript}}
+// and JSON output instructions; `DEFAULT_SUMMARY_POLISH_PROMPT_TEMPLATE`
+// must contain {{sections}} and JSON output instructions. Template
+// variables (interpolated at request time):
+//
+// Name inference prompt:
+//   {{channel}}        — operator-supplied or yt-dlp-extracted channel name
+//   {{title}}          — episode/video title
+//   {{description}}    — episode description (capped at 800 chars)
+//   {{speakerStats}}   — pre-formatted block listing each speaker's
+//                        chip letter, total speaking time, turn count
+//   {{transcript}}     — speaker-labeled bracketed transcript, capped
+//                        at 25k chars (middle truncated when over)
+//   {{speakerKeys}}    — JSON-schema-friendly key list for the
+//                        response shape (one line per Speaker_X)
+//
+// Summary polish prompt:
+//   {{speakerRoster}}  — pre-formatted block listing each speaker
+//                        with their inferred name (or "(unknown)")
+//                        and stats
+//   {{transcript}}     — this window's slice of the labeled transcript
+//   {{sections}}       — pre-formatted block listing each section
+//                        with title + original summary + time range
+export const DEFAULT_NAME_INFERENCE_PROMPT_TEMPLATE = `You are identifying real-world speaker names in an interview/podcast/meeting transcript. The transcript below has been pre-tagged with speaker labels like [A], [B], [C] — these are anonymous labels assigned by voice clustering. Your job: infer the real names of each speaker from contextual clues in the transcript.
+
+EPISODE METADATA:
+- Channel/show: {{channel}}
+- Episode title: {{title}}
+- Description: {{description}}
+
+{{operatorContext}}SPEAKER STATISTICS (cluster output):
+{{speakerStats}}
+
+TRANSCRIPT (each line is "[<letter> <MM:SS>] text"):
+{{transcript}}
+
+INSTRUCTIONS:
+1. For each Speaker_X in the speaker statistics, infer the real name from contextual clues:
+   - Direct introductions ("welcome Matt", "I'm joined by Sarah")
+   - Self-introductions ("my name is", "I'm Sarah, founder of...")
+   - References between speakers ("what do you think Matt?", "as Sarah was saying")
+   - Channel name or episode title hints
+   - Operator hints in the OPERATOR HINTS section above, IF PRESENT — but see rule 6 for how to weight those.
+2. Use the speaker statistics to help — the host typically speaks more turns; guests speak less.
+3. Use first + last name if confidently identifiable. Use first name only if that's all you have.
+4. RETURN null IF YOU CANNOT CONFIDENTLY IDENTIFY THE SPEAKER. Do not guess. A null is better than a wrong name.
+5. For brief speakers (under 30s of speaking time, e.g. an intro music VO or a passing comment) it's expected that you'll often return null.
+6. WEIGHTING OPERATOR HINTS: When an OPERATOR HINTS section appears above, treat it as informed suggestion, NOT authoritative truth. The operator may have listed people who turned out not to speak, omitted people who did, or guessed wrong on who matches which voice. ALWAYS verify hints against the transcript. Specifically:
+   - A name in the hints is only a candidate; if the transcript provides no signal that THIS Speaker_X is that person, return null instead of guessing.
+   - If the transcript clearly identifies a speaker as someone NOT in the hints, use the transcript's name.
+   - If the hints describe what each named person did ("Steve gave the update, John asked questions"), use that as a soft signal for mapping names to chip letters, but still verify with the transcript before committing.
+   - It is better to leave a speaker as null than to confidently map a hint to the wrong chip letter.
+
+Respond with ONLY valid JSON in this exact format, no other text:
+{
+  "speakers": {
+{{speakerKeys}}
+  }
+}`;
+
+export const DEFAULT_SUMMARY_POLISH_PROMPT_TEMPLATE = `You wrote section summaries for a podcast/interview transcript window. We've now identified the speakers via voice clustering and (where possible) inferred their real names. Your job: rewrite each section's SUMMARY to attribute statements to specific speakers where it improves clarity, naturalness, and information density.
+
+SPEAKERS (from voice clustering across the full episode):
+{{speakerRoster}}
+
+WINDOW TRANSCRIPT (this window's slice; each line is labeled with the speaker's name, or a chip letter when their name is unknown):
+{{transcript}}
+
+ORIGINAL SECTIONS IN THIS WINDOW (re-write the summary of each):
+{{sections}}
+
+INSTRUCTIONS:
+1. The SPEAKERS roster and the WINDOW TRANSCRIPT are the AUTHORITATIVE source of who said what. The ORIGINAL summaries were written in an earlier pass and may attribute statements to OUTDATED or WRONG speaker names — your job includes CORRECTING those.
+2. Rewrite each section's SUMMARY so every speaker attribution matches the transcript + roster. If an original summary credits a statement to a person who, per the transcript, was actually said by someone else, REPLACE the name with the correct one. Never keep a name that does not appear in the roster.
+3. Use real names when available ("Matt Hill explains..."); fall back to a chip letter only for a speaker who has no name ("Speaker A explains...").
+4. Keep summaries 1-3 sentences — same length range as the original.
+5. KEEP THE TITLE EXACTLY AS GIVEN. Do not rewrite titles.
+6. Return the sections in the SAME ORDER as given, with the SAME INDEX numbers in the array.
+7. If a section is primarily one speaker, lean into their name ("Matt explains..."). If it is back-and-forth, name both ("Matt and Sarah debate...").
+8. If the transcript for a section genuinely has no speaker signal, keep the original summary's wording — but still fix or drop any name in it that conflicts with the roster. DO NOT invent attribution that the transcript does not support.
+
+Respond with ONLY valid JSON in this exact format:
+{
+  "sections": [
+    { "index": 0, "summary": "Polished summary text..." },
+    { "index": 1, "summary": "..." }
+  ]
+}
+
+Return only the sections in this window. Use the same indices as the input ([0], [1], ...). Only the summary field — title and indices stay as given.`;
+
+// Substitute {{key}} placeholders in a template. Unknown keys are
+// left as the literal {{key}} so an operator's edit that drops a
+// variable doesn't crash the run — the model just sees the placeholder.
+function fillTemplate(template, vars) {
+  return String(template || "").replace(/\{\{\s*(\w+)\s*\}\}/g, (_match, key) => {
+    return key in vars ? String(vars[key]) : `{{${key}}}`;
+  });
+}
+
+// Build a transcript representation with speaker labels prefixed.
+// Each line: `[A 0:08] So Matt, tell us how you got started…`
+//   - The bracketed prefix is `[<label> <MM:SS>]`
+//   - <label> is the speaker chip letter (Speaker_A → A) by default;
+//     when the segment has no speaker (gap, unmatched), uses "?" — the
+//     model is instructed to ignore those lines for name inference.
+//   - When opts.speakerNames is provided (the SUMMARY-POLISH pass), a
+//     named speaker is labeled with their RESOLVED name instead of the
+//     letter (`[Matt 0:08] …`). This matters for the re-polish flow: the
+//     operator's corrected names land directly on every transcript line,
+//     so the polish model attributes to the right person without having
+//     to resolve chip letters through the roster — and without trusting
+//     any stale names already baked into the original summaries. The
+//     name-INFERENCE pass deliberately omits speakerNames (it's inferring
+//     them) so it still sees plain chip letters.
+export function formatSpeakerLabeledTranscript(segments, opts = {}) {
+  if (!Array.isArray(segments) || segments.length === 0) return "";
+  const startSec = opts.startSec != null ? opts.startSec : -Infinity;
+  const endSec = opts.endSec != null ? opts.endSec : Infinity;
+  const speakerNames =
+    opts.speakerNames && typeof opts.speakerNames === "object"
+      ? opts.speakerNames
+      : null;
+  const lines = [];
+  for (const s of segments) {
+    const t = s.start || 0;
+    if (t < startSec || t >= endSec) continue;
+    const text = (s.text || "").trim();
+    if (!text) continue;
+    let label = "?";
+    const m = String(s.speaker || "").match(/^Speaker_([A-Z]+)$/);
+    if (m) {
+      label = m[1];
+      // Prefer the operator-corrected name when we have one.
+      if (speakerNames) {
+        const nm = speakerNames[s.speaker];
+        if (typeof nm === "string" && nm.trim()) {
+          label = nm.trim().replace(/[\[\]]/g, "");
+        }
+      }
+    }
+    const sec = Math.floor(t);
+    const mm = Math.floor(sec / 60);
+    const ss = sec % 60;
+    lines.push(`[${label} ${mm}:${String(ss).padStart(2, "0")}] ${text}`);
+  }
+  return lines.join("\n");
+}
+
+// ─── Stage 1: global speaker name inference ─────────────────────────
+//
+// Returns: { Speaker_A: "Matt Hill" | null, Speaker_B: ..., ... }
+// On total failure returns an object with all values null so Stage 2
+// can still run and produce generic speaker-attributed summaries
+// ("Speaker A explains...").
+export async function runNameInference({
+  speakers, // map from clusterSpeakers (Speaker_A → stats)
+  transcriptSegments, // array of { start, end, text, speaker } with speakers attached
+  channelHint = "",
+  titleHint = "",
+  descriptionHint = "",
+  // Operator-editable prompt override (Settings → LLM prompts).
+  // Three-layer resolution at the caller: per-session override →
+  // operator-promoted default → DEFAULT_NAME_INFERENCE_PROMPT_TEMPLATE.
+  // Empty string means "use the hardcoded default" inside this fn.
+  promptOverride = "",
+  // Free-form operator context — internal meetings path uses this to
+  // pass participant hints + notes that the LLM should treat as
+  // suggestions, not authoritative truth. Empty string → no
+  // OPERATOR HINTS block appears in the rendered prompt. The block
+  // is composed here (not by the caller) so the warning language
+  // stays consistent across pipelines.
+  participantHints = "",
+  operatorNotes = "",
+  backend,
+  // audit params
+  pipelineBackend,
+  jobId,
+  batchId,
+  mediaUrl,
+  installId,
+  licenseFingerprint = null,
+  source,
+  computeCostDetails,
+}) {
+  const speakerLetters = Object.keys(speakers || {})
+    .filter((k) => /^Speaker_[A-Z]+$/.test(k))
+    .sort();
+  // Build the null-default map up front so any early-return path
+  // still returns the right shape.
+  const nullMap = Object.fromEntries(speakerLetters.map((k) => [k, null]));
+  if (speakerLetters.length === 0) return nullMap;
+
+  // For name inference we send the WHOLE labeled transcript (no
+  // time window). Cap at ~25k chars to stay well inside Qwen3.6's
+  // context window — for very long content we'd truncate the
+  // middle; in practice 25k chars ≈ 200 minutes of transcript so
+  // this only bites on very long podcasts.
+  const fullLabeledTranscript = formatSpeakerLabeledTranscript(transcriptSegments);
+  const cappedTranscript =
+    fullLabeledTranscript.length > 25000
+      ? fullLabeledTranscript.slice(0, 12500) + "\n\n…[middle truncated for prompt length]…\n\n" + fullLabeledTranscript.slice(-12500)
+      : fullLabeledTranscript;
+
+  // Speaker stats block — gives the model a sense of who speaks
+  // how much, which helps it map names confidently.
+  const speakerStatsBlock = speakerLetters
+    .map((k) => {
+      const stats = speakers[k] || {};
+      const secs = Math.round(stats.total_speaking_seconds || 0);
+      const mins = Math.floor(secs / 60);
+      const rem = secs % 60;
+      const timeStr = mins > 0 ? `${mins}m ${rem}s` : `${rem}s`;
+      const letter = k.replace("Speaker_", "");
+      return `- ${k} (chip "${letter}"): ${timeStr} of speaking time, ${stats.turns || 0} turns`;
+    })
+    .join("\n");
+
+  const speakerKeysBlock = speakerLetters
+    .map((k, i) => `    "${k}": "Real Name or null"${i < speakerLetters.length - 1 ? "," : ""}`)
+    .join("\n");
+
+  // Compose the OPERATOR HINTS block. Renders only when at least one
+  // of participantHints / operatorNotes is non-empty. The wrapping
+  // heading + warning language lives here (not in the template) so
+  // operator-edited prompt overrides can't accidentally drop the
+  // "treat as suggestions" framing — the LLM always sees it when
+  // the block is present.
+  const hintsParts = [];
+  if (participantHints && String(participantHints).trim()) {
+    hintsParts.push(
+      `Possible participants in this conversation (operator-supplied — may be incomplete or include people who don't actually speak):\n${String(participantHints).trim()}`,
+    );
+  }
+  if (operatorNotes && String(operatorNotes).trim()) {
+    // Cap notes at 4000 chars to leave room for the transcript in
+    // the prompt window. Mid-truncate is fine — notes are usually
+    // short.
+    const trimmed = String(operatorNotes).trim().slice(0, 4000);
+    hintsParts.push(
+      `Operator notes on the conversation (may describe what each named participant did — use as a soft signal for mapping names to chip letters):\n${trimmed}`,
+    );
+  }
+  const operatorContextBlock = hintsParts.length
+    ? `OPERATOR HINTS (treat as suggestions only — verify against the transcript before assigning names):\n\n${hintsParts.join("\n\n")}\n\n`
+    : "";
+
+  const templateSource =
+    typeof promptOverride === "string" && promptOverride.trim()
+      ? promptOverride
+      : DEFAULT_NAME_INFERENCE_PROMPT_TEMPLATE;
+  const prompt = fillTemplate(templateSource, {
+    channel: channelHint || "(unknown)",
+    title: titleHint || "(unknown)",
+    description: (descriptionHint || "").slice(0, 800) || "(none)",
+    operatorContext: operatorContextBlock,
+    speakerStats: speakerStatsBlock,
+    transcript: cappedTranscript,
+    speakerKeys: speakerKeysBlock,
+  });
+
+  const t0 = Date.now();
+  let lastErr = null;
+  let r = null;
+  let parsed = null;
+  for (let attempt = 0; attempt < STAGE_1_MAX_ATTEMPTS; attempt++) {
+    try {
+      r = await backend.analyzeText({ prompt });
+      parsed = safeParseSpeakers(r.text, speakerLetters);
+      if (parsed) {
+        lastErr = null;
+        break;
+      }
+      lastErr = "invalid JSON in name-inference response";
+    } catch (err) {
+      lastErr = (err?.message || String(err)).slice(0, 280);
+      r = null;
+    }
+    if (attempt < STAGE_1_MAX_ATTEMPTS - 1) {
+      console.warn(
+        `[polish/names] attempt ${attempt + 1} failed (${lastErr}) — retrying`
+      );
+    }
+  }
+  const dur = Date.now() - t0;
+  const cost = parsed && r ? computeCostDetails(r.model, r.usage) : { input_tokens: 0, output_tokens: 0, thinking_tokens: 0, cost_usd: 0 };
+  await recordCall({
+    install_id: installId,
+    license_fingerprint: licenseFingerprint,
+    tier: "core",
+    pipeline: "polish_names",
+    backend: pipelineBackend,
+    model: r?.model || null,
+    status: parsed ? "success" : "error",
+    duration_ms: dur,
+    audio_seconds: 0,
+    job_id: jobId,
+    batch_id: batchId,
+    source,
+    media_url: mediaUrl,
+    error: parsed ? null : (lastErr || "name inference failed"),
+    ...cost,
+  });
+  if (!parsed) {
+    console.warn(
+      `[polish/names] all ${STAGE_1_MAX_ATTEMPTS} attempts failed (${lastErr}) — falling back to null names`
+    );
+    return nullMap;
+  }
+  const filled = { ...nullMap };
+  for (const k of speakerLetters) {
+    const v = parsed[k];
+    if (typeof v === "string" && v.trim() && v.trim().toLowerCase() !== "null") {
+      filled[k] = v.trim();
+    }
+  }
+  const namedCount = Object.values(filled).filter((v) => v).length;
+  console.log(
+    `[polish/names] inferred ${namedCount}/${speakerLetters.length} speakers in ${(dur / 1000).toFixed(1)}s`
+  );
+  return filled;
+}
+
+function safeParseSpeakers(text, expectedKeys) {
+  if (!text || typeof text !== "string") return null;
+  // Strip optional code fence wrapping.
+  let s = text.trim();
+  const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/);
+  if (fence) s = fence[1].trim();
+  let parsed;
+  try {
+    parsed = JSON.parse(s);
+  } catch {
+    return null;
+  }
+  const speakers = parsed?.speakers;
+  if (!speakers || typeof speakers !== "object") return null;
+  // Require all expected keys present (null values OK)
+  for (const k of expectedKeys) {
+    if (!(k in speakers)) return null;
+  }
+  return speakers;
+}
+
+// ─── Stage 2: per-window summary polish ─────────────────────────────
+//
+// `sections` is the FULL stitched section list with global entry
+// indices (output of stitchAnalysisResults). We need per-window
+// groupings to fire N parallel LLM calls — each call sees only the
+// sections within its window's time range so the prompt stays
+// bounded.
+//
+// `windows` is the planned-windows array (each w.bodyStartSec /
+// w.bodyEndSec) — used to assign sections to windows and to slice
+// the transcript for the prompt.
+//
+// `canonicalEntries` is the parsed transcript (the same array
+// stitcher uses for indexing). We need it to convert section
+// startIndex/endIndex into time ranges for grouping.
+//
+// Returns: the same sections array with summaries rewritten in
+// place. Sections whose window's polish failed keep their original
+// summary. Section start/end indices and titles are NEVER modified.
+export async function runSummaryPolish({
+  sections,
+  canonicalEntries,
+  windows,
+  transcriptSegments,
+  speakerNames,
+  speakerStats,
+  // Operator-editable polish prompt override. Same three-layer
+  // resolution at caller as the name-inference override.
+  promptOverride = "",
+  backend,
+  concurrency,
+  // audit params
+  pipelineBackend,
+  jobId,
+  batchId,
+  mediaUrl,
+  installId,
+  licenseFingerprint = null,
+  source,
+  computeCostDetails,
+}) {
+  if (!Array.isArray(sections) || sections.length === 0) return sections;
+  if (!Array.isArray(windows) || windows.length === 0) return sections;
+  if (!Array.isArray(canonicalEntries) || canonicalEntries.length === 0) return sections;
+
+  // Group sections by window — assign each section to the window
+  // whose body contains its start time. Sections whose start time
+  // falls before any window's body (shouldn't happen in practice)
+  // get assigned to window 0.
+  const sectionsByWindow = windows.map(() => []);
+  for (const sec of sections) {
+    const startEntry = canonicalEntries[sec.startIndex];
+    if (!startEntry) continue;
+    const startSec = startEntry.offset || 0;
+    let assigned = -1;
+    for (let i = 0; i < windows.length; i++) {
+      const w = windows[i];
+      const nextW = windows[i + 1];
+      const upper = nextW ? nextW.bodyStartSec : Infinity;
+      if (startSec >= (w.bodyStartSec || 0) && startSec < upper) {
+        assigned = i;
+        break;
+      }
+    }
+    if (assigned < 0) assigned = 0;
+    sectionsByWindow[assigned].push(sec);
+  }
+
+  // Build a speaker-roster block reused across all window prompts.
+  const speakerRoster = Object.entries(speakerNames || {})
+    .filter(([k]) => /^Speaker_[A-Z]+$/.test(k))
+    .sort()
+    .map(([k, name]) => {
+      const letter = k.replace("Speaker_", "");
+      const stats = speakerStats?.[k] || {};
+      const secs = Math.round(stats.total_speaking_seconds || 0);
+      const mins = Math.floor(secs / 60);
+      const rem = secs % 60;
+      const timeStr = mins > 0 ? `${mins}m ${rem}s` : `${rem}s`;
+      const nameLabel = name ? `"${name}"` : "(unknown name)";
+      return `- ${k} (chip [${letter}], ${timeStr} speaking, ${stats.turns || 0} turns): ${nameLabel}`;
+    })
+    .join("\n");
+
+  // Per-window worker — runs one LLM call to polish all sections
+  // assigned to that window. Returns a Map<sectionIndex,
+  // newSummary> covering only the sections it successfully rewrote.
+  const worker = async (windowIdx) => {
+    const w = windows[windowIdx];
+    const winSections = sectionsByWindow[windowIdx];
+    if (winSections.length === 0) return new Map();
+
+    const winStartSec = w.startSec || 0;
+    const winEndSec = w.windowEndSec || (w.bodyEndSec || 0);
+    const windowTranscript = formatSpeakerLabeledTranscript(transcriptSegments, {
+      startSec: winStartSec,
+      endSec: winEndSec,
+      // Label lines with the corrected names so attributions in the
+      // rewritten summaries follow the operator's roster, not whatever
+      // (possibly stale) names the original summaries were written with.
+      speakerNames,
+    });
+
+    // Format each section for the prompt. We carry the section's
+    // ORIGINAL position in `sections` via `__origIdx` so the worker
+    // can map polished summaries back to the right slot.
+    const sectionsBlock = winSections
+      .map((sec, i) => {
+        const startEntry = canonicalEntries[sec.startIndex];
+        const endEntry = canonicalEntries[sec.endIndex];
+        const tStart = startEntry ? (startEntry.offset || 0) : 0;
+        const tEnd = endEntry ? (endEntry.offset || 0) : 0;
+        const tStartStr = fmtMmSs(tStart);
+        const tEndStr = fmtMmSs(tEnd);
+        return `[${i}] Title: "${sec.title}" (${tStartStr}-${tEndStr})\n    Original summary: ${sec.summary}`;
+      })
+      .join("\n\n");
+
+    const templateSource =
+      typeof promptOverride === "string" && promptOverride.trim()
+        ? promptOverride
+        : DEFAULT_SUMMARY_POLISH_PROMPT_TEMPLATE;
+    const prompt = fillTemplate(templateSource, {
+      speakerRoster: speakerRoster || "(no speakers identified)",
+      transcript: windowTranscript || "(empty)",
+      sections: sectionsBlock,
+    });
+
+    const t0 = Date.now();
+    let r = null;
+    let parsed = null;
+    let lastErr = null;
+    for (let attempt = 0; attempt < STAGE_2_MAX_ATTEMPTS; attempt++) {
+      try {
+        r = await backend.analyzeText({ prompt });
+        parsed = safeParsePolishedSections(r.text, winSections.length);
+        if (parsed) {
+          lastErr = null;
+          break;
+        }
+        lastErr = "invalid JSON in polish response";
+      } catch (err) {
+        lastErr = (err?.message || String(err)).slice(0, 280);
+        r = null;
+      }
+      if (attempt < STAGE_2_MAX_ATTEMPTS - 1) {
+        console.warn(
+          `[polish/window ${windowIdx + 1}/${windows.length}] attempt ${attempt + 1} failed (${lastErr}) — retrying`
+        );
+      }
+    }
+    const dur = Date.now() - t0;
+    const cost = parsed && r ? computeCostDetails(r.model, r.usage) : { input_tokens: 0, output_tokens: 0, thinking_tokens: 0, cost_usd: 0 };
+    await recordCall({
+      install_id: installId,
+      license_fingerprint: licenseFingerprint,
+      tier: "core",
+      pipeline: "polish_summaries",
+      backend: pipelineBackend,
+      model: r?.model || null,
+      status: parsed ? "success" : "error",
+      duration_ms: dur,
+      audio_seconds: 0,
+      job_id: jobId,
+      batch_id: batchId,
+      source,
+      media_url: mediaUrl,
+      error: parsed ? null : (lastErr || "polish failed"),
+      window_idx: windowIdx,
+      window_count: windows.length,
+      ...cost,
+    });
+    if (!parsed) {
+      console.warn(
+        `[polish/window ${windowIdx + 1}/${windows.length}] all attempts failed (${lastErr}) — keeping original summaries`
+      );
+      return new Map();
+    }
+    // Map polish output back to the original sections by their
+    // win-local index. Build a Map<orig-section-position-in-array,
+    // newSummary>.
+    const out = new Map();
+    for (const p of parsed) {
+      const localIdx = p.index;
+      if (!Number.isInteger(localIdx) || localIdx < 0 || localIdx >= winSections.length) continue;
+      const newSummary = (p.summary || "").trim();
+      if (!newSummary) continue;
+      const sec = winSections[localIdx];
+      const origIdx = sections.indexOf(sec);
+      if (origIdx >= 0) out.set(origIdx, newSummary);
+    }
+    console.log(
+      `[polish/window ${windowIdx + 1}/${windows.length}] polished ${out.size}/${winSections.length} sections in ${(dur / 1000).toFixed(1)}s`
+    );
+    return out;
+  };
+
+  // Concurrent worker pool (same shape as runPipelinedAnalysis).
+  let nextIdx = 0;
+  const updates = new Map();
+  async function poolWorker() {
+    while (true) {
+      const my = nextIdx++;
+      if (my >= windows.length) return;
+      const result = await worker(my);
+      for (const [k, v] of result) updates.set(k, v);
+    }
+  }
+  const pool = Array.from(
+    { length: Math.min(concurrency || 4, windows.length) },
+    poolWorker
+  );
+  await Promise.all(pool);
+
+  // Apply polished summaries onto a shallow copy of sections so the
+  // caller's reference doesn't mutate unexpectedly.
+  const polished = sections.map((sec, i) => {
+    const newSummary = updates.get(i);
+    return newSummary ? { ...sec, summary: newSummary } : sec;
+  });
+  console.log(
+    `[polish] applied ${updates.size}/${sections.length} polished summaries`
+  );
+  return polished;
+}
+
+function safeParsePolishedSections(text, maxIndex) {
+  if (!text || typeof text !== "string") return null;
+  let s = text.trim();
+  const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/);
+  if (fence) s = fence[1].trim();
+  let parsed;
+  try {
+    parsed = JSON.parse(s);
+  } catch {
+    return null;
+  }
+  const arr = parsed?.sections;
+  if (!Array.isArray(arr)) return null;
+  // Validate each entry has {index: int, summary: string} and indices
+  // are in-range.
+  const valid = [];
+  for (const item of arr) {
+    if (!item || typeof item !== "object") continue;
+    if (!Number.isInteger(item.index)) continue;
+    if (item.index < 0 || item.index >= maxIndex) continue;
+    if (typeof item.summary !== "string" || !item.summary.trim()) continue;
+    valid.push(item);
+  }
+  return valid.length > 0 ? valid : null;
+}
+
+function fmtMmSs(seconds) {
+  const s = Math.max(0, Math.floor(seconds || 0));
+  const h = Math.floor(s / 3600);
+  const m = Math.floor((s % 3600) / 60);
+  const sec = s % 60;
+  const pad = (n) => n.toString().padStart(2, "0");
+  return h > 0 ? `${h}:${pad(m)}:${pad(sec)}` : `${m}:${pad(sec)}`;
+}