Add internal-meetings pipeline and post-hoc speaker tools

2026-06-13 13:35:53 -05:00
parent 9a2dbf69df
commit 705807e286
15 changed files with 7375 additions and 0 deletions
@@ -0,0 +1,142 @@
+// Chunk-buffer state used by the pipelined-analyze path in
+// routes/summarize-url.js. The hardware backend fires
+// onChunkComplete(chunkData) as each transcribe chunk finishes;
+// this buffer:
+//   - drains chunks in INDEX ORDER (chunks may arrive out of order
+//     when concurrency > 1; we hold them in `pending` until the
+//     next-expected index lands so dedup against the prior chunk's
+//     overlap boundary is deterministic)
+//   - dedupes each new chunk's segments against the prior chunk's
+//     overlapBoundarySec — same logic that runs at end-of-transcribe
+//     in hardware.js, but applied incrementally so analyze can read
+//     a clean, no-duplicates segment view per window
+//   - tracks coveredEndSec (the maximum global timestamp the deduped
+//     buffer extends to, considering ONLY in-order chunks)
+//   - lets the analyze workers await `waitForTime(targetSec)` and
+//     query `getSegments(startSec, endSec)` to build per-window
+//     analyze inputs as soon as the required chunks are in
+//
+// Failure modes:
+//   - A chunk fails entirely → its segments are empty / undefined.
+//     The buffer still advances nextExpected past it so later chunks
+//     aren't stuck behind. The window covering that chunk's range
+//     gets a shorter transcript and may yield no sections (or fewer
+//     than expected). Downstream stitcher tolerates gaps.
+//   - waitForTime can wait forever if the relevant chunk index
+//     never arrives. Caller is responsible for racing this against
+//     the transcribe Promise so a transcribe failure unblocks all
+//     pending waiters via reject.
+
+export function createChunkBuffer() {
+  return {
+    // Sparse staging area for chunks that arrived out of index order.
+    pending: new Map(),
+    // Drained, deduped, sorted-by-start segments. Append-only.
+    segments: [],
+    // Index of the next chunk we're waiting on to drain.
+    nextExpected: 0,
+    // Total chunk count, populated on the first onChunkComplete call.
+    totalChunks: null,
+    // Greatest global end-time covered by drained chunks. NOT just
+    // max(pending) — out-of-order pending chunks don't count until
+    // their predecessors land, so dedup is consistent.
+    coveredEndSec: 0,
+    // The previous chunk's overlap boundary in GLOBAL seconds.
+    // Segments in the next chunk with start < this are duplicates of
+    // segments already in the prior chunk's tail and get dropped.
+    prevOverlapBoundary: 0,
+    // Async waiters: { targetSec, resolve, reject }
+    waiters: [],
+    // Set true on terminal failure so future waiters reject immediately
+    // instead of hanging.
+    failed: false,
+    failedReason: null,
+
+    add(chunkData) {
+      if (this.failed) return;
+      if (chunkData == null) return;
+      if (this.totalChunks == null && Number.isInteger(chunkData.totalChunks)) {
+        this.totalChunks = chunkData.totalChunks;
+      }
+      this.pending.set(chunkData.chunkIndex, chunkData);
+      // Drain consecutive chunks starting from nextExpected
+      while (this.pending.has(this.nextExpected)) {
+        const c = this.pending.get(this.nextExpected);
+        this.pending.delete(this.nextExpected);
+        const segs = Array.isArray(c.segments) ? c.segments : [];
+        // Dedup against the global overlap boundary set by the prior
+        // chunk. Same predicate hardware.js uses at end-of-transcribe
+        // for the global stitch: `seg.start >= prevOverlapBoundary`.
+        for (const s of segs) {
+          if ((s.start || 0) >= this.prevOverlapBoundary) {
+            this.segments.push(s);
+          }
+        }
+        // overlapBoundarySec from audio-meta.js is ALREADY a global
+        // timestamp (= startSec + overlapSeconds at chunking time),
+        // NOT a chunk-relative offset. The earlier `c.startSeconds +
+        // c.overlapBoundarySec` double-counted: chunk 1 ended up
+        // with prevOverlapBoundary=570 instead of 300, chunk 2
+        // 1110 instead of 570, and by chunk 3+ the boundary had
+        // outrun every subsequent chunk's segments — all dropped.
+        // Symptom: window 1 received only ~30% of the segments it
+        // should have, windows 2-6 received zero. Matches the
+        // formula hardware.js uses at end-of-transcribe (with
+        // offsetSeconds=0 for summarize-url callers).
+        this.prevOverlapBoundary = c.overlapBoundarySec || 0;
+        const endHere = (c.startSeconds || 0) + (c.durationSeconds || 0);
+        if (endHere > this.coveredEndSec) this.coveredEndSec = endHere;
+        this.nextExpected += 1;
+      }
+      this.checkWaiters();
+    },
+
+    checkWaiters() {
+      const stillWaiting = [];
+      for (const w of this.waiters) {
+        if (this.coveredEndSec >= w.targetSec) {
+          w.resolve();
+        } else {
+          stillWaiting.push(w);
+        }
+      }
+      this.waiters = stillWaiting;
+    },
+
+    // Block until coveredEndSec reaches targetSec. Rejects with the
+    // failedReason if the buffer is poisoned by a transcribe failure.
+    waitForTime(targetSec) {
+      if (this.failed) return Promise.reject(this.failedReason);
+      if (this.coveredEndSec >= targetSec) return Promise.resolve();
+      return new Promise((resolve, reject) =>
+        this.waiters.push({ targetSec, resolve, reject })
+      );
+    },
+
+    // Snapshot the segments covering [startSec, endSec). Caller gets
+    // a fresh array safe to mutate.
+    getSegments(startSec, endSec) {
+      const out = [];
+      for (const s of this.segments) {
+        const t = s.start || 0;
+        if (t >= startSec && t < endSec) out.push(s);
+      }
+      return out;
+    },
+
+    // Mark the buffer dead so all current + future waiters reject.
+    // Called when transcribe throws — without this, runPipelinedAnalysis
+    // workers would hang forever waiting for a window that'll never
+    // become ready.
+    fail(reason) {
+      this.failed = true;
+      this.failedReason = reason instanceof Error
+        ? reason
+        : new Error(String(reason || "transcribe failed"));
+      for (const w of this.waiters) {
+        try { w.reject(this.failedReason); } catch {}
+      }
+      this.waiters = [];
+    },
+  };
+}