Add internal-meetings pipeline and post-hoc speaker tools

This commit is contained in:
Keysat
2026-06-13 13:35:53 -05:00
parent 9a2dbf69df
commit 705807e286
15 changed files with 7375 additions and 0 deletions
+142
View File
@@ -0,0 +1,142 @@
// Chunk-buffer state used by the pipelined-analyze path in
// routes/summarize-url.js. The hardware backend fires
// onChunkComplete(chunkData) as each transcribe chunk finishes;
// this buffer:
// - drains chunks in INDEX ORDER (chunks may arrive out of order
// when concurrency > 1; we hold them in `pending` until the
// next-expected index lands so dedup against the prior chunk's
// overlap boundary is deterministic)
// - dedupes each new chunk's segments against the prior chunk's
// overlapBoundarySec — same logic that runs at end-of-transcribe
// in hardware.js, but applied incrementally so analyze can read
// a clean, no-duplicates segment view per window
// - tracks coveredEndSec (the maximum global timestamp the deduped
// buffer extends to, considering ONLY in-order chunks)
// - lets the analyze workers await `waitForTime(targetSec)` and
// query `getSegments(startSec, endSec)` to build per-window
// analyze inputs as soon as the required chunks are in
//
// Failure modes:
// - A chunk fails entirely → its segments are empty / undefined.
// The buffer still advances nextExpected past it so later chunks
// aren't stuck behind. The window covering that chunk's range
// gets a shorter transcript and may yield no sections (or fewer
// than expected). Downstream stitcher tolerates gaps.
// - waitForTime can wait forever if the relevant chunk index
// never arrives. Caller is responsible for racing this against
// the transcribe Promise so a transcribe failure unblocks all
// pending waiters via reject.
export function createChunkBuffer() {
return {
// Sparse staging area for chunks that arrived out of index order.
pending: new Map(),
// Drained, deduped, sorted-by-start segments. Append-only.
segments: [],
// Index of the next chunk we're waiting on to drain.
nextExpected: 0,
// Total chunk count, populated on the first onChunkComplete call.
totalChunks: null,
// Greatest global end-time covered by drained chunks. NOT just
// max(pending) — out-of-order pending chunks don't count until
// their predecessors land, so dedup is consistent.
coveredEndSec: 0,
// The previous chunk's overlap boundary in GLOBAL seconds.
// Segments in the next chunk with start < this are duplicates of
// segments already in the prior chunk's tail and get dropped.
prevOverlapBoundary: 0,
// Async waiters: { targetSec, resolve, reject }
waiters: [],
// Set true on terminal failure so future waiters reject immediately
// instead of hanging.
failed: false,
failedReason: null,
add(chunkData) {
if (this.failed) return;
if (chunkData == null) return;
if (this.totalChunks == null && Number.isInteger(chunkData.totalChunks)) {
this.totalChunks = chunkData.totalChunks;
}
this.pending.set(chunkData.chunkIndex, chunkData);
// Drain consecutive chunks starting from nextExpected
while (this.pending.has(this.nextExpected)) {
const c = this.pending.get(this.nextExpected);
this.pending.delete(this.nextExpected);
const segs = Array.isArray(c.segments) ? c.segments : [];
// Dedup against the global overlap boundary set by the prior
// chunk. Same predicate hardware.js uses at end-of-transcribe
// for the global stitch: `seg.start >= prevOverlapBoundary`.
for (const s of segs) {
if ((s.start || 0) >= this.prevOverlapBoundary) {
this.segments.push(s);
}
}
// overlapBoundarySec from audio-meta.js is ALREADY a global
// timestamp (= startSec + overlapSeconds at chunking time),
// NOT a chunk-relative offset. The earlier `c.startSeconds +
// c.overlapBoundarySec` double-counted: chunk 1 ended up
// with prevOverlapBoundary=570 instead of 300, chunk 2
// 1110 instead of 570, and by chunk 3+ the boundary had
// outrun every subsequent chunk's segments — all dropped.
// Symptom: window 1 received only ~30% of the segments it
// should have, windows 2-6 received zero. Matches the
// formula hardware.js uses at end-of-transcribe (with
// offsetSeconds=0 for summarize-url callers).
this.prevOverlapBoundary = c.overlapBoundarySec || 0;
const endHere = (c.startSeconds || 0) + (c.durationSeconds || 0);
if (endHere > this.coveredEndSec) this.coveredEndSec = endHere;
this.nextExpected += 1;
}
this.checkWaiters();
},
checkWaiters() {
const stillWaiting = [];
for (const w of this.waiters) {
if (this.coveredEndSec >= w.targetSec) {
w.resolve();
} else {
stillWaiting.push(w);
}
}
this.waiters = stillWaiting;
},
// Block until coveredEndSec reaches targetSec. Rejects with the
// failedReason if the buffer is poisoned by a transcribe failure.
waitForTime(targetSec) {
if (this.failed) return Promise.reject(this.failedReason);
if (this.coveredEndSec >= targetSec) return Promise.resolve();
return new Promise((resolve, reject) =>
this.waiters.push({ targetSec, resolve, reject })
);
},
// Snapshot the segments covering [startSec, endSec). Caller gets
// a fresh array safe to mutate.
getSegments(startSec, endSec) {
const out = [];
for (const s of this.segments) {
const t = s.start || 0;
if (t >= startSec && t < endSec) out.push(s);
}
return out;
},
// Mark the buffer dead so all current + future waiters reject.
// Called when transcribe throws — without this, runPipelinedAnalysis
// workers would hang forever waiting for a window that'll never
// become ready.
fail(reason) {
this.failed = true;
this.failedReason = reason instanceof Error
? reason
: new Error(String(reason || "transcribe failed"));
for (const w of this.waiters) {
try { w.reject(this.failedReason); } catch {}
}
this.waiters = [];
},
};
}