Add internal-meetings pipeline and post-hoc speaker tools
This commit is contained in:
@@ -0,0 +1,142 @@
|
||||
// Chunk-buffer state used by the pipelined-analyze path in
|
||||
// routes/summarize-url.js. The hardware backend fires
|
||||
// onChunkComplete(chunkData) as each transcribe chunk finishes;
|
||||
// this buffer:
|
||||
// - drains chunks in INDEX ORDER (chunks may arrive out of order
|
||||
// when concurrency > 1; we hold them in `pending` until the
|
||||
// next-expected index lands so dedup against the prior chunk's
|
||||
// overlap boundary is deterministic)
|
||||
// - dedupes each new chunk's segments against the prior chunk's
|
||||
// overlapBoundarySec — same logic that runs at end-of-transcribe
|
||||
// in hardware.js, but applied incrementally so analyze can read
|
||||
// a clean, no-duplicates segment view per window
|
||||
// - tracks coveredEndSec (the maximum global timestamp the deduped
|
||||
// buffer extends to, considering ONLY in-order chunks)
|
||||
// - lets the analyze workers await `waitForTime(targetSec)` and
|
||||
// query `getSegments(startSec, endSec)` to build per-window
|
||||
// analyze inputs as soon as the required chunks are in
|
||||
//
|
||||
// Failure modes:
|
||||
// - A chunk fails entirely → its segments are empty / undefined.
|
||||
// The buffer still advances nextExpected past it so later chunks
|
||||
// aren't stuck behind. The window covering that chunk's range
|
||||
// gets a shorter transcript and may yield no sections (or fewer
|
||||
// than expected). Downstream stitcher tolerates gaps.
|
||||
// - waitForTime can wait forever if the relevant chunk index
|
||||
// never arrives. Caller is responsible for racing this against
|
||||
// the transcribe Promise so a transcribe failure unblocks all
|
||||
// pending waiters via reject.
|
||||
|
||||
export function createChunkBuffer() {
|
||||
return {
|
||||
// Sparse staging area for chunks that arrived out of index order.
|
||||
pending: new Map(),
|
||||
// Drained, deduped, sorted-by-start segments. Append-only.
|
||||
segments: [],
|
||||
// Index of the next chunk we're waiting on to drain.
|
||||
nextExpected: 0,
|
||||
// Total chunk count, populated on the first onChunkComplete call.
|
||||
totalChunks: null,
|
||||
// Greatest global end-time covered by drained chunks. NOT just
|
||||
// max(pending) — out-of-order pending chunks don't count until
|
||||
// their predecessors land, so dedup is consistent.
|
||||
coveredEndSec: 0,
|
||||
// The previous chunk's overlap boundary in GLOBAL seconds.
|
||||
// Segments in the next chunk with start < this are duplicates of
|
||||
// segments already in the prior chunk's tail and get dropped.
|
||||
prevOverlapBoundary: 0,
|
||||
// Async waiters: { targetSec, resolve, reject }
|
||||
waiters: [],
|
||||
// Set true on terminal failure so future waiters reject immediately
|
||||
// instead of hanging.
|
||||
failed: false,
|
||||
failedReason: null,
|
||||
|
||||
add(chunkData) {
|
||||
if (this.failed) return;
|
||||
if (chunkData == null) return;
|
||||
if (this.totalChunks == null && Number.isInteger(chunkData.totalChunks)) {
|
||||
this.totalChunks = chunkData.totalChunks;
|
||||
}
|
||||
this.pending.set(chunkData.chunkIndex, chunkData);
|
||||
// Drain consecutive chunks starting from nextExpected
|
||||
while (this.pending.has(this.nextExpected)) {
|
||||
const c = this.pending.get(this.nextExpected);
|
||||
this.pending.delete(this.nextExpected);
|
||||
const segs = Array.isArray(c.segments) ? c.segments : [];
|
||||
// Dedup against the global overlap boundary set by the prior
|
||||
// chunk. Same predicate hardware.js uses at end-of-transcribe
|
||||
// for the global stitch: `seg.start >= prevOverlapBoundary`.
|
||||
for (const s of segs) {
|
||||
if ((s.start || 0) >= this.prevOverlapBoundary) {
|
||||
this.segments.push(s);
|
||||
}
|
||||
}
|
||||
// overlapBoundarySec from audio-meta.js is ALREADY a global
|
||||
// timestamp (= startSec + overlapSeconds at chunking time),
|
||||
// NOT a chunk-relative offset. The earlier `c.startSeconds +
|
||||
// c.overlapBoundarySec` double-counted: chunk 1 ended up
|
||||
// with prevOverlapBoundary=570 instead of 300, chunk 2
|
||||
// 1110 instead of 570, and by chunk 3+ the boundary had
|
||||
// outrun every subsequent chunk's segments — all dropped.
|
||||
// Symptom: window 1 received only ~30% of the segments it
|
||||
// should have, windows 2-6 received zero. Matches the
|
||||
// formula hardware.js uses at end-of-transcribe (with
|
||||
// offsetSeconds=0 for summarize-url callers).
|
||||
this.prevOverlapBoundary = c.overlapBoundarySec || 0;
|
||||
const endHere = (c.startSeconds || 0) + (c.durationSeconds || 0);
|
||||
if (endHere > this.coveredEndSec) this.coveredEndSec = endHere;
|
||||
this.nextExpected += 1;
|
||||
}
|
||||
this.checkWaiters();
|
||||
},
|
||||
|
||||
checkWaiters() {
|
||||
const stillWaiting = [];
|
||||
for (const w of this.waiters) {
|
||||
if (this.coveredEndSec >= w.targetSec) {
|
||||
w.resolve();
|
||||
} else {
|
||||
stillWaiting.push(w);
|
||||
}
|
||||
}
|
||||
this.waiters = stillWaiting;
|
||||
},
|
||||
|
||||
// Block until coveredEndSec reaches targetSec. Rejects with the
|
||||
// failedReason if the buffer is poisoned by a transcribe failure.
|
||||
waitForTime(targetSec) {
|
||||
if (this.failed) return Promise.reject(this.failedReason);
|
||||
if (this.coveredEndSec >= targetSec) return Promise.resolve();
|
||||
return new Promise((resolve, reject) =>
|
||||
this.waiters.push({ targetSec, resolve, reject })
|
||||
);
|
||||
},
|
||||
|
||||
// Snapshot the segments covering [startSec, endSec). Caller gets
|
||||
// a fresh array safe to mutate.
|
||||
getSegments(startSec, endSec) {
|
||||
const out = [];
|
||||
for (const s of this.segments) {
|
||||
const t = s.start || 0;
|
||||
if (t >= startSec && t < endSec) out.push(s);
|
||||
}
|
||||
return out;
|
||||
},
|
||||
|
||||
// Mark the buffer dead so all current + future waiters reject.
|
||||
// Called when transcribe throws — without this, runPipelinedAnalysis
|
||||
// workers would hang forever waiting for a window that'll never
|
||||
// become ready.
|
||||
fail(reason) {
|
||||
this.failed = true;
|
||||
this.failedReason = reason instanceof Error
|
||||
? reason
|
||||
: new Error(String(reason || "transcribe failed"));
|
||||
for (const w of this.waiters) {
|
||||
try { w.reject(this.failedReason); } catch {}
|
||||
}
|
||||
this.waiters = [];
|
||||
},
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user