143 lines
6.0 KiB
JavaScript
143 lines
6.0 KiB
JavaScript
// Chunk-buffer state used by the pipelined-analyze path in
|
|
// routes/summarize-url.js. The hardware backend fires
|
|
// onChunkComplete(chunkData) as each transcribe chunk finishes;
|
|
// this buffer:
|
|
// - drains chunks in INDEX ORDER (chunks may arrive out of order
|
|
// when concurrency > 1; we hold them in `pending` until the
|
|
// next-expected index lands so dedup against the prior chunk's
|
|
// overlap boundary is deterministic)
|
|
// - dedupes each new chunk's segments against the prior chunk's
|
|
// overlapBoundarySec — same logic that runs at end-of-transcribe
|
|
// in hardware.js, but applied incrementally so analyze can read
|
|
// a clean, no-duplicates segment view per window
|
|
// - tracks coveredEndSec (the maximum global timestamp the deduped
|
|
// buffer extends to, considering ONLY in-order chunks)
|
|
// - lets the analyze workers await `waitForTime(targetSec)` and
|
|
// query `getSegments(startSec, endSec)` to build per-window
|
|
// analyze inputs as soon as the required chunks are in
|
|
//
|
|
// Failure modes:
|
|
// - A chunk fails entirely → its segments are empty / undefined.
|
|
// The buffer still advances nextExpected past it so later chunks
|
|
// aren't stuck behind. The window covering that chunk's range
|
|
// gets a shorter transcript and may yield no sections (or fewer
|
|
// than expected). Downstream stitcher tolerates gaps.
|
|
// - waitForTime can wait forever if the relevant chunk index
|
|
// never arrives. Caller is responsible for racing this against
|
|
// the transcribe Promise so a transcribe failure unblocks all
|
|
// pending waiters via reject.
|
|
|
|
export function createChunkBuffer() {
|
|
return {
|
|
// Sparse staging area for chunks that arrived out of index order.
|
|
pending: new Map(),
|
|
// Drained, deduped, sorted-by-start segments. Append-only.
|
|
segments: [],
|
|
// Index of the next chunk we're waiting on to drain.
|
|
nextExpected: 0,
|
|
// Total chunk count, populated on the first onChunkComplete call.
|
|
totalChunks: null,
|
|
// Greatest global end-time covered by drained chunks. NOT just
|
|
// max(pending) — out-of-order pending chunks don't count until
|
|
// their predecessors land, so dedup is consistent.
|
|
coveredEndSec: 0,
|
|
// The previous chunk's overlap boundary in GLOBAL seconds.
|
|
// Segments in the next chunk with start < this are duplicates of
|
|
// segments already in the prior chunk's tail and get dropped.
|
|
prevOverlapBoundary: 0,
|
|
// Async waiters: { targetSec, resolve, reject }
|
|
waiters: [],
|
|
// Set true on terminal failure so future waiters reject immediately
|
|
// instead of hanging.
|
|
failed: false,
|
|
failedReason: null,
|
|
|
|
add(chunkData) {
|
|
if (this.failed) return;
|
|
if (chunkData == null) return;
|
|
if (this.totalChunks == null && Number.isInteger(chunkData.totalChunks)) {
|
|
this.totalChunks = chunkData.totalChunks;
|
|
}
|
|
this.pending.set(chunkData.chunkIndex, chunkData);
|
|
// Drain consecutive chunks starting from nextExpected
|
|
while (this.pending.has(this.nextExpected)) {
|
|
const c = this.pending.get(this.nextExpected);
|
|
this.pending.delete(this.nextExpected);
|
|
const segs = Array.isArray(c.segments) ? c.segments : [];
|
|
// Dedup against the global overlap boundary set by the prior
|
|
// chunk. Same predicate hardware.js uses at end-of-transcribe
|
|
// for the global stitch: `seg.start >= prevOverlapBoundary`.
|
|
for (const s of segs) {
|
|
if ((s.start || 0) >= this.prevOverlapBoundary) {
|
|
this.segments.push(s);
|
|
}
|
|
}
|
|
// overlapBoundarySec from audio-meta.js is ALREADY a global
|
|
// timestamp (= startSec + overlapSeconds at chunking time),
|
|
// NOT a chunk-relative offset. The earlier `c.startSeconds +
|
|
// c.overlapBoundarySec` double-counted: chunk 1 ended up
|
|
// with prevOverlapBoundary=570 instead of 300, chunk 2
|
|
// 1110 instead of 570, and by chunk 3+ the boundary had
|
|
// outrun every subsequent chunk's segments — all dropped.
|
|
// Symptom: window 1 received only ~30% of the segments it
|
|
// should have, windows 2-6 received zero. Matches the
|
|
// formula hardware.js uses at end-of-transcribe (with
|
|
// offsetSeconds=0 for summarize-url callers).
|
|
this.prevOverlapBoundary = c.overlapBoundarySec || 0;
|
|
const endHere = (c.startSeconds || 0) + (c.durationSeconds || 0);
|
|
if (endHere > this.coveredEndSec) this.coveredEndSec = endHere;
|
|
this.nextExpected += 1;
|
|
}
|
|
this.checkWaiters();
|
|
},
|
|
|
|
checkWaiters() {
|
|
const stillWaiting = [];
|
|
for (const w of this.waiters) {
|
|
if (this.coveredEndSec >= w.targetSec) {
|
|
w.resolve();
|
|
} else {
|
|
stillWaiting.push(w);
|
|
}
|
|
}
|
|
this.waiters = stillWaiting;
|
|
},
|
|
|
|
// Block until coveredEndSec reaches targetSec. Rejects with the
|
|
// failedReason if the buffer is poisoned by a transcribe failure.
|
|
waitForTime(targetSec) {
|
|
if (this.failed) return Promise.reject(this.failedReason);
|
|
if (this.coveredEndSec >= targetSec) return Promise.resolve();
|
|
return new Promise((resolve, reject) =>
|
|
this.waiters.push({ targetSec, resolve, reject })
|
|
);
|
|
},
|
|
|
|
// Snapshot the segments covering [startSec, endSec). Caller gets
|
|
// a fresh array safe to mutate.
|
|
getSegments(startSec, endSec) {
|
|
const out = [];
|
|
for (const s of this.segments) {
|
|
const t = s.start || 0;
|
|
if (t >= startSec && t < endSec) out.push(s);
|
|
}
|
|
return out;
|
|
},
|
|
|
|
// Mark the buffer dead so all current + future waiters reject.
|
|
// Called when transcribe throws — without this, runPipelinedAnalysis
|
|
// workers would hang forever waiting for a window that'll never
|
|
// become ready.
|
|
fail(reason) {
|
|
this.failed = true;
|
|
this.failedReason = reason instanceof Error
|
|
? reason
|
|
: new Error(String(reason || "transcribe failed"));
|
|
for (const w of this.waiters) {
|
|
try { w.reject(this.failedReason); } catch {}
|
|
}
|
|
this.waiters = [];
|
|
},
|
|
};
|
|
}
|