Chunk overlap + overlap-aware stitching

Chunks were contiguous (start = prev end) with a naïve offset-concat stitch — no overlap. That cut sentences at boundaries, denied the diarizer context at edges, and let one voice split across chunks (the MH/Unknown_0 problem). Now each ~150s body is sliced with a 15s margin on both sides ([bodyStart-15, bodyEnd+15]); the stitcher keeps a segment only in the chunk that owns its MIDPOINT (body region) and drops it from the neighbour's margin — so boundary-spanning speech is seen whole by the backend and kept exactly once. - SessionPackager.PlannedChunk gains bodyStart/bodyEnd; planChunks adds overlapSeconds. - TranscriptAssembler.ChunkResult carries body bounds (defaults keep-all → no-overlap behaviour preserved for existing callers); assemble dedups by midpoint-in-body. - TranscriptPipeline passes body bounds through. Complements (doesn't replace) the fragment-smoothing + reconciliation safety nets; this is the upstream fix. ~+20% backend audio per interior chunk. 63/63 XCTest (new: overlap window layout + boundary-segment dedup).
2026-06-08 13:03:56 -05:00
parent 1c133c8970
commit ab910cf742
4 changed files with 60 additions and 18 deletions
@@ -10,24 +10,35 @@ import AVFoundation
 enum SessionPackager {
    struct PlannedChunk: Equatable {
        let index: Int
-        let start: Double      // global seconds
-        let end: Double
+        let start: Double      // sliced window start (global seconds, incl. overlap margin)
+        let end: Double        // sliced window end (incl. overlap margin)
+        let bodyStart: Double  // the region this chunk OWNS (no overlap) — for stitch dedup
+        let bodyEnd: Double
    }

-    /// One chunk if short; otherwise even ~`chunkSeconds` windows.
+    /// One chunk if short; otherwise ~`chunkSeconds` bodies, each sliced with an
+    /// `overlapSeconds` margin on both sides. The margin gives the backend context at
+    /// boundaries (so a sentence isn't cut and the diarizer attributes edge speech
+    /// correctly and keeps a voice consistent across chunks); the stitcher keeps only
+    /// each chunk's owned `body` region, deduping the overlap.
    static func planChunks(durationSec: Double,
                           chunkSeconds: Double = 150,
+                           overlapSeconds: Double = 15,
                           thresholdSec: Double = 180) -> [PlannedChunk] {
        guard durationSec > thresholdSec else {
-            return [PlannedChunk(index: 0, start: 0, end: durationSec)]
+            return [PlannedChunk(index: 0, start: 0, end: durationSec, bodyStart: 0, bodyEnd: durationSec)]
        }
        var chunks: [PlannedChunk] = []
-        var start = 0.0
+        var bodyStart = 0.0
        var index = 0
-        while start < durationSec - 0.001 {
-            let end = min(start + chunkSeconds, durationSec)
-            chunks.append(PlannedChunk(index: index, start: start, end: end))
-            start = end
+        while bodyStart < durationSec - 0.001 {
+            let bodyEnd = min(bodyStart + chunkSeconds, durationSec)
+            chunks.append(PlannedChunk(
+                index: index,
+                start: max(0, bodyStart - overlapSeconds),
+                end: min(durationSec, bodyEnd + overlapSeconds),
+                bodyStart: bodyStart, bodyEnd: bodyEnd))
+            bodyStart = bodyEnd
            index += 1
        }
        return chunks
@@ -5,8 +5,13 @@ import Foundation
 /// name, and fingerprints collected for the voiceprint store.
 enum TranscriptAssembler {
    struct ChunkResult {
-        let chunkStart: Double           // global seconds
+        let chunkStart: Double           // global seconds (the sliced window start)
        let response: LabelMergeResponse
+        // The region this chunk OWNS; segments whose midpoint falls outside it are the
+        // neighbour's (overlap margin) and are dropped here. Defaults keep everything
+        // (no-overlap behaviour).
+        var bodyStart: Double = -.greatestFiniteMagnitude
+        var bodyEnd: Double = .greatestFiniteMagnitude
    }

    struct Assembled {
@@ -40,13 +45,16 @@ enum TranscriptAssembler {

        for chunk in chunks {
            let offset = chunk.chunkStart
-            // Audio length from the chunk window, so silent/all-unknown calls still
-            // report a real duration (not just the last segment's end).
-            duration = max(duration, offset + chunk.response.duration)
+            // Body end bounds the real session length even on silent/all-unknown calls.
+            duration = max(duration, min(chunk.bodyEnd, offset + chunk.response.duration))

            for seg in chunk.response.segments {
                let start = seg.startSeconds + offset
                let end = seg.endSeconds + offset
+                // Overlap dedup: keep a segment only in the chunk that OWNS its midpoint;
+                // the other chunk saw it only in its margin (for context) and drops it.
+                let mid = (start + end) / 2
+                guard mid >= chunk.bodyStart, mid < chunk.bodyEnd else { continue }
                segments.append(.init(start: start, end: end, speaker: seg.speaker, text: seg.text))
                duration = max(duration, end)
            }
@@ -91,7 +91,8 @@ final class TranscriptPipeline {
                known[name] = fp
            }
            voiceprints.update(with: response)
-            results.append(.init(chunkStart: chunk.start, response: response))
+            results.append(.init(chunkStart: chunk.start, response: response,
+                                 bodyStart: chunk.bodyStart, bodyEnd: chunk.bodyEnd))
        }
        await progress?(plan.count, plan.count)

@@ -8,11 +8,33 @@ final class Phase5Tests: XCTestCase {
        XCTAssertEqual(c[0].end, 70, accuracy: 0.001)
    }

-    func testPlanChunksLong() {
-        let c = SessionPackager.planChunks(durationSec: 400, chunkSeconds: 150)
+    func testPlanChunksLongOverlapsWindowsWithContiguousBodies() {
+        let c = SessionPackager.planChunks(durationSec: 400, chunkSeconds: 150, overlapSeconds: 15)
        XCTAssertEqual(c.count, 3)
-        XCTAssertEqual(c[0].start, 0); XCTAssertEqual(c[0].end, 150)
-        XCTAssertEqual(c[1].start, 150); XCTAssertEqual(c[2].end, 400)
+        // Owned bodies tile the call with no gaps/overlap.
+        XCTAssertEqual(c[0].bodyStart, 0);   XCTAssertEqual(c[0].bodyEnd, 150)
+        XCTAssertEqual(c[1].bodyStart, 150); XCTAssertEqual(c[1].bodyEnd, 300)
+        XCTAssertEqual(c[2].bodyEnd, 400)
+        // Sliced windows overlap by the margin (and clamp at the ends).
+        XCTAssertEqual(c[0].start, 0); XCTAssertEqual(c[0].end, 165)   // +15 trailing
+        XCTAssertEqual(c[1].start, 135)                                // -15 leading
+        XCTAssertLessThan(c[1].start, c[0].end)                        // windows overlap
+        XCTAssertEqual(c[2].end, 400)                                  // clamped
+    }
+
+    func testAssembleDedupsOverlapByBody() {
+        // A segment at global 152–156 sits in chunk1's body but also in chunk0's
+        // trailing margin (overlap). It must be kept exactly once (by chunk1).
+        let r0 = #"{"duration":165,"speakers":[{"cluster":"S0","name":"A","source":"visual","overlap_confidence":0.9}],"segments":[{"start_ms":152000,"end_ms":156000,"speaker":"A","text":"boundary"}],"fingerprints":{},"models":{}}"#
+        let r1 = #"{"duration":180,"speakers":[{"cluster":"S0","name":"A","source":"visual","overlap_confidence":0.9}],"segments":[{"start_ms":17000,"end_ms":21000,"speaker":"A","text":"boundary"}],"fingerprints":{},"models":{}}"#
+        let c0 = try! JSONDecoder().decode(LabelMergeResponse.self, from: Data(r0.utf8))
+        let c1 = try! JSONDecoder().decode(LabelMergeResponse.self, from: Data(r1.utf8))
+        let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet", chunks: [
+            .init(chunkStart: 0, response: c0, bodyStart: 0, bodyEnd: 150),
+            .init(chunkStart: 135, response: c1, bodyStart: 150, bodyEnd: 300),
+        ])
+        XCTAssertEqual(asm.speakersFile.segments.count, 1)                       // deduped
+        XCTAssertEqual(asm.speakersFile.segments[0].start, 152, accuracy: 0.01)
    }

    func testRebaseClipsAndRebases() throws {