diff --git a/Ten31Transcripts/Session/SessionPackager.swift b/Ten31Transcripts/Session/SessionPackager.swift index cc8bb6a..b36c62f 100644 --- a/Ten31Transcripts/Session/SessionPackager.swift +++ b/Ten31Transcripts/Session/SessionPackager.swift @@ -10,24 +10,35 @@ import AVFoundation enum SessionPackager { struct PlannedChunk: Equatable { let index: Int - let start: Double // global seconds - let end: Double + let start: Double // sliced window start (global seconds, incl. overlap margin) + let end: Double // sliced window end (incl. overlap margin) + let bodyStart: Double // the region this chunk OWNS (no overlap) — for stitch dedup + let bodyEnd: Double } - /// One chunk if short; otherwise even ~`chunkSeconds` windows. + /// One chunk if short; otherwise ~`chunkSeconds` bodies, each sliced with an + /// `overlapSeconds` margin on both sides. The margin gives the backend context at + /// boundaries (so a sentence isn't cut and the diarizer attributes edge speech + /// correctly and keeps a voice consistent across chunks); the stitcher keeps only + /// each chunk's owned `body` region, deduping the overlap. static func planChunks(durationSec: Double, chunkSeconds: Double = 150, + overlapSeconds: Double = 15, thresholdSec: Double = 180) -> [PlannedChunk] { guard durationSec > thresholdSec else { - return [PlannedChunk(index: 0, start: 0, end: durationSec)] + return [PlannedChunk(index: 0, start: 0, end: durationSec, bodyStart: 0, bodyEnd: durationSec)] } var chunks: [PlannedChunk] = [] - var start = 0.0 + var bodyStart = 0.0 var index = 0 - while start < durationSec - 0.001 { - let end = min(start + chunkSeconds, durationSec) - chunks.append(PlannedChunk(index: index, start: start, end: end)) - start = end + while bodyStart < durationSec - 0.001 { + let bodyEnd = min(bodyStart + chunkSeconds, durationSec) + chunks.append(PlannedChunk( + index: index, + start: max(0, bodyStart - overlapSeconds), + end: min(durationSec, bodyEnd + overlapSeconds), + bodyStart: bodyStart, bodyEnd: bodyEnd)) + bodyStart = bodyEnd index += 1 } return chunks diff --git a/Ten31Transcripts/Session/TranscriptAssembler.swift b/Ten31Transcripts/Session/TranscriptAssembler.swift index 1373b21..9d5cfaa 100644 --- a/Ten31Transcripts/Session/TranscriptAssembler.swift +++ b/Ten31Transcripts/Session/TranscriptAssembler.swift @@ -5,8 +5,13 @@ import Foundation /// name, and fingerprints collected for the voiceprint store. enum TranscriptAssembler { struct ChunkResult { - let chunkStart: Double // global seconds + let chunkStart: Double // global seconds (the sliced window start) let response: LabelMergeResponse + // The region this chunk OWNS; segments whose midpoint falls outside it are the + // neighbour's (overlap margin) and are dropped here. Defaults keep everything + // (no-overlap behaviour). + var bodyStart: Double = -.greatestFiniteMagnitude + var bodyEnd: Double = .greatestFiniteMagnitude } struct Assembled { @@ -40,13 +45,16 @@ enum TranscriptAssembler { for chunk in chunks { let offset = chunk.chunkStart - // Audio length from the chunk window, so silent/all-unknown calls still - // report a real duration (not just the last segment's end). - duration = max(duration, offset + chunk.response.duration) + // Body end bounds the real session length even on silent/all-unknown calls. + duration = max(duration, min(chunk.bodyEnd, offset + chunk.response.duration)) for seg in chunk.response.segments { let start = seg.startSeconds + offset let end = seg.endSeconds + offset + // Overlap dedup: keep a segment only in the chunk that OWNS its midpoint; + // the other chunk saw it only in its margin (for context) and drops it. + let mid = (start + end) / 2 + guard mid >= chunk.bodyStart, mid < chunk.bodyEnd else { continue } segments.append(.init(start: start, end: end, speaker: seg.speaker, text: seg.text)) duration = max(duration, end) } diff --git a/Ten31Transcripts/Session/TranscriptPipeline.swift b/Ten31Transcripts/Session/TranscriptPipeline.swift index 83a8361..08472ef 100644 --- a/Ten31Transcripts/Session/TranscriptPipeline.swift +++ b/Ten31Transcripts/Session/TranscriptPipeline.swift @@ -91,7 +91,8 @@ final class TranscriptPipeline { known[name] = fp } voiceprints.update(with: response) - results.append(.init(chunkStart: chunk.start, response: response)) + results.append(.init(chunkStart: chunk.start, response: response, + bodyStart: chunk.bodyStart, bodyEnd: chunk.bodyEnd)) } await progress?(plan.count, plan.count) diff --git a/Ten31TranscriptsTests/Phase5Tests.swift b/Ten31TranscriptsTests/Phase5Tests.swift index 39012c6..0fd916c 100644 --- a/Ten31TranscriptsTests/Phase5Tests.swift +++ b/Ten31TranscriptsTests/Phase5Tests.swift @@ -8,11 +8,33 @@ final class Phase5Tests: XCTestCase { XCTAssertEqual(c[0].end, 70, accuracy: 0.001) } - func testPlanChunksLong() { - let c = SessionPackager.planChunks(durationSec: 400, chunkSeconds: 150) + func testPlanChunksLongOverlapsWindowsWithContiguousBodies() { + let c = SessionPackager.planChunks(durationSec: 400, chunkSeconds: 150, overlapSeconds: 15) XCTAssertEqual(c.count, 3) - XCTAssertEqual(c[0].start, 0); XCTAssertEqual(c[0].end, 150) - XCTAssertEqual(c[1].start, 150); XCTAssertEqual(c[2].end, 400) + // Owned bodies tile the call with no gaps/overlap. + XCTAssertEqual(c[0].bodyStart, 0); XCTAssertEqual(c[0].bodyEnd, 150) + XCTAssertEqual(c[1].bodyStart, 150); XCTAssertEqual(c[1].bodyEnd, 300) + XCTAssertEqual(c[2].bodyEnd, 400) + // Sliced windows overlap by the margin (and clamp at the ends). + XCTAssertEqual(c[0].start, 0); XCTAssertEqual(c[0].end, 165) // +15 trailing + XCTAssertEqual(c[1].start, 135) // -15 leading + XCTAssertLessThan(c[1].start, c[0].end) // windows overlap + XCTAssertEqual(c[2].end, 400) // clamped + } + + func testAssembleDedupsOverlapByBody() { + // A segment at global 152–156 sits in chunk1's body but also in chunk0's + // trailing margin (overlap). It must be kept exactly once (by chunk1). + let r0 = #"{"duration":165,"speakers":[{"cluster":"S0","name":"A","source":"visual","overlap_confidence":0.9}],"segments":[{"start_ms":152000,"end_ms":156000,"speaker":"A","text":"boundary"}],"fingerprints":{},"models":{}}"# + let r1 = #"{"duration":180,"speakers":[{"cluster":"S0","name":"A","source":"visual","overlap_confidence":0.9}],"segments":[{"start_ms":17000,"end_ms":21000,"speaker":"A","text":"boundary"}],"fingerprints":{},"models":{}}"# + let c0 = try! JSONDecoder().decode(LabelMergeResponse.self, from: Data(r0.utf8)) + let c1 = try! JSONDecoder().decode(LabelMergeResponse.self, from: Data(r1.utf8)) + let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet", chunks: [ + .init(chunkStart: 0, response: c0, bodyStart: 0, bodyEnd: 150), + .init(chunkStart: 135, response: c1, bodyStart: 150, bodyEnd: 300), + ]) + XCTAssertEqual(asm.speakersFile.segments.count, 1) // deduped + XCTAssertEqual(asm.speakersFile.segments[0].start, 152, accuracy: 0.01) } func testRebaseClipsAndRebases() throws {