From 1b6bb8ab674754c84e4ad8d53093aae539557b20 Mon Sep 17 00:00:00 2001 From: Grant Gilliam Date: Mon, 8 Jun 2026 16:21:45 -0500 Subject: [PATCH] Drop stuck whole-call visual spans at processing time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defense-in-depth + salvage for sessions captured before the adapter fix: drop any vision-source span whose single unbroken duration covers ≥60% of the call. No one speaks that long without a break, so it's a stuck/false active-speaker cue that would dominate backend name attribution. Self (mic_vad) spans are never dropped. Applied to both the live and re-process paths. Test added; 66 pass. --- .../Session/TranscriptPipeline.swift | 23 ++++++++++++++++--- Ten31TranscriptsTests/Phase5Tests.swift | 12 ++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/Ten31Transcripts/Session/TranscriptPipeline.swift b/Ten31Transcripts/Session/TranscriptPipeline.swift index 08472ef..3df6e0e 100644 --- a/Ten31Transcripts/Session/TranscriptPipeline.swift +++ b/Ten31Transcripts/Session/TranscriptPipeline.swift @@ -50,13 +50,20 @@ final class TranscriptPipeline { try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true) defer { try? fm.removeItem(at: chunksDir) } // cleanup on success OR throw + // Defensive: drop any visual span covering most of the call in one unbroken + // segment — the signature of a stuck/false active-speaker cue (e.g. a solid + // camera-off tile read as "speaking" the whole call). Such a span would + // dominate the backend's name attribution and collapse every voice onto one + // name. Also salvages sessions captured before the adapter fix landed. + let vis = Self.dropStuckSpans(timeline, duration: duration) + // Start from stored voiceprints; accumulate this call's prints across chunks // for within-call unification (the store only persists high-confidence ones). var known = voiceprints.knownVoiceprints() var results: [TranscriptAssembler.ChunkResult] = [] // Mono fallback needs self folded into the timeline; dual sends it separately. - let monoTimeline = dual ? timeline - : timeline + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName) + let monoTimeline = dual ? vis + : vis + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName) for chunk in plan { try Task.checkCancellation() @@ -70,7 +77,7 @@ final class TranscriptPipeline { try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk) try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk) guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue } - let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end) + let timelineData = try SessionPackager.rebasedTimelineData(vis, start: chunk.start, end: chunk.end) let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end) response = try await client.labelMergeDual( micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData, @@ -113,4 +120,14 @@ final class TranscriptPipeline { static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] { spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") } } + + /// Drop visual (vision-source) spans whose single unbroken duration covers at + /// least `maxFraction` of the whole call — no one legitimately speaks that long + /// without a break, so it's a stuck/false cue. Self spans (mic_vad) are kept. + static func dropStuckSpans(_ timeline: [VisualTimeline.Segment], duration: Double, + maxFraction: Double = 0.6) -> [VisualTimeline.Segment] { + guard duration > 0 else { return timeline } + let limit = maxFraction * duration + return timeline.filter { $0.source != "vision" || ($0.end - $0.start) < limit } + } } diff --git a/Ten31TranscriptsTests/Phase5Tests.swift b/Ten31TranscriptsTests/Phase5Tests.swift index 0fd916c..8acbf02 100644 --- a/Ten31TranscriptsTests/Phase5Tests.swift +++ b/Ten31TranscriptsTests/Phase5Tests.swift @@ -37,6 +37,18 @@ final class Phase5Tests: XCTestCase { XCTAssertEqual(asm.speakersFile.segments[0].start, 152, accuracy: 0.01) } + func testDropStuckSpansRemovesWholeCallCue() { + let segs = [ + VisualTimeline.Segment(start: 0, end: 1900, name: "Grant Gilliam", confidence: 1, source: "vision"), // stuck whole-call tile + VisualTimeline.Segment(start: 100, end: 130, name: "Matt Odell", confidence: 0.9, source: "vision"), // real + VisualTimeline.Segment(start: 0, end: 1900, name: "Grant", confidence: 1, source: "mic_vad"), // self span: keep + ] + let out = TranscriptPipeline.dropStuckSpans(segs, duration: 1976) + XCTAssertFalse(out.contains { $0.name == "Grant Gilliam" }) // 96% of call in one span → dropped + XCTAssertTrue(out.contains { $0.name == "Matt Odell" }) // short real span kept + XCTAssertTrue(out.contains { $0.source == "mic_vad" }) // self never dropped + } + func testRebaseClipsAndRebases() throws { let segs = [ VisualTimeline.Segment(start: 140, end: 160, name: "A", confidence: 0.9, source: "vision"),