Drop stuck whole-call visual spans at processing time
Defense-in-depth + salvage for sessions captured before the adapter fix: drop any vision-source span whose single unbroken duration covers ≥60% of the call. No one speaks that long without a break, so it's a stuck/false active-speaker cue that would dominate backend name attribution. Self (mic_vad) spans are never dropped. Applied to both the live and re-process paths. Test added; 66 pass.
This commit is contained in:
@@ -50,13 +50,20 @@ final class TranscriptPipeline {
|
|||||||
try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true)
|
try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true)
|
||||||
defer { try? fm.removeItem(at: chunksDir) } // cleanup on success OR throw
|
defer { try? fm.removeItem(at: chunksDir) } // cleanup on success OR throw
|
||||||
|
|
||||||
|
// Defensive: drop any visual span covering most of the call in one unbroken
|
||||||
|
// segment — the signature of a stuck/false active-speaker cue (e.g. a solid
|
||||||
|
// camera-off tile read as "speaking" the whole call). Such a span would
|
||||||
|
// dominate the backend's name attribution and collapse every voice onto one
|
||||||
|
// name. Also salvages sessions captured before the adapter fix landed.
|
||||||
|
let vis = Self.dropStuckSpans(timeline, duration: duration)
|
||||||
|
|
||||||
// Start from stored voiceprints; accumulate this call's prints across chunks
|
// Start from stored voiceprints; accumulate this call's prints across chunks
|
||||||
// for within-call unification (the store only persists high-confidence ones).
|
// for within-call unification (the store only persists high-confidence ones).
|
||||||
var known = voiceprints.knownVoiceprints()
|
var known = voiceprints.knownVoiceprints()
|
||||||
var results: [TranscriptAssembler.ChunkResult] = []
|
var results: [TranscriptAssembler.ChunkResult] = []
|
||||||
// Mono fallback needs self folded into the timeline; dual sends it separately.
|
// Mono fallback needs self folded into the timeline; dual sends it separately.
|
||||||
let monoTimeline = dual ? timeline
|
let monoTimeline = dual ? vis
|
||||||
: timeline + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName)
|
: vis + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName)
|
||||||
|
|
||||||
for chunk in plan {
|
for chunk in plan {
|
||||||
try Task.checkCancellation()
|
try Task.checkCancellation()
|
||||||
@@ -70,7 +77,7 @@ final class TranscriptPipeline {
|
|||||||
try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk)
|
try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk)
|
||||||
try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk)
|
try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk)
|
||||||
guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue }
|
guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue }
|
||||||
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
|
let timelineData = try SessionPackager.rebasedTimelineData(vis, start: chunk.start, end: chunk.end)
|
||||||
let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end)
|
let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end)
|
||||||
response = try await client.labelMergeDual(
|
response = try await client.labelMergeDual(
|
||||||
micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData,
|
micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData,
|
||||||
@@ -113,4 +120,14 @@ final class TranscriptPipeline {
|
|||||||
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
|
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
|
||||||
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
|
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Drop visual (vision-source) spans whose single unbroken duration covers at
|
||||||
|
/// least `maxFraction` of the whole call — no one legitimately speaks that long
|
||||||
|
/// without a break, so it's a stuck/false cue. Self spans (mic_vad) are kept.
|
||||||
|
static func dropStuckSpans(_ timeline: [VisualTimeline.Segment], duration: Double,
|
||||||
|
maxFraction: Double = 0.6) -> [VisualTimeline.Segment] {
|
||||||
|
guard duration > 0 else { return timeline }
|
||||||
|
let limit = maxFraction * duration
|
||||||
|
return timeline.filter { $0.source != "vision" || ($0.end - $0.start) < limit }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,6 +37,18 @@ final class Phase5Tests: XCTestCase {
|
|||||||
XCTAssertEqual(asm.speakersFile.segments[0].start, 152, accuracy: 0.01)
|
XCTAssertEqual(asm.speakersFile.segments[0].start, 152, accuracy: 0.01)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testDropStuckSpansRemovesWholeCallCue() {
|
||||||
|
let segs = [
|
||||||
|
VisualTimeline.Segment(start: 0, end: 1900, name: "Grant Gilliam", confidence: 1, source: "vision"), // stuck whole-call tile
|
||||||
|
VisualTimeline.Segment(start: 100, end: 130, name: "Matt Odell", confidence: 0.9, source: "vision"), // real
|
||||||
|
VisualTimeline.Segment(start: 0, end: 1900, name: "Grant", confidence: 1, source: "mic_vad"), // self span: keep
|
||||||
|
]
|
||||||
|
let out = TranscriptPipeline.dropStuckSpans(segs, duration: 1976)
|
||||||
|
XCTAssertFalse(out.contains { $0.name == "Grant Gilliam" }) // 96% of call in one span → dropped
|
||||||
|
XCTAssertTrue(out.contains { $0.name == "Matt Odell" }) // short real span kept
|
||||||
|
XCTAssertTrue(out.contains { $0.source == "mic_vad" }) // self never dropped
|
||||||
|
}
|
||||||
|
|
||||||
func testRebaseClipsAndRebases() throws {
|
func testRebaseClipsAndRebases() throws {
|
||||||
let segs = [
|
let segs = [
|
||||||
VisualTimeline.Segment(start: 140, end: 160, name: "A", confidence: 0.9, source: "vision"),
|
VisualTimeline.Segment(start: 140, end: 160, name: "A", confidence: 0.9, source: "vision"),
|
||||||
|
|||||||
Reference in New Issue
Block a user