Files
Grant Gilliam a3e3406b28 Make diarization chunk length configurable (Auto + presets)
Chunk size was hardcoded at 2.5-min bodies. Add a Settings control:
Auto / Standard 2.5min / Large group 60s / Fine 90s. Shorter chunks keep fewer
simultaneous speakers per window (Sortformer resolves ~4/chunk), useful for large
calls, at some cost to speed and cross-chunk voice matching.

- ChunkMode (new, pure/testable): mode → body seconds; Auto picks 60s when >4
  participants were detected, else 150s; overlap + single-chunk threshold scale
  with the body length.
- AppSettings.chunkMode (+ typed `chunk`); SettingsView picker with explanation.
- TranscriptPipeline.process gains chunkSeconds; derives overlap/threshold from it.
- SessionController resolves the body from the setting + the session's detected
  participant count (visual_timeline participants) for both send + re-process.
- Participant roster now counts EVERY tile OCR'd, not just who spoke
  (TimelineBuilder.observedNames → VisualObserver → VisualCapture), so the Auto
  call-size signal is meaningful even though speaking-detection is sparse.

Tests: ChunkMode resolution, overlap scaling, short-body re-chunking. 69 pass.
2026-06-09 10:15:16 -05:00

134 lines
8.7 KiB
Swift
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import XCTest
@testable import Ten31Transcripts
final class Phase5Tests: XCTestCase {
func testPlanChunksShort() {
let c = SessionPackager.planChunks(durationSec: 70)
XCTAssertEqual(c.count, 1)
XCTAssertEqual(c[0].end, 70, accuracy: 0.001)
}
func testPlanChunksLongOverlapsWindowsWithContiguousBodies() {
let c = SessionPackager.planChunks(durationSec: 400, chunkSeconds: 150, overlapSeconds: 15)
XCTAssertEqual(c.count, 3)
// Owned bodies tile the call with no gaps/overlap.
XCTAssertEqual(c[0].bodyStart, 0); XCTAssertEqual(c[0].bodyEnd, 150)
XCTAssertEqual(c[1].bodyStart, 150); XCTAssertEqual(c[1].bodyEnd, 300)
XCTAssertEqual(c[2].bodyEnd, 400)
// Sliced windows overlap by the margin (and clamp at the ends).
XCTAssertEqual(c[0].start, 0); XCTAssertEqual(c[0].end, 165) // +15 trailing
XCTAssertEqual(c[1].start, 135) // -15 leading
XCTAssertLessThan(c[1].start, c[0].end) // windows overlap
XCTAssertEqual(c[2].end, 400) // clamped
}
func testAssembleDedupsOverlapByBody() {
// A segment at global 152156 sits in chunk1's body but also in chunk0's
// trailing margin (overlap). It must be kept exactly once (by chunk1).
let r0 = #"{"duration":165,"speakers":[{"cluster":"S0","name":"A","source":"visual","overlap_confidence":0.9}],"segments":[{"start_ms":152000,"end_ms":156000,"speaker":"A","text":"boundary"}],"fingerprints":{},"models":{}}"#
let r1 = #"{"duration":180,"speakers":[{"cluster":"S0","name":"A","source":"visual","overlap_confidence":0.9}],"segments":[{"start_ms":17000,"end_ms":21000,"speaker":"A","text":"boundary"}],"fingerprints":{},"models":{}}"#
let c0 = try! JSONDecoder().decode(LabelMergeResponse.self, from: Data(r0.utf8))
let c1 = try! JSONDecoder().decode(LabelMergeResponse.self, from: Data(r1.utf8))
let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet", chunks: [
.init(chunkStart: 0, response: c0, bodyStart: 0, bodyEnd: 150),
.init(chunkStart: 135, response: c1, bodyStart: 150, bodyEnd: 300),
])
XCTAssertEqual(asm.speakersFile.segments.count, 1) // deduped
XCTAssertEqual(asm.speakersFile.segments[0].start, 152, accuracy: 0.01)
}
func testChunkModeResolvesBodyLength() {
// Fixed presets ignore participant count.
XCTAssertEqual(ChunkMode.standard.bodySeconds(participantCount: 99), 150)
XCTAssertEqual(ChunkMode.largeGroup.bodySeconds(participantCount: 2), 60)
XCTAssertEqual(ChunkMode.fine.bodySeconds(participantCount: nil), 90)
// Auto: >4 detected 60s, 4 150s, unknown 150s.
XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: 6), 60)
XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: 4), 150)
XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: nil), 150)
}
func testChunkOverlapScalesWithBody() {
XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 150), 15) // capped
XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 60), 8) // floored (60*0.12=7.28)
XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 90), 11) // 90*0.12=10.811
}
func testPlanChunksShortBodyChunksAShortCall() {
// A 100s call would be ONE chunk at the 2.5-min default, but at a 60s body it
// splits so "Large group" actually re-chunks medium calls.
let c = SessionPackager.planChunks(durationSec: 100, chunkSeconds: 60,
overlapSeconds: 8, thresholdSec: 72)
XCTAssertEqual(c.count, 2)
XCTAssertEqual(c[0].bodyStart, 0); XCTAssertEqual(c[0].bodyEnd, 60)
XCTAssertEqual(c[1].bodyStart, 60); XCTAssertEqual(c[1].bodyEnd, 100)
}
func testDropStuckSpansRemovesWholeCallCue() {
let segs = [
VisualTimeline.Segment(start: 0, end: 1900, name: "Grant Gilliam", confidence: 1, source: "vision"), // stuck whole-call tile
VisualTimeline.Segment(start: 100, end: 130, name: "Matt Odell", confidence: 0.9, source: "vision"), // real
VisualTimeline.Segment(start: 0, end: 1900, name: "Grant", confidence: 1, source: "mic_vad"), // self span: keep
]
let out = TranscriptPipeline.dropStuckSpans(segs, duration: 1976)
XCTAssertFalse(out.contains { $0.name == "Grant Gilliam" }) // 96% of call in one span dropped
XCTAssertTrue(out.contains { $0.name == "Matt Odell" }) // short real span kept
XCTAssertTrue(out.contains { $0.source == "mic_vad" }) // self never dropped
}
func testRebaseClipsAndRebases() throws {
let segs = [
VisualTimeline.Segment(start: 140, end: 160, name: "A", confidence: 0.9, source: "vision"),
VisualTimeline.Segment(start: 200, end: 260, name: "B", confidence: 0.8, source: "vision"),
]
let data = try SessionPackager.rebasedTimelineData(segs, start: 150, end: 300)
let arr = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [[String: Any]])
XCTAssertEqual(arr.count, 2)
XCTAssertEqual(arr[0]["start"] as? Double, 0)
XCTAssertEqual(arr[0]["end"] as? Double, 10)
XCTAssertEqual(arr[1]["start"] as? Double, 50)
XCTAssertEqual(arr[1]["end"] as? Double, 110)
}
func testRebaseSelfVadClipsAndRebases() throws {
let spans = [VADSpan(start: 140, end: 160, confidence: 0.9),
VADSpan(start: 200, end: 260, confidence: 0.8)]
let data = try SessionPackager.rebasedSelfVadData(spans, start: 150, end: 300)
let arr = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [[String: Any]])
XCTAssertEqual(arr.count, 2)
XCTAssertEqual(arr[0]["start"] as? Double, 0)
XCTAssertEqual(arr[0]["end"] as? Double, 10) // 160 clipped at 150 base 010
XCTAssertEqual(arr[1]["start"] as? Double, 50)
XCTAssertEqual(arr[1]["end"] as? Double, 110)
XCTAssertNil(arr[0]["name"]) // self_vad carries no name
}
func testAssembleRanksMicChannelOverVisual() throws {
// Same person resolved by visual in one chunk and by the mic channel in
// another the mic-channel attribution (the user's own track) wins.
let visual = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1]}],"segments":[],"fingerprints":{},"models":{}}"#
let mic = #"{"duration":100,"speakers":[{"cluster":"mic","name":"Grant","source":"mic_channel","fingerprint":[0.2]}],"segments":[],"fingerprints":{"Grant":[0.2]},"models":{}}"#
let rv = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(visual.utf8))
let rm = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(mic.utf8))
let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet",
chunks: [.init(chunkStart: 0, response: rv), .init(chunkStart: 100, response: rm)])
XCTAssertEqual(asm.speakersFile.speakers.first { $0.name == "Grant" }?.source, "mic_channel")
}
func testAssembleOffsetsAndUnifies() throws {
let resp0 = #"{"duration":150,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1,0.2]}],"segments":[{"start_ms":1000,"end_ms":2000,"speaker":"Grant","text":"hi"}],"fingerprints":{"Grant":[0.1,0.2]},"models":{"diarization":"x"}}"#
let resp1 = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Sarah","source":"voiceprint","match_similarity":0.7,"fingerprint":[0.3,0.4]},{"cluster":"Speaker_1","name":"Unknown_0","source":"unmatched"}],"segments":[{"start_ms":500,"end_ms":1500,"speaker":"Sarah","text":"hello"}],"fingerprints":{"Sarah":[0.3,0.4]},"models":{"diarization":"x"}}"#
let r0 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp0.utf8))
let r1 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp1.utf8))
let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet",
chunks: [.init(chunkStart: 0, response: r0), .init(chunkStart: 150, response: r1)])
XCTAssertEqual(asm.speakersFile.segments.count, 2)
XCTAssertEqual(asm.speakersFile.segments[0].start, 1, accuracy: 0.001)
XCTAssertEqual(asm.speakersFile.segments[1].start, 150.5, accuracy: 0.001)
XCTAssertEqual(asm.speakersFile.speakers.count, 3)
XCTAssertNotNil(asm.fingerprints["Grant"])
XCTAssertNotNil(asm.fingerprints["Sarah"])
XCTAssertNil(asm.fingerprints["Unknown_0"])
}
}