a3e3406b28
Chunk size was hardcoded at 2.5-min bodies. Add a Settings control: Auto / Standard 2.5min / Large group 60s / Fine 90s. Shorter chunks keep fewer simultaneous speakers per window (Sortformer resolves ~4/chunk), useful for large calls, at some cost to speed and cross-chunk voice matching. - ChunkMode (new, pure/testable): mode → body seconds; Auto picks 60s when >4 participants were detected, else 150s; overlap + single-chunk threshold scale with the body length. - AppSettings.chunkMode (+ typed `chunk`); SettingsView picker with explanation. - TranscriptPipeline.process gains chunkSeconds; derives overlap/threshold from it. - SessionController resolves the body from the setting + the session's detected participant count (visual_timeline participants) for both send + re-process. - Participant roster now counts EVERY tile OCR'd, not just who spoke (TimelineBuilder.observedNames → VisualObserver → VisualCapture), so the Auto call-size signal is meaningful even though speaking-detection is sparse. Tests: ChunkMode resolution, overlap scaling, short-body re-chunking. 69 pass.
134 lines
8.7 KiB
Swift
134 lines
8.7 KiB
Swift
import XCTest
|
||
@testable import Ten31Transcripts
|
||
|
||
final class Phase5Tests: XCTestCase {
|
||
func testPlanChunksShort() {
|
||
let c = SessionPackager.planChunks(durationSec: 70)
|
||
XCTAssertEqual(c.count, 1)
|
||
XCTAssertEqual(c[0].end, 70, accuracy: 0.001)
|
||
}
|
||
|
||
func testPlanChunksLongOverlapsWindowsWithContiguousBodies() {
|
||
let c = SessionPackager.planChunks(durationSec: 400, chunkSeconds: 150, overlapSeconds: 15)
|
||
XCTAssertEqual(c.count, 3)
|
||
// Owned bodies tile the call with no gaps/overlap.
|
||
XCTAssertEqual(c[0].bodyStart, 0); XCTAssertEqual(c[0].bodyEnd, 150)
|
||
XCTAssertEqual(c[1].bodyStart, 150); XCTAssertEqual(c[1].bodyEnd, 300)
|
||
XCTAssertEqual(c[2].bodyEnd, 400)
|
||
// Sliced windows overlap by the margin (and clamp at the ends).
|
||
XCTAssertEqual(c[0].start, 0); XCTAssertEqual(c[0].end, 165) // +15 trailing
|
||
XCTAssertEqual(c[1].start, 135) // -15 leading
|
||
XCTAssertLessThan(c[1].start, c[0].end) // windows overlap
|
||
XCTAssertEqual(c[2].end, 400) // clamped
|
||
}
|
||
|
||
func testAssembleDedupsOverlapByBody() {
|
||
// A segment at global 152–156 sits in chunk1's body but also in chunk0's
|
||
// trailing margin (overlap). It must be kept exactly once (by chunk1).
|
||
let r0 = #"{"duration":165,"speakers":[{"cluster":"S0","name":"A","source":"visual","overlap_confidence":0.9}],"segments":[{"start_ms":152000,"end_ms":156000,"speaker":"A","text":"boundary"}],"fingerprints":{},"models":{}}"#
|
||
let r1 = #"{"duration":180,"speakers":[{"cluster":"S0","name":"A","source":"visual","overlap_confidence":0.9}],"segments":[{"start_ms":17000,"end_ms":21000,"speaker":"A","text":"boundary"}],"fingerprints":{},"models":{}}"#
|
||
let c0 = try! JSONDecoder().decode(LabelMergeResponse.self, from: Data(r0.utf8))
|
||
let c1 = try! JSONDecoder().decode(LabelMergeResponse.self, from: Data(r1.utf8))
|
||
let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet", chunks: [
|
||
.init(chunkStart: 0, response: c0, bodyStart: 0, bodyEnd: 150),
|
||
.init(chunkStart: 135, response: c1, bodyStart: 150, bodyEnd: 300),
|
||
])
|
||
XCTAssertEqual(asm.speakersFile.segments.count, 1) // deduped
|
||
XCTAssertEqual(asm.speakersFile.segments[0].start, 152, accuracy: 0.01)
|
||
}
|
||
|
||
func testChunkModeResolvesBodyLength() {
|
||
// Fixed presets ignore participant count.
|
||
XCTAssertEqual(ChunkMode.standard.bodySeconds(participantCount: 99), 150)
|
||
XCTAssertEqual(ChunkMode.largeGroup.bodySeconds(participantCount: 2), 60)
|
||
XCTAssertEqual(ChunkMode.fine.bodySeconds(participantCount: nil), 90)
|
||
// Auto: >4 detected → 60s, ≤4 → 150s, unknown → 150s.
|
||
XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: 6), 60)
|
||
XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: 4), 150)
|
||
XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: nil), 150)
|
||
}
|
||
|
||
func testChunkOverlapScalesWithBody() {
|
||
XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 150), 15) // capped
|
||
XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 60), 8) // floored (60*0.12=7.2→8)
|
||
XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 90), 11) // 90*0.12=10.8→11
|
||
}
|
||
|
||
func testPlanChunksShortBodyChunksAShortCall() {
|
||
// A 100s call would be ONE chunk at the 2.5-min default, but at a 60s body it
|
||
// splits — so "Large group" actually re-chunks medium calls.
|
||
let c = SessionPackager.planChunks(durationSec: 100, chunkSeconds: 60,
|
||
overlapSeconds: 8, thresholdSec: 72)
|
||
XCTAssertEqual(c.count, 2)
|
||
XCTAssertEqual(c[0].bodyStart, 0); XCTAssertEqual(c[0].bodyEnd, 60)
|
||
XCTAssertEqual(c[1].bodyStart, 60); XCTAssertEqual(c[1].bodyEnd, 100)
|
||
}
|
||
|
||
func testDropStuckSpansRemovesWholeCallCue() {
|
||
let segs = [
|
||
VisualTimeline.Segment(start: 0, end: 1900, name: "Grant Gilliam", confidence: 1, source: "vision"), // stuck whole-call tile
|
||
VisualTimeline.Segment(start: 100, end: 130, name: "Matt Odell", confidence: 0.9, source: "vision"), // real
|
||
VisualTimeline.Segment(start: 0, end: 1900, name: "Grant", confidence: 1, source: "mic_vad"), // self span: keep
|
||
]
|
||
let out = TranscriptPipeline.dropStuckSpans(segs, duration: 1976)
|
||
XCTAssertFalse(out.contains { $0.name == "Grant Gilliam" }) // 96% of call in one span → dropped
|
||
XCTAssertTrue(out.contains { $0.name == "Matt Odell" }) // short real span kept
|
||
XCTAssertTrue(out.contains { $0.source == "mic_vad" }) // self never dropped
|
||
}
|
||
|
||
func testRebaseClipsAndRebases() throws {
|
||
let segs = [
|
||
VisualTimeline.Segment(start: 140, end: 160, name: "A", confidence: 0.9, source: "vision"),
|
||
VisualTimeline.Segment(start: 200, end: 260, name: "B", confidence: 0.8, source: "vision"),
|
||
]
|
||
let data = try SessionPackager.rebasedTimelineData(segs, start: 150, end: 300)
|
||
let arr = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [[String: Any]])
|
||
XCTAssertEqual(arr.count, 2)
|
||
XCTAssertEqual(arr[0]["start"] as? Double, 0)
|
||
XCTAssertEqual(arr[0]["end"] as? Double, 10)
|
||
XCTAssertEqual(arr[1]["start"] as? Double, 50)
|
||
XCTAssertEqual(arr[1]["end"] as? Double, 110)
|
||
}
|
||
|
||
func testRebaseSelfVadClipsAndRebases() throws {
|
||
let spans = [VADSpan(start: 140, end: 160, confidence: 0.9),
|
||
VADSpan(start: 200, end: 260, confidence: 0.8)]
|
||
let data = try SessionPackager.rebasedSelfVadData(spans, start: 150, end: 300)
|
||
let arr = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [[String: Any]])
|
||
XCTAssertEqual(arr.count, 2)
|
||
XCTAssertEqual(arr[0]["start"] as? Double, 0)
|
||
XCTAssertEqual(arr[0]["end"] as? Double, 10) // 160 clipped at 150 base → 0–10
|
||
XCTAssertEqual(arr[1]["start"] as? Double, 50)
|
||
XCTAssertEqual(arr[1]["end"] as? Double, 110)
|
||
XCTAssertNil(arr[0]["name"]) // self_vad carries no name
|
||
}
|
||
|
||
func testAssembleRanksMicChannelOverVisual() throws {
|
||
// Same person resolved by visual in one chunk and by the mic channel in
|
||
// another → the mic-channel attribution (the user's own track) wins.
|
||
let visual = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1]}],"segments":[],"fingerprints":{},"models":{}}"#
|
||
let mic = #"{"duration":100,"speakers":[{"cluster":"mic","name":"Grant","source":"mic_channel","fingerprint":[0.2]}],"segments":[],"fingerprints":{"Grant":[0.2]},"models":{}}"#
|
||
let rv = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(visual.utf8))
|
||
let rm = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(mic.utf8))
|
||
let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet",
|
||
chunks: [.init(chunkStart: 0, response: rv), .init(chunkStart: 100, response: rm)])
|
||
XCTAssertEqual(asm.speakersFile.speakers.first { $0.name == "Grant" }?.source, "mic_channel")
|
||
}
|
||
|
||
func testAssembleOffsetsAndUnifies() throws {
|
||
let resp0 = #"{"duration":150,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1,0.2]}],"segments":[{"start_ms":1000,"end_ms":2000,"speaker":"Grant","text":"hi"}],"fingerprints":{"Grant":[0.1,0.2]},"models":{"diarization":"x"}}"#
|
||
let resp1 = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Sarah","source":"voiceprint","match_similarity":0.7,"fingerprint":[0.3,0.4]},{"cluster":"Speaker_1","name":"Unknown_0","source":"unmatched"}],"segments":[{"start_ms":500,"end_ms":1500,"speaker":"Sarah","text":"hello"}],"fingerprints":{"Sarah":[0.3,0.4]},"models":{"diarization":"x"}}"#
|
||
let r0 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp0.utf8))
|
||
let r1 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp1.utf8))
|
||
let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet",
|
||
chunks: [.init(chunkStart: 0, response: r0), .init(chunkStart: 150, response: r1)])
|
||
XCTAssertEqual(asm.speakersFile.segments.count, 2)
|
||
XCTAssertEqual(asm.speakersFile.segments[0].start, 1, accuracy: 0.001)
|
||
XCTAssertEqual(asm.speakersFile.segments[1].start, 150.5, accuracy: 0.001)
|
||
XCTAssertEqual(asm.speakersFile.speakers.count, 3)
|
||
XCTAssertNotNil(asm.fingerprints["Grant"])
|
||
XCTAssertNotNil(asm.fingerprints["Sarah"])
|
||
XCTAssertNil(asm.fingerprints["Unknown_0"])
|
||
}
|
||
}
|