53d7fcdac0
The backend shipped dual-channel mode; wire the client to it. We already capture
mic (you) and system (others) separately, so send them as two files instead of the
mono mix — fixing the misattribution at the source.
- SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad);
multipart generalized to N files; shared POST/retry/decode extracted.
- SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad;
sliceAudio reused for both tracks.
- TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase
timeline + self_vad per chunk) when system audio is healthy; mono mixed-file
fallback (self folded into the timeline) otherwise.
- VisualCapture.finish: write the full visual_timeline.json (remote + self merged)
but return REMOTE (vision) segments only — self travels via the mic channel.
- TranscriptAssembler: rank mic_channel highest (the user's own track wins).
- VoiceprintStore: store the clean mic_channel self voiceprint.
- SessionController: pass mic/system URLs + remote timeline + channel self-spans +
self_name + systemHealthy; self_vad.json now reflects the channel-verified spans.
Validated END-TO-END against the live backend on the real misattributing session:
'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own
lines come back source=mic_channel; per-channel ASR recovered fuller remote text.
36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
73 lines
4.5 KiB
Swift
73 lines
4.5 KiB
Swift
import XCTest
|
||
@testable import Ten31Transcripts
|
||
|
||
final class Phase5Tests: XCTestCase {
|
||
func testPlanChunksShort() {
|
||
let c = SessionPackager.planChunks(durationSec: 70)
|
||
XCTAssertEqual(c.count, 1)
|
||
XCTAssertEqual(c[0].end, 70, accuracy: 0.001)
|
||
}
|
||
|
||
func testPlanChunksLong() {
|
||
let c = SessionPackager.planChunks(durationSec: 400, chunkSeconds: 150)
|
||
XCTAssertEqual(c.count, 3)
|
||
XCTAssertEqual(c[0].start, 0); XCTAssertEqual(c[0].end, 150)
|
||
XCTAssertEqual(c[1].start, 150); XCTAssertEqual(c[2].end, 400)
|
||
}
|
||
|
||
func testRebaseClipsAndRebases() throws {
|
||
let segs = [
|
||
VisualTimeline.Segment(start: 140, end: 160, name: "A", confidence: 0.9, source: "vision"),
|
||
VisualTimeline.Segment(start: 200, end: 260, name: "B", confidence: 0.8, source: "vision"),
|
||
]
|
||
let data = try SessionPackager.rebasedTimelineData(segs, start: 150, end: 300)
|
||
let arr = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [[String: Any]])
|
||
XCTAssertEqual(arr.count, 2)
|
||
XCTAssertEqual(arr[0]["start"] as? Double, 0)
|
||
XCTAssertEqual(arr[0]["end"] as? Double, 10)
|
||
XCTAssertEqual(arr[1]["start"] as? Double, 50)
|
||
XCTAssertEqual(arr[1]["end"] as? Double, 110)
|
||
}
|
||
|
||
func testRebaseSelfVadClipsAndRebases() throws {
|
||
let spans = [VADSpan(start: 140, end: 160, confidence: 0.9),
|
||
VADSpan(start: 200, end: 260, confidence: 0.8)]
|
||
let data = try SessionPackager.rebasedSelfVadData(spans, start: 150, end: 300)
|
||
let arr = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [[String: Any]])
|
||
XCTAssertEqual(arr.count, 2)
|
||
XCTAssertEqual(arr[0]["start"] as? Double, 0)
|
||
XCTAssertEqual(arr[0]["end"] as? Double, 10) // 160 clipped at 150 base → 0–10
|
||
XCTAssertEqual(arr[1]["start"] as? Double, 50)
|
||
XCTAssertEqual(arr[1]["end"] as? Double, 110)
|
||
XCTAssertNil(arr[0]["name"]) // self_vad carries no name
|
||
}
|
||
|
||
func testAssembleRanksMicChannelOverVisual() throws {
|
||
// Same person resolved by visual in one chunk and by the mic channel in
|
||
// another → the mic-channel attribution (the user's own track) wins.
|
||
let visual = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1]}],"segments":[],"fingerprints":{},"models":{}}"#
|
||
let mic = #"{"duration":100,"speakers":[{"cluster":"mic","name":"Grant","source":"mic_channel","fingerprint":[0.2]}],"segments":[],"fingerprints":{"Grant":[0.2]},"models":{}}"#
|
||
let rv = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(visual.utf8))
|
||
let rm = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(mic.utf8))
|
||
let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet",
|
||
chunks: [.init(chunkStart: 0, response: rv), .init(chunkStart: 100, response: rm)])
|
||
XCTAssertEqual(asm.speakersFile.speakers.first { $0.name == "Grant" }?.source, "mic_channel")
|
||
}
|
||
|
||
func testAssembleOffsetsAndUnifies() throws {
|
||
let resp0 = #"{"duration":150,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1,0.2]}],"segments":[{"start_ms":1000,"end_ms":2000,"speaker":"Grant","text":"hi"}],"fingerprints":{"Grant":[0.1,0.2]},"models":{"diarization":"x"}}"#
|
||
let resp1 = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Sarah","source":"voiceprint","match_similarity":0.7,"fingerprint":[0.3,0.4]},{"cluster":"Speaker_1","name":"Unknown_0","source":"unmatched"}],"segments":[{"start_ms":500,"end_ms":1500,"speaker":"Sarah","text":"hello"}],"fingerprints":{"Sarah":[0.3,0.4]},"models":{"diarization":"x"}}"#
|
||
let r0 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp0.utf8))
|
||
let r1 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp1.utf8))
|
||
let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet",
|
||
chunks: [.init(chunkStart: 0, response: r0), .init(chunkStart: 150, response: r1)])
|
||
XCTAssertEqual(asm.speakersFile.segments.count, 2)
|
||
XCTAssertEqual(asm.speakersFile.segments[0].start, 1, accuracy: 0.001)
|
||
XCTAssertEqual(asm.speakersFile.segments[1].start, 150.5, accuracy: 0.001)
|
||
XCTAssertEqual(asm.speakersFile.speakers.count, 3)
|
||
XCTAssertNotNil(asm.fingerprints["Grant"])
|
||
XCTAssertNotNil(asm.fingerprints["Sarah"])
|
||
XCTAssertNil(asm.fingerprints["Unknown_0"])
|
||
}
|
||
}
|