Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
@@ -0,0 +1,62 @@
|
||||
import XCTest
|
||||
import CoreGraphics
|
||||
import CoreText
|
||||
@testable import Ten31Transcripts
|
||||
|
||||
/// Validates the visual adapter against synthetic call frames (no real
|
||||
/// screenshots needed): OCR anchors the tiles and the highlight is attributed to
|
||||
/// the correct speaker, tracking it as it moves.
|
||||
final class GridCallAnalyzerTests: XCTestCase {
|
||||
|
||||
private func drawText(_ s: String, _ ctx: CGContext, center: CGPoint, size: CGFloat) {
|
||||
let font = CTFontCreateWithName("Helvetica-Bold" as CFString, size, nil)
|
||||
let attrs = [kCTFontAttributeName: font,
|
||||
kCTForegroundColorAttributeName: CGColor(red: 1, green: 1, blue: 1, alpha: 1)] as CFDictionary
|
||||
let line = CTLineCreateWithAttributedString(CFAttributedStringCreate(nil, s as CFString, attrs)!)
|
||||
let b = CTLineGetBoundsWithOptions(line, [])
|
||||
ctx.textPosition = CGPoint(x: center.x - b.width / 2, y: center.y - b.height / 2)
|
||||
CTLineDraw(line, ctx)
|
||||
}
|
||||
|
||||
private func frame(speakingIndex: Int) -> CGImage {
|
||||
let W = 800, H = 600
|
||||
let ctx = CGContext(data: nil, width: W, height: H, bitsPerComponent: 8, bytesPerRow: 0,
|
||||
space: CGColorSpaceCreateDeviceRGB(),
|
||||
bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue)!
|
||||
ctx.setFillColor(CGColor(red: 0.1, green: 0.1, blue: 0.12, alpha: 1))
|
||||
ctx.fill(CGRect(x: 0, y: 0, width: W, height: H))
|
||||
let rects: [(String, CGRect)] = [
|
||||
("GRANT", CGRect(x: 40, y: 320, width: 340, height: 230)),
|
||||
("SARAH", CGRect(x: 420, y: 320, width: 340, height: 230)),
|
||||
("DMITRI", CGRect(x: 40, y: 50, width: 340, height: 230)),
|
||||
("ALEX", CGRect(x: 420, y: 50, width: 340, height: 230)),
|
||||
]
|
||||
for (i, (name, rect)) in rects.enumerated() {
|
||||
ctx.setFillColor(CGColor(red: 0.18, green: 0.18, blue: 0.2, alpha: 1)); ctx.fill(rect)
|
||||
if i == speakingIndex {
|
||||
ctx.setStrokeColor(CGColor(red: 0.1, green: 0.85, blue: 0.2, alpha: 1)); ctx.setLineWidth(14)
|
||||
ctx.stroke(rect.insetBy(dx: 7, dy: 7))
|
||||
}
|
||||
drawText(name, ctx, center: CGPoint(x: rect.midX, y: rect.midY), size: 54)
|
||||
}
|
||||
return ctx.makeImage()!
|
||||
}
|
||||
|
||||
func testReadsNamesAndPicksHighlightedSpeaker() {
|
||||
let obs = SignalAdapter().analyze(cgImage: frame(speakingIndex: 1), at: 0) // SARAH
|
||||
XCTAssertGreaterThanOrEqual(obs.count, 2)
|
||||
let speaking = obs.filter { $0.speaking }
|
||||
XCTAssertEqual(speaking.count, 1)
|
||||
// SARAH tile center in top-left pixels ≈ (590, 165)
|
||||
XCTAssertEqual(speaking.first?.bbox.midX ?? 0, 590, accuracy: 160)
|
||||
XCTAssertEqual(speaking.first?.bbox.midY ?? 0, 165, accuracy: 160)
|
||||
}
|
||||
|
||||
func testHighlightTracksToAnotherTile() {
|
||||
let obs = SignalAdapter().analyze(cgImage: frame(speakingIndex: 2), at: 1) // DMITRI
|
||||
let speaking = obs.filter { $0.speaking }
|
||||
XCTAssertEqual(speaking.count, 1)
|
||||
XCTAssertEqual(speaking.first?.bbox.midX ?? 0, 210, accuracy: 160)
|
||||
XCTAssertEqual(speaking.first?.bbox.midY ?? 0, 435, accuracy: 160)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
import XCTest
|
||||
@testable import Ten31Transcripts
|
||||
|
||||
final class Phase5Tests: XCTestCase {
|
||||
func testPlanChunksShort() {
|
||||
let c = SessionPackager.planChunks(durationSec: 70)
|
||||
XCTAssertEqual(c.count, 1)
|
||||
XCTAssertEqual(c[0].end, 70, accuracy: 0.001)
|
||||
}
|
||||
|
||||
func testPlanChunksLong() {
|
||||
let c = SessionPackager.planChunks(durationSec: 400, chunkSeconds: 150)
|
||||
XCTAssertEqual(c.count, 3)
|
||||
XCTAssertEqual(c[0].start, 0); XCTAssertEqual(c[0].end, 150)
|
||||
XCTAssertEqual(c[1].start, 150); XCTAssertEqual(c[2].end, 400)
|
||||
}
|
||||
|
||||
func testRebaseClipsAndRebases() throws {
|
||||
let segs = [
|
||||
VisualTimeline.Segment(start: 140, end: 160, name: "A", confidence: 0.9, source: "vision"),
|
||||
VisualTimeline.Segment(start: 200, end: 260, name: "B", confidence: 0.8, source: "vision"),
|
||||
]
|
||||
let data = try SessionPackager.rebasedTimelineData(segs, start: 150, end: 300)
|
||||
let arr = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [[String: Any]])
|
||||
XCTAssertEqual(arr.count, 2)
|
||||
XCTAssertEqual(arr[0]["start"] as? Double, 0)
|
||||
XCTAssertEqual(arr[0]["end"] as? Double, 10)
|
||||
XCTAssertEqual(arr[1]["start"] as? Double, 50)
|
||||
XCTAssertEqual(arr[1]["end"] as? Double, 110)
|
||||
}
|
||||
|
||||
func testAssembleOffsetsAndUnifies() throws {
|
||||
let resp0 = #"{"duration":150,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1,0.2]}],"segments":[{"start_ms":1000,"end_ms":2000,"speaker":"Grant","text":"hi"}],"fingerprints":{"Grant":[0.1,0.2]},"models":{"diarization":"x"}}"#
|
||||
let resp1 = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Sarah","source":"voiceprint","match_similarity":0.7,"fingerprint":[0.3,0.4]},{"cluster":"Speaker_1","name":"Unknown_0","source":"unmatched"}],"segments":[{"start_ms":500,"end_ms":1500,"speaker":"Sarah","text":"hello"}],"fingerprints":{"Sarah":[0.3,0.4]},"models":{"diarization":"x"}}"#
|
||||
let r0 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp0.utf8))
|
||||
let r1 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp1.utf8))
|
||||
let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet",
|
||||
chunks: [.init(chunkStart: 0, response: r0), .init(chunkStart: 150, response: r1)])
|
||||
XCTAssertEqual(asm.speakersFile.segments.count, 2)
|
||||
XCTAssertEqual(asm.speakersFile.segments[0].start, 1, accuracy: 0.001)
|
||||
XCTAssertEqual(asm.speakersFile.segments[1].start, 150.5, accuracy: 0.001)
|
||||
XCTAssertEqual(asm.speakersFile.speakers.count, 3)
|
||||
XCTAssertNotNil(asm.fingerprints["Grant"])
|
||||
XCTAssertNotNil(asm.fingerprints["Sarah"])
|
||||
XCTAssertNil(asm.fingerprints["Unknown_0"])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
import XCTest
|
||||
@testable import Ten31Transcripts
|
||||
|
||||
final class TimelineBuilderTests: XCTestCase {
|
||||
private func obs(_ name: String, _ speaking: Bool, _ t: Double, _ conf: Double = 0.9) -> SpeakerObservation {
|
||||
SpeakerObservation(name: name, speaking: speaking, bbox: .zero, confidence: conf, t: t)
|
||||
}
|
||||
|
||||
func testOpensAfterKFramesAndClosesAfterMQuiet() {
|
||||
let b = TimelineBuilder(openFrames: 2, closeFrames: 2)
|
||||
b.ingest([obs("A", true, 0)], at: 0)
|
||||
b.ingest([obs("A", true, 1)], at: 1)
|
||||
b.ingest([obs("A", true, 2)], at: 2)
|
||||
b.ingest([], at: 3)
|
||||
b.ingest([], at: 4)
|
||||
b.finish()
|
||||
XCTAssertEqual(b.segments.count, 1)
|
||||
XCTAssertEqual(b.segments.first?.name, "A")
|
||||
XCTAssertEqual(b.segments.first?.start ?? -1, 0, accuracy: 0.001)
|
||||
XCTAssertEqual(b.segments.first?.end ?? -1, 2, accuracy: 0.001)
|
||||
XCTAssertEqual(b.segments.first?.source, "vision")
|
||||
}
|
||||
|
||||
func testSingleFlickerDoesNotOpen() {
|
||||
let b = TimelineBuilder(openFrames: 2, closeFrames: 2)
|
||||
b.ingest([obs("A", true, 0)], at: 0)
|
||||
b.ingest([], at: 1)
|
||||
b.finish()
|
||||
XCTAssertTrue(b.segments.isEmpty)
|
||||
}
|
||||
|
||||
func testAllowsOverlap() {
|
||||
let b = TimelineBuilder(openFrames: 1, closeFrames: 1)
|
||||
b.ingest([obs("A", true, 0), obs("B", true, 0)], at: 0)
|
||||
b.ingest([obs("A", true, 1), obs("B", true, 1)], at: 1)
|
||||
b.ingest([], at: 2)
|
||||
b.finish()
|
||||
XCTAssertEqual(b.segments.count, 2)
|
||||
XCTAssertEqual(Set(b.segments.map { $0.name }), ["A", "B"])
|
||||
}
|
||||
|
||||
func testMergesSelfSpans() {
|
||||
let b = TimelineBuilder()
|
||||
b.mergeSelfSpans([VADSpan(start: 0, end: 4.5, confidence: 0.97)], selfName: "Grant")
|
||||
b.finish()
|
||||
XCTAssertEqual(b.segments.count, 1)
|
||||
XCTAssertEqual(b.segments.first?.name, "Grant")
|
||||
XCTAssertEqual(b.segments.first?.source, "mic_vad")
|
||||
}
|
||||
|
||||
func testNormalizesAlias() {
|
||||
let b = TimelineBuilder(openFrames: 1, closeFrames: 1)
|
||||
b.addAlias("Sarah J", canonical: "Sarah Jones")
|
||||
b.ingest([obs("Sarah J", true, 0)], at: 0)
|
||||
b.ingest([obs("Sarah J", true, 1)], at: 1)
|
||||
b.ingest([], at: 2)
|
||||
b.finish()
|
||||
XCTAssertEqual(b.segments.first?.name, "Sarah Jones")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
import XCTest
|
||||
@testable import Ten31Transcripts
|
||||
|
||||
final class VoiceprintStoreTests: XCTestCase {
|
||||
private func tempURL() -> URL {
|
||||
FileManager.default.temporaryDirectory.appendingPathComponent("vp_\(UUID().uuidString).json")
|
||||
}
|
||||
|
||||
private func response() throws -> LabelMergeResponse {
|
||||
let json = #"{"duration":10,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1,0.2,0.3]},{"cluster":"Speaker_1","name":"Sarah","source":"voiceprint","match_similarity":0.7,"fingerprint":[0.4,0.5,0.6]},{"cluster":"Speaker_2","name":"Bob","source":"visual","overlap_confidence":0.5,"fingerprint":[0.7,0.8,0.9]},{"cluster":"Speaker_3","name":"Unknown_0","source":"unmatched"}],"segments":[],"fingerprints":{"Grant":[0.1,0.2,0.3],"Sarah":[0.4,0.5,0.6]},"models":{}}"#
|
||||
return try JSONDecoder().decode(LabelMergeResponse.self, from: Data(json.utf8))
|
||||
}
|
||||
|
||||
func testStoresOnlyConfidentNamedSpeakers() throws {
|
||||
let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) }
|
||||
let store = VoiceprintStore(fileURL: url)
|
||||
store.update(with: try response())
|
||||
XCTAssertNotNil(store.entries["Grant"]) // visual, high overlap
|
||||
XCTAssertNotNil(store.entries["Sarah"]) // voiceprint match
|
||||
XCTAssertNil(store.entries["Bob"]) // overlap 0.5 < 0.8
|
||||
XCTAssertNil(store.entries["Unknown_0"])
|
||||
XCTAssertEqual(store.knownVoiceprints()["Grant"], [0.1, 0.2, 0.3])
|
||||
XCTAssertEqual(store.entries["Grant"]?.calls, 1)
|
||||
}
|
||||
|
||||
func testPersistsAcrossInstancesAndIncrementsCalls() throws {
|
||||
let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) }
|
||||
let store = VoiceprintStore(fileURL: url)
|
||||
store.update(with: try response())
|
||||
store.update(with: try response())
|
||||
XCTAssertEqual(store.entries["Grant"]?.calls, 2)
|
||||
let reopened = VoiceprintStore(fileURL: url)
|
||||
XCTAssertEqual(reopened.knownVoiceprints().count, 2)
|
||||
}
|
||||
|
||||
func testRenameRemoveReset() throws {
|
||||
let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) }
|
||||
let store = VoiceprintStore(fileURL: url)
|
||||
store.update(with: try response())
|
||||
store.rename("Sarah", to: "Sarah Jones")
|
||||
XCTAssertNotNil(store.entries["Sarah Jones"]); XCTAssertNil(store.entries["Sarah"])
|
||||
store.remove("Grant"); XCTAssertNil(store.entries["Grant"])
|
||||
store.reset(); XCTAssertTrue(store.entries.isEmpty)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user