Phases 2-6: detection, visual timeline, backend hand-off, voiceprints

Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
2026-06-06 00:15:49 -05:00
parent fd7e1a5907
commit 863136aeec
27 changed files with 2108 additions and 22 deletions
@@ -0,0 +1,179 @@
+import Foundation
+
+/// Decoded `POST /api/audio/label-merge` response (verified against the live
+/// backend). Handles both `transcribe=true` (start_ms/end_ms + text) and
+/// `transcribe=false` (start_s/end_s + confidence) segment shapes.
+struct LabelMergeResponse: Decodable {
+    let duration: Double
+    let speakers: [Speaker]
+    let segments: [Segment]
+    let fingerprints: [String: [Float]]
+    let models: [String: String]?
+
+    /// The backend's "unmatched" labels — never persisted as a named voiceprint.
+    static func isUnknownName(_ name: String) -> Bool {
+        name.hasPrefix("Unknown_") || name == "Speaker_unknown"
+    }
+
+    struct Speaker: Decodable {
+        let cluster: String
+        let name: String
+        let source: String                 // visual | voiceprint | unmatched
+        let overlapConfidence: Double?
+        let matchSimilarity: Double?
+        let fingerprint: [Float]?
+        enum CodingKeys: String, CodingKey {
+            case cluster, name, source, fingerprint
+            case overlapConfidence = "overlap_confidence"
+            case matchSimilarity = "match_similarity"
+        }
+    }
+
+    struct Segment: Decodable {
+        let startMs: Int?
+        let endMs: Int?
+        let startS: Double?
+        let endS: Double?
+        let speaker: String
+        let text: String?
+        let confidence: Double?
+        enum CodingKeys: String, CodingKey {
+            case startMs = "start_ms"
+            case endMs = "end_ms"
+            case startS = "start_s"
+            case endS = "end_s"
+            case speaker, text, confidence
+        }
+        /// Start time in seconds regardless of which shape the backend used.
+        var startSeconds: Double { startS ?? startMs.map { Double($0) / 1000 } ?? 0 }
+        var endSeconds: Double { endS ?? endMs.map { Double($0) / 1000 } ?? 0 }
+    }
+}
+
+enum SparkControlError: Error, LocalizedError {
+    case invalidHost
+    case tooLarge                       // 413
+    case server(Int, String)            // other non-2xx with {"detail":...}
+    case decode(String)
+    case retriesExhausted
+
+    var errorDescription: String? {
+        switch self {
+        case .invalidHost: return "Invalid backend host URL."
+        case .tooLarge: return "Audio chunk exceeds the backend's 200 MB limit."
+        case .server(let code, let detail): return "Backend error \(code): \(detail)"
+        case .decode(let msg): return "Couldn't decode backend response: \(msg)"
+        case .retriesExhausted: return "Backend stayed busy (503) after retries."
+        }
+    }
+}
+
+/// Talks to SparkControl's `label-merge`. **Callers must invoke sequentially**
+/// (one audio request in flight) — concurrent audio requests trip a GPU race
+/// (503). The Phase-5 pipeline drives one chunk at a time, satisfying this.
+final class SparkControlClient {
+    private let baseURL: String
+    private let urlSession: URLSession
+
+    init(baseURL: String, skipTLS: Bool) {
+        let trimmed = baseURL.trimmingCharacters(in: .whitespacesAndNewlines)
+        self.baseURL = trimmed.hasSuffix("/") ? String(trimmed.dropLast()) : trimmed
+        let config = URLSessionConfiguration.ephemeral
+        config.timeoutIntervalForRequest = 600     // diarization can take up to ~600s
+        config.timeoutIntervalForResource = 900
+        config.waitsForConnectivity = false
+        let delegate: URLSessionDelegate? = skipTLS ? InsecureTrustDelegate() : nil
+        self.urlSession = URLSession(configuration: config, delegate: delegate, delegateQueue: nil)
+    }
+
+    deinit { urlSession.finishTasksAndInvalidate() }
+
+    /// One `label-merge` call. `timeline` is the flat `[{start,end,name,confidence}]`
+    /// JSON (chunk-local seconds). Retries on `503 + Retry-After`.
+    func labelMerge(audioURL: URL,
+                    timeline: Data,
+                    knownVoiceprints: [String: [Float]]?,
+                    transcribe: Bool,
+                    minOverlap: Double? = nil,
+                    voiceprintThreshold: Double? = nil,
+                    maxRetries: Int = 3) async throws -> LabelMergeResponse {
+        guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
+            throw SparkControlError.invalidHost
+        }
+
+        var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"]
+        if let timelineString = String(data: timeline, encoding: .utf8) {
+            fields["timeline"] = timelineString
+        }
+        if let known = knownVoiceprints, !known.isEmpty,
+           let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }),
+           let str = String(data: data, encoding: .utf8) {
+            fields["known_voiceprints"] = str
+        }
+        if let minOverlap { fields["min_overlap"] = String(minOverlap) }
+        if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) }
+
+        let audio = try Data(contentsOf: audioURL)
+        // Body doesn't change between retries — build it once.
+        let (body, contentType) = Self.multipart(fields: fields, fileField: "file",
+                                                 filename: audioURL.lastPathComponent, fileData: audio)
+
+        var attempt = 0
+        while true {
+            var request = URLRequest(url: url)
+            request.httpMethod = "POST"
+            request.setValue(contentType, forHTTPHeaderField: "Content-Type")
+            request.httpBody = body
+
+            let (data, response) = try await urlSession.data(for: request)
+            guard let http = response as? HTTPURLResponse else {
+                throw SparkControlError.decode("no HTTP response")
+            }
+
+            switch http.statusCode {
+            case 200..<300:
+                do {
+                    return try JSONDecoder().decode(LabelMergeResponse.self, from: data)
+                } catch {
+                    throw SparkControlError.decode(error.localizedDescription)
+                }
+            case 503:
+                attempt += 1
+                if attempt > maxRetries { throw SparkControlError.retriesExhausted }
+                let retryAfter = (http.value(forHTTPHeaderField: "Retry-After")).flatMap(Double.init) ?? 5
+                try await Task.sleep(nanoseconds: UInt64(max(1, retryAfter) * 1_000_000_000))
+            case 413:
+                throw SparkControlError.tooLarge
+            default:
+                throw SparkControlError.server(http.statusCode, Self.detail(from: data))
+            }
+        }
+    }
+
+    // MARK: - Helpers
+
+    private static func detail(from data: Data) -> String {
+        if let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
+           let detail = obj["detail"] as? String { return detail }
+        return String(data: data, encoding: .utf8) ?? "unknown error"
+    }
+
+    private static func multipart(fields: [String: String], fileField: String,
+                                  filename: String, fileData: Data) -> (Data, String) {
+        let boundary = "Boundary-\(UUID().uuidString)"
+        var body = Data()
+        func append(_ s: String) { body.append(s.data(using: .utf8)!) }
+
+        for (name, value) in fields {
+            append("--\(boundary)\r\n")
+            append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n")
+            append("\(value)\r\n")
+        }
+        append("--\(boundary)\r\n")
+        append("Content-Disposition: form-data; name=\"\(fileField)\"; filename=\"\(filename)\"\r\n")
+        append("Content-Type: audio/wav\r\n\r\n")
+        body.append(fileData)
+        append("\r\n--\(boundary)--\r\n")
+        return (body, "multipart/form-data; boundary=\(boundary)")
+    }
+}