Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
@@ -0,0 +1,179 @@
|
||||
import Foundation
|
||||
|
||||
/// Decoded `POST /api/audio/label-merge` response (verified against the live
|
||||
/// backend). Handles both `transcribe=true` (start_ms/end_ms + text) and
|
||||
/// `transcribe=false` (start_s/end_s + confidence) segment shapes.
|
||||
struct LabelMergeResponse: Decodable {
|
||||
let duration: Double
|
||||
let speakers: [Speaker]
|
||||
let segments: [Segment]
|
||||
let fingerprints: [String: [Float]]
|
||||
let models: [String: String]?
|
||||
|
||||
/// The backend's "unmatched" labels — never persisted as a named voiceprint.
|
||||
static func isUnknownName(_ name: String) -> Bool {
|
||||
name.hasPrefix("Unknown_") || name == "Speaker_unknown"
|
||||
}
|
||||
|
||||
struct Speaker: Decodable {
|
||||
let cluster: String
|
||||
let name: String
|
||||
let source: String // visual | voiceprint | unmatched
|
||||
let overlapConfidence: Double?
|
||||
let matchSimilarity: Double?
|
||||
let fingerprint: [Float]?
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case cluster, name, source, fingerprint
|
||||
case overlapConfidence = "overlap_confidence"
|
||||
case matchSimilarity = "match_similarity"
|
||||
}
|
||||
}
|
||||
|
||||
struct Segment: Decodable {
|
||||
let startMs: Int?
|
||||
let endMs: Int?
|
||||
let startS: Double?
|
||||
let endS: Double?
|
||||
let speaker: String
|
||||
let text: String?
|
||||
let confidence: Double?
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case startMs = "start_ms"
|
||||
case endMs = "end_ms"
|
||||
case startS = "start_s"
|
||||
case endS = "end_s"
|
||||
case speaker, text, confidence
|
||||
}
|
||||
/// Start time in seconds regardless of which shape the backend used.
|
||||
var startSeconds: Double { startS ?? startMs.map { Double($0) / 1000 } ?? 0 }
|
||||
var endSeconds: Double { endS ?? endMs.map { Double($0) / 1000 } ?? 0 }
|
||||
}
|
||||
}
|
||||
|
||||
enum SparkControlError: Error, LocalizedError {
|
||||
case invalidHost
|
||||
case tooLarge // 413
|
||||
case server(Int, String) // other non-2xx with {"detail":...}
|
||||
case decode(String)
|
||||
case retriesExhausted
|
||||
|
||||
var errorDescription: String? {
|
||||
switch self {
|
||||
case .invalidHost: return "Invalid backend host URL."
|
||||
case .tooLarge: return "Audio chunk exceeds the backend's 200 MB limit."
|
||||
case .server(let code, let detail): return "Backend error \(code): \(detail)"
|
||||
case .decode(let msg): return "Couldn't decode backend response: \(msg)"
|
||||
case .retriesExhausted: return "Backend stayed busy (503) after retries."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Talks to SparkControl's `label-merge`. **Callers must invoke sequentially**
|
||||
/// (one audio request in flight) — concurrent audio requests trip a GPU race
|
||||
/// (503). The Phase-5 pipeline drives one chunk at a time, satisfying this.
|
||||
final class SparkControlClient {
|
||||
private let baseURL: String
|
||||
private let urlSession: URLSession
|
||||
|
||||
init(baseURL: String, skipTLS: Bool) {
|
||||
let trimmed = baseURL.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
self.baseURL = trimmed.hasSuffix("/") ? String(trimmed.dropLast()) : trimmed
|
||||
let config = URLSessionConfiguration.ephemeral
|
||||
config.timeoutIntervalForRequest = 600 // diarization can take up to ~600s
|
||||
config.timeoutIntervalForResource = 900
|
||||
config.waitsForConnectivity = false
|
||||
let delegate: URLSessionDelegate? = skipTLS ? InsecureTrustDelegate() : nil
|
||||
self.urlSession = URLSession(configuration: config, delegate: delegate, delegateQueue: nil)
|
||||
}
|
||||
|
||||
deinit { urlSession.finishTasksAndInvalidate() }
|
||||
|
||||
/// One `label-merge` call. `timeline` is the flat `[{start,end,name,confidence}]`
|
||||
/// JSON (chunk-local seconds). Retries on `503 + Retry-After`.
|
||||
func labelMerge(audioURL: URL,
|
||||
timeline: Data,
|
||||
knownVoiceprints: [String: [Float]]?,
|
||||
transcribe: Bool,
|
||||
minOverlap: Double? = nil,
|
||||
voiceprintThreshold: Double? = nil,
|
||||
maxRetries: Int = 3) async throws -> LabelMergeResponse {
|
||||
guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
|
||||
throw SparkControlError.invalidHost
|
||||
}
|
||||
|
||||
var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"]
|
||||
if let timelineString = String(data: timeline, encoding: .utf8) {
|
||||
fields["timeline"] = timelineString
|
||||
}
|
||||
if let known = knownVoiceprints, !known.isEmpty,
|
||||
let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }),
|
||||
let str = String(data: data, encoding: .utf8) {
|
||||
fields["known_voiceprints"] = str
|
||||
}
|
||||
if let minOverlap { fields["min_overlap"] = String(minOverlap) }
|
||||
if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) }
|
||||
|
||||
let audio = try Data(contentsOf: audioURL)
|
||||
// Body doesn't change between retries — build it once.
|
||||
let (body, contentType) = Self.multipart(fields: fields, fileField: "file",
|
||||
filename: audioURL.lastPathComponent, fileData: audio)
|
||||
|
||||
var attempt = 0
|
||||
while true {
|
||||
var request = URLRequest(url: url)
|
||||
request.httpMethod = "POST"
|
||||
request.setValue(contentType, forHTTPHeaderField: "Content-Type")
|
||||
request.httpBody = body
|
||||
|
||||
let (data, response) = try await urlSession.data(for: request)
|
||||
guard let http = response as? HTTPURLResponse else {
|
||||
throw SparkControlError.decode("no HTTP response")
|
||||
}
|
||||
|
||||
switch http.statusCode {
|
||||
case 200..<300:
|
||||
do {
|
||||
return try JSONDecoder().decode(LabelMergeResponse.self, from: data)
|
||||
} catch {
|
||||
throw SparkControlError.decode(error.localizedDescription)
|
||||
}
|
||||
case 503:
|
||||
attempt += 1
|
||||
if attempt > maxRetries { throw SparkControlError.retriesExhausted }
|
||||
let retryAfter = (http.value(forHTTPHeaderField: "Retry-After")).flatMap(Double.init) ?? 5
|
||||
try await Task.sleep(nanoseconds: UInt64(max(1, retryAfter) * 1_000_000_000))
|
||||
case 413:
|
||||
throw SparkControlError.tooLarge
|
||||
default:
|
||||
throw SparkControlError.server(http.statusCode, Self.detail(from: data))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Helpers
|
||||
|
||||
private static func detail(from data: Data) -> String {
|
||||
if let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||
let detail = obj["detail"] as? String { return detail }
|
||||
return String(data: data, encoding: .utf8) ?? "unknown error"
|
||||
}
|
||||
|
||||
private static func multipart(fields: [String: String], fileField: String,
|
||||
filename: String, fileData: Data) -> (Data, String) {
|
||||
let boundary = "Boundary-\(UUID().uuidString)"
|
||||
var body = Data()
|
||||
func append(_ s: String) { body.append(s.data(using: .utf8)!) }
|
||||
|
||||
for (name, value) in fields {
|
||||
append("--\(boundary)\r\n")
|
||||
append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n")
|
||||
append("\(value)\r\n")
|
||||
}
|
||||
append("--\(boundary)\r\n")
|
||||
append("Content-Disposition: form-data; name=\"\(fileField)\"; filename=\"\(filename)\"\r\n")
|
||||
append("Content-Type: audio/wav\r\n\r\n")
|
||||
body.append(fileData)
|
||||
append("\r\n--\(boundary)--\r\n")
|
||||
return (body, "multipart/form-data; boundary=\(boundary)")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
import Foundation
|
||||
|
||||
/// Local persistence of named voiceprints — the compounding-identity layer.
|
||||
///
|
||||
/// File `~/Ten31Transcripts/voiceprints.json`:
|
||||
/// `{ "<name>": { "vector": [192 floats], "updated": <iso>, "calls": <int> } }`
|
||||
///
|
||||
/// On send → `knownVoiceprints()` feeds `label-merge`. On response → `update(with:)`
|
||||
/// stores/refreshes vectors for speakers resolved by **visual** (overlap ≥ ~0.8)
|
||||
/// or **voiceprint** match. Never stores `Unknown_N` / `Speaker_unknown`.
|
||||
///
|
||||
/// Thread-safe (lock-guarded); the sequential pipeline is the only writer.
|
||||
final class VoiceprintStore {
|
||||
struct Entry: Codable, Equatable {
|
||||
var vector: [Float]
|
||||
var updated: String
|
||||
var calls: Int
|
||||
}
|
||||
|
||||
private let url: URL
|
||||
private let minOverlapToStore: Double
|
||||
private let lock = NSLock()
|
||||
private var entriesStore: [String: Entry] = [:]
|
||||
|
||||
init(fileURL: URL, minOverlapToStore: Double = 0.8) {
|
||||
self.url = fileURL
|
||||
self.minOverlapToStore = minOverlapToStore
|
||||
load()
|
||||
}
|
||||
|
||||
var entries: [String: Entry] {
|
||||
lock.lock(); defer { lock.unlock() }
|
||||
return entriesStore
|
||||
}
|
||||
|
||||
/// Vectors keyed by name, for the `known_voiceprints` field.
|
||||
func knownVoiceprints() -> [String: [Float]] {
|
||||
lock.lock(); defer { lock.unlock() }
|
||||
return entriesStore.mapValues { $0.vector }
|
||||
}
|
||||
|
||||
/// Persist fingerprints from a `label-merge` response for confidently-named
|
||||
/// speakers only.
|
||||
func update(with response: LabelMergeResponse) {
|
||||
lock.lock(); defer { lock.unlock() }
|
||||
let now = ISO8601DateFormatter().string(from: Date())
|
||||
for sp in response.speakers {
|
||||
guard !Self.isUnknown(sp.name) else { continue }
|
||||
let acceptable: Bool
|
||||
switch sp.source {
|
||||
case "visual": acceptable = (sp.overlapConfidence ?? 0) >= minOverlapToStore
|
||||
case "voiceprint": acceptable = true // already matched a known print
|
||||
default: acceptable = false // unmatched
|
||||
}
|
||||
guard acceptable, let vector = sp.fingerprint ?? response.fingerprints[sp.name],
|
||||
!vector.isEmpty else { continue }
|
||||
var entry = entriesStore[sp.name] ?? Entry(vector: vector, updated: now, calls: 0)
|
||||
entry.vector = vector
|
||||
entry.updated = now
|
||||
entry.calls += 1
|
||||
entriesStore[sp.name] = entry
|
||||
}
|
||||
save()
|
||||
}
|
||||
|
||||
func rename(_ old: String, to new: String) {
|
||||
lock.lock(); defer { lock.unlock() }
|
||||
guard let e = entriesStore.removeValue(forKey: old) else { return }
|
||||
entriesStore[new] = e
|
||||
save()
|
||||
}
|
||||
|
||||
func remove(_ name: String) {
|
||||
lock.lock(); defer { lock.unlock() }
|
||||
entriesStore.removeValue(forKey: name)
|
||||
save()
|
||||
}
|
||||
|
||||
func reset() {
|
||||
lock.lock(); defer { lock.unlock() }
|
||||
entriesStore = [:]
|
||||
save()
|
||||
}
|
||||
|
||||
// MARK: - Persistence (call with lock held)
|
||||
|
||||
private func load() {
|
||||
guard let data = try? Data(contentsOf: url),
|
||||
let decoded = try? JSONDecoder().decode([String: Entry].self, from: data) else { return }
|
||||
entriesStore = decoded
|
||||
}
|
||||
|
||||
private func save() {
|
||||
let encoder = JSONEncoder()
|
||||
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
|
||||
try? FileManager.default.createDirectory(at: url.deletingLastPathComponent(),
|
||||
withIntermediateDirectories: true)
|
||||
if let data = try? encoder.encode(entriesStore) { try? data.write(to: url) }
|
||||
}
|
||||
|
||||
private static func isUnknown(_ name: String) -> Bool {
|
||||
LabelMergeResponse.isUnknownName(name)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user