From 863136aeecb7dd244a5b133a8d6b9783dfcb08c6 Mon Sep 17 00:00:00 2001 From: Grant Gilliam Date: Sat, 6 Jun 2026 00:15:49 -0500 Subject: [PATCH] Phases 2-6: detection, visual timeline, backend hand-off, voiceprints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass. --- Ten31Transcripts/Adapters/SignalAdapter.swift | 31 +++ Ten31Transcripts/Audio/AudioRecorder.swift | 45 ++-- Ten31Transcripts/Audio/Resampler.swift | 7 +- .../Backend/SparkControlClient.swift | 179 ++++++++++++++ .../Backend/VoiceprintStore.swift | 104 ++++++++ .../Detection/AudioInputProcesses.swift | 61 +++++ Ten31Transcripts/Detection/CallDetector.swift | 226 ++++++++++++++++++ .../Detection/MicActivityMonitor.swift | 125 ++++++++++ .../Session/SessionController.swift | 174 +++++++++++++- .../Session/SessionPackager.swift | 85 +++++++ Ten31Transcripts/Session/SpeakersFile.swift | 45 ++++ .../Session/TranscriptAssembler.swift | 78 ++++++ .../Session/TranscriptPipeline.swift | 75 ++++++ Ten31Transcripts/Settings/AppSettings.swift | 22 ++ Ten31Transcripts/UI/MenuBarView.swift | 42 ++++ Ten31Transcripts/UI/SettingsView.swift | 16 ++ Ten31Transcripts/Visual/FrameSampler.swift | 82 +++++++ .../Visual/GridCallAnalyzer.swift | 94 ++++++++ .../Visual/SpeakerObservation.swift | 36 +++ Ten31Transcripts/Visual/TextRecognizer.swift | 59 +++++ Ten31Transcripts/Visual/TimelineBuilder.swift | 127 ++++++++++ Ten31Transcripts/Visual/VisualObserver.swift | 131 ++++++++++ Ten31Transcripts/Visual/VisualTimeline.swift | 72 ++++++ .../GridCallAnalyzerTests.swift | 62 +++++ Ten31TranscriptsTests/Phase5Tests.swift | 47 ++++ .../TimelineBuilderTests.swift | 60 +++++ .../VoiceprintStoreTests.swift | 45 ++++ 27 files changed, 2108 insertions(+), 22 deletions(-) create mode 100644 Ten31Transcripts/Adapters/SignalAdapter.swift create mode 100644 Ten31Transcripts/Backend/SparkControlClient.swift create mode 100644 Ten31Transcripts/Backend/VoiceprintStore.swift create mode 100644 Ten31Transcripts/Detection/AudioInputProcesses.swift create mode 100644 Ten31Transcripts/Detection/CallDetector.swift create mode 100644 Ten31Transcripts/Detection/MicActivityMonitor.swift create mode 100644 Ten31Transcripts/Session/SessionPackager.swift create mode 100644 Ten31Transcripts/Session/SpeakersFile.swift create mode 100644 Ten31Transcripts/Session/TranscriptAssembler.swift create mode 100644 Ten31Transcripts/Session/TranscriptPipeline.swift create mode 100644 Ten31Transcripts/Visual/FrameSampler.swift create mode 100644 Ten31Transcripts/Visual/GridCallAnalyzer.swift create mode 100644 Ten31Transcripts/Visual/SpeakerObservation.swift create mode 100644 Ten31Transcripts/Visual/TextRecognizer.swift create mode 100644 Ten31Transcripts/Visual/TimelineBuilder.swift create mode 100644 Ten31Transcripts/Visual/VisualObserver.swift create mode 100644 Ten31Transcripts/Visual/VisualTimeline.swift create mode 100644 Ten31TranscriptsTests/GridCallAnalyzerTests.swift create mode 100644 Ten31TranscriptsTests/Phase5Tests.swift create mode 100644 Ten31TranscriptsTests/TimelineBuilderTests.swift create mode 100644 Ten31TranscriptsTests/VoiceprintStoreTests.swift diff --git a/Ten31Transcripts/Adapters/SignalAdapter.swift b/Ten31Transcripts/Adapters/SignalAdapter.swift new file mode 100644 index 0000000..5024b62 --- /dev/null +++ b/Ten31Transcripts/Adapters/SignalAdapter.swift @@ -0,0 +1,31 @@ +import Foundation +import CoreVideo + +/// Signal Desktop adapter. Signal shows avatars/initials with a coloured ring +/// around the active speaker; names may also be available via the Electron +/// Accessibility tree (preferred over OCR when we enable it). Geometry/threshold +/// here are first-pass and will be calibrated against real Signal screenshots. +struct SignalAdapter: AppAdapter { + static let bundleIDs = ["org.whispersystems.signal-desktop"] + let adapterVersion = "signal-0.1.0" + let preferredFPS = 3 + + private let analyzer: GridCallAnalyzer + + init() { + var config = GridCallAnalyzer.Config() + // Signal tiles are squarish with initials centred; tune with fixtures. + config.tileExpandX = 1.6 + config.tileExpandY = 1.8 + self.analyzer = GridCallAnalyzer(config: config) + } + + func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] { + analyzer.analyze(pixelBuffer: frame, at: t) + } + + // Exposed for fixture/synthetic tests. + func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] { + analyzer.analyze(cgImage: cgImage, at: t) + } +} diff --git a/Ten31Transcripts/Audio/AudioRecorder.swift b/Ten31Transcripts/Audio/AudioRecorder.swift index 9af33a3..e1a7490 100644 --- a/Ten31Transcripts/Audio/AudioRecorder.swift +++ b/Ten31Transcripts/Audio/AudioRecorder.swift @@ -156,31 +156,44 @@ final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput { // MARK: - Ingest (ioQueue only) + /// Write audio CONTINUOUSLY; re-anchor to the timestamp only when drift is a + /// real gap (> ~100 ms), not per-buffer timestamp jitter. Correcting every + /// buffer injects/strips a few samples each time → audible rhythmic glitching. + /// The shared t0 still bounds mic/system skew to the tolerance, well within + /// what the backend merge needs. + private static let driftTolerance: Int64 = 1600 // 100 ms @ 16 kHz + private func ingestMic(_ buffer: AVAudioPCMBuffer, startHost: Double) { guard !tornDown, let writer = micWriter, let vad else { return } - let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded())) - if expected > writer.framesWritten { - let padded = writer.padSilence(expected - writer.framesWritten) + let drift = max(0, Int64(((startHost - t0Host) * 16_000).rounded())) - writer.framesWritten + var chunk: AVAudioPCMBuffer? = buffer + if drift > Self.driftTolerance { // real gap → pad to realign + let padded = writer.padSilence(drift) if padded > 0 { vad.feedSilence(padded) } + } else if drift < -Self.driftTolerance { // far ahead → trim overlap + let trim = Int(-drift) + if trim >= Int(buffer.frameLength) { return } + chunk = Self.trimFront(buffer, by: trim) } - let startIdx = max(0, Int(writer.framesWritten - expected)) - if startIdx >= Int(buffer.frameLength) { return } - guard let chunk = Self.trimFront(buffer, by: startIdx) else { return } - updateLevel(chunk, isMic: true) - if writer.write(chunk) > 0 { vad.feed(chunk) } + guard let out = chunk else { return } + updateLevel(out, isMic: true) + if writer.write(out) > 0 { vad.feed(out) } } private func ingestSystem(_ buffer: AVAudioPCMBuffer, startHost: Double) { guard !tornDown, let writer = systemWriter else { return } - let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded())) - if expected > writer.framesWritten { - writer.padSilence(expected - writer.framesWritten) + let drift = max(0, Int64(((startHost - t0Host) * 16_000).rounded())) - writer.framesWritten + var chunk: AVAudioPCMBuffer? = buffer + if drift > Self.driftTolerance { + writer.padSilence(drift) + } else if drift < -Self.driftTolerance { + let trim = Int(-drift) + if trim >= Int(buffer.frameLength) { return } + chunk = Self.trimFront(buffer, by: trim) } - let startIdx = max(0, Int(writer.framesWritten - expected)) - if startIdx >= Int(buffer.frameLength) { return } - guard let chunk = Self.trimFront(buffer, by: startIdx) else { return } - updateLevel(chunk, isMic: false) - writer.write(chunk) + guard let out = chunk else { return } + updateLevel(out, isMic: false) + writer.write(out) } // MARK: - Mic (AVAudioEngine) diff --git a/Ten31Transcripts/Audio/Resampler.swift b/Ten31Transcripts/Audio/Resampler.swift index 87b53da..747bc74 100644 --- a/Ten31Transcripts/Audio/Resampler.swift +++ b/Ten31Transcripts/Audio/Resampler.swift @@ -24,7 +24,12 @@ final class Resampler { guard !ended, input.frameLength > 0 else { return nil } if converter == nil || sourceFormat != input.format { - converter = AVAudioConverter(from: input.format, to: Self.targetFormat) + let c = AVAudioConverter(from: input.format, to: Self.targetFormat) + // Highest-quality sample-rate conversion: best anti-aliasing on the + // 48k→16k downsample, which avoids harsh artifacts on loud/bright speech. + c?.sampleRateConverterQuality = .max + c?.sampleRateConverterAlgorithm = AVSampleRateConverterAlgorithm_Mastering + converter = c sourceFormat = input.format } guard let converter else { return nil } diff --git a/Ten31Transcripts/Backend/SparkControlClient.swift b/Ten31Transcripts/Backend/SparkControlClient.swift new file mode 100644 index 0000000..f474b6a --- /dev/null +++ b/Ten31Transcripts/Backend/SparkControlClient.swift @@ -0,0 +1,179 @@ +import Foundation + +/// Decoded `POST /api/audio/label-merge` response (verified against the live +/// backend). Handles both `transcribe=true` (start_ms/end_ms + text) and +/// `transcribe=false` (start_s/end_s + confidence) segment shapes. +struct LabelMergeResponse: Decodable { + let duration: Double + let speakers: [Speaker] + let segments: [Segment] + let fingerprints: [String: [Float]] + let models: [String: String]? + + /// The backend's "unmatched" labels — never persisted as a named voiceprint. + static func isUnknownName(_ name: String) -> Bool { + name.hasPrefix("Unknown_") || name == "Speaker_unknown" + } + + struct Speaker: Decodable { + let cluster: String + let name: String + let source: String // visual | voiceprint | unmatched + let overlapConfidence: Double? + let matchSimilarity: Double? + let fingerprint: [Float]? + enum CodingKeys: String, CodingKey { + case cluster, name, source, fingerprint + case overlapConfidence = "overlap_confidence" + case matchSimilarity = "match_similarity" + } + } + + struct Segment: Decodable { + let startMs: Int? + let endMs: Int? + let startS: Double? + let endS: Double? + let speaker: String + let text: String? + let confidence: Double? + enum CodingKeys: String, CodingKey { + case startMs = "start_ms" + case endMs = "end_ms" + case startS = "start_s" + case endS = "end_s" + case speaker, text, confidence + } + /// Start time in seconds regardless of which shape the backend used. + var startSeconds: Double { startS ?? startMs.map { Double($0) / 1000 } ?? 0 } + var endSeconds: Double { endS ?? endMs.map { Double($0) / 1000 } ?? 0 } + } +} + +enum SparkControlError: Error, LocalizedError { + case invalidHost + case tooLarge // 413 + case server(Int, String) // other non-2xx with {"detail":...} + case decode(String) + case retriesExhausted + + var errorDescription: String? { + switch self { + case .invalidHost: return "Invalid backend host URL." + case .tooLarge: return "Audio chunk exceeds the backend's 200 MB limit." + case .server(let code, let detail): return "Backend error \(code): \(detail)" + case .decode(let msg): return "Couldn't decode backend response: \(msg)" + case .retriesExhausted: return "Backend stayed busy (503) after retries." + } + } +} + +/// Talks to SparkControl's `label-merge`. **Callers must invoke sequentially** +/// (one audio request in flight) — concurrent audio requests trip a GPU race +/// (503). The Phase-5 pipeline drives one chunk at a time, satisfying this. +final class SparkControlClient { + private let baseURL: String + private let urlSession: URLSession + + init(baseURL: String, skipTLS: Bool) { + let trimmed = baseURL.trimmingCharacters(in: .whitespacesAndNewlines) + self.baseURL = trimmed.hasSuffix("/") ? String(trimmed.dropLast()) : trimmed + let config = URLSessionConfiguration.ephemeral + config.timeoutIntervalForRequest = 600 // diarization can take up to ~600s + config.timeoutIntervalForResource = 900 + config.waitsForConnectivity = false + let delegate: URLSessionDelegate? = skipTLS ? InsecureTrustDelegate() : nil + self.urlSession = URLSession(configuration: config, delegate: delegate, delegateQueue: nil) + } + + deinit { urlSession.finishTasksAndInvalidate() } + + /// One `label-merge` call. `timeline` is the flat `[{start,end,name,confidence}]` + /// JSON (chunk-local seconds). Retries on `503 + Retry-After`. + func labelMerge(audioURL: URL, + timeline: Data, + knownVoiceprints: [String: [Float]]?, + transcribe: Bool, + minOverlap: Double? = nil, + voiceprintThreshold: Double? = nil, + maxRetries: Int = 3) async throws -> LabelMergeResponse { + guard let url = URL(string: baseURL + "/api/audio/label-merge") else { + throw SparkControlError.invalidHost + } + + var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"] + if let timelineString = String(data: timeline, encoding: .utf8) { + fields["timeline"] = timelineString + } + if let known = knownVoiceprints, !known.isEmpty, + let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }), + let str = String(data: data, encoding: .utf8) { + fields["known_voiceprints"] = str + } + if let minOverlap { fields["min_overlap"] = String(minOverlap) } + if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) } + + let audio = try Data(contentsOf: audioURL) + // Body doesn't change between retries — build it once. + let (body, contentType) = Self.multipart(fields: fields, fileField: "file", + filename: audioURL.lastPathComponent, fileData: audio) + + var attempt = 0 + while true { + var request = URLRequest(url: url) + request.httpMethod = "POST" + request.setValue(contentType, forHTTPHeaderField: "Content-Type") + request.httpBody = body + + let (data, response) = try await urlSession.data(for: request) + guard let http = response as? HTTPURLResponse else { + throw SparkControlError.decode("no HTTP response") + } + + switch http.statusCode { + case 200..<300: + do { + return try JSONDecoder().decode(LabelMergeResponse.self, from: data) + } catch { + throw SparkControlError.decode(error.localizedDescription) + } + case 503: + attempt += 1 + if attempt > maxRetries { throw SparkControlError.retriesExhausted } + let retryAfter = (http.value(forHTTPHeaderField: "Retry-After")).flatMap(Double.init) ?? 5 + try await Task.sleep(nanoseconds: UInt64(max(1, retryAfter) * 1_000_000_000)) + case 413: + throw SparkControlError.tooLarge + default: + throw SparkControlError.server(http.statusCode, Self.detail(from: data)) + } + } + } + + // MARK: - Helpers + + private static func detail(from data: Data) -> String { + if let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any], + let detail = obj["detail"] as? String { return detail } + return String(data: data, encoding: .utf8) ?? "unknown error" + } + + private static func multipart(fields: [String: String], fileField: String, + filename: String, fileData: Data) -> (Data, String) { + let boundary = "Boundary-\(UUID().uuidString)" + var body = Data() + func append(_ s: String) { body.append(s.data(using: .utf8)!) } + + for (name, value) in fields { + append("--\(boundary)\r\n") + append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n") + append("\(value)\r\n") + } + append("--\(boundary)\r\n") + append("Content-Disposition: form-data; name=\"\(fileField)\"; filename=\"\(filename)\"\r\n") + append("Content-Type: audio/wav\r\n\r\n") + body.append(fileData) + append("\r\n--\(boundary)--\r\n") + return (body, "multipart/form-data; boundary=\(boundary)") + } +} diff --git a/Ten31Transcripts/Backend/VoiceprintStore.swift b/Ten31Transcripts/Backend/VoiceprintStore.swift new file mode 100644 index 0000000..143e154 --- /dev/null +++ b/Ten31Transcripts/Backend/VoiceprintStore.swift @@ -0,0 +1,104 @@ +import Foundation + +/// Local persistence of named voiceprints — the compounding-identity layer. +/// +/// File `~/Ten31Transcripts/voiceprints.json`: +/// `{ "": { "vector": [192 floats], "updated": , "calls": } }` +/// +/// On send → `knownVoiceprints()` feeds `label-merge`. On response → `update(with:)` +/// stores/refreshes vectors for speakers resolved by **visual** (overlap ≥ ~0.8) +/// or **voiceprint** match. Never stores `Unknown_N` / `Speaker_unknown`. +/// +/// Thread-safe (lock-guarded); the sequential pipeline is the only writer. +final class VoiceprintStore { + struct Entry: Codable, Equatable { + var vector: [Float] + var updated: String + var calls: Int + } + + private let url: URL + private let minOverlapToStore: Double + private let lock = NSLock() + private var entriesStore: [String: Entry] = [:] + + init(fileURL: URL, minOverlapToStore: Double = 0.8) { + self.url = fileURL + self.minOverlapToStore = minOverlapToStore + load() + } + + var entries: [String: Entry] { + lock.lock(); defer { lock.unlock() } + return entriesStore + } + + /// Vectors keyed by name, for the `known_voiceprints` field. + func knownVoiceprints() -> [String: [Float]] { + lock.lock(); defer { lock.unlock() } + return entriesStore.mapValues { $0.vector } + } + + /// Persist fingerprints from a `label-merge` response for confidently-named + /// speakers only. + func update(with response: LabelMergeResponse) { + lock.lock(); defer { lock.unlock() } + let now = ISO8601DateFormatter().string(from: Date()) + for sp in response.speakers { + guard !Self.isUnknown(sp.name) else { continue } + let acceptable: Bool + switch sp.source { + case "visual": acceptable = (sp.overlapConfidence ?? 0) >= minOverlapToStore + case "voiceprint": acceptable = true // already matched a known print + default: acceptable = false // unmatched + } + guard acceptable, let vector = sp.fingerprint ?? response.fingerprints[sp.name], + !vector.isEmpty else { continue } + var entry = entriesStore[sp.name] ?? Entry(vector: vector, updated: now, calls: 0) + entry.vector = vector + entry.updated = now + entry.calls += 1 + entriesStore[sp.name] = entry + } + save() + } + + func rename(_ old: String, to new: String) { + lock.lock(); defer { lock.unlock() } + guard let e = entriesStore.removeValue(forKey: old) else { return } + entriesStore[new] = e + save() + } + + func remove(_ name: String) { + lock.lock(); defer { lock.unlock() } + entriesStore.removeValue(forKey: name) + save() + } + + func reset() { + lock.lock(); defer { lock.unlock() } + entriesStore = [:] + save() + } + + // MARK: - Persistence (call with lock held) + + private func load() { + guard let data = try? Data(contentsOf: url), + let decoded = try? JSONDecoder().decode([String: Entry].self, from: data) else { return } + entriesStore = decoded + } + + private func save() { + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + try? FileManager.default.createDirectory(at: url.deletingLastPathComponent(), + withIntermediateDirectories: true) + if let data = try? encoder.encode(entriesStore) { try? data.write(to: url) } + } + + private static func isUnknown(_ name: String) -> Bool { + LabelMergeResponse.isUnknownName(name) + } +} diff --git a/Ten31Transcripts/Detection/AudioInputProcesses.swift b/Ten31Transcripts/Detection/AudioInputProcesses.swift new file mode 100644 index 0000000..7261af3 --- /dev/null +++ b/Ten31Transcripts/Detection/AudioInputProcesses.swift @@ -0,0 +1,61 @@ +import CoreAudio +import Foundation + +/// Lists the PIDs of processes currently using an audio **input** (the mic), via +/// the CoreAudio process-object API (macOS 14+). +/// +/// This is how we attribute mic usage to a *specific* app — e.g. "is Signal in a +/// call?" — which is far more robust than matching window titles, and it works +/// uniformly for Zoom/Teams/Signal and browser calls (Meet). It also lets us +/// ignore our own recording: we look at the *call app's* PID, not the global mic, +/// so a call's end is detected even while we keep the mic open. +/// +/// Approach mirrors fastrepl/anarlog's `list_mic_using_apps`. +@available(macOS 14.0, *) +enum AudioInputProcesses { + static func micUsingPIDs() -> Set { + var listAddr = AudioObjectPropertyAddress( + mSelector: kAudioHardwarePropertyProcessObjectList, + mScope: kAudioObjectPropertyScopeGlobal, + mElement: kAudioObjectPropertyElementMain) + + var dataSize: UInt32 = 0 + guard AudioObjectGetPropertyDataSize( + AudioObjectID(kAudioObjectSystemObject), &listAddr, 0, nil, &dataSize) == noErr, + dataSize > 0 else { return [] } + + let count = Int(dataSize) / MemoryLayout.size + var processes = [AudioObjectID](repeating: 0, count: count) + guard AudioObjectGetPropertyData( + AudioObjectID(kAudioObjectSystemObject), &listAddr, 0, nil, &dataSize, &processes) == noErr + else { return [] } + + var pids = Set() + for process in processes where isRunningInput(process) { + if let pid = pid(of: process) { pids.insert(pid) } + } + return pids + } + + private static func isRunningInput(_ process: AudioObjectID) -> Bool { + var addr = AudioObjectPropertyAddress( + mSelector: kAudioProcessPropertyIsRunningInput, + mScope: kAudioObjectPropertyScopeGlobal, + mElement: kAudioObjectPropertyElementMain) + var value: UInt32 = 0 + var size = UInt32(MemoryLayout.size) + guard AudioObjectGetPropertyData(process, &addr, 0, nil, &size, &value) == noErr else { return false } + return value != 0 + } + + private static func pid(of process: AudioObjectID) -> pid_t? { + var addr = AudioObjectPropertyAddress( + mSelector: kAudioProcessPropertyPID, + mScope: kAudioObjectPropertyScopeGlobal, + mElement: kAudioObjectPropertyElementMain) + var value: pid_t = 0 + var size = UInt32(MemoryLayout.size) + guard AudioObjectGetPropertyData(process, &addr, 0, nil, &size, &value) == noErr else { return nil } + return value + } +} diff --git a/Ten31Transcripts/Detection/CallDetector.swift b/Ten31Transcripts/Detection/CallDetector.swift new file mode 100644 index 0000000..7cc8b34 --- /dev/null +++ b/Ten31Transcripts/Detection/CallDetector.swift @@ -0,0 +1,226 @@ +import AppKit +import CoreGraphics +import Combine + +/// Detects when the user joins/leaves a call and reports it via callbacks. +/// +/// Heuristic: the mic is live system-wide AND a known call app is present — +/// Zoom/Teams/Signal by bundle ID, or Google Meet by a browser window whose +/// title looks like a Meet call (read via `CGWindowList`, using the Screen +/// Recording permission). Debounced so a quick unrelated mic use doesn't trigger. +/// +/// Main-actor: all evaluation runs on the main thread. +@MainActor +final class CallDetector: ObservableObject { + + enum DetectedApp: String, Equatable { + case zoom, teams, signal, meet + var label: String { rawValue } + var display: String { + switch self { + case .zoom: return "Zoom" + case .teams: return "Microsoft Teams" + case .signal: return "Signal" + case .meet: return "Google Meet" + } + } + } + + enum Status: Equatable { + case disabled + case listening + case inCall(DetectedApp) + } + + @Published private(set) var status: Status = .disabled + + var onCallStart: ((DetectedApp) -> Void)? + var onCallEnd: (() -> Void)? + + private let mic = MicActivityMonitor() + private var pollTimer: Timer? + private var openTimer: Timer? + private var closeTimer: Timer? + private var inCall = false + private var currentApp: DetectedApp? + private var enabled = false + + private let openDelay: TimeInterval = 2.0 + private let closeDelay: TimeInterval = 4.0 + private let pollInterval: TimeInterval = 3.0 + + private static let nativeApps: [(id: String, app: DetectedApp)] = [ + ("us.zoom.xos", .zoom), + ("com.microsoft.teams2", .teams), + ("com.microsoft.teams", .teams), + ("org.whispersystems.signal-desktop", .signal), + ] + private static let browserIDs: Set = [ + "org.mozilla.firefox", "com.google.Chrome", "com.apple.Safari", + "company.thebrowser.Browser", "com.brave.Browser", "com.microsoft.edgemac", + ] + + func enable() { + guard !enabled else { return } + enabled = true + mic.onChange = { [weak self] _ in self?.evaluate() } + mic.start() + status = .listening + pollTimer = Timer.scheduledTimer(withTimeInterval: pollInterval, repeats: true) { [weak self] _ in + Task { @MainActor in self?.evaluate() } + } + evaluate() + } + + func disable() { + guard enabled else { return } + enabled = false + mic.stop() + pollTimer?.invalidate(); pollTimer = nil + cancelOpen(); cancelClose() + inCall = false + currentApp = nil + status = .disabled + } + + // MARK: - Evaluation + + private func evaluate() { + guard enabled else { return } + let candidate = mic.isRunning ? detectApp() : nil + + if let candidate { + cancelClose() + if inCall { + currentApp = candidate + status = .inCall(candidate) + } else if openTimer == nil { + openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in + Task { @MainActor in self?.fireOpen() } + } + } + } else { + cancelOpen() + if inCall && closeTimer == nil { + closeTimer = Timer.scheduledTimer(withTimeInterval: closeDelay, repeats: false) { [weak self] _ in + Task { @MainActor in self?.fireClose() } + } + } + } + } + + private func fireOpen() { + openTimer = nil + // Re-resolve the app at fire time (the debounce window may have changed it). + guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return } + inCall = true + currentApp = app + status = .inCall(app) + onCallStart?(app) + } + + private func fireClose() { + closeTimer = nil + guard enabled, inCall else { return } + inCall = false + currentApp = nil + status = .listening + onCallEnd?() + } + + private func cancelOpen() { openTimer?.invalidate(); openTimer = nil } + private func cancelClose() { closeTimer?.invalidate(); closeTimer = nil } + + // MARK: - App detection + + /// A call is active when a known call app is actually using the mic. + /// On macOS 14+ we attribute mic usage per-process (robust start AND stop, + /// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13 + /// we fall back to the per-app call-window heuristic. + private func detectApp() -> DetectedApp? { + if #available(macOS 14.0, *) { + return detectViaMicAttribution() + } + return detectViaWindowTitle() + } + + @available(macOS 14.0, *) + private func detectViaMicAttribution() -> DetectedApp? { + let micPIDs = AudioInputProcesses.micUsingPIDs() + guard !micPIDs.isEmpty else { return nil } + let selfPID = NSRunningApplication.current.processIdentifier + + for app in NSWorkspace.shared.runningApplications { + let pid = app.processIdentifier + guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue } + if let native = Self.nativeApps.first(where: { $0.id == id }) { + return native.app // Signal/Zoom/Teams using the mic = in a call + } + // A browser using the mic + a Meet window = a Meet call. The mic state + // gives reliable start/stop; the window check keeps non-Meet browser + // mic use (other web apps) from being mislabeled as a Meet recording. + if Self.browserIDs.contains(id), pidHasMeetWindow(pid) { + return .meet + } + } + return nil + } + + private func pidHasMeetWindow(_ pid: pid_t) -> Bool { + guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] + else { return false } + for w in info { + guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid, + let title = w[kCGWindowName as String] as? String else { continue } + if Self.looksLikeMeet(title) { return true } + } + return false + } + + /// macOS 13 fallback: detect by the presence of a call WINDOW per app. + private func detectViaWindowTitle() -> DetectedApp? { + var pidToApp: [pid_t: DetectedApp] = [:] + var browserPIDs = Set() + for app in NSWorkspace.shared.runningApplications { + guard let id = app.bundleIdentifier else { continue } + if let native = Self.nativeApps.first(where: { $0.id == id }) { + pidToApp[app.processIdentifier] = native.app + } else if Self.browserIDs.contains(id) { + browserPIDs.insert(app.processIdentifier) + } + } + guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil } + guard let infoList = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] else { + return nil + } + for info in infoList { + guard let pid = info[kCGWindowOwnerPID as String] as? pid_t, + let title = info[kCGWindowName as String] as? String, + !title.isEmpty else { continue } + if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet } + if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app } + } + return nil + } + + /// Per-app in-call window-title signatures (macOS 13 fallback only). + private static func isCallWindow(_ app: DetectedApp, _ title: String) -> Bool { + let t = title.lowercased() + switch app { + case .zoom: return t.contains("zoom meeting") || t.contains("meeting") + case .teams: return t.contains("meeting") + case .signal: return t.contains("signal call") || t.contains("group call") + case .meet: return false // handled via the browser path above + } + } + + /// Match an ACTIVE Google Meet call. Verified against real Firefox titles: + /// in a call the title is "Meet - " (e.g. "Meet - tjh-pixe-ier"), while + /// the home/lobby/"you left" pages are bare "Meet" or "Google Meet". Matching + /// only the "Meet - …" form is what lets auto-STOP fire when you leave (and + /// avoids false-starting on the home page). Also excludes "Zoom Meeting" etc. + private static func looksLikeMeet(_ title: String) -> Bool { + let t = title.lowercased() + return t.hasPrefix("meet - ") || t.hasPrefix("meet – ") || t.hasPrefix("meet — ") + } +} diff --git a/Ten31Transcripts/Detection/MicActivityMonitor.swift b/Ten31Transcripts/Detection/MicActivityMonitor.swift new file mode 100644 index 0000000..10e492b --- /dev/null +++ b/Ten31Transcripts/Detection/MicActivityMonitor.swift @@ -0,0 +1,125 @@ +import CoreAudio +import Foundation + +/// Watches whether *any* app is using the default input device (the system-wide +/// "mic is live" signal), via CoreAudio property listeners. Re-binds when the +/// default input device changes (e.g. you plug in a headset mid-call). +/// +/// Threading: ALL CoreAudio state (deviceID, listener blocks, `started`) and all +/// Add/Remove calls are confined to the serial `queue`. `isRunning` is written +/// and read only on the main thread (via `deliver`). `onChange` fires on main. +final class MicActivityMonitor { + private(set) var isRunning = false // main-thread only + var onChange: ((Bool) -> Void)? + + private let queue = DispatchQueue(label: "xyz.ten31.micmonitor") + + // queue-confined: + private var deviceID = AudioObjectID(kAudioObjectUnknown) + private var runningBlock: AudioObjectPropertyListenerBlock? + private var defaultDeviceBlock: AudioObjectPropertyListenerBlock? + private var started = false + + private static let runningAddr = AudioObjectPropertyAddress( + mSelector: kAudioDevicePropertyDeviceIsRunningSomewhere, + mScope: kAudioObjectPropertyScopeGlobal, + mElement: kAudioObjectPropertyElementMain) + + private static let defaultDeviceAddr = AudioObjectPropertyAddress( + mSelector: kAudioHardwarePropertyDefaultInputDevice, + mScope: kAudioObjectPropertyScopeGlobal, + mElement: kAudioObjectPropertyElementMain) + + func start() { queue.async { self.begin() } } + + /// Called on the main thread (by the @MainActor CallDetector). Resets + /// `isRunning` so a subsequent enable()'s synchronous evaluation can't read a + /// stale `true` before the fresh reading arrives. + func stop() { + queue.sync { self.end() } + isRunning = false + } + + // MARK: - queue-confined + + private func begin() { + guard !started else { return } + started = true + var addr = Self.defaultDeviceAddr + let block: AudioObjectPropertyListenerBlock = { [weak self] _, _ in + self?.rebindRunning() // delivered on `queue` + } + defaultDeviceBlock = block + AudioObjectAddPropertyListenerBlock(AudioObjectID(kAudioObjectSystemObject), &addr, queue, block) + bindRunning() + } + + private func end() { + started = false + if let block = defaultDeviceBlock { + var addr = Self.defaultDeviceAddr + AudioObjectRemovePropertyListenerBlock(AudioObjectID(kAudioObjectSystemObject), &addr, queue, block) + defaultDeviceBlock = nil + } + unbindRunning() + } + + private func bindRunning() { + guard started else { return } + deviceID = Self.defaultInputDevice() + guard deviceID != AudioObjectID(kAudioObjectUnknown) else { return } + var addr = Self.runningAddr + let block: AudioObjectPropertyListenerBlock = { [weak self] _, _ in + guard let self else { return } + self.deliver(Self.isDeviceRunning(self.deviceID)) // on `queue` + } + runningBlock = block + // Install the listener BEFORE the initial read so a flip during setup is + // caught (either by the now-installed listener or the post-install read). + AudioObjectAddPropertyListenerBlock(deviceID, &addr, queue, block) + deliver(Self.isDeviceRunning(deviceID)) + } + + private func unbindRunning() { + if deviceID != AudioObjectID(kAudioObjectUnknown), let block = runningBlock { + var addr = Self.runningAddr + AudioObjectRemovePropertyListenerBlock(deviceID, &addr, queue, block) + } + runningBlock = nil + deviceID = AudioObjectID(kAudioObjectUnknown) + } + + private func rebindRunning() { + guard started else { return } + unbindRunning() + bindRunning() + } + + private func deliver(_ running: Bool) { + DispatchQueue.main.async { + let changed = running != self.isRunning + self.isRunning = running + if changed { self.onChange?(running) } + } + } + + // MARK: - CoreAudio reads (use local address copies) + + private static func defaultInputDevice() -> AudioObjectID { + var addr = defaultDeviceAddr + var device = AudioObjectID(kAudioObjectUnknown) + var size = UInt32(MemoryLayout.size) + let status = AudioObjectGetPropertyData( + AudioObjectID(kAudioObjectSystemObject), &addr, 0, nil, &size, &device) + return status == noErr ? device : AudioObjectID(kAudioObjectUnknown) + } + + private static func isDeviceRunning(_ device: AudioObjectID) -> Bool { + guard device != AudioObjectID(kAudioObjectUnknown) else { return false } + var addr = runningAddr + var value: UInt32 = 0 + var size = UInt32(MemoryLayout.size) + let status = AudioObjectGetPropertyData(device, &addr, 0, nil, &size, &value) + return status == noErr && value != 0 + } +} diff --git a/Ten31Transcripts/Session/SessionController.swift b/Ten31Transcripts/Session/SessionController.swift index 2637aa1..6b355b9 100644 --- a/Ten31Transcripts/Session/SessionController.swift +++ b/Ten31Transcripts/Session/SessionController.swift @@ -1,6 +1,7 @@ import Foundation import Combine import AppKit +import CoreGraphics struct SessionInfo: Equatable { let folder: URL @@ -25,6 +26,14 @@ final class SessionController: ObservableObject { case error(String) } + /// Backend transcription status for the most recent session. + enum TranscriptStatus: Equatable { + case idle + case processing(Int, Int) // chunk done, total + case done(speakers: Int, segments: Int) + case failed(String) + } + /// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a /// recording in progress before the app quits. static weak var shared: SessionController? @@ -37,12 +46,34 @@ final class SessionController: ObservableObject { @Published private(set) var systemLevel: Float = 0 /// Surfaced after a session if system audio stopped early. @Published private(set) var warning: String? + /// Mirrored from `CallDetector` for the UI. + @Published private(set) var detectionStatus: CallDetector.Status = .disabled + /// Backend transcription status for the last session. + @Published private(set) var transcriptStatus: TranscriptStatus = .idle private let settings: AppSettings + private var voiceprints: VoiceprintStore + private let detector = CallDetector() + private var cancellables = Set() + private var currentLabel = "manual" + /// Inputs needed to (re)process the last finished session through the backend. + private struct ProcessInputs { + let folder: URL + let sessionId: String + let app: String + let mixedURL: URL + let selfSpans: [VADSpan] + } + private var lastProcess: ProcessInputs? + private var processTask: Task? private var recorder: AudioRecorder? private var currentFolder: URL? private var startTime: Date? private var timer: Timer? + /// True when the current session was started by call detection (not the user). + private var autoStarted = false + /// Set if a detected call ends while we're still in `.starting`. + private var pendingAutoStop = false /// The in-flight start or stop Task, so `prepareForTermination` can await it. private var lifecycleTask: Task? /// Bumped each time a start/stop Task is spawned (Task is a value type, so this @@ -51,7 +82,64 @@ final class SessionController: ObservableObject { init(settings: AppSettings) { self.settings = settings + self.voiceprints = VoiceprintStore( + fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json")) SessionController.shared = self + + detector.onCallStart = { [weak self] app in self?.handleCallStart(app) } + detector.onCallEnd = { [weak self] in self?.handleCallEnd() } + detector.$status + .sink { [weak self] status in self?.detectionStatus = status } + .store(in: &cancellables) + // Re-point the voiceprint DB if the output folder changes. The in-flight + // pipeline keeps its own captured reference, so this can't disrupt a run. + settings.$outputFolderPath + .dropFirst() + .sink { [weak self] path in + guard let self else { return } + let dir = URL(fileURLWithPath: (path as NSString).expandingTildeInPath, isDirectory: true) + self.voiceprints = VoiceprintStore(fileURL: dir.appendingPathComponent("voiceprints.json")) + } + .store(in: &cancellables) + settings.$autoRecordOnDetection + .sink { [weak self] on in + guard let self else { return } + if on { + self.detector.enable() + } else { + self.detector.disable() + // Don't leave an auto-started session running with no detector — + // handle both .recording and the in-flight .starting case. + if self.autoStarted { + switch self.state { + case .recording: self.stop() + case .starting: self.pendingAutoStop = true + default: break + } + } + } + } + .store(in: &cancellables) + } + + // MARK: - Auto-detection + + private func handleCallStart(_ app: CallDetector.DetectedApp) { + guard settings.autoRecordOnDetection else { return } + switch state { + case .idle, .error: start(label: app.label, auto: true) + case .starting, .recording, .finishing: break // don't disturb an active session + } + } + + private func handleCallEnd() { + // Only auto-stop a session we auto-started; never a manual recording. + guard autoStarted else { return } + switch state { + case .recording: stop() + case .starting: pendingAutoStop = true // resolved when start() completes + case .idle, .error, .finishing: break + } } var isBusy: Bool { @@ -68,15 +156,18 @@ final class SessionController: ObservableObject { // MARK: - Start / Stop - private func start() { + private func start(label: String = "manual", auto: Bool = false) { let folder: URL do { - folder = try makeSessionFolder() + folder = try makeSessionFolder(label: label) } catch { fail("Couldn't create session folder: \(error.localizedDescription)") return } currentFolder = folder + currentLabel = label + autoStarted = auto + pendingAutoStop = false let recorder = AudioRecorder( micURL: folder.appendingPathComponent("mic.wav"), systemURL: folder.appendingPathComponent("system.wav"), @@ -92,12 +183,36 @@ final class SessionController: ObservableObject { self.state = .recording self.startTime = Date() self.startTimer() + // A detected call may have ended while we were still starting. + if self.pendingAutoStop { + self.pendingAutoStop = false + self.stop() + } } catch { - self.fail("Couldn't start recording: \(error.localizedDescription)") + self.handleStartFailure(error) } } } + /// Map a recorder start failure to an actionable message. The common case is + /// Screen Recording getting re-checked after a rebuild (the SCStream auth + /// check fails even though CGPreflight reports granted), so re-prompt and open + /// the right Settings pane rather than show a cryptic TCC error. + private func handleStartFailure(_ error: Error) { + let msg = error.localizedDescription.lowercased() + let screenIssue = msg.contains("declined") || msg.contains("tcc") + || msg.contains("screen") || msg.contains("permission") + if screenIssue { + _ = CGRequestScreenCaptureAccess() + if let url = URL(string: "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") { + NSWorkspace.shared.open(url) + } + fail("Screen Recording needs re-approval for this build. Toggle Ten31Transcripts off then on in System Settings ▸ Screen Recording, then restart the app.") + } else { + fail("Couldn't start recording: \(error.localizedDescription)") + } + } + private func stop() { guard let recorder else { return } state = .finishing @@ -114,20 +229,66 @@ final class SessionController: ObservableObject { micLevel = 0 systemLevel = 0 warning = result.systemNote.map { "System audio stopped early: \($0)" } + transcriptStatus = .idle if let folder = currentFolder { writeSelfSpans(result, to: folder) lastSession = SessionInfo( folder: folder, mixedURL: result.mixedURL, duration: result.duration, selfSpanCount: result.selfSpans.count) + lastProcess = ProcessInputs( + folder: folder, sessionId: folder.lastPathComponent, app: currentLabel, + mixedURL: result.mixedURL, selfSpans: result.selfSpans) } + let autoSend = settings.autoSendOnStop currentFolder = nil + autoStarted = false + pendingAutoStop = false elapsed = 0 state = .idle + if autoSend { processLastSession() } + } + + // MARK: - Backend transcription + + /// Send the last finished session to the backend → `speakers.json`. Uses the + /// mic-VAD self spans as the timeline for now; visual segments (Phase 3–4) get + /// merged in once the adapters land. Safe to call manually ("Send to backend") + /// or automatically on stop. + func processLastSession() { + guard let inputs = lastProcess else { return } + if case .processing = transcriptStatus { return } + transcriptStatus = .processing(0, 1) + + let settings = self.settings + let voiceprints = self.voiceprints + processTask = Task { + let pipeline = TranscriptPipeline( + baseURL: settings.backendBaseURL, + skipTLS: settings.skipTLSVerification, + voiceprints: voiceprints) + let timeline = TranscriptPipeline.timeline( + fromSelfSpans: inputs.selfSpans, selfName: settings.selfName) + do { + let speakers = try await pipeline.process( + sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app, + mixedURL: inputs.mixedURL, timeline: timeline, + progress: { done, total in + await MainActor.run { self.transcriptStatus = .processing(done, total) } + }) + self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count) + } catch is CancellationError { + self.transcriptStatus = .idle + } catch { + self.transcriptStatus = .failed(error.localizedDescription) + } + } } private func fail(_ message: String) { recorder = nil currentFolder = nil + autoStarted = false + pendingAutoStop = false stopTimer() micLevel = 0 systemLevel = 0 @@ -139,6 +300,9 @@ final class SessionController: ObservableObject { /// its WAV headers are finalized before the process exits. Handles quit while /// `.starting` and `.finishing`, not just `.recording`. func prepareForTermination() async { + // Cancel any in-flight backend transcription (audio is already saved; the + // user can resend). The pipeline's checkCancellation + defer clean up chunks. + processTask?.cancel() // Drain whatever lifecycle Task is in flight until nothing is busy. A Stop // click landing in an await window can spawn a new stop Task, so loop // rather than awaiting a single captured task. @@ -178,9 +342,9 @@ final class SessionController: ObservableObject { // MARK: - Files - private func makeSessionFolder() throws -> URL { + private func makeSessionFolder(label: String) throws -> URL { let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true) - let folder = base.appendingPathComponent("\(Self.timestamp())_manual", isDirectory: true) + let folder = base.appendingPathComponent("\(Self.timestamp())_\(label)", isDirectory: true) try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true) return folder } diff --git a/Ten31Transcripts/Session/SessionPackager.swift b/Ten31Transcripts/Session/SessionPackager.swift new file mode 100644 index 0000000..473441c --- /dev/null +++ b/Ten31Transcripts/Session/SessionPackager.swift @@ -0,0 +1,85 @@ +import Foundation +import AVFoundation + +/// Splits a long session into backend-sized chunks and produces, per chunk, the +/// sliced audio and the timeline rebased to chunk-local seconds. +/// +/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3 +/// min are chunked into ~2–3 min windows; names + voiceprints unify speakers +/// across chunks (handled in the pipeline). +enum SessionPackager { + struct PlannedChunk: Equatable { + let index: Int + let start: Double // global seconds + let end: Double + } + + /// One chunk if short; otherwise even ~`chunkSeconds` windows. + static func planChunks(durationSec: Double, + chunkSeconds: Double = 150, + thresholdSec: Double = 180) -> [PlannedChunk] { + guard durationSec > thresholdSec else { + return [PlannedChunk(index: 0, start: 0, end: durationSec)] + } + var chunks: [PlannedChunk] = [] + var start = 0.0 + var index = 0 + while start < durationSec - 0.001 { + let end = min(start + chunkSeconds, durationSec) + chunks.append(PlannedChunk(index: index, start: start, end: end)) + start = end + index += 1 + } + return chunks + } + + /// Clip segments to `[start, end)` and rebase to chunk-local seconds, then + /// emit the flat `label-merge` array `[{start,end,name,confidence}]`. + static func rebasedTimelineData(_ segments: [VisualTimeline.Segment], + start: Double, end: Double) throws -> Data { + let flat: [[String: Any]] = segments.compactMap { seg in + let s = max(seg.start, start) + let e = min(seg.end, end) + guard e > s else { return nil } + return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence] + } + return try JSONSerialization.data(withJSONObject: flat, options: []) + } + + /// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`. + static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws { + let input = try AVAudioFile(forReading: source) + let sr = input.fileFormat.sampleRate + let startFrame = AVAudioFramePosition((startSec * sr).rounded()) + let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded())) + guard endFrame > startFrame else { return } + + let settings: [String: Any] = [ + AVFormatIDKey: kAudioFormatLinearPCM, + AVSampleRateKey: sr, + AVNumberOfChannelsKey: 1, + AVLinearPCMBitDepthKey: 16, + AVLinearPCMIsFloatKey: false, + AVLinearPCMIsBigEndianKey: false, + ] + let output = try AVAudioFile(forWriting: dest, settings: settings, + commonFormat: .pcmFormatFloat32, interleaved: false) + input.framePosition = startFrame + var remaining = AVAudioFrameCount(endFrame - startFrame) + let block: AVAudioFrameCount = 16_000 + while remaining > 0 { + let n = min(block, remaining) + guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break } + try input.read(into: buffer, frameCount: n) + if buffer.frameLength == 0 { break } + try output.write(from: buffer) + remaining -= buffer.frameLength + } + } + + /// Duration (seconds) of a WAV. + static func duration(of url: URL) -> Double { + guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 } + return Double(file.length) / file.fileFormat.sampleRate + } +} diff --git a/Ten31Transcripts/Session/SpeakersFile.swift b/Ten31Transcripts/Session/SpeakersFile.swift new file mode 100644 index 0000000..4132022 --- /dev/null +++ b/Ten31Transcripts/Session/SpeakersFile.swift @@ -0,0 +1,45 @@ +import Foundation + +/// `speakers.json` — the final stored output (docs §6): per-chunk `label-merge` +/// results concatenated, timestamps offset back to global seconds, names unified. +/// This is the hand-off to the downstream summarizer; the app stops here. +struct SpeakersFile: Codable { + let sessionId: String + let app: String + let durationSec: Double + let speakers: [Speaker] + let segments: [Segment] + let models: [String: String] + + struct Speaker: Codable, Equatable { + let name: String + let source: String + let overlapConfidence: Double? + let matchSimilarity: Double? + enum CodingKeys: String, CodingKey { + case name, source + case overlapConfidence = "overlap_confidence" + case matchSimilarity = "match_similarity" + } + } + + struct Segment: Codable, Equatable { + let start: Double + let end: Double + let speaker: String + let text: String? + } + + enum CodingKeys: String, CodingKey { + case sessionId = "session_id" + case app + case durationSec = "duration_sec" + case speakers, segments, models + } + + func write(to url: URL) throws { + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + try encoder.encode(self).write(to: url) + } +} diff --git a/Ten31Transcripts/Session/TranscriptAssembler.swift b/Ten31Transcripts/Session/TranscriptAssembler.swift new file mode 100644 index 0000000..bb44a60 --- /dev/null +++ b/Ten31Transcripts/Session/TranscriptAssembler.swift @@ -0,0 +1,78 @@ +import Foundation + +/// Concatenates per-chunk `label-merge` results into one global `speakers.json`: +/// segment times offset back to global seconds, speakers unified across chunks by +/// name, and fingerprints collected for the voiceprint store. +enum TranscriptAssembler { + struct ChunkResult { + let chunkStart: Double // global seconds + let response: LabelMergeResponse + } + + struct Assembled { + let speakersFile: SpeakersFile + let fingerprints: [String: [Float]] // name -> 192-dim, for VoiceprintStore + } + + /// Source ranking when the same name appears across chunks with different sources. + private static func rank(_ source: String) -> Int { + switch source { + case "visual": return 3 + case "voiceprint": return 2 + default: return 1 // unmatched + } + } + + private static func isUnknown(_ name: String) -> Bool { + LabelMergeResponse.isUnknownName(name) + } + + static func assemble(sessionId: String, app: String, chunks: [ChunkResult]) -> Assembled { + var segments: [SpeakersFile.Segment] = [] + var bestSpeaker: [String: SpeakersFile.Speaker] = [:] + var fingerprints: [String: [Float]] = [:] + var models: [String: String] = [:] + var duration = 0.0 + + for chunk in chunks { + let offset = chunk.chunkStart + // Audio length from the chunk window, so silent/all-unknown calls still + // report a real duration (not just the last segment's end). + duration = max(duration, offset + chunk.response.duration) + + for seg in chunk.response.segments { + let start = seg.startSeconds + offset + let end = seg.endSeconds + offset + segments.append(.init(start: start, end: end, speaker: seg.speaker, text: seg.text)) + duration = max(duration, end) + } + + for sp in chunk.response.speakers { + let candidate = SpeakersFile.Speaker( + name: sp.name, source: sp.source, + overlapConfidence: sp.overlapConfidence, matchSimilarity: sp.matchSimilarity) + if let existing = bestSpeaker[sp.name] { + if rank(sp.source) > rank(existing.source) { bestSpeaker[sp.name] = candidate } + } else { + bestSpeaker[sp.name] = candidate + } + // Collect named fingerprints only (never Unknown_N / Speaker_unknown). + if !isUnknown(sp.name), let fp = sp.fingerprint, fp.count > 0 { + fingerprints[sp.name] = fp + } + } + for (name, fp) in chunk.response.fingerprints where !isUnknown(name) && fp.count > 0 { + fingerprints[name] = fp + } + } + + segments.sort { $0.start < $1.start } + let speakers = bestSpeaker.values.sorted { $0.name < $1.name } + models = chunks.last?.response.models ?? [:] + + let file = SpeakersFile( + sessionId: sessionId, app: app, durationSec: duration, + speakers: speakers, segments: segments, models: models) + return Assembled(speakersFile: file, fingerprints: fingerprints) + } +} diff --git a/Ten31Transcripts/Session/TranscriptPipeline.swift b/Ten31Transcripts/Session/TranscriptPipeline.swift new file mode 100644 index 0000000..911033a --- /dev/null +++ b/Ten31Transcripts/Session/TranscriptPipeline.swift @@ -0,0 +1,75 @@ +import Foundation + +/// Drives a finished session through the backend: chunk → sequential +/// `label-merge` (accumulating voiceprints) → assemble `speakers.json` → persist +/// fingerprints. Requests are sequential by construction (one chunk at a time). +final class TranscriptPipeline { + private let client: SparkControlClient + private let voiceprints: VoiceprintStore + + init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) { + self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS) + self.voiceprints = voiceprints + } + + /// Process `mixedURL` against `timeline` (visual + self spans). Writes + /// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)` + /// is called per chunk. + func process(sessionFolder: URL, + sessionId: String, + app: String, + mixedURL: URL, + timeline: [VisualTimeline.Segment], + progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile { + let duration = SessionPackager.duration(of: mixedURL) + let plan = SessionPackager.planChunks(durationSec: duration) + + // Zero-duration / empty session → a valid empty speakers.json, no backend call. + if plan.isEmpty || duration <= 0 { + let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: []) + try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json")) + await progress?(0, 0) + return empty.speakersFile + } + + let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true) + try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true) + defer { try? FileManager.default.removeItem(at: chunksDir) } // cleanup on success OR throw + + // Start from stored voiceprints; accumulate this call's prints across chunks + // for within-call unification (the store only persists high-confidence ones). + var known = voiceprints.knownVoiceprints() + var results: [TranscriptAssembler.ChunkResult] = [] + + for chunk in plan { + try Task.checkCancellation() + await progress?(chunk.index, plan.count) + let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav") + try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL) + guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue } // empty slice → skip + + let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end) + let response = try await client.labelMerge( + audioURL: chunkURL, timeline: timelineData, + knownVoiceprints: known.isEmpty ? nil : known, transcribe: true) + + for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) { + known[name] = fp + } + voiceprints.update(with: response) + results.append(.init(chunkStart: chunk.start, response: response)) + try? FileManager.default.removeItem(at: chunkURL) + } + await progress?(plan.count, plan.count) + + let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results) + try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json")) + return assembled.speakersFile + } + + /// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once + /// the visual adapters land (Phase 3–4), their segments are merged in too. + static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] { + spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") } + } +} diff --git a/Ten31Transcripts/Settings/AppSettings.swift b/Ten31Transcripts/Settings/AppSettings.swift index 445f899..a2c153b 100644 --- a/Ten31Transcripts/Settings/AppSettings.swift +++ b/Ten31Transcripts/Settings/AppSettings.swift @@ -32,6 +32,21 @@ final class AppSettings: ObservableObject { didSet { defaults.set(adapterEnabled, forKey: Keys.adapterEnabled) } } + @Published var autoRecordOnDetection: Bool { + didSet { defaults.set(autoRecordOnDetection, forKey: Keys.autoRecord) } + } + + /// The user's name, pre-seeded into the timeline for mic-VAD "self" spans. + @Published var selfName: String { + didSet { defaults.set(selfName, forKey: Keys.selfName) } + } + + /// Auto-send a finished recording to the backend for transcription. Default + /// off while developing; flip on for hands-free transcripts. + @Published var autoSendOnStop: Bool { + didSet { defaults.set(autoSendOnStop, forKey: Keys.autoSend) } + } + /// Output folder as a resolved file URL (expands a leading `~`). var outputFolderURL: URL { URL(fileURLWithPath: (outputFolderPath as NSString).expandingTildeInPath, @@ -55,6 +70,10 @@ final class AppSettings: ObservableObject { self.adapterEnabled = stored ?? Dictionary( uniqueKeysWithValues: Self.adapterKeys.map { ($0.key, true) } ) + + self.autoRecordOnDetection = defaults.object(forKey: Keys.autoRecord) as? Bool ?? true + self.selfName = defaults.string(forKey: Keys.selfName) ?? "Me" + self.autoSendOnStop = defaults.object(forKey: Keys.autoSend) as? Bool ?? false } private enum Keys { @@ -62,5 +81,8 @@ final class AppSettings: ObservableObject { static let skipTLS = "skipTLSVerification" static let outputFolder = "outputFolderPath" static let adapterEnabled = "adapterEnabled" + static let autoRecord = "autoRecordOnDetection" + static let selfName = "selfName" + static let autoSend = "autoSendOnStop" } } diff --git a/Ten31Transcripts/UI/MenuBarView.swift b/Ten31Transcripts/UI/MenuBarView.swift index 4ee27ef..e6361bc 100644 --- a/Ten31Transcripts/UI/MenuBarView.swift +++ b/Ten31Transcripts/UI/MenuBarView.swift @@ -46,6 +46,9 @@ struct MenuBarView: View { .foregroundStyle(.secondary) } } + Text(detectionText) + .font(.caption) + .foregroundStyle(.secondary) Button { session.toggle() @@ -84,6 +87,15 @@ struct MenuBarView: View { .font(.caption) } .buttonStyle(.link) + + HStack { + Button("Send to backend") { session.processLastSession() } + .disabled(transcriptProcessing) + Spacer() + } + if !transcriptText.isEmpty { + Text(transcriptText).font(.caption).foregroundStyle(transcriptColor) + } } } } @@ -114,6 +126,36 @@ struct MenuBarView: View { return String(format: "%02d:%02d", total / 60, total % 60) } + private var detectionText: String { + switch session.detectionStatus { + case .disabled: return "Auto-detect off" + case .listening: return "Listening for calls…" + case .inCall(let app): return "In call: \(app.display)" + } + } + + private var transcriptProcessing: Bool { + if case .processing = session.transcriptStatus { return true } + return false + } + + private var transcriptText: String { + switch session.transcriptStatus { + case .idle: return "" + case .processing(let d, let t): return "Transcribing… chunk \(d)/\(t)" + case .done(let s, let seg): return "Transcript ready · \(s) speakers · \(seg) segments" + case .failed(let m): return "Transcript failed: \(m)" + } + } + + private var transcriptColor: Color { + switch session.transcriptStatus { + case .failed: return .red + case .done: return .green + default: return .secondary + } + } + private var header: some View { VStack(alignment: .leading, spacing: 2) { Text("Ten31 Transcripts").font(.headline) diff --git a/Ten31Transcripts/UI/SettingsView.swift b/Ten31Transcripts/UI/SettingsView.swift index 7d81868..0f2d97f 100644 --- a/Ten31Transcripts/UI/SettingsView.swift +++ b/Ten31Transcripts/UI/SettingsView.swift @@ -14,6 +14,22 @@ struct SettingsView: View { isOn: $settings.skipTLSVerification) } + Section("Call detection") { + Toggle("Auto-record when a call is detected", isOn: $settings.autoRecordOnDetection) + Text("Detects Zoom, Teams, Signal, and Google Meet (any browser).") + .font(.caption) + .foregroundStyle(.secondary) + } + + Section("Transcription") { + TextField("Your name", text: $settings.selfName) + .textFieldStyle(.roundedBorder) + Toggle("Auto-send recordings to backend", isOn: $settings.autoSendOnStop) + Text("Your name labels the mic-VAD \"self\" spans. Auto-send transcribes each recording on stop.") + .font(.caption) + .foregroundStyle(.secondary) + } + Section("Output") { HStack { Text(settings.outputFolderPath) diff --git a/Ten31Transcripts/Visual/FrameSampler.swift b/Ten31Transcripts/Visual/FrameSampler.swift new file mode 100644 index 0000000..bd943c3 --- /dev/null +++ b/Ten31Transcripts/Visual/FrameSampler.swift @@ -0,0 +1,82 @@ +import Foundation +import CoreGraphics + +/// Renders a CGImage to an RGBA8 buffer once, then answers cheap colour queries +/// over pixel regions. Used to score the active-speaker highlight (a saturated +/// coloured border/ring) around participant tiles. +struct FrameSampler { + let width: Int + let height: Int + private let pixels: [UInt8] // RGBA8, row-major, top-left origin + + init?(cgImage: CGImage) { + let w = cgImage.width, h = cgImage.height + guard w > 0, h > 0 else { return nil } + var buffer = [UInt8](repeating: 0, count: w * h * 4) + let colorSpace = CGColorSpaceCreateDeviceRGB() + let info = CGImageAlphaInfo.premultipliedLast.rawValue + guard let ctx = buffer.withUnsafeMutableBytes({ raw -> CGContext? in + CGContext(data: raw.baseAddress, width: w, height: h, bitsPerComponent: 8, + bytesPerRow: w * 4, space: colorSpace, bitmapInfo: info) + }) else { return nil } + ctx.draw(cgImage, in: CGRect(x: 0, y: 0, width: w, height: h)) + self.width = w + self.height = h + self.pixels = buffer + } + + /// Mean HSV saturation (0…1) over a pixel rect (top-left origin), sampled on a grid. + func meanSaturation(inPixelRect rect: CGRect, samples: Int = 24) -> Double { + let x0 = max(0, Int(rect.minX)), x1 = min(width, Int(rect.maxX)) + let y0 = max(0, Int(rect.minY)), y1 = min(height, Int(rect.maxY)) + guard x1 > x0, y1 > y0 else { return 0 } + let stepX = max(1, (x1 - x0) / samples) + let stepY = max(1, (y1 - y0) / samples) + var sum = 0.0, count = 0 + var y = y0 + while y < y1 { + var x = x0 + while x < x1 { + let i = (y * width + x) * 4 + let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2]) + let mx = max(r, g, b), mn = min(r, g, b) + sum += mx > 0 ? (mx - mn) / mx : 0 + count += 1 + x += stepX + } + y += stepY + } + return count > 0 ? sum / Double(count) : 0 + } + + /// Mean saturation of a ring just inside `rect`'s edges (the tile border), + /// excluding the interior — that's where the speaking highlight lives. + func borderSaturation(inPixelRect rect: CGRect, thicknessFraction: Double = 0.12) -> Double { + let t = max(2.0, min(rect.width, rect.height) * thicknessFraction) + let top = CGRect(x: rect.minX, y: rect.minY, width: rect.width, height: t) + let bottom = CGRect(x: rect.minX, y: rect.maxY - t, width: rect.width, height: t) + let left = CGRect(x: rect.minX, y: rect.minY, width: t, height: rect.height) + let right = CGRect(x: rect.maxX - t, y: rect.minY, width: t, height: rect.height) + return [top, bottom, left, right].map { meanSaturation(inPixelRect: $0) }.max() ?? 0 + } + + /// Grid-sampled pixel positions (top-left origin) that are strongly saturated + /// AND bright enough to be a UI highlight — i.e. the speaking ring/border. + func saturatedPoints(threshold: Double = 0.5, minBrightness: Double = 60, gridStep: Int = 6) -> [CGPoint] { + var points: [CGPoint] = [] + var y = 0 + while y < height { + var x = 0 + while x < width { + let i = (y * width + x) * 4 + let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2]) + let mx = max(r, g, b), mn = min(r, g, b) + let sat = mx > 0 ? (mx - mn) / mx : 0 + if sat > threshold && mx > minBrightness { points.append(CGPoint(x: x, y: y)) } + x += gridStep + } + y += gridStep + } + return points + } +} diff --git a/Ten31Transcripts/Visual/GridCallAnalyzer.swift b/Ten31Transcripts/Visual/GridCallAnalyzer.swift new file mode 100644 index 0000000..bf841e1 --- /dev/null +++ b/Ten31Transcripts/Visual/GridCallAnalyzer.swift @@ -0,0 +1,94 @@ +import Foundation +import CoreGraphics +import CoreVideo +import CoreImage + +/// Shared engine for tile-grid conferencing UIs (Signal/Zoom/Teams): OCR the +/// name/initials on each tile, then mark the active speaker(s) by the saturated +/// coloured highlight around their tile. +/// +/// Geometry (`Config`) is a first pass; the exact tile expansion and saturation +/// threshold get calibrated per app against real screenshot fixtures. The +/// detection *logic* (read names; pick the highlighted tile) is validated with +/// synthetic frames. +struct GridCallAnalyzer { + struct Config { + var tileExpandX = 1.8 // grow text bbox → approx tile (for the reported bbox) + var tileExpandY = 2.6 + var minTextConfidence: Float = 0.3 + var maxNameLength = 40 + /// Highlight detection: a name is "speaking" if enough strongly-saturated + /// highlight pixels sit within `highlightRadiusFraction` of its label. + var highlightRadiusFraction = 0.22 // of max(frame W,H) + var minHighlightPoints = 6 + var highlightShareOfMax = 0.35 // must be ≥ this fraction of the busiest tile + } + + var config = Config() + var recognizer = TextRecognizer() + + func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] { + guard let cg = Self.cgImage(from: pixelBuffer) else { return [] } + return analyze(cgImage: cg, at: t) + } + + func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] { + let texts = recognizer.recognize(in: cgImage).filter { + $0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty + } + guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] } + + let w = cgImage.width, h = cgImage.height + let tiles = texts.map { r -> (name: String, center: CGPoint, rect: CGRect, conf: Double) in + let rect = tileRect(r.boundingBox, imageW: w, imageH: h) + let cx = r.boundingBox.midX * Double(w) + let cy = (1 - r.boundingBox.midY) * Double(h) // flip Y to top-left origin + return (cleaned(r.text), CGPoint(x: cx, y: cy), rect, Double(r.confidence)) + } + + // Find highlight pixels once, attribute each to the nearest name label. + let points = sampler.saturatedPoints() + let radius = Double(max(w, h)) * config.highlightRadiusFraction + let r2 = radius * radius + let counts = tiles.map { tile -> Int in + points.reduce(0) { acc, p in + let dx = Double(p.x) - tile.center.x, dy = Double(p.y) - tile.center.y + return acc + (dx * dx + dy * dy <= r2 ? 1 : 0) + } + } + let maxCount = counts.max() ?? 0 + let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax)) + + return tiles.enumerated().map { idx, tile in + let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need + return SpeakerObservation(name: tile.name, speaking: speaking, + bbox: tile.rect, confidence: tile.conf, t: t) + } + } + + /// Vision normalized bbox (bottom-left origin) → pixel tile rect (top-left), + /// expanded around the text centre to approximate the whole tile. + private func tileRect(_ box: CGRect, imageW: Int, imageH: Int) -> CGRect { + let W = Double(imageW), H = Double(imageH) + let pw = box.width * W + let ph = box.height * H + let cx = (box.midX) * W + let cy = (1 - box.midY) * H // flip Y to top-left origin + let nw = pw * config.tileExpandX + let nh = ph * config.tileExpandY + let rect = CGRect(x: cx - nw / 2, y: cy - nh / 2, width: nw, height: nh) + return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H)) + } + + private func cleaned(_ s: String) -> String { + let t = s.trimmingCharacters(in: .whitespacesAndNewlines) + return t.count <= config.maxNameLength ? t : "" + } + + private static let ciContext = CIContext() + + static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? { + let ci = CIImage(cvPixelBuffer: pixelBuffer) + return ciContext.createCGImage(ci, from: ci.extent) // reuse; allocating per frame is costly + } +} diff --git a/Ten31Transcripts/Visual/SpeakerObservation.swift b/Ten31Transcripts/Visual/SpeakerObservation.swift new file mode 100644 index 0000000..6f5eb2a --- /dev/null +++ b/Ten31Transcripts/Visual/SpeakerObservation.swift @@ -0,0 +1,36 @@ +import Foundation +import CoreGraphics +import CoreVideo + +/// One per-frame observation from an app adapter: a participant tile, whether its +/// active-speaker cue is showing, and where it is. `name` may be a full name, +/// initials (Signal), or "" if unknown. `t` is seconds relative to the session t0. +struct SpeakerObservation: Equatable { + let name: String + let speaking: Bool + let bbox: CGRect + let confidence: Double // 0…1 + let t: TimeInterval +} + +/// Per-app screen-reading strategy. Each conferencing app gets one implementation +/// that knows that app's tile layout, name placement, and active-speaker cue. +/// Adapters must be testable offline against still-image fixtures. +protocol AppAdapter { + static var bundleIDs: [String] { get } + var adapterVersion: String { get } + var preferredFPS: Int { get } + + /// Analyze one frame; return the speakers visible and whether each is speaking. + /// Must process in-memory and never persist the frame. + func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] + + /// Optional: participant names from the app's Accessibility tree (Electron + /// apps like Signal expose these), preferred over OCR when available. + func namesFromAccessibility() -> [String]? +} + +extension AppAdapter { + func namesFromAccessibility() -> [String]? { nil } + var preferredFPS: Int { 3 } +} diff --git a/Ten31Transcripts/Visual/TextRecognizer.swift b/Ten31Transcripts/Visual/TextRecognizer.swift new file mode 100644 index 0000000..c19787b --- /dev/null +++ b/Ten31Transcripts/Visual/TextRecognizer.swift @@ -0,0 +1,59 @@ +import Foundation +import Vision +import CoreVideo +import CoreGraphics + +/// Thin wrapper over Vision's text recognition, used by adapters to read names / +/// initials off participant tiles. Runs on the Neural Engine; no permission +/// needed. Works on any frame, so adapters can be developed against still images. +struct TextRecognizer { + struct Result { + let text: String + let confidence: Float + /// Normalized Vision bounding box (origin bottom-left, 0…1). + let boundingBox: CGRect + } + + var recognitionLevel: VNRequestTextRecognitionLevel = .accurate + var minimumTextHeight: Float = 0 // 0 = Vision default + var usesLanguageCorrection = false // names/initials aren't dictionary words + + /// Recognize text in `pixelBuffer`, optionally limited to a normalized region + /// of interest (origin bottom-left, matching Vision's coordinate space). + func recognize(in pixelBuffer: CVPixelBuffer, regionOfInterest: CGRect? = nil) -> [Result] { + let request = VNRecognizeTextRequest() + request.recognitionLevel = recognitionLevel + request.usesLanguageCorrection = usesLanguageCorrection + if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight } + if let roi = regionOfInterest { request.regionOfInterest = roi } + + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:]) + do { + try handler.perform([request]) + } catch { + return [] + } + + guard let observations = request.results else { return [] } + return observations.compactMap { obs in + guard let top = obs.topCandidates(1).first else { return nil } + return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox) + } + } + + /// Convenience for fixtures/tests: recognize text in a CGImage. + func recognize(in cgImage: CGImage, regionOfInterest: CGRect? = nil) -> [Result] { + let request = VNRecognizeTextRequest() + request.recognitionLevel = recognitionLevel + request.usesLanguageCorrection = usesLanguageCorrection + if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight } + if let roi = regionOfInterest { request.regionOfInterest = roi } + + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + guard (try? handler.perform([request])) != nil, let results = request.results else { return [] } + return results.compactMap { obs in + guard let top = obs.topCandidates(1).first else { return nil } + return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox) + } + } +} diff --git a/Ten31Transcripts/Visual/TimelineBuilder.swift b/Ten31Transcripts/Visual/TimelineBuilder.swift new file mode 100644 index 0000000..162eb2b --- /dev/null +++ b/Ten31Transcripts/Visual/TimelineBuilder.swift @@ -0,0 +1,127 @@ +import Foundation + +/// Turns noisy per-frame `SpeakerObservation`s into clean +/// `(start, end, name, confidence)` segments. +/// +/// - Hysteresis: open a segment after `openFrames` consecutive speaking frames, +/// close after `closeFrames` quiet frames — rides out UI-cue lag/flicker. +/// - Overlaps allowed: each name is tracked independently (crosstalk). +/// - mic-VAD "self" spans are merged in as high-confidence segments. +/// - OCR name variants are normalized via an alias table. +/// +/// Pure logic, no UI/capture deps — fully unit-testable offline. +final class TimelineBuilder { + private let openFrames: Int + private let closeFrames: Int + private var aliases: [String: String] = [:] // normalized variant -> canonical + private var states: [String: NameState] = [:] + private var lastFrameT: Double = 0 + private(set) var segments: [VisualTimeline.Segment] = [] + + init(openFrames: Int = 2, closeFrames: Int = 2) { + self.openFrames = max(1, openFrames) + self.closeFrames = max(1, closeFrames) + } + + /// Register that `variant` (e.g. "Sarah J") should map to `canonical` + /// (e.g. "Sarah Jones"). + func addAlias(_ variant: String, canonical: String) { + aliases[Self.normalize(variant)] = canonical + } + + /// Ingest one frame's observations (all sharing time `t`). Names not present + /// (or present but not speaking) count as a quiet frame for any open segment. + func ingest(_ observations: [SpeakerObservation], at t: TimeInterval) { + lastFrameT = t + + // Best confidence per canonical name that is speaking this frame. + var speaking: [String: Double] = [:] + for obs in observations where obs.speaking && !obs.name.isEmpty { + let name = canonical(obs.name) + speaking[name] = max(speaking[name] ?? 0, obs.confidence) + } + + let names = Set(states.keys).union(speaking.keys) + for name in names { + var st = states[name] ?? NameState() + if let conf = speaking[name] { + if st.voiced == 0 { st.runStart = t } + st.voiced += 1 + st.silent = 0 + st.lastVoicedT = t + if !st.open && st.voiced >= openFrames { + st.open = true + st.segStart = st.runStart + st.confSum = 0 + st.confN = 0 + } + if st.open { st.confSum += conf; st.confN += 1 } + } else { + st.silent += 1 + st.voiced = 0 + if st.open && st.silent >= closeFrames { + closeSegment(name: name, state: st) + st.open = false + } + } + states[name] = st + } + } + + /// Merge mic-VAD self spans (the user) as high-confidence segments. + func mergeSelfSpans(_ spans: [VADSpan], selfName: String) { + for span in spans where span.end > span.start { + segments.append(.init(start: span.start, end: span.end, + name: selfName, confidence: span.confidence, source: "mic_vad")) + } + } + + /// Force-close any open segments at `t` (used when a visual gap begins, so a + /// segment isn't carried across the gap). + func closeOpenSegments(at t: TimeInterval) { + for (name, st) in states where st.open { + closeSegment(name: name, state: st) + states[name]?.open = false + states[name]?.voiced = 0 + states[name]?.silent = 0 + } + } + + /// Close any still-open segments at end of capture. + func finish() { + for (name, st) in states where st.open { + closeSegment(name: name, state: st) + states[name]?.open = false + } + segments.sort { $0.start < $1.start } + } + + // MARK: - Internal + + private struct NameState { + var voiced = 0 + var silent = 0 + var open = false + var runStart: Double = 0 + var segStart: Double = 0 + var lastVoicedT: Double = 0 + var confSum: Double = 0 + var confN = 0 + } + + private func closeSegment(name: String, state st: NameState) { + guard st.lastVoicedT > st.segStart else { return } + let confidence = st.confN > 0 ? st.confSum / Double(st.confN) : 0.8 + segments.append(.init(start: st.segStart, end: st.lastVoicedT, + name: name, confidence: confidence, source: "vision")) + } + + private func canonical(_ raw: String) -> String { + let key = Self.normalize(raw) + return aliases[key] ?? raw.trimmingCharacters(in: .whitespacesAndNewlines) + } + + private static func normalize(_ s: String) -> String { + s.lowercased().trimmingCharacters(in: .whitespacesAndNewlines) + } +} diff --git a/Ten31Transcripts/Visual/VisualObserver.swift b/Ten31Transcripts/Visual/VisualObserver.swift new file mode 100644 index 0000000..4e54559 --- /dev/null +++ b/Ten31Transcripts/Visual/VisualObserver.swift @@ -0,0 +1,131 @@ +import Foundation +import ScreenCaptureKit +import CoreMedia +import QuartzCore +import AppKit + +/// Window-scoped visual capture: streams the call window's own rendered content +/// at ~`fps`, hands each frame to the app adapter, and **releases it immediately +/// — frames are never written to disk**. Builds the speaker timeline and records +/// `visual_gap`s when the window is minimized (SCK delivers non-live frames). +/// +/// Window visibility/focus is NOT required — SCK captures a window even when it's +/// occluded or on another Space; only minimization freezes the backing buffer. +@available(macOS 13.0, *) +final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput { + private let bundleID: String + private let adapter: any AppAdapter + private let t0Host: Double + private let fps: Int + private let queue = DispatchQueue(label: "xyz.ten31.visual") + + private var stream: SCStream? + private let builder = TimelineBuilder() + private var gaps: [VisualTimeline.Gap] = [] + private var gapStart: Double? + + /// Optional live hook (e.g. for a debug HUD). Observations only; no frame. + var onObservations: (([SpeakerObservation], TimeInterval) -> Void)? + + init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) { + self.bundleID = bundleID + self.adapter = adapter + self.t0Host = t0Host + self.fps = max(1, fps) + } + + func start() async throws { + let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false) + // The call window: the largest window owned by the target app. + let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID } + guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else { + throw NSError(domain: "Ten31", code: 2, + userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."]) + } + + let filter = SCContentFilter(desktopIndependentWindow: window) + let config = SCStreamConfiguration() + config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(fps)) + config.queueDepth = 3 + config.showsCursor = false + config.pixelFormat = kCVPixelFormatType_32BGRA + // window.frame is in points; capture at native pixels so OCR can read small + // initials/names (a half-res Retina capture badly hurts recognition). + let scale = NSScreen.main?.backingScaleFactor ?? 2 + config.width = max(2, Int(window.frame.width * scale)) + config.height = max(2, Int(window.frame.height * scale)) + + let stream = SCStream(filter: filter, configuration: config, delegate: self) + try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: queue) + try await stream.startCapture() + self.stream = stream + } + + func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) { + if let stream { try? await stream.stopCapture() } + stream = nil + return queue.sync { + if let gs = gapStart { + gaps.append(.init(start: gs, end: CACurrentMediaTime() - t0Host, reason: "minimized")) + gapStart = nil + } + builder.finish() + return (builder.segments, gaps) + } + } + + /// Merge mic-VAD self spans into the visual timeline (call before `stop`'s read, + /// or fold in afterwards in the packager). + func addSelfSpans(_ spans: [VADSpan], selfName: String) { + queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) } + } + + // MARK: - SCStreamOutput (on `queue`) + + func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer, + of type: SCStreamOutputType) { + guard type == .screen, CMSampleBufferDataIsReady(sampleBuffer) else { return } + let now = CACurrentMediaTime() - t0Host + + switch frameKind(sampleBuffer) { + case .idle: + // Window is live but static (no pixel change) — no new info, not a gap. + return + case .gap: + // Minimized/blanked: the backing buffer is frozen. Open a gap once and + // close any open speaker segments so none is carried across it. + if gapStart == nil { + gapStart = now + builder.closeOpenSegments(at: now) + } + return + case .live: + if let gs = gapStart { + gaps.append(.init(start: gs, end: now, reason: "minimized")) + gapStart = nil + } + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + let observations = adapter.analyze(frame: pixelBuffer, at: now) // frame released after this scope + builder.ingest(observations, at: now) + onObservations?(observations, now) + } + } + + func stream(_ stream: SCStream, didStopWithError error: Error) {} + + private enum FrameKind { case live, idle, gap } + + /// SCK delivers `.complete` only when content changes, `.idle` for a static + /// (but visible) window, and `.blank`/`.suspended`/`.stopped` when frozen. + private func frameKind(_ sampleBuffer: CMSampleBuffer) -> FrameKind { + guard let attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, createIfNecessary: false) + as? [[SCStreamFrameInfo: Any]], + let raw = attachments.first?[.status] as? Int, + let status = SCFrameStatus(rawValue: raw) else { return .live } + switch status { + case .complete: return .live + case .idle: return .idle + default: return .gap // .blank / .suspended / .stopped + } + } +} diff --git a/Ten31Transcripts/Visual/VisualTimeline.swift b/Ten31Transcripts/Visual/VisualTimeline.swift new file mode 100644 index 0000000..aa74988 --- /dev/null +++ b/Ten31Transcripts/Visual/VisualTimeline.swift @@ -0,0 +1,72 @@ +import Foundation + +/// `visual_timeline.json` (schema 1.1) — the app's primary visual output. Times +/// are seconds relative to session t0. Segments may overlap (crosstalk). +struct VisualTimeline: Codable { + var schemaVersion = "1.1" + let sessionId: String + let app: String + let adapterVersion: String + let t0Unix: Double + let durationSec: Double + let fpsSampled: Int + let selfName: String? + let participants: [Participant] + let segments: [Segment] + let visualGaps: [Gap] + + struct Participant: Codable { + let name: String + let isSelf: Bool? + let aliases: [String]? + enum CodingKeys: String, CodingKey { + case name + case isSelf = "is_self" + case aliases + } + } + + struct Segment: Codable, Equatable { + let start: Double + let end: Double + let name: String + let confidence: Double + let source: String // vision | accessibility | fused | mic_vad + } + + struct Gap: Codable, Equatable { + let start: Double + let end: Double + let reason: String // minimized | tab_switched + } + + enum CodingKeys: String, CodingKey { + case schemaVersion = "schema_version" + case sessionId = "session_id" + case app + case adapterVersion = "adapter_version" + case t0Unix = "t0_unix" + case durationSec = "duration_sec" + case fpsSampled = "fps_sampled" + case selfName = "self_name" + case participants + case segments + case visualGaps = "visual_gaps" + } + + /// Write the rich `visual_timeline.json`. + func write(to url: URL) throws { + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + try encoder.encode(self).write(to: url) + } + + /// The flat array `label-merge` wants: `[{start,end,name,confidence}]`, + /// dropping `source`. Slice/rebase to chunk-local seconds happens in Phase 5. + func flatTimelineData() throws -> Data { + let flat = segments.map { seg -> [String: Any] in + ["start": seg.start, "end": seg.end, "name": seg.name, "confidence": seg.confidence] + } + return try JSONSerialization.data(withJSONObject: flat, options: []) + } +} diff --git a/Ten31TranscriptsTests/GridCallAnalyzerTests.swift b/Ten31TranscriptsTests/GridCallAnalyzerTests.swift new file mode 100644 index 0000000..471cd88 --- /dev/null +++ b/Ten31TranscriptsTests/GridCallAnalyzerTests.swift @@ -0,0 +1,62 @@ +import XCTest +import CoreGraphics +import CoreText +@testable import Ten31Transcripts + +/// Validates the visual adapter against synthetic call frames (no real +/// screenshots needed): OCR anchors the tiles and the highlight is attributed to +/// the correct speaker, tracking it as it moves. +final class GridCallAnalyzerTests: XCTestCase { + + private func drawText(_ s: String, _ ctx: CGContext, center: CGPoint, size: CGFloat) { + let font = CTFontCreateWithName("Helvetica-Bold" as CFString, size, nil) + let attrs = [kCTFontAttributeName: font, + kCTForegroundColorAttributeName: CGColor(red: 1, green: 1, blue: 1, alpha: 1)] as CFDictionary + let line = CTLineCreateWithAttributedString(CFAttributedStringCreate(nil, s as CFString, attrs)!) + let b = CTLineGetBoundsWithOptions(line, []) + ctx.textPosition = CGPoint(x: center.x - b.width / 2, y: center.y - b.height / 2) + CTLineDraw(line, ctx) + } + + private func frame(speakingIndex: Int) -> CGImage { + let W = 800, H = 600 + let ctx = CGContext(data: nil, width: W, height: H, bitsPerComponent: 8, bytesPerRow: 0, + space: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue)! + ctx.setFillColor(CGColor(red: 0.1, green: 0.1, blue: 0.12, alpha: 1)) + ctx.fill(CGRect(x: 0, y: 0, width: W, height: H)) + let rects: [(String, CGRect)] = [ + ("GRANT", CGRect(x: 40, y: 320, width: 340, height: 230)), + ("SARAH", CGRect(x: 420, y: 320, width: 340, height: 230)), + ("DMITRI", CGRect(x: 40, y: 50, width: 340, height: 230)), + ("ALEX", CGRect(x: 420, y: 50, width: 340, height: 230)), + ] + for (i, (name, rect)) in rects.enumerated() { + ctx.setFillColor(CGColor(red: 0.18, green: 0.18, blue: 0.2, alpha: 1)); ctx.fill(rect) + if i == speakingIndex { + ctx.setStrokeColor(CGColor(red: 0.1, green: 0.85, blue: 0.2, alpha: 1)); ctx.setLineWidth(14) + ctx.stroke(rect.insetBy(dx: 7, dy: 7)) + } + drawText(name, ctx, center: CGPoint(x: rect.midX, y: rect.midY), size: 54) + } + return ctx.makeImage()! + } + + func testReadsNamesAndPicksHighlightedSpeaker() { + let obs = SignalAdapter().analyze(cgImage: frame(speakingIndex: 1), at: 0) // SARAH + XCTAssertGreaterThanOrEqual(obs.count, 2) + let speaking = obs.filter { $0.speaking } + XCTAssertEqual(speaking.count, 1) + // SARAH tile center in top-left pixels ≈ (590, 165) + XCTAssertEqual(speaking.first?.bbox.midX ?? 0, 590, accuracy: 160) + XCTAssertEqual(speaking.first?.bbox.midY ?? 0, 165, accuracy: 160) + } + + func testHighlightTracksToAnotherTile() { + let obs = SignalAdapter().analyze(cgImage: frame(speakingIndex: 2), at: 1) // DMITRI + let speaking = obs.filter { $0.speaking } + XCTAssertEqual(speaking.count, 1) + XCTAssertEqual(speaking.first?.bbox.midX ?? 0, 210, accuracy: 160) + XCTAssertEqual(speaking.first?.bbox.midY ?? 0, 435, accuracy: 160) + } +} diff --git a/Ten31TranscriptsTests/Phase5Tests.swift b/Ten31TranscriptsTests/Phase5Tests.swift new file mode 100644 index 0000000..bc52674 --- /dev/null +++ b/Ten31TranscriptsTests/Phase5Tests.swift @@ -0,0 +1,47 @@ +import XCTest +@testable import Ten31Transcripts + +final class Phase5Tests: XCTestCase { + func testPlanChunksShort() { + let c = SessionPackager.planChunks(durationSec: 70) + XCTAssertEqual(c.count, 1) + XCTAssertEqual(c[0].end, 70, accuracy: 0.001) + } + + func testPlanChunksLong() { + let c = SessionPackager.planChunks(durationSec: 400, chunkSeconds: 150) + XCTAssertEqual(c.count, 3) + XCTAssertEqual(c[0].start, 0); XCTAssertEqual(c[0].end, 150) + XCTAssertEqual(c[1].start, 150); XCTAssertEqual(c[2].end, 400) + } + + func testRebaseClipsAndRebases() throws { + let segs = [ + VisualTimeline.Segment(start: 140, end: 160, name: "A", confidence: 0.9, source: "vision"), + VisualTimeline.Segment(start: 200, end: 260, name: "B", confidence: 0.8, source: "vision"), + ] + let data = try SessionPackager.rebasedTimelineData(segs, start: 150, end: 300) + let arr = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [[String: Any]]) + XCTAssertEqual(arr.count, 2) + XCTAssertEqual(arr[0]["start"] as? Double, 0) + XCTAssertEqual(arr[0]["end"] as? Double, 10) + XCTAssertEqual(arr[1]["start"] as? Double, 50) + XCTAssertEqual(arr[1]["end"] as? Double, 110) + } + + func testAssembleOffsetsAndUnifies() throws { + let resp0 = #"{"duration":150,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1,0.2]}],"segments":[{"start_ms":1000,"end_ms":2000,"speaker":"Grant","text":"hi"}],"fingerprints":{"Grant":[0.1,0.2]},"models":{"diarization":"x"}}"# + let resp1 = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Sarah","source":"voiceprint","match_similarity":0.7,"fingerprint":[0.3,0.4]},{"cluster":"Speaker_1","name":"Unknown_0","source":"unmatched"}],"segments":[{"start_ms":500,"end_ms":1500,"speaker":"Sarah","text":"hello"}],"fingerprints":{"Sarah":[0.3,0.4]},"models":{"diarization":"x"}}"# + let r0 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp0.utf8)) + let r1 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp1.utf8)) + let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet", + chunks: [.init(chunkStart: 0, response: r0), .init(chunkStart: 150, response: r1)]) + XCTAssertEqual(asm.speakersFile.segments.count, 2) + XCTAssertEqual(asm.speakersFile.segments[0].start, 1, accuracy: 0.001) + XCTAssertEqual(asm.speakersFile.segments[1].start, 150.5, accuracy: 0.001) + XCTAssertEqual(asm.speakersFile.speakers.count, 3) + XCTAssertNotNil(asm.fingerprints["Grant"]) + XCTAssertNotNil(asm.fingerprints["Sarah"]) + XCTAssertNil(asm.fingerprints["Unknown_0"]) + } +} diff --git a/Ten31TranscriptsTests/TimelineBuilderTests.swift b/Ten31TranscriptsTests/TimelineBuilderTests.swift new file mode 100644 index 0000000..c38fa8e --- /dev/null +++ b/Ten31TranscriptsTests/TimelineBuilderTests.swift @@ -0,0 +1,60 @@ +import XCTest +@testable import Ten31Transcripts + +final class TimelineBuilderTests: XCTestCase { + private func obs(_ name: String, _ speaking: Bool, _ t: Double, _ conf: Double = 0.9) -> SpeakerObservation { + SpeakerObservation(name: name, speaking: speaking, bbox: .zero, confidence: conf, t: t) + } + + func testOpensAfterKFramesAndClosesAfterMQuiet() { + let b = TimelineBuilder(openFrames: 2, closeFrames: 2) + b.ingest([obs("A", true, 0)], at: 0) + b.ingest([obs("A", true, 1)], at: 1) + b.ingest([obs("A", true, 2)], at: 2) + b.ingest([], at: 3) + b.ingest([], at: 4) + b.finish() + XCTAssertEqual(b.segments.count, 1) + XCTAssertEqual(b.segments.first?.name, "A") + XCTAssertEqual(b.segments.first?.start ?? -1, 0, accuracy: 0.001) + XCTAssertEqual(b.segments.first?.end ?? -1, 2, accuracy: 0.001) + XCTAssertEqual(b.segments.first?.source, "vision") + } + + func testSingleFlickerDoesNotOpen() { + let b = TimelineBuilder(openFrames: 2, closeFrames: 2) + b.ingest([obs("A", true, 0)], at: 0) + b.ingest([], at: 1) + b.finish() + XCTAssertTrue(b.segments.isEmpty) + } + + func testAllowsOverlap() { + let b = TimelineBuilder(openFrames: 1, closeFrames: 1) + b.ingest([obs("A", true, 0), obs("B", true, 0)], at: 0) + b.ingest([obs("A", true, 1), obs("B", true, 1)], at: 1) + b.ingest([], at: 2) + b.finish() + XCTAssertEqual(b.segments.count, 2) + XCTAssertEqual(Set(b.segments.map { $0.name }), ["A", "B"]) + } + + func testMergesSelfSpans() { + let b = TimelineBuilder() + b.mergeSelfSpans([VADSpan(start: 0, end: 4.5, confidence: 0.97)], selfName: "Grant") + b.finish() + XCTAssertEqual(b.segments.count, 1) + XCTAssertEqual(b.segments.first?.name, "Grant") + XCTAssertEqual(b.segments.first?.source, "mic_vad") + } + + func testNormalizesAlias() { + let b = TimelineBuilder(openFrames: 1, closeFrames: 1) + b.addAlias("Sarah J", canonical: "Sarah Jones") + b.ingest([obs("Sarah J", true, 0)], at: 0) + b.ingest([obs("Sarah J", true, 1)], at: 1) + b.ingest([], at: 2) + b.finish() + XCTAssertEqual(b.segments.first?.name, "Sarah Jones") + } +} diff --git a/Ten31TranscriptsTests/VoiceprintStoreTests.swift b/Ten31TranscriptsTests/VoiceprintStoreTests.swift new file mode 100644 index 0000000..7833dc0 --- /dev/null +++ b/Ten31TranscriptsTests/VoiceprintStoreTests.swift @@ -0,0 +1,45 @@ +import XCTest +@testable import Ten31Transcripts + +final class VoiceprintStoreTests: XCTestCase { + private func tempURL() -> URL { + FileManager.default.temporaryDirectory.appendingPathComponent("vp_\(UUID().uuidString).json") + } + + private func response() throws -> LabelMergeResponse { + let json = #"{"duration":10,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1,0.2,0.3]},{"cluster":"Speaker_1","name":"Sarah","source":"voiceprint","match_similarity":0.7,"fingerprint":[0.4,0.5,0.6]},{"cluster":"Speaker_2","name":"Bob","source":"visual","overlap_confidence":0.5,"fingerprint":[0.7,0.8,0.9]},{"cluster":"Speaker_3","name":"Unknown_0","source":"unmatched"}],"segments":[],"fingerprints":{"Grant":[0.1,0.2,0.3],"Sarah":[0.4,0.5,0.6]},"models":{}}"# + return try JSONDecoder().decode(LabelMergeResponse.self, from: Data(json.utf8)) + } + + func testStoresOnlyConfidentNamedSpeakers() throws { + let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) } + let store = VoiceprintStore(fileURL: url) + store.update(with: try response()) + XCTAssertNotNil(store.entries["Grant"]) // visual, high overlap + XCTAssertNotNil(store.entries["Sarah"]) // voiceprint match + XCTAssertNil(store.entries["Bob"]) // overlap 0.5 < 0.8 + XCTAssertNil(store.entries["Unknown_0"]) + XCTAssertEqual(store.knownVoiceprints()["Grant"], [0.1, 0.2, 0.3]) + XCTAssertEqual(store.entries["Grant"]?.calls, 1) + } + + func testPersistsAcrossInstancesAndIncrementsCalls() throws { + let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) } + let store = VoiceprintStore(fileURL: url) + store.update(with: try response()) + store.update(with: try response()) + XCTAssertEqual(store.entries["Grant"]?.calls, 2) + let reopened = VoiceprintStore(fileURL: url) + XCTAssertEqual(reopened.knownVoiceprints().count, 2) + } + + func testRenameRemoveReset() throws { + let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) } + let store = VoiceprintStore(fileURL: url) + store.update(with: try response()) + store.rename("Sarah", to: "Sarah Jones") + XCTAssertNotNil(store.entries["Sarah Jones"]); XCTAssertNil(store.entries["Sarah"]) + store.remove("Grant"); XCTAssertNil(store.entries["Grant"]) + store.reset(); XCTAssertTrue(store.entries.isEmpty) + } +}