Phases 2-6: detection, visual timeline, backend hand-off, voiceprints

Phase 2 (call detection): CallDetector using CoreAudio per-process mic
attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet,
ignoring our own recording; auto-record toggle. Built; pending live multi-app
confirmation by the user.

Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation,
TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema
1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR +
saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver
(window capture; frames released, never saved; minimized->visual_gap, idle != gap).
Synthetic-frame tested; adapter geometry pending real Signal fixtures + live
VisualObserver validation.

Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential,
TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline
slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated
END-TO-END against the live backend (chunk -> label-merge -> speakers.json).

Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named
fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status,
auto-send toggle (default off) + self-name setting.

All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
Grant Gilliam
2026-06-06 00:15:49 -05:00
parent fd7e1a5907
commit 863136aeec
27 changed files with 2108 additions and 22 deletions
@@ -0,0 +1,226 @@
import AppKit
import CoreGraphics
import Combine
/// Detects when the user joins/leaves a call and reports it via callbacks.
///
/// Heuristic: the mic is live system-wide AND a known call app is present
/// Zoom/Teams/Signal by bundle ID, or Google Meet by a browser window whose
/// title looks like a Meet call (read via `CGWindowList`, using the Screen
/// Recording permission). Debounced so a quick unrelated mic use doesn't trigger.
///
/// Main-actor: all evaluation runs on the main thread.
@MainActor
final class CallDetector: ObservableObject {
enum DetectedApp: String, Equatable {
case zoom, teams, signal, meet
var label: String { rawValue }
var display: String {
switch self {
case .zoom: return "Zoom"
case .teams: return "Microsoft Teams"
case .signal: return "Signal"
case .meet: return "Google Meet"
}
}
}
enum Status: Equatable {
case disabled
case listening
case inCall(DetectedApp)
}
@Published private(set) var status: Status = .disabled
var onCallStart: ((DetectedApp) -> Void)?
var onCallEnd: (() -> Void)?
private let mic = MicActivityMonitor()
private var pollTimer: Timer?
private var openTimer: Timer?
private var closeTimer: Timer?
private var inCall = false
private var currentApp: DetectedApp?
private var enabled = false
private let openDelay: TimeInterval = 2.0
private let closeDelay: TimeInterval = 4.0
private let pollInterval: TimeInterval = 3.0
private static let nativeApps: [(id: String, app: DetectedApp)] = [
("us.zoom.xos", .zoom),
("com.microsoft.teams2", .teams),
("com.microsoft.teams", .teams),
("org.whispersystems.signal-desktop", .signal),
]
private static let browserIDs: Set<String> = [
"org.mozilla.firefox", "com.google.Chrome", "com.apple.Safari",
"company.thebrowser.Browser", "com.brave.Browser", "com.microsoft.edgemac",
]
func enable() {
guard !enabled else { return }
enabled = true
mic.onChange = { [weak self] _ in self?.evaluate() }
mic.start()
status = .listening
pollTimer = Timer.scheduledTimer(withTimeInterval: pollInterval, repeats: true) { [weak self] _ in
Task { @MainActor in self?.evaluate() }
}
evaluate()
}
func disable() {
guard enabled else { return }
enabled = false
mic.stop()
pollTimer?.invalidate(); pollTimer = nil
cancelOpen(); cancelClose()
inCall = false
currentApp = nil
status = .disabled
}
// MARK: - Evaluation
private func evaluate() {
guard enabled else { return }
let candidate = mic.isRunning ? detectApp() : nil
if let candidate {
cancelClose()
if inCall {
currentApp = candidate
status = .inCall(candidate)
} else if openTimer == nil {
openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in
Task { @MainActor in self?.fireOpen() }
}
}
} else {
cancelOpen()
if inCall && closeTimer == nil {
closeTimer = Timer.scheduledTimer(withTimeInterval: closeDelay, repeats: false) { [weak self] _ in
Task { @MainActor in self?.fireClose() }
}
}
}
}
private func fireOpen() {
openTimer = nil
// Re-resolve the app at fire time (the debounce window may have changed it).
guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return }
inCall = true
currentApp = app
status = .inCall(app)
onCallStart?(app)
}
private func fireClose() {
closeTimer = nil
guard enabled, inCall else { return }
inCall = false
currentApp = nil
status = .listening
onCallEnd?()
}
private func cancelOpen() { openTimer?.invalidate(); openTimer = nil }
private func cancelClose() { closeTimer?.invalidate(); closeTimer = nil }
// MARK: - App detection
/// A call is active when a known call app is actually using the mic.
/// On macOS 14+ we attribute mic usage per-process (robust start AND stop,
/// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13
/// we fall back to the per-app call-window heuristic.
private func detectApp() -> DetectedApp? {
if #available(macOS 14.0, *) {
return detectViaMicAttribution()
}
return detectViaWindowTitle()
}
@available(macOS 14.0, *)
private func detectViaMicAttribution() -> DetectedApp? {
let micPIDs = AudioInputProcesses.micUsingPIDs()
guard !micPIDs.isEmpty else { return nil }
let selfPID = NSRunningApplication.current.processIdentifier
for app in NSWorkspace.shared.runningApplications {
let pid = app.processIdentifier
guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue }
if let native = Self.nativeApps.first(where: { $0.id == id }) {
return native.app // Signal/Zoom/Teams using the mic = in a call
}
// A browser using the mic + a Meet window = a Meet call. The mic state
// gives reliable start/stop; the window check keeps non-Meet browser
// mic use (other web apps) from being mislabeled as a Meet recording.
if Self.browserIDs.contains(id), pidHasMeetWindow(pid) {
return .meet
}
}
return nil
}
private func pidHasMeetWindow(_ pid: pid_t) -> Bool {
guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]]
else { return false }
for w in info {
guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid,
let title = w[kCGWindowName as String] as? String else { continue }
if Self.looksLikeMeet(title) { return true }
}
return false
}
/// macOS 13 fallback: detect by the presence of a call WINDOW per app.
private func detectViaWindowTitle() -> DetectedApp? {
var pidToApp: [pid_t: DetectedApp] = [:]
var browserPIDs = Set<pid_t>()
for app in NSWorkspace.shared.runningApplications {
guard let id = app.bundleIdentifier else { continue }
if let native = Self.nativeApps.first(where: { $0.id == id }) {
pidToApp[app.processIdentifier] = native.app
} else if Self.browserIDs.contains(id) {
browserPIDs.insert(app.processIdentifier)
}
}
guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil }
guard let infoList = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] else {
return nil
}
for info in infoList {
guard let pid = info[kCGWindowOwnerPID as String] as? pid_t,
let title = info[kCGWindowName as String] as? String,
!title.isEmpty else { continue }
if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet }
if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app }
}
return nil
}
/// Per-app in-call window-title signatures (macOS 13 fallback only).
private static func isCallWindow(_ app: DetectedApp, _ title: String) -> Bool {
let t = title.lowercased()
switch app {
case .zoom: return t.contains("zoom meeting") || t.contains("meeting")
case .teams: return t.contains("meeting")
case .signal: return t.contains("signal call") || t.contains("group call")
case .meet: return false // handled via the browser path above
}
}
/// Match an ACTIVE Google Meet call. Verified against real Firefox titles:
/// in a call the title is "Meet - <code>" (e.g. "Meet - tjh-pixe-ier"), while
/// the home/lobby/"you left" pages are bare "Meet" or "Google Meet". Matching
/// only the "Meet - " form is what lets auto-STOP fire when you leave (and
/// avoids false-starting on the home page). Also excludes "Zoom Meeting" etc.
private static func looksLikeMeet(_ title: String) -> Bool {
let t = title.lowercased()
return t.hasPrefix("meet - ") || t.hasPrefix("meet ") || t.hasPrefix("meet — ")
}
}