Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
import CoreAudio
|
||||
import Foundation
|
||||
|
||||
/// Lists the PIDs of processes currently using an audio **input** (the mic), via
|
||||
/// the CoreAudio process-object API (macOS 14+).
|
||||
///
|
||||
/// This is how we attribute mic usage to a *specific* app — e.g. "is Signal in a
|
||||
/// call?" — which is far more robust than matching window titles, and it works
|
||||
/// uniformly for Zoom/Teams/Signal and browser calls (Meet). It also lets us
|
||||
/// ignore our own recording: we look at the *call app's* PID, not the global mic,
|
||||
/// so a call's end is detected even while we keep the mic open.
|
||||
///
|
||||
/// Approach mirrors fastrepl/anarlog's `list_mic_using_apps`.
|
||||
@available(macOS 14.0, *)
|
||||
enum AudioInputProcesses {
|
||||
static func micUsingPIDs() -> Set<pid_t> {
|
||||
var listAddr = AudioObjectPropertyAddress(
|
||||
mSelector: kAudioHardwarePropertyProcessObjectList,
|
||||
mScope: kAudioObjectPropertyScopeGlobal,
|
||||
mElement: kAudioObjectPropertyElementMain)
|
||||
|
||||
var dataSize: UInt32 = 0
|
||||
guard AudioObjectGetPropertyDataSize(
|
||||
AudioObjectID(kAudioObjectSystemObject), &listAddr, 0, nil, &dataSize) == noErr,
|
||||
dataSize > 0 else { return [] }
|
||||
|
||||
let count = Int(dataSize) / MemoryLayout<AudioObjectID>.size
|
||||
var processes = [AudioObjectID](repeating: 0, count: count)
|
||||
guard AudioObjectGetPropertyData(
|
||||
AudioObjectID(kAudioObjectSystemObject), &listAddr, 0, nil, &dataSize, &processes) == noErr
|
||||
else { return [] }
|
||||
|
||||
var pids = Set<pid_t>()
|
||||
for process in processes where isRunningInput(process) {
|
||||
if let pid = pid(of: process) { pids.insert(pid) }
|
||||
}
|
||||
return pids
|
||||
}
|
||||
|
||||
private static func isRunningInput(_ process: AudioObjectID) -> Bool {
|
||||
var addr = AudioObjectPropertyAddress(
|
||||
mSelector: kAudioProcessPropertyIsRunningInput,
|
||||
mScope: kAudioObjectPropertyScopeGlobal,
|
||||
mElement: kAudioObjectPropertyElementMain)
|
||||
var value: UInt32 = 0
|
||||
var size = UInt32(MemoryLayout<UInt32>.size)
|
||||
guard AudioObjectGetPropertyData(process, &addr, 0, nil, &size, &value) == noErr else { return false }
|
||||
return value != 0
|
||||
}
|
||||
|
||||
private static func pid(of process: AudioObjectID) -> pid_t? {
|
||||
var addr = AudioObjectPropertyAddress(
|
||||
mSelector: kAudioProcessPropertyPID,
|
||||
mScope: kAudioObjectPropertyScopeGlobal,
|
||||
mElement: kAudioObjectPropertyElementMain)
|
||||
var value: pid_t = 0
|
||||
var size = UInt32(MemoryLayout<pid_t>.size)
|
||||
guard AudioObjectGetPropertyData(process, &addr, 0, nil, &size, &value) == noErr else { return nil }
|
||||
return value
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,226 @@
|
||||
import AppKit
|
||||
import CoreGraphics
|
||||
import Combine
|
||||
|
||||
/// Detects when the user joins/leaves a call and reports it via callbacks.
|
||||
///
|
||||
/// Heuristic: the mic is live system-wide AND a known call app is present —
|
||||
/// Zoom/Teams/Signal by bundle ID, or Google Meet by a browser window whose
|
||||
/// title looks like a Meet call (read via `CGWindowList`, using the Screen
|
||||
/// Recording permission). Debounced so a quick unrelated mic use doesn't trigger.
|
||||
///
|
||||
/// Main-actor: all evaluation runs on the main thread.
|
||||
@MainActor
|
||||
final class CallDetector: ObservableObject {
|
||||
|
||||
enum DetectedApp: String, Equatable {
|
||||
case zoom, teams, signal, meet
|
||||
var label: String { rawValue }
|
||||
var display: String {
|
||||
switch self {
|
||||
case .zoom: return "Zoom"
|
||||
case .teams: return "Microsoft Teams"
|
||||
case .signal: return "Signal"
|
||||
case .meet: return "Google Meet"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum Status: Equatable {
|
||||
case disabled
|
||||
case listening
|
||||
case inCall(DetectedApp)
|
||||
}
|
||||
|
||||
@Published private(set) var status: Status = .disabled
|
||||
|
||||
var onCallStart: ((DetectedApp) -> Void)?
|
||||
var onCallEnd: (() -> Void)?
|
||||
|
||||
private let mic = MicActivityMonitor()
|
||||
private var pollTimer: Timer?
|
||||
private var openTimer: Timer?
|
||||
private var closeTimer: Timer?
|
||||
private var inCall = false
|
||||
private var currentApp: DetectedApp?
|
||||
private var enabled = false
|
||||
|
||||
private let openDelay: TimeInterval = 2.0
|
||||
private let closeDelay: TimeInterval = 4.0
|
||||
private let pollInterval: TimeInterval = 3.0
|
||||
|
||||
private static let nativeApps: [(id: String, app: DetectedApp)] = [
|
||||
("us.zoom.xos", .zoom),
|
||||
("com.microsoft.teams2", .teams),
|
||||
("com.microsoft.teams", .teams),
|
||||
("org.whispersystems.signal-desktop", .signal),
|
||||
]
|
||||
private static let browserIDs: Set<String> = [
|
||||
"org.mozilla.firefox", "com.google.Chrome", "com.apple.Safari",
|
||||
"company.thebrowser.Browser", "com.brave.Browser", "com.microsoft.edgemac",
|
||||
]
|
||||
|
||||
func enable() {
|
||||
guard !enabled else { return }
|
||||
enabled = true
|
||||
mic.onChange = { [weak self] _ in self?.evaluate() }
|
||||
mic.start()
|
||||
status = .listening
|
||||
pollTimer = Timer.scheduledTimer(withTimeInterval: pollInterval, repeats: true) { [weak self] _ in
|
||||
Task { @MainActor in self?.evaluate() }
|
||||
}
|
||||
evaluate()
|
||||
}
|
||||
|
||||
func disable() {
|
||||
guard enabled else { return }
|
||||
enabled = false
|
||||
mic.stop()
|
||||
pollTimer?.invalidate(); pollTimer = nil
|
||||
cancelOpen(); cancelClose()
|
||||
inCall = false
|
||||
currentApp = nil
|
||||
status = .disabled
|
||||
}
|
||||
|
||||
// MARK: - Evaluation
|
||||
|
||||
private func evaluate() {
|
||||
guard enabled else { return }
|
||||
let candidate = mic.isRunning ? detectApp() : nil
|
||||
|
||||
if let candidate {
|
||||
cancelClose()
|
||||
if inCall {
|
||||
currentApp = candidate
|
||||
status = .inCall(candidate)
|
||||
} else if openTimer == nil {
|
||||
openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in
|
||||
Task { @MainActor in self?.fireOpen() }
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cancelOpen()
|
||||
if inCall && closeTimer == nil {
|
||||
closeTimer = Timer.scheduledTimer(withTimeInterval: closeDelay, repeats: false) { [weak self] _ in
|
||||
Task { @MainActor in self?.fireClose() }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func fireOpen() {
|
||||
openTimer = nil
|
||||
// Re-resolve the app at fire time (the debounce window may have changed it).
|
||||
guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return }
|
||||
inCall = true
|
||||
currentApp = app
|
||||
status = .inCall(app)
|
||||
onCallStart?(app)
|
||||
}
|
||||
|
||||
private func fireClose() {
|
||||
closeTimer = nil
|
||||
guard enabled, inCall else { return }
|
||||
inCall = false
|
||||
currentApp = nil
|
||||
status = .listening
|
||||
onCallEnd?()
|
||||
}
|
||||
|
||||
private func cancelOpen() { openTimer?.invalidate(); openTimer = nil }
|
||||
private func cancelClose() { closeTimer?.invalidate(); closeTimer = nil }
|
||||
|
||||
// MARK: - App detection
|
||||
|
||||
/// A call is active when a known call app is actually using the mic.
|
||||
/// On macOS 14+ we attribute mic usage per-process (robust start AND stop,
|
||||
/// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13
|
||||
/// we fall back to the per-app call-window heuristic.
|
||||
private func detectApp() -> DetectedApp? {
|
||||
if #available(macOS 14.0, *) {
|
||||
return detectViaMicAttribution()
|
||||
}
|
||||
return detectViaWindowTitle()
|
||||
}
|
||||
|
||||
@available(macOS 14.0, *)
|
||||
private func detectViaMicAttribution() -> DetectedApp? {
|
||||
let micPIDs = AudioInputProcesses.micUsingPIDs()
|
||||
guard !micPIDs.isEmpty else { return nil }
|
||||
let selfPID = NSRunningApplication.current.processIdentifier
|
||||
|
||||
for app in NSWorkspace.shared.runningApplications {
|
||||
let pid = app.processIdentifier
|
||||
guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue }
|
||||
if let native = Self.nativeApps.first(where: { $0.id == id }) {
|
||||
return native.app // Signal/Zoom/Teams using the mic = in a call
|
||||
}
|
||||
// A browser using the mic + a Meet window = a Meet call. The mic state
|
||||
// gives reliable start/stop; the window check keeps non-Meet browser
|
||||
// mic use (other web apps) from being mislabeled as a Meet recording.
|
||||
if Self.browserIDs.contains(id), pidHasMeetWindow(pid) {
|
||||
return .meet
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private func pidHasMeetWindow(_ pid: pid_t) -> Bool {
|
||||
guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]]
|
||||
else { return false }
|
||||
for w in info {
|
||||
guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid,
|
||||
let title = w[kCGWindowName as String] as? String else { continue }
|
||||
if Self.looksLikeMeet(title) { return true }
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
/// macOS 13 fallback: detect by the presence of a call WINDOW per app.
|
||||
private func detectViaWindowTitle() -> DetectedApp? {
|
||||
var pidToApp: [pid_t: DetectedApp] = [:]
|
||||
var browserPIDs = Set<pid_t>()
|
||||
for app in NSWorkspace.shared.runningApplications {
|
||||
guard let id = app.bundleIdentifier else { continue }
|
||||
if let native = Self.nativeApps.first(where: { $0.id == id }) {
|
||||
pidToApp[app.processIdentifier] = native.app
|
||||
} else if Self.browserIDs.contains(id) {
|
||||
browserPIDs.insert(app.processIdentifier)
|
||||
}
|
||||
}
|
||||
guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil }
|
||||
guard let infoList = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] else {
|
||||
return nil
|
||||
}
|
||||
for info in infoList {
|
||||
guard let pid = info[kCGWindowOwnerPID as String] as? pid_t,
|
||||
let title = info[kCGWindowName as String] as? String,
|
||||
!title.isEmpty else { continue }
|
||||
if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet }
|
||||
if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app }
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
/// Per-app in-call window-title signatures (macOS 13 fallback only).
|
||||
private static func isCallWindow(_ app: DetectedApp, _ title: String) -> Bool {
|
||||
let t = title.lowercased()
|
||||
switch app {
|
||||
case .zoom: return t.contains("zoom meeting") || t.contains("meeting")
|
||||
case .teams: return t.contains("meeting")
|
||||
case .signal: return t.contains("signal call") || t.contains("group call")
|
||||
case .meet: return false // handled via the browser path above
|
||||
}
|
||||
}
|
||||
|
||||
/// Match an ACTIVE Google Meet call. Verified against real Firefox titles:
|
||||
/// in a call the title is "Meet - <code>" (e.g. "Meet - tjh-pixe-ier"), while
|
||||
/// the home/lobby/"you left" pages are bare "Meet" or "Google Meet". Matching
|
||||
/// only the "Meet - …" form is what lets auto-STOP fire when you leave (and
|
||||
/// avoids false-starting on the home page). Also excludes "Zoom Meeting" etc.
|
||||
private static func looksLikeMeet(_ title: String) -> Bool {
|
||||
let t = title.lowercased()
|
||||
return t.hasPrefix("meet - ") || t.hasPrefix("meet – ") || t.hasPrefix("meet — ")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
import CoreAudio
|
||||
import Foundation
|
||||
|
||||
/// Watches whether *any* app is using the default input device (the system-wide
|
||||
/// "mic is live" signal), via CoreAudio property listeners. Re-binds when the
|
||||
/// default input device changes (e.g. you plug in a headset mid-call).
|
||||
///
|
||||
/// Threading: ALL CoreAudio state (deviceID, listener blocks, `started`) and all
|
||||
/// Add/Remove calls are confined to the serial `queue`. `isRunning` is written
|
||||
/// and read only on the main thread (via `deliver`). `onChange` fires on main.
|
||||
final class MicActivityMonitor {
|
||||
private(set) var isRunning = false // main-thread only
|
||||
var onChange: ((Bool) -> Void)?
|
||||
|
||||
private let queue = DispatchQueue(label: "xyz.ten31.micmonitor")
|
||||
|
||||
// queue-confined:
|
||||
private var deviceID = AudioObjectID(kAudioObjectUnknown)
|
||||
private var runningBlock: AudioObjectPropertyListenerBlock?
|
||||
private var defaultDeviceBlock: AudioObjectPropertyListenerBlock?
|
||||
private var started = false
|
||||
|
||||
private static let runningAddr = AudioObjectPropertyAddress(
|
||||
mSelector: kAudioDevicePropertyDeviceIsRunningSomewhere,
|
||||
mScope: kAudioObjectPropertyScopeGlobal,
|
||||
mElement: kAudioObjectPropertyElementMain)
|
||||
|
||||
private static let defaultDeviceAddr = AudioObjectPropertyAddress(
|
||||
mSelector: kAudioHardwarePropertyDefaultInputDevice,
|
||||
mScope: kAudioObjectPropertyScopeGlobal,
|
||||
mElement: kAudioObjectPropertyElementMain)
|
||||
|
||||
func start() { queue.async { self.begin() } }
|
||||
|
||||
/// Called on the main thread (by the @MainActor CallDetector). Resets
|
||||
/// `isRunning` so a subsequent enable()'s synchronous evaluation can't read a
|
||||
/// stale `true` before the fresh reading arrives.
|
||||
func stop() {
|
||||
queue.sync { self.end() }
|
||||
isRunning = false
|
||||
}
|
||||
|
||||
// MARK: - queue-confined
|
||||
|
||||
private func begin() {
|
||||
guard !started else { return }
|
||||
started = true
|
||||
var addr = Self.defaultDeviceAddr
|
||||
let block: AudioObjectPropertyListenerBlock = { [weak self] _, _ in
|
||||
self?.rebindRunning() // delivered on `queue`
|
||||
}
|
||||
defaultDeviceBlock = block
|
||||
AudioObjectAddPropertyListenerBlock(AudioObjectID(kAudioObjectSystemObject), &addr, queue, block)
|
||||
bindRunning()
|
||||
}
|
||||
|
||||
private func end() {
|
||||
started = false
|
||||
if let block = defaultDeviceBlock {
|
||||
var addr = Self.defaultDeviceAddr
|
||||
AudioObjectRemovePropertyListenerBlock(AudioObjectID(kAudioObjectSystemObject), &addr, queue, block)
|
||||
defaultDeviceBlock = nil
|
||||
}
|
||||
unbindRunning()
|
||||
}
|
||||
|
||||
private func bindRunning() {
|
||||
guard started else { return }
|
||||
deviceID = Self.defaultInputDevice()
|
||||
guard deviceID != AudioObjectID(kAudioObjectUnknown) else { return }
|
||||
var addr = Self.runningAddr
|
||||
let block: AudioObjectPropertyListenerBlock = { [weak self] _, _ in
|
||||
guard let self else { return }
|
||||
self.deliver(Self.isDeviceRunning(self.deviceID)) // on `queue`
|
||||
}
|
||||
runningBlock = block
|
||||
// Install the listener BEFORE the initial read so a flip during setup is
|
||||
// caught (either by the now-installed listener or the post-install read).
|
||||
AudioObjectAddPropertyListenerBlock(deviceID, &addr, queue, block)
|
||||
deliver(Self.isDeviceRunning(deviceID))
|
||||
}
|
||||
|
||||
private func unbindRunning() {
|
||||
if deviceID != AudioObjectID(kAudioObjectUnknown), let block = runningBlock {
|
||||
var addr = Self.runningAddr
|
||||
AudioObjectRemovePropertyListenerBlock(deviceID, &addr, queue, block)
|
||||
}
|
||||
runningBlock = nil
|
||||
deviceID = AudioObjectID(kAudioObjectUnknown)
|
||||
}
|
||||
|
||||
private func rebindRunning() {
|
||||
guard started else { return }
|
||||
unbindRunning()
|
||||
bindRunning()
|
||||
}
|
||||
|
||||
private func deliver(_ running: Bool) {
|
||||
DispatchQueue.main.async {
|
||||
let changed = running != self.isRunning
|
||||
self.isRunning = running
|
||||
if changed { self.onChange?(running) }
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - CoreAudio reads (use local address copies)
|
||||
|
||||
private static func defaultInputDevice() -> AudioObjectID {
|
||||
var addr = defaultDeviceAddr
|
||||
var device = AudioObjectID(kAudioObjectUnknown)
|
||||
var size = UInt32(MemoryLayout<AudioObjectID>.size)
|
||||
let status = AudioObjectGetPropertyData(
|
||||
AudioObjectID(kAudioObjectSystemObject), &addr, 0, nil, &size, &device)
|
||||
return status == noErr ? device : AudioObjectID(kAudioObjectUnknown)
|
||||
}
|
||||
|
||||
private static func isDeviceRunning(_ device: AudioObjectID) -> Bool {
|
||||
guard device != AudioObjectID(kAudioObjectUnknown) else { return false }
|
||||
var addr = runningAddr
|
||||
var value: UInt32 = 0
|
||||
var size = UInt32(MemoryLayout<UInt32>.size)
|
||||
let status = AudioObjectGetPropertyData(device, &addr, 0, nil, &size, &value)
|
||||
return status == noErr && value != 0
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user