Wire visual capture into the recording lifecycle (failure-isolated)

Visual capture now runs alongside audio: on call start the session picks the
app's adapter, captures the call window on the SAME monotonic clock as the audio
(AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands
the backend the visual segments with mic-VAD self-spans merged. Any visual
failure (no adapter, no window, Screen Recording denied) leaves the session
recording audio-only — the proven path is never blocked or broken.

- CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact
  CGWindowID of the matched Meet browser window (native apps → nil → largest).
- VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json.
- AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment.

Hardened per a 3-lens adversarial review (concurrency / failure-isolation /
data-flow), all 6 confirmed findings fixed:
- P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session
  (cross-session SCStream leak + visual_timeline.json written to the wrong
  folder). Now gated on session identity — generation + recorder ===, still
  .recording — with fail-closed adoption; otherwise the stream is cancelled.
- P1: observer captured the browser's largest window, not the detected Meet
  window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested),
  largest-area only as fallback.
- P2: a startVisual orphaned by a concurrent stop could leak a stream on quit.
  inFlightVisual is registered before the await and drained in prepareForTermination.
- P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in
  VisualCapture (clampSegments/clampGaps, unit-tested).
- P4: capture pixel size used NSScreen.main scale; now uses the scale of the
  display actually hosting the window (OCR clarity on secondary displays).
- VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so
  a wedged stream can't hang finalization.

25/25 XCTest pass. Live validation on real calls still pending.
This commit is contained in:
Grant Gilliam
2026-06-06 10:18:52 -05:00
parent c347acbd97
commit 880b56e426
6 changed files with 348 additions and 48 deletions
+45 -27
View File
@@ -26,6 +26,16 @@ final class CallDetector: ObservableObject {
}
}
/// A detected call plus what to capture for visuals: the bundle ID of the owner
/// (the native app for Zoom/Teams/Signal, or the *browser* hosting the Meet tab)
/// and for Meet the exact `CGWindowID` of the matched call window, so the
/// observer captures that window instead of guessing the browser's largest one.
struct DetectedCall: Equatable {
let app: DetectedApp
let bundleID: String
let windowID: CGWindowID?
}
enum Status: Equatable {
case disabled
case listening
@@ -34,7 +44,7 @@ final class CallDetector: ObservableObject {
@Published private(set) var status: Status = .disabled
var onCallStart: ((DetectedApp) -> Void)?
var onCallStart: ((DetectedCall) -> Void)?
var onCallEnd: (() -> Void)?
private let mic = MicActivityMonitor()
@@ -42,7 +52,7 @@ final class CallDetector: ObservableObject {
private var openTimer: Timer?
private var closeTimer: Timer?
private var inCall = false
private var currentApp: DetectedApp?
private var currentCall: DetectedCall?
private var enabled = false
private let openDelay: TimeInterval = 2.0
@@ -79,7 +89,7 @@ final class CallDetector: ObservableObject {
pollTimer?.invalidate(); pollTimer = nil
cancelOpen(); cancelClose()
inCall = false
currentApp = nil
currentCall = nil
status = .disabled
}
@@ -92,8 +102,8 @@ final class CallDetector: ObservableObject {
if let candidate {
cancelClose()
if inCall {
currentApp = candidate
status = .inCall(candidate)
currentCall = candidate
status = .inCall(candidate.app)
} else if openTimer == nil {
openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in
Task { @MainActor in self?.fireOpen() }
@@ -112,18 +122,18 @@ final class CallDetector: ObservableObject {
private func fireOpen() {
openTimer = nil
// Re-resolve the app at fire time (the debounce window may have changed it).
guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return }
guard enabled, mic.isRunning, let call = detectApp(), !inCall else { return }
inCall = true
currentApp = app
status = .inCall(app)
onCallStart?(app)
currentCall = call
status = .inCall(call.app)
onCallStart?(call)
}
private func fireClose() {
closeTimer = nil
guard enabled, inCall else { return }
inCall = false
currentApp = nil
currentCall = nil
status = .listening
onCallEnd?()
}
@@ -137,7 +147,7 @@ final class CallDetector: ObservableObject {
/// On macOS 14+ we attribute mic usage per-process (robust start AND stop,
/// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13
/// we fall back to the per-app call-window heuristic.
private func detectApp() -> DetectedApp? {
private func detectApp() -> DetectedCall? {
if #available(macOS 14.0, *) {
return detectViaMicAttribution()
}
@@ -145,7 +155,7 @@ final class CallDetector: ObservableObject {
}
@available(macOS 14.0, *)
private func detectViaMicAttribution() -> DetectedApp? {
private func detectViaMicAttribution() -> DetectedCall? {
let micPIDs = AudioInputProcesses.micUsingPIDs()
guard !micPIDs.isEmpty else { return nil }
let selfPID = NSRunningApplication.current.processIdentifier
@@ -154,39 +164,42 @@ final class CallDetector: ObservableObject {
let pid = app.processIdentifier
guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue }
if let native = Self.nativeApps.first(where: { $0.id == id }) {
return native.app // Signal/Zoom/Teams using the mic = in a call
return DetectedCall(app: native.app, bundleID: id, windowID: nil) // native: capture largest owned window
}
// A browser using the mic + a Meet window = a Meet call. The mic state
// gives reliable start/stop; the window check keeps non-Meet browser
// mic use (other web apps) from being mislabeled as a Meet recording.
if Self.browserIDs.contains(id), pidHasMeetWindow(pid) {
return .meet
// Capture that exact browser window (by ID), not just the browser.
if Self.browserIDs.contains(id), let wid = meetWindowID(pid) {
return DetectedCall(app: .meet, bundleID: id, windowID: wid)
}
}
return nil
}
private func pidHasMeetWindow(_ pid: pid_t) -> Bool {
/// The `CGWindowID` of this PID's Google Meet call window (title "Meet - "),
/// or nil if none also serves as the "is this a Meet call?" check.
private func meetWindowID(_ pid: pid_t) -> CGWindowID? {
guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]]
else { return false }
else { return nil }
for w in info {
guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid,
let title = w[kCGWindowName as String] as? String else { continue }
if Self.looksLikeMeet(title) { return true }
let title = w[kCGWindowName as String] as? String, Self.looksLikeMeet(title) else { continue }
return w[kCGWindowNumber as String] as? CGWindowID
}
return false
return nil
}
/// macOS 13 fallback: detect by the presence of a call WINDOW per app.
private func detectViaWindowTitle() -> DetectedApp? {
var pidToApp: [pid_t: DetectedApp] = [:]
var browserPIDs = Set<pid_t>()
private func detectViaWindowTitle() -> DetectedCall? {
var pidToApp: [pid_t: (app: DetectedApp, id: String)] = [:]
var browserPIDs: [pid_t: String] = [:]
for app in NSWorkspace.shared.runningApplications {
guard let id = app.bundleIdentifier else { continue }
if let native = Self.nativeApps.first(where: { $0.id == id }) {
pidToApp[app.processIdentifier] = native.app
pidToApp[app.processIdentifier] = (native.app, id)
} else if Self.browserIDs.contains(id) {
browserPIDs.insert(app.processIdentifier)
browserPIDs[app.processIdentifier] = id
}
}
guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil }
@@ -197,8 +210,13 @@ final class CallDetector: ObservableObject {
guard let pid = info[kCGWindowOwnerPID as String] as? pid_t,
let title = info[kCGWindowName as String] as? String,
!title.isEmpty else { continue }
if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet }
if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app }
if let id = browserPIDs[pid], Self.looksLikeMeet(title) {
return DetectedCall(app: .meet, bundleID: id,
windowID: info[kCGWindowNumber as String] as? CGWindowID)
}
if let native = pidToApp[pid], Self.isCallWindow(native.app, title) {
return DetectedCall(app: native.app, bundleID: native.id, windowID: nil)
}
}
return nil
}