Wire visual capture into the recording lifecycle (failure-isolated)
Visual capture now runs alongside audio: on call start the session picks the
app's adapter, captures the call window on the SAME monotonic clock as the audio
(AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands
the backend the visual segments with mic-VAD self-spans merged. Any visual
failure (no adapter, no window, Screen Recording denied) leaves the session
recording audio-only — the proven path is never blocked or broken.
- CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact
CGWindowID of the matched Meet browser window (native apps → nil → largest).
- VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json.
- AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment.
Hardened per a 3-lens adversarial review (concurrency / failure-isolation /
data-flow), all 6 confirmed findings fixed:
- P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session
(cross-session SCStream leak + visual_timeline.json written to the wrong
folder). Now gated on session identity — generation + recorder ===, still
.recording — with fail-closed adoption; otherwise the stream is cancelled.
- P1: observer captured the browser's largest window, not the detected Meet
window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested),
largest-area only as fallback.
- P2: a startVisual orphaned by a concurrent stop could leak a stream on quit.
inFlightVisual is registered before the await and drained in prepareForTermination.
- P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in
VisualCapture (clampSegments/clampGaps, unit-tested).
- P4: capture pixel size used NSScreen.main scale; now uses the scale of the
display actually hosting the window (OCR clarity on secondary displays).
- VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so
a wedged stream can't hang finalization.
25/25 XCTest pass. Live validation on real calls still pending.
This commit is contained in:
@@ -26,6 +26,16 @@ final class CallDetector: ObservableObject {
|
||||
}
|
||||
}
|
||||
|
||||
/// A detected call plus what to capture for visuals: the bundle ID of the owner
|
||||
/// (the native app for Zoom/Teams/Signal, or the *browser* hosting the Meet tab)
|
||||
/// and — for Meet — the exact `CGWindowID` of the matched call window, so the
|
||||
/// observer captures that window instead of guessing the browser's largest one.
|
||||
struct DetectedCall: Equatable {
|
||||
let app: DetectedApp
|
||||
let bundleID: String
|
||||
let windowID: CGWindowID?
|
||||
}
|
||||
|
||||
enum Status: Equatable {
|
||||
case disabled
|
||||
case listening
|
||||
@@ -34,7 +44,7 @@ final class CallDetector: ObservableObject {
|
||||
|
||||
@Published private(set) var status: Status = .disabled
|
||||
|
||||
var onCallStart: ((DetectedApp) -> Void)?
|
||||
var onCallStart: ((DetectedCall) -> Void)?
|
||||
var onCallEnd: (() -> Void)?
|
||||
|
||||
private let mic = MicActivityMonitor()
|
||||
@@ -42,7 +52,7 @@ final class CallDetector: ObservableObject {
|
||||
private var openTimer: Timer?
|
||||
private var closeTimer: Timer?
|
||||
private var inCall = false
|
||||
private var currentApp: DetectedApp?
|
||||
private var currentCall: DetectedCall?
|
||||
private var enabled = false
|
||||
|
||||
private let openDelay: TimeInterval = 2.0
|
||||
@@ -79,7 +89,7 @@ final class CallDetector: ObservableObject {
|
||||
pollTimer?.invalidate(); pollTimer = nil
|
||||
cancelOpen(); cancelClose()
|
||||
inCall = false
|
||||
currentApp = nil
|
||||
currentCall = nil
|
||||
status = .disabled
|
||||
}
|
||||
|
||||
@@ -92,8 +102,8 @@ final class CallDetector: ObservableObject {
|
||||
if let candidate {
|
||||
cancelClose()
|
||||
if inCall {
|
||||
currentApp = candidate
|
||||
status = .inCall(candidate)
|
||||
currentCall = candidate
|
||||
status = .inCall(candidate.app)
|
||||
} else if openTimer == nil {
|
||||
openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in
|
||||
Task { @MainActor in self?.fireOpen() }
|
||||
@@ -112,18 +122,18 @@ final class CallDetector: ObservableObject {
|
||||
private func fireOpen() {
|
||||
openTimer = nil
|
||||
// Re-resolve the app at fire time (the debounce window may have changed it).
|
||||
guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return }
|
||||
guard enabled, mic.isRunning, let call = detectApp(), !inCall else { return }
|
||||
inCall = true
|
||||
currentApp = app
|
||||
status = .inCall(app)
|
||||
onCallStart?(app)
|
||||
currentCall = call
|
||||
status = .inCall(call.app)
|
||||
onCallStart?(call)
|
||||
}
|
||||
|
||||
private func fireClose() {
|
||||
closeTimer = nil
|
||||
guard enabled, inCall else { return }
|
||||
inCall = false
|
||||
currentApp = nil
|
||||
currentCall = nil
|
||||
status = .listening
|
||||
onCallEnd?()
|
||||
}
|
||||
@@ -137,7 +147,7 @@ final class CallDetector: ObservableObject {
|
||||
/// On macOS 14+ we attribute mic usage per-process (robust start AND stop,
|
||||
/// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13
|
||||
/// we fall back to the per-app call-window heuristic.
|
||||
private func detectApp() -> DetectedApp? {
|
||||
private func detectApp() -> DetectedCall? {
|
||||
if #available(macOS 14.0, *) {
|
||||
return detectViaMicAttribution()
|
||||
}
|
||||
@@ -145,7 +155,7 @@ final class CallDetector: ObservableObject {
|
||||
}
|
||||
|
||||
@available(macOS 14.0, *)
|
||||
private func detectViaMicAttribution() -> DetectedApp? {
|
||||
private func detectViaMicAttribution() -> DetectedCall? {
|
||||
let micPIDs = AudioInputProcesses.micUsingPIDs()
|
||||
guard !micPIDs.isEmpty else { return nil }
|
||||
let selfPID = NSRunningApplication.current.processIdentifier
|
||||
@@ -154,39 +164,42 @@ final class CallDetector: ObservableObject {
|
||||
let pid = app.processIdentifier
|
||||
guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue }
|
||||
if let native = Self.nativeApps.first(where: { $0.id == id }) {
|
||||
return native.app // Signal/Zoom/Teams using the mic = in a call
|
||||
return DetectedCall(app: native.app, bundleID: id, windowID: nil) // native: capture largest owned window
|
||||
}
|
||||
// A browser using the mic + a Meet window = a Meet call. The mic state
|
||||
// gives reliable start/stop; the window check keeps non-Meet browser
|
||||
// mic use (other web apps) from being mislabeled as a Meet recording.
|
||||
if Self.browserIDs.contains(id), pidHasMeetWindow(pid) {
|
||||
return .meet
|
||||
// Capture that exact browser window (by ID), not just the browser.
|
||||
if Self.browserIDs.contains(id), let wid = meetWindowID(pid) {
|
||||
return DetectedCall(app: .meet, bundleID: id, windowID: wid)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private func pidHasMeetWindow(_ pid: pid_t) -> Bool {
|
||||
/// The `CGWindowID` of this PID's Google Meet call window (title "Meet - …"),
|
||||
/// or nil if none — also serves as the "is this a Meet call?" check.
|
||||
private func meetWindowID(_ pid: pid_t) -> CGWindowID? {
|
||||
guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]]
|
||||
else { return false }
|
||||
else { return nil }
|
||||
for w in info {
|
||||
guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid,
|
||||
let title = w[kCGWindowName as String] as? String else { continue }
|
||||
if Self.looksLikeMeet(title) { return true }
|
||||
let title = w[kCGWindowName as String] as? String, Self.looksLikeMeet(title) else { continue }
|
||||
return w[kCGWindowNumber as String] as? CGWindowID
|
||||
}
|
||||
return false
|
||||
return nil
|
||||
}
|
||||
|
||||
/// macOS 13 fallback: detect by the presence of a call WINDOW per app.
|
||||
private func detectViaWindowTitle() -> DetectedApp? {
|
||||
var pidToApp: [pid_t: DetectedApp] = [:]
|
||||
var browserPIDs = Set<pid_t>()
|
||||
private func detectViaWindowTitle() -> DetectedCall? {
|
||||
var pidToApp: [pid_t: (app: DetectedApp, id: String)] = [:]
|
||||
var browserPIDs: [pid_t: String] = [:]
|
||||
for app in NSWorkspace.shared.runningApplications {
|
||||
guard let id = app.bundleIdentifier else { continue }
|
||||
if let native = Self.nativeApps.first(where: { $0.id == id }) {
|
||||
pidToApp[app.processIdentifier] = native.app
|
||||
pidToApp[app.processIdentifier] = (native.app, id)
|
||||
} else if Self.browserIDs.contains(id) {
|
||||
browserPIDs.insert(app.processIdentifier)
|
||||
browserPIDs[app.processIdentifier] = id
|
||||
}
|
||||
}
|
||||
guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil }
|
||||
@@ -197,8 +210,13 @@ final class CallDetector: ObservableObject {
|
||||
guard let pid = info[kCGWindowOwnerPID as String] as? pid_t,
|
||||
let title = info[kCGWindowName as String] as? String,
|
||||
!title.isEmpty else { continue }
|
||||
if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet }
|
||||
if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app }
|
||||
if let id = browserPIDs[pid], Self.looksLikeMeet(title) {
|
||||
return DetectedCall(app: .meet, bundleID: id,
|
||||
windowID: info[kCGWindowNumber as String] as? CGWindowID)
|
||||
}
|
||||
if let native = pidToApp[pid], Self.isCallWindow(native.app, title) {
|
||||
return DetectedCall(app: native.app, bundleID: native.id, windowID: nil)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user