Files
ten31-transcripts/Ten31Transcripts/Detection/CallDetector.swift
T
Grant Gilliam 880b56e426 Wire visual capture into the recording lifecycle (failure-isolated)
Visual capture now runs alongside audio: on call start the session picks the
app's adapter, captures the call window on the SAME monotonic clock as the audio
(AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands
the backend the visual segments with mic-VAD self-spans merged. Any visual
failure (no adapter, no window, Screen Recording denied) leaves the session
recording audio-only — the proven path is never blocked or broken.

- CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact
  CGWindowID of the matched Meet browser window (native apps → nil → largest).
- VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json.
- AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment.

Hardened per a 3-lens adversarial review (concurrency / failure-isolation /
data-flow), all 6 confirmed findings fixed:
- P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session
  (cross-session SCStream leak + visual_timeline.json written to the wrong
  folder). Now gated on session identity — generation + recorder ===, still
  .recording — with fail-closed adoption; otherwise the stream is cancelled.
- P1: observer captured the browser's largest window, not the detected Meet
  window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested),
  largest-area only as fallback.
- P2: a startVisual orphaned by a concurrent stop could leak a stream on quit.
  inFlightVisual is registered before the await and drained in prepareForTermination.
- P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in
  VisualCapture (clampSegments/clampGaps, unit-tested).
- P4: capture pixel size used NSScreen.main scale; now uses the scale of the
  display actually hosting the window (OCR clarity on secondary displays).
- VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so
  a wedged stream can't hang finalization.

25/25 XCTest pass. Live validation on real calls still pending.
2026-06-06 10:18:52 -05:00

245 lines
9.7 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import AppKit
import CoreGraphics
import Combine
/// Detects when the user joins/leaves a call and reports it via callbacks.
///
/// Heuristic: the mic is live system-wide AND a known call app is present
/// Zoom/Teams/Signal by bundle ID, or Google Meet by a browser window whose
/// title looks like a Meet call (read via `CGWindowList`, using the Screen
/// Recording permission). Debounced so a quick unrelated mic use doesn't trigger.
///
/// Main-actor: all evaluation runs on the main thread.
@MainActor
final class CallDetector: ObservableObject {
enum DetectedApp: String, Equatable {
case zoom, teams, signal, meet
var label: String { rawValue }
var display: String {
switch self {
case .zoom: return "Zoom"
case .teams: return "Microsoft Teams"
case .signal: return "Signal"
case .meet: return "Google Meet"
}
}
}
/// A detected call plus what to capture for visuals: the bundle ID of the owner
/// (the native app for Zoom/Teams/Signal, or the *browser* hosting the Meet tab)
/// and for Meet the exact `CGWindowID` of the matched call window, so the
/// observer captures that window instead of guessing the browser's largest one.
struct DetectedCall: Equatable {
let app: DetectedApp
let bundleID: String
let windowID: CGWindowID?
}
enum Status: Equatable {
case disabled
case listening
case inCall(DetectedApp)
}
@Published private(set) var status: Status = .disabled
var onCallStart: ((DetectedCall) -> Void)?
var onCallEnd: (() -> Void)?
private let mic = MicActivityMonitor()
private var pollTimer: Timer?
private var openTimer: Timer?
private var closeTimer: Timer?
private var inCall = false
private var currentCall: DetectedCall?
private var enabled = false
private let openDelay: TimeInterval = 2.0
private let closeDelay: TimeInterval = 4.0
private let pollInterval: TimeInterval = 3.0
private static let nativeApps: [(id: String, app: DetectedApp)] = [
("us.zoom.xos", .zoom),
("com.microsoft.teams2", .teams),
("com.microsoft.teams", .teams),
("org.whispersystems.signal-desktop", .signal),
]
private static let browserIDs: Set<String> = [
"org.mozilla.firefox", "com.google.Chrome", "com.apple.Safari",
"company.thebrowser.Browser", "com.brave.Browser", "com.microsoft.edgemac",
]
func enable() {
guard !enabled else { return }
enabled = true
mic.onChange = { [weak self] _ in self?.evaluate() }
mic.start()
status = .listening
pollTimer = Timer.scheduledTimer(withTimeInterval: pollInterval, repeats: true) { [weak self] _ in
Task { @MainActor in self?.evaluate() }
}
evaluate()
}
func disable() {
guard enabled else { return }
enabled = false
mic.stop()
pollTimer?.invalidate(); pollTimer = nil
cancelOpen(); cancelClose()
inCall = false
currentCall = nil
status = .disabled
}
// MARK: - Evaluation
private func evaluate() {
guard enabled else { return }
let candidate = mic.isRunning ? detectApp() : nil
if let candidate {
cancelClose()
if inCall {
currentCall = candidate
status = .inCall(candidate.app)
} else if openTimer == nil {
openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in
Task { @MainActor in self?.fireOpen() }
}
}
} else {
cancelOpen()
if inCall && closeTimer == nil {
closeTimer = Timer.scheduledTimer(withTimeInterval: closeDelay, repeats: false) { [weak self] _ in
Task { @MainActor in self?.fireClose() }
}
}
}
}
private func fireOpen() {
openTimer = nil
// Re-resolve the app at fire time (the debounce window may have changed it).
guard enabled, mic.isRunning, let call = detectApp(), !inCall else { return }
inCall = true
currentCall = call
status = .inCall(call.app)
onCallStart?(call)
}
private func fireClose() {
closeTimer = nil
guard enabled, inCall else { return }
inCall = false
currentCall = nil
status = .listening
onCallEnd?()
}
private func cancelOpen() { openTimer?.invalidate(); openTimer = nil }
private func cancelClose() { closeTimer?.invalidate(); closeTimer = nil }
// MARK: - App detection
/// A call is active when a known call app is actually using the mic.
/// On macOS 14+ we attribute mic usage per-process (robust start AND stop,
/// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13
/// we fall back to the per-app call-window heuristic.
private func detectApp() -> DetectedCall? {
if #available(macOS 14.0, *) {
return detectViaMicAttribution()
}
return detectViaWindowTitle()
}
@available(macOS 14.0, *)
private func detectViaMicAttribution() -> DetectedCall? {
let micPIDs = AudioInputProcesses.micUsingPIDs()
guard !micPIDs.isEmpty else { return nil }
let selfPID = NSRunningApplication.current.processIdentifier
for app in NSWorkspace.shared.runningApplications {
let pid = app.processIdentifier
guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue }
if let native = Self.nativeApps.first(where: { $0.id == id }) {
return DetectedCall(app: native.app, bundleID: id, windowID: nil) // native: capture largest owned window
}
// A browser using the mic + a Meet window = a Meet call. The mic state
// gives reliable start/stop; the window check keeps non-Meet browser
// mic use (other web apps) from being mislabeled as a Meet recording.
// Capture that exact browser window (by ID), not just the browser.
if Self.browserIDs.contains(id), let wid = meetWindowID(pid) {
return DetectedCall(app: .meet, bundleID: id, windowID: wid)
}
}
return nil
}
/// The `CGWindowID` of this PID's Google Meet call window (title "Meet - "),
/// or nil if none also serves as the "is this a Meet call?" check.
private func meetWindowID(_ pid: pid_t) -> CGWindowID? {
guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]]
else { return nil }
for w in info {
guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid,
let title = w[kCGWindowName as String] as? String, Self.looksLikeMeet(title) else { continue }
return w[kCGWindowNumber as String] as? CGWindowID
}
return nil
}
/// macOS 13 fallback: detect by the presence of a call WINDOW per app.
private func detectViaWindowTitle() -> DetectedCall? {
var pidToApp: [pid_t: (app: DetectedApp, id: String)] = [:]
var browserPIDs: [pid_t: String] = [:]
for app in NSWorkspace.shared.runningApplications {
guard let id = app.bundleIdentifier else { continue }
if let native = Self.nativeApps.first(where: { $0.id == id }) {
pidToApp[app.processIdentifier] = (native.app, id)
} else if Self.browserIDs.contains(id) {
browserPIDs[app.processIdentifier] = id
}
}
guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil }
guard let infoList = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] else {
return nil
}
for info in infoList {
guard let pid = info[kCGWindowOwnerPID as String] as? pid_t,
let title = info[kCGWindowName as String] as? String,
!title.isEmpty else { continue }
if let id = browserPIDs[pid], Self.looksLikeMeet(title) {
return DetectedCall(app: .meet, bundleID: id,
windowID: info[kCGWindowNumber as String] as? CGWindowID)
}
if let native = pidToApp[pid], Self.isCallWindow(native.app, title) {
return DetectedCall(app: native.app, bundleID: native.id, windowID: nil)
}
}
return nil
}
/// Per-app in-call window-title signatures (macOS 13 fallback only).
private static func isCallWindow(_ app: DetectedApp, _ title: String) -> Bool {
let t = title.lowercased()
switch app {
case .zoom: return t.contains("zoom meeting") || t.contains("meeting")
case .teams: return t.contains("meeting")
case .signal: return t.contains("signal call") || t.contains("group call")
case .meet: return false // handled via the browser path above
}
}
/// Match an ACTIVE Google Meet call. Verified against real Firefox titles:
/// in a call the title is "Meet - <code>" (e.g. "Meet - tjh-pixe-ier"), while
/// the home/lobby/"you left" pages are bare "Meet" or "Google Meet". Matching
/// only the "Meet - " form is what lets auto-STOP fire when you leave (and
/// avoids false-starting on the home page). Also excludes "Zoom Meeting" etc.
private static func looksLikeMeet(_ title: String) -> Bool {
let t = title.lowercased()
return t.hasPrefix("meet - ") || t.hasPrefix("meet ") || t.hasPrefix("meet — ")
}
}