Wire visual capture into the recording lifecycle (failure-isolated)

Visual capture now runs alongside audio: on call start the session picks the
app's adapter, captures the call window on the SAME monotonic clock as the audio
(AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands
the backend the visual segments with mic-VAD self-spans merged. Any visual
failure (no adapter, no window, Screen Recording denied) leaves the session
recording audio-only — the proven path is never blocked or broken.

- CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact
  CGWindowID of the matched Meet browser window (native apps → nil → largest).
- VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json.
- AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment.

Hardened per a 3-lens adversarial review (concurrency / failure-isolation /
data-flow), all 6 confirmed findings fixed:
- P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session
  (cross-session SCStream leak + visual_timeline.json written to the wrong
  folder). Now gated on session identity — generation + recorder ===, still
  .recording — with fail-closed adoption; otherwise the stream is cancelled.
- P1: observer captured the browser's largest window, not the detected Meet
  window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested),
  largest-area only as fallback.
- P2: a startVisual orphaned by a concurrent stop could leak a stream on quit.
  inFlightVisual is registered before the await and drained in prepareForTermination.
- P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in
  VisualCapture (clampSegments/clampGaps, unit-tested).
- P4: capture pixel size used NSScreen.main scale; now uses the scale of the
  display actually hosting the window (OCR clarity on secondary displays).
- VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so
  a wedged stream can't hang finalization.

25/25 XCTest pass. Live validation on real calls still pending.
This commit is contained in:
Grant Gilliam
2026-06-06 10:18:52 -05:00
parent c347acbd97
commit 880b56e426
6 changed files with 348 additions and 48 deletions
+51 -6
View File
@@ -14,6 +14,7 @@ import AppKit
@available(macOS 13.0, *)
final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
private let bundleID: String
private let windowID: CGWindowID?
private let adapter: any AppAdapter
private let t0Host: Double
private let fps: Int
@@ -27,8 +28,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
/// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?
init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
init(bundleID: String, windowID: CGWindowID? = nil, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
self.bundleID = bundleID
self.windowID = windowID
self.adapter = adapter
self.t0Host = t0Host
self.fps = max(1, fps)
@@ -36,12 +38,15 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
func start() async throws {
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
// The call window: the largest window owned by the target app.
let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else {
// Prefer the EXACT detected window (e.g. the Meet browser window) by ID; fall
// back to the largest owned window when no ID was supplied or it's gone.
guard let idx = Self.pickWindowIndex(candidates.map { ($0.windowID, $0.frame) }, preferredID: windowID),
candidates.indices.contains(idx) else {
throw NSError(domain: "Ten31", code: 2,
userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
}
let window = candidates[idx]
let filter = SCContentFilter(desktopIndependentWindow: window)
let config = SCStreamConfiguration()
@@ -50,8 +55,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
config.showsCursor = false
config.pixelFormat = kCVPixelFormatType_32BGRA
// window.frame is in points; capture at native pixels so OCR can read small
// initials/names (a half-res Retina capture badly hurts recognition).
let scale = NSScreen.main?.backingScaleFactor ?? 2
// initials/names (a half-res Retina capture badly hurts recognition). Use the
// scale of the display the window is actually on, not always the main screen.
let scale = Self.backingScale(forWindowFrame: window.frame)
config.width = max(2, Int(window.frame.width * scale))
config.height = max(2, Int(window.frame.height * scale))
@@ -61,8 +67,36 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
self.stream = stream
}
/// Choose which candidate window to capture: the one matching `preferredID` if
/// present, else the largest by area. Returns the index into `candidates`, or
/// nil if there are none. Pure/testable no ScreenCaptureKit types.
static func pickWindowIndex(_ candidates: [(id: CGWindowID, frame: CGRect)],
preferredID: CGWindowID?) -> Int? {
guard !candidates.isEmpty else { return nil }
if let preferredID, let i = candidates.firstIndex(where: { $0.id == preferredID }) {
return i
}
return candidates.indices.max(by: {
candidates[$0].frame.width * candidates[$0].frame.height
< candidates[$1].frame.width * candidates[$1].frame.height
})
}
/// Backing scale of the display that contains the window's center. SCWindow.frame
/// is in global display (top-left origin) points; NSScreen is bottom-left, so we
/// flip the center through the primary screen's height before testing containment.
private static func backingScale(forWindowFrame frame: CGRect) -> CGFloat {
let screens = NSScreen.screens
guard let primary = screens.first else { return NSScreen.main?.backingScaleFactor ?? 2 }
let centerAppKit = CGPoint(x: frame.midX, y: primary.frame.maxY - frame.midY)
let screen = screens.first { $0.frame.contains(centerAppKit) } ?? NSScreen.main ?? primary
return screen.backingScaleFactor
}
func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
if let stream { try? await stream.stopCapture() }
// Bound stopCapture: an already-errored SCStream can block forever, which
// would wedge session finalization in `.finishing`. Mirror AudioRecorder.
if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) }
stream = nil
return queue.sync {
if let gs = gapStart {
@@ -113,6 +147,17 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
func stream(_ stream: SCStream, didStopWithError error: Error) {}
/// Proceed as soon as `stopCapture()` returns OR the timeout fires, so a wedged
/// stream can't block forever.
private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async {
await withTaskGroup(of: Void.self) { group in
group.addTask { try? await stream.stopCapture() }
group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) }
_ = await group.next()
group.cancelAll()
}
}
private enum FrameKind { case live, idle, gap }
/// SCK delivers `.complete` only when content changes, `.idle` for a static