Wire visual capture into the recording lifecycle (failure-isolated)

Visual capture now runs alongside audio: on call start the session picks the app's adapter, captures the call window on the SAME monotonic clock as the audio (AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands the backend the visual segments with mic-VAD self-spans merged. Any visual failure (no adapter, no window, Screen Recording denied) leaves the session recording audio-only — the proven path is never blocked or broken. - CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact CGWindowID of the matched Meet browser window (native apps → nil → largest). - VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json. - AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment. Hardened per a 3-lens adversarial review (concurrency / failure-isolation / data-flow), all 6 confirmed findings fixed: - P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session (cross-session SCStream leak + visual_timeline.json written to the wrong folder). Now gated on session identity — generation + recorder ===, still .recording — with fail-closed adoption; otherwise the stream is cancelled. - P1: observer captured the browser's largest window, not the detected Meet window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested), largest-area only as fallback. - P2: a startVisual orphaned by a concurrent stop could leak a stream on quit. inFlightVisual is registered before the await and drained in prepareForTermination. - P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in VisualCapture (clampSegments/clampGaps, unit-tested). - P4: capture pixel size used NSScreen.main scale; now uses the scale of the display actually hosting the window (OCR clarity on secondary displays). - VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so a wedged stream can't hang finalization. 25/25 XCTest pass. Live validation on real calls still pending.
2026-06-06 10:18:52 -05:00
parent c347acbd97
commit 880b56e426
6 changed files with 348 additions and 48 deletions
@@ -0,0 +1,83 @@
+import Foundation
+import CoreGraphics
+
+/// Owns the visual side of one recording session: picks the app's adapter, runs a
+/// `VisualObserver` over the call window, and on stop writes `visual_timeline.json`
+/// and returns the speaker segments for the backend hand-off.
+///
+/// Strictly best-effort: if there's no adapter for the app, or the window can't be
+/// captured, the session simply records audio-only — visuals never block or break
+/// the proven audio path. `init?` returns nil when the app has no visual adapter.
+@available(macOS 13.0, *)
+final class VisualCapture {
+    let app: CallDetector.DetectedApp
+    private let adapter: any AppAdapter
+    private let observer: VisualObserver
+
+    init?(app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?, t0Host: Double) {
+        guard let adapter = AdapterRegistry.adapter(for: app) else { return nil }
+        self.app = app
+        self.adapter = adapter
+        self.observer = VisualObserver(bundleID: bundleID, windowID: windowID, adapter: adapter,
+                                       t0Host: t0Host, fps: adapter.preferredFPS)
+    }
+
+    /// Start window capture. Throws if the window isn't capturable (no window yet,
+    /// Screen Recording denied) — the caller catches and falls back to audio-only.
+    func start() async throws {
+        try await observer.start()
+    }
+
+    /// Stop and discard capture without writing anything (used when the session
+    /// ends before capture was fully adopted).
+    func cancel() async {
+        _ = await observer.stop()
+    }
+
+    /// Clamp segment ends to the audio duration; drop any that become empty. Keeps
+    /// `visual_timeline.json` internally consistent and never sends the backend a
+    /// segment longer than the audio. (`duration <= 0` → passthrough.)
+    static func clampSegments(_ segs: [VisualTimeline.Segment], to duration: Double) -> [VisualTimeline.Segment] {
+        guard duration > 0 else { return segs }
+        return segs.compactMap { s in
+            let end = min(s.end, duration)
+            guard end > s.start else { return nil }
+            return .init(start: s.start, end: end, name: s.name, confidence: s.confidence, source: s.source)
+        }
+    }
+
+    static func clampGaps(_ gaps: [VisualTimeline.Gap], to duration: Double) -> [VisualTimeline.Gap] {
+        guard duration > 0 else { return gaps }
+        return gaps.compactMap { g in
+            let end = min(g.end, duration)
+            guard end > g.start else { return nil }
+            return .init(start: g.start, end: end, reason: g.reason)
+        }
+    }
+
+    /// Stop capture, fold in the mic-VAD self spans, write `visual_timeline.json`
+    /// into the session folder, and return the merged segments for `label-merge`.
+    func finish(selfSpans: [VADSpan], selfName: String,
+                sessionId: String, t0Unix: Double, durationSec: Double,
+                folder: URL) async -> [VisualTimeline.Segment] {
+        observer.addSelfSpans(selfSpans, selfName: selfName)
+        let (rawSegments, rawGaps) = await observer.stop()
+
+        // The observer stops slightly after audio fixes `durationSec`, so a trailing
+        // gap/segment can run past it. Clamp ends so the JSON is internally consistent
+        // (and we never hand the backend a segment longer than the audio).
+        let segments = Self.clampSegments(rawSegments, to: durationSec)
+        let gaps = Self.clampGaps(rawGaps, to: durationSec)
+
+        let names = Set(segments.map { $0.name })
+        let participants = names.sorted().map {
+            VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
+        }
+        let timeline = VisualTimeline(
+            sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion,
+            t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS,
+            selfName: selfName, participants: participants, segments: segments, visualGaps: gaps)
+        try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json"))
+        return segments
+    }
+}
@@ -14,6 +14,7 @@ import AppKit
@available(macOS 13.0, *)
 final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
    private let bundleID: String
+    private let windowID: CGWindowID?
    private let adapter: any AppAdapter
    private let t0Host: Double
    private let fps: Int
@@ -27,8 +28,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
    /// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
    var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?

-    init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
+    init(bundleID: String, windowID: CGWindowID? = nil, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
        self.bundleID = bundleID
+        self.windowID = windowID
        self.adapter = adapter
        self.t0Host = t0Host
        self.fps = max(1, fps)
@@ -36,12 +38,15 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {

    func start() async throws {
        let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
-        // The call window: the largest window owned by the target app.
        let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
-        guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else {
+        // Prefer the EXACT detected window (e.g. the Meet browser window) by ID; fall
+        // back to the largest owned window when no ID was supplied or it's gone.
+        guard let idx = Self.pickWindowIndex(candidates.map { ($0.windowID, $0.frame) }, preferredID: windowID),
+              candidates.indices.contains(idx) else {
            throw NSError(domain: "Ten31", code: 2,
                          userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
        }
+        let window = candidates[idx]

        let filter = SCContentFilter(desktopIndependentWindow: window)
        let config = SCStreamConfiguration()
@@ -50,8 +55,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
        config.showsCursor = false
        config.pixelFormat = kCVPixelFormatType_32BGRA
        // window.frame is in points; capture at native pixels so OCR can read small
-        // initials/names (a half-res Retina capture badly hurts recognition).
-        let scale = NSScreen.main?.backingScaleFactor ?? 2
+        // initials/names (a half-res Retina capture badly hurts recognition). Use the
+        // scale of the display the window is actually on, not always the main screen.
+        let scale = Self.backingScale(forWindowFrame: window.frame)
        config.width = max(2, Int(window.frame.width * scale))
        config.height = max(2, Int(window.frame.height * scale))

@@ -61,8 +67,36 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
        self.stream = stream
    }

+    /// Choose which candidate window to capture: the one matching `preferredID` if
+    /// present, else the largest by area. Returns the index into `candidates`, or
+    /// nil if there are none. Pure/testable — no ScreenCaptureKit types.
+    static func pickWindowIndex(_ candidates: [(id: CGWindowID, frame: CGRect)],
+                                preferredID: CGWindowID?) -> Int? {
+        guard !candidates.isEmpty else { return nil }
+        if let preferredID, let i = candidates.firstIndex(where: { $0.id == preferredID }) {
+            return i
+        }
+        return candidates.indices.max(by: {
+            candidates[$0].frame.width * candidates[$0].frame.height
+                < candidates[$1].frame.width * candidates[$1].frame.height
+        })
+    }
+
+    /// Backing scale of the display that contains the window's center. SCWindow.frame
+    /// is in global display (top-left origin) points; NSScreen is bottom-left, so we
+    /// flip the center through the primary screen's height before testing containment.
+    private static func backingScale(forWindowFrame frame: CGRect) -> CGFloat {
+        let screens = NSScreen.screens
+        guard let primary = screens.first else { return NSScreen.main?.backingScaleFactor ?? 2 }
+        let centerAppKit = CGPoint(x: frame.midX, y: primary.frame.maxY - frame.midY)
+        let screen = screens.first { $0.frame.contains(centerAppKit) } ?? NSScreen.main ?? primary
+        return screen.backingScaleFactor
+    }
+
    func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
-        if let stream { try? await stream.stopCapture() }
+        // Bound stopCapture: an already-errored SCStream can block forever, which
+        // would wedge session finalization in `.finishing`. Mirror AudioRecorder.
+        if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) }
        stream = nil
        return queue.sync {
            if let gs = gapStart {
@@ -113,6 +147,17 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {

    func stream(_ stream: SCStream, didStopWithError error: Error) {}

+    /// Proceed as soon as `stopCapture()` returns OR the timeout fires, so a wedged
+    /// stream can't block forever.
+    private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async {
+        await withTaskGroup(of: Void.self) { group in
+            group.addTask { try? await stream.stopCapture() }
+            group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) }
+            _ = await group.next()
+            group.cancelAll()
+        }
+    }
+
    private enum FrameKind { case live, idle, gap }

    /// SCK delivers `.complete` only when content changes, `.idle` for a static