Wire visual capture into the recording lifecycle (failure-isolated)

Visual capture now runs alongside audio: on call start the session picks the app's adapter, captures the call window on the SAME monotonic clock as the audio (AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands the backend the visual segments with mic-VAD self-spans merged. Any visual failure (no adapter, no window, Screen Recording denied) leaves the session recording audio-only — the proven path is never blocked or broken. - CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact CGWindowID of the matched Meet browser window (native apps → nil → largest). - VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json. - AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment. Hardened per a 3-lens adversarial review (concurrency / failure-isolation / data-flow), all 6 confirmed findings fixed: - P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session (cross-session SCStream leak + visual_timeline.json written to the wrong folder). Now gated on session identity — generation + recorder ===, still .recording — with fail-closed adoption; otherwise the stream is cancelled. - P1: observer captured the browser's largest window, not the detected Meet window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested), largest-area only as fallback. - P2: a startVisual orphaned by a concurrent stop could leak a stream on quit. inFlightVisual is registered before the await and drained in prepareForTermination. - P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in VisualCapture (clampSegments/clampGaps, unit-tested). - P4: capture pixel size used NSScreen.main scale; now uses the scale of the display actually hosting the window (OCR clarity on secondary displays). - VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so a wedged stream can't hang finalization. 25/25 XCTest pass. Live validation on real calls still pending.
2026-06-06 10:18:52 -05:00
parent c347acbd97
commit 880b56e426
6 changed files with 348 additions and 48 deletions
@@ -88,6 +88,11 @@ final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput {
        }
    }

+    /// The shared monotonic t0 (`CACurrentMediaTime` base) captured at `start()`,
+    /// so visual capture can timestamp frames against the exact same clock as the
+    /// audio. Valid only after `start()` has returned.
+    func sharedT0Host() -> Double { ioQueue.sync { t0Host } }
+
    func stop() async -> RecordingResult {
        // Stop the mic FIRST — always succeeds and halts mic capture immediately.
        engine?.inputNode.removeTap(onBus: 0)
@@ -26,6 +26,16 @@ final class CallDetector: ObservableObject {
        }
    }

+    /// A detected call plus what to capture for visuals: the bundle ID of the owner
+    /// (the native app for Zoom/Teams/Signal, or the *browser* hosting the Meet tab)
+    /// and — for Meet — the exact `CGWindowID` of the matched call window, so the
+    /// observer captures that window instead of guessing the browser's largest one.
+    struct DetectedCall: Equatable {
+        let app: DetectedApp
+        let bundleID: String
+        let windowID: CGWindowID?
+    }
+
    enum Status: Equatable {
        case disabled
        case listening
@@ -34,7 +44,7 @@ final class CallDetector: ObservableObject {

    @Published private(set) var status: Status = .disabled

-    var onCallStart: ((DetectedApp) -> Void)?
+    var onCallStart: ((DetectedCall) -> Void)?
    var onCallEnd: (() -> Void)?

    private let mic = MicActivityMonitor()
@@ -42,7 +52,7 @@ final class CallDetector: ObservableObject {
    private var openTimer: Timer?
    private var closeTimer: Timer?
    private var inCall = false
-    private var currentApp: DetectedApp?
+    private var currentCall: DetectedCall?
    private var enabled = false

    private let openDelay: TimeInterval = 2.0
@@ -79,7 +89,7 @@ final class CallDetector: ObservableObject {
        pollTimer?.invalidate(); pollTimer = nil
        cancelOpen(); cancelClose()
        inCall = false
-        currentApp = nil
+        currentCall = nil
        status = .disabled
    }

@@ -92,8 +102,8 @@ final class CallDetector: ObservableObject {
        if let candidate {
            cancelClose()
            if inCall {
-                currentApp = candidate
-                status = .inCall(candidate)
+                currentCall = candidate
+                status = .inCall(candidate.app)
            } else if openTimer == nil {
                openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in
                    Task { @MainActor in self?.fireOpen() }
@@ -112,18 +122,18 @@ final class CallDetector: ObservableObject {
    private func fireOpen() {
        openTimer = nil
        // Re-resolve the app at fire time (the debounce window may have changed it).
-        guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return }
+        guard enabled, mic.isRunning, let call = detectApp(), !inCall else { return }
        inCall = true
-        currentApp = app
-        status = .inCall(app)
-        onCallStart?(app)
+        currentCall = call
+        status = .inCall(call.app)
+        onCallStart?(call)
    }

    private func fireClose() {
        closeTimer = nil
        guard enabled, inCall else { return }
        inCall = false
-        currentApp = nil
+        currentCall = nil
        status = .listening
        onCallEnd?()
    }
@@ -137,7 +147,7 @@ final class CallDetector: ObservableObject {
    /// On macOS 14+ we attribute mic usage per-process (robust start AND stop,
    /// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13
    /// we fall back to the per-app call-window heuristic.
-    private func detectApp() -> DetectedApp? {
+    private func detectApp() -> DetectedCall? {
        if #available(macOS 14.0, *) {
            return detectViaMicAttribution()
        }
@@ -145,7 +155,7 @@ final class CallDetector: ObservableObject {
    }

    @available(macOS 14.0, *)
-    private func detectViaMicAttribution() -> DetectedApp? {
+    private func detectViaMicAttribution() -> DetectedCall? {
        let micPIDs = AudioInputProcesses.micUsingPIDs()
        guard !micPIDs.isEmpty else { return nil }
        let selfPID = NSRunningApplication.current.processIdentifier
@@ -154,39 +164,42 @@ final class CallDetector: ObservableObject {
            let pid = app.processIdentifier
            guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue }
            if let native = Self.nativeApps.first(where: { $0.id == id }) {
-                return native.app          // Signal/Zoom/Teams using the mic = in a call
+                return DetectedCall(app: native.app, bundleID: id, windowID: nil)   // native: capture largest owned window
            }
            // A browser using the mic + a Meet window = a Meet call. The mic state
            // gives reliable start/stop; the window check keeps non-Meet browser
            // mic use (other web apps) from being mislabeled as a Meet recording.
-            if Self.browserIDs.contains(id), pidHasMeetWindow(pid) {
-                return .meet
+            // Capture that exact browser window (by ID), not just the browser.
+            if Self.browserIDs.contains(id), let wid = meetWindowID(pid) {
+                return DetectedCall(app: .meet, bundleID: id, windowID: wid)
            }
        }
        return nil
    }

-    private func pidHasMeetWindow(_ pid: pid_t) -> Bool {
+    /// The `CGWindowID` of this PID's Google Meet call window (title "Meet - …"),
+    /// or nil if none — also serves as the "is this a Meet call?" check.
+    private func meetWindowID(_ pid: pid_t) -> CGWindowID? {
        guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]]
-        else { return false }
+        else { return nil }
        for w in info {
            guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid,
-                  let title = w[kCGWindowName as String] as? String else { continue }
-            if Self.looksLikeMeet(title) { return true }
+                  let title = w[kCGWindowName as String] as? String, Self.looksLikeMeet(title) else { continue }
+            return w[kCGWindowNumber as String] as? CGWindowID
        }
-        return false
+        return nil
    }

    /// macOS 13 fallback: detect by the presence of a call WINDOW per app.
-    private func detectViaWindowTitle() -> DetectedApp? {
-        var pidToApp: [pid_t: DetectedApp] = [:]
-        var browserPIDs = Set<pid_t>()
+    private func detectViaWindowTitle() -> DetectedCall? {
+        var pidToApp: [pid_t: (app: DetectedApp, id: String)] = [:]
+        var browserPIDs: [pid_t: String] = [:]
        for app in NSWorkspace.shared.runningApplications {
            guard let id = app.bundleIdentifier else { continue }
            if let native = Self.nativeApps.first(where: { $0.id == id }) {
-                pidToApp[app.processIdentifier] = native.app
+                pidToApp[app.processIdentifier] = (native.app, id)
            } else if Self.browserIDs.contains(id) {
-                browserPIDs.insert(app.processIdentifier)
+                browserPIDs[app.processIdentifier] = id
            }
        }
        guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil }
@@ -197,8 +210,13 @@ final class CallDetector: ObservableObject {
            guard let pid = info[kCGWindowOwnerPID as String] as? pid_t,
                  let title = info[kCGWindowName as String] as? String,
                  !title.isEmpty else { continue }
-            if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet }
-            if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app }
+            if let id = browserPIDs[pid], Self.looksLikeMeet(title) {
+                return DetectedCall(app: .meet, bundleID: id,
+                                    windowID: info[kCGWindowNumber as String] as? CGWindowID)
+            }
+            if let native = pidToApp[pid], Self.isCallWindow(native.app, title) {
+                return DetectedCall(app: native.app, bundleID: native.id, windowID: nil)
+            }
        }
        return nil
    }
@@ -62,11 +62,20 @@ final class SessionController: ObservableObject {
        let sessionId: String
        let app: String
        let mixedURL: URL
-        let selfSpans: [VADSpan]
+        let timeline: [VisualTimeline.Segment]
    }
    private var lastProcess: ProcessInputs?
    private var processTask: Task<Void, Never>?
    private var recorder: AudioRecorder?
+    /// Visual capture for the current session (nil for manual recordings, apps with
+    /// no adapter, or when the window can't be captured — those record audio-only).
+    private var visualCapture: VisualCapture?
+    /// A visual capture whose `start()` is in flight (registered before the await),
+    /// so `prepareForTermination` can tear it down if its start-Task is orphaned.
+    private var inFlightVisual: VisualCapture?
+    /// App + capture target to start visual capture for, set at `start()`. `windowID`
+    /// pins the exact detected window (e.g. the Meet browser window); nil → largest.
+    private var pendingCapture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)?
    private var currentFolder: URL?
    private var startTime: Date?
    private var timer: Timer?
@@ -86,7 +95,7 @@ final class SessionController: ObservableObject {
            fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
        SessionController.shared = self

-        detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
+        detector.onCallStart = { [weak self] call in self?.handleCallStart(call) }
        detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
        detector.$status
            .sink { [weak self] status in self?.detectionStatus = status }
@@ -124,10 +133,11 @@ final class SessionController: ObservableObject {

    // MARK: - Auto-detection

-    private func handleCallStart(_ app: CallDetector.DetectedApp) {
+    private func handleCallStart(_ call: CallDetector.DetectedCall) {
        guard settings.autoRecordOnDetection else { return }
        switch state {
-        case .idle, .error: start(label: app.label, auto: true)
+        case .idle, .error:
+            start(label: call.app.label, auto: true, capture: (call.app, call.bundleID, call.windowID))
        case .starting, .recording, .finishing: break   // don't disturb an active session
        }
    }
@@ -156,7 +166,8 @@ final class SessionController: ObservableObject {

    // MARK: - Start / Stop

-    private func start(label: String = "manual", auto: Bool = false) {
+    private func start(label: String = "manual", auto: Bool = false,
+                       capture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)? = nil) {
        let folder: URL
        do {
            folder = try makeSessionFolder(label: label)
@@ -168,6 +179,7 @@ final class SessionController: ObservableObject {
        currentLabel = label
        autoStarted = auto
        pendingAutoStop = false
+        pendingCapture = capture
        let recorder = AudioRecorder(
            micURL: folder.appendingPathComponent("mic.wav"),
            systemURL: folder.appendingPathComponent("system.wav"),
@@ -177,6 +189,7 @@ final class SessionController: ObservableObject {
        state = .starting

        lifecycleGeneration += 1
+        let myGen = lifecycleGeneration
        lifecycleTask = Task {
            do {
                try await recorder.start()        // self-tears-down if it throws
@@ -187,7 +200,12 @@ final class SessionController: ObservableObject {
                if self.pendingAutoStop {
                    self.pendingAutoStop = false
                    self.stop()
+                    return
                }
+                // Attach visual capture on the SAME clock (best-effort, audio-only on failure).
+                // Pass this session's generation + recorder so a slow start can't
+                // adopt itself into a different session that began meanwhile.
+                await self.startVisual(t0Host: recorder.sharedT0Host(), generation: myGen, recorder: recorder)
            } catch {
                self.handleStartFailure(error)
            }
@@ -213,18 +231,71 @@ final class SessionController: ObservableObject {
        }
    }

+    // MARK: - Visual capture
+
+    /// Best-effort: start window capture for the detected app on the audio clock.
+    /// Any failure (no adapter, no window, Screen Recording denied) leaves
+    /// `visualCapture` nil and the session records audio-only.
+    ///
+    /// `generation`/`recorder` identify the session that launched this; because
+    /// `vc.start()` is a slow async call, a stop + a fresh start can complete during
+    /// it. We adopt the stream ONLY back into the same session — otherwise we cancel
+    /// it, so a stale capture can never attach to (or leak into) a different session.
+    private func startVisual(t0Host: Double, generation: Int, recorder: AudioRecorder) async {
+        guard let capture = pendingCapture else { return }   // manual recording → audio-only
+        pendingCapture = nil
+        guard let vc = VisualCapture(app: capture.app, bundleID: capture.bundleID,
+                                     windowID: capture.windowID, t0Host: t0Host) else { return }
+        // Register the live capture before the await so a quit (prepareForTermination)
+        // can drain it even if this start-Task gets orphaned by a concurrent stop.
+        inFlightVisual = vc
+        defer { if inFlightVisual === vc { inFlightVisual = nil } }
+        do {
+            try await vc.start()
+            // Adopt only if THIS session still owns the slot (same generation, same
+            // recorder, still recording); otherwise discard rather than leak/misattach.
+            guard generation == lifecycleGeneration, self.recorder === recorder,
+                  case .recording = state else {
+                await vc.cancel()
+                return
+            }
+            if let existing = visualCapture { await existing.cancel() }   // fail-closed
+            visualCapture = vc
+        } catch {
+            await vc.cancel()     // tear down any partial stream; never break recording
+        }
+    }
+
+    /// Stop visual capture (if any), write `visual_timeline.json`, and return the
+    /// timeline for the backend: visual segments + merged self-spans when visual
+    /// ran, otherwise the mic-VAD self spans alone.
+    private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?) async -> [VisualTimeline.Segment] {
+        let selfName = settings.selfName
+        if let vc = visualCapture, let folder {
+            visualCapture = nil
+            return await vc.finish(
+                selfSpans: result.selfSpans, selfName: selfName,
+                sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
+                durationSec: result.duration, folder: folder)
+        }
+        if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
+        return TranscriptPipeline.timeline(fromSelfSpans: result.selfSpans, selfName: selfName)
+    }
+
    private func stop() {
        guard let recorder else { return }
        state = .finishing
        stopTimer()
+        let folder = currentFolder
        lifecycleGeneration += 1
        lifecycleTask = Task {
            let result = await recorder.stop()
-            self.finish(result)
+            let timeline = await self.stopVisualAndTimeline(result, folder: folder)
+            self.finish(result, timeline: timeline)
        }
    }

-    private func finish(_ result: RecordingResult) {
+    private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment]) {
        recorder = nil
        micLevel = 0
        systemLevel = 0
@@ -237,7 +308,7 @@ final class SessionController: ObservableObject {
                duration: result.duration, selfSpanCount: result.selfSpans.count)
            lastProcess = ProcessInputs(
                folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
-                mixedURL: result.mixedURL, selfSpans: result.selfSpans)
+                mixedURL: result.mixedURL, timeline: timeline)
        }
        let autoSend = settings.autoSendOnStop
        currentFolder = nil
@@ -250,10 +321,10 @@ final class SessionController: ObservableObject {

    // MARK: - Backend transcription

-    /// Send the last finished session to the backend → `speakers.json`. Uses the
-    /// mic-VAD self spans as the timeline for now; visual segments (Phase 3–4) get
-    /// merged in once the adapters land. Safe to call manually ("Send to backend")
-    /// or automatically on stop.
+    /// Send the last finished session to the backend → `speakers.json`. The
+    /// timeline is the session's visual segments (with mic-VAD self spans merged)
+    /// when visual capture ran, or the self spans alone otherwise. Safe to call
+    /// manually ("Send to backend") or automatically on stop.
    func processLastSession() {
        guard let inputs = lastProcess else { return }
        if case .processing = transcriptStatus { return }
@@ -266,8 +337,7 @@ final class SessionController: ObservableObject {
                baseURL: settings.backendBaseURL,
                skipTLS: settings.skipTLSVerification,
                voiceprints: voiceprints)
-            let timeline = TranscriptPipeline.timeline(
-                fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
+            let timeline = inputs.timeline
            do {
                let speakers = try await pipeline.process(
                    sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
@@ -286,6 +356,9 @@ final class SessionController: ObservableObject {

    private func fail(_ message: String) {
        recorder = nil
+        visualCapture = nil   // recorder.start() failed before visual started; nothing running
+        inFlightVisual = nil
+        pendingCapture = nil
        currentFolder = nil
        autoStarted = false
        pendingAutoStop = false
@@ -312,11 +385,20 @@ final class SessionController: ObservableObject {
            if state == .recording, let recorder {
                state = .finishing
                stopTimer()
-                finish(await recorder.stop())
+                let folder = currentFolder
+                let result = await recorder.stop()
+                let timeline = await stopVisualAndTimeline(result, folder: folder)
+                finish(result, timeline: timeline)
            } else if lifecycleGeneration == gen {
                break   // settled: no new transition was spawned
            }
        }
+        // A visual start-Task orphaned by a concurrent stop may still hold a live
+        // stream that nothing else will tear down before exit — drain it here.
+        if let vc = inFlightVisual {
+            inFlightVisual = nil
+            await vc.cancel()
+        }
    }

    // MARK: - Timer
@@ -0,0 +1,83 @@
+import Foundation
+import CoreGraphics
+
+/// Owns the visual side of one recording session: picks the app's adapter, runs a
+/// `VisualObserver` over the call window, and on stop writes `visual_timeline.json`
+/// and returns the speaker segments for the backend hand-off.
+///
+/// Strictly best-effort: if there's no adapter for the app, or the window can't be
+/// captured, the session simply records audio-only — visuals never block or break
+/// the proven audio path. `init?` returns nil when the app has no visual adapter.
+@available(macOS 13.0, *)
+final class VisualCapture {
+    let app: CallDetector.DetectedApp
+    private let adapter: any AppAdapter
+    private let observer: VisualObserver
+
+    init?(app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?, t0Host: Double) {
+        guard let adapter = AdapterRegistry.adapter(for: app) else { return nil }
+        self.app = app
+        self.adapter = adapter
+        self.observer = VisualObserver(bundleID: bundleID, windowID: windowID, adapter: adapter,
+                                       t0Host: t0Host, fps: adapter.preferredFPS)
+    }
+
+    /// Start window capture. Throws if the window isn't capturable (no window yet,
+    /// Screen Recording denied) — the caller catches and falls back to audio-only.
+    func start() async throws {
+        try await observer.start()
+    }
+
+    /// Stop and discard capture without writing anything (used when the session
+    /// ends before capture was fully adopted).
+    func cancel() async {
+        _ = await observer.stop()
+    }
+
+    /// Clamp segment ends to the audio duration; drop any that become empty. Keeps
+    /// `visual_timeline.json` internally consistent and never sends the backend a
+    /// segment longer than the audio. (`duration <= 0` → passthrough.)
+    static func clampSegments(_ segs: [VisualTimeline.Segment], to duration: Double) -> [VisualTimeline.Segment] {
+        guard duration > 0 else { return segs }
+        return segs.compactMap { s in
+            let end = min(s.end, duration)
+            guard end > s.start else { return nil }
+            return .init(start: s.start, end: end, name: s.name, confidence: s.confidence, source: s.source)
+        }
+    }
+
+    static func clampGaps(_ gaps: [VisualTimeline.Gap], to duration: Double) -> [VisualTimeline.Gap] {
+        guard duration > 0 else { return gaps }
+        return gaps.compactMap { g in
+            let end = min(g.end, duration)
+            guard end > g.start else { return nil }
+            return .init(start: g.start, end: end, reason: g.reason)
+        }
+    }
+
+    /// Stop capture, fold in the mic-VAD self spans, write `visual_timeline.json`
+    /// into the session folder, and return the merged segments for `label-merge`.
+    func finish(selfSpans: [VADSpan], selfName: String,
+                sessionId: String, t0Unix: Double, durationSec: Double,
+                folder: URL) async -> [VisualTimeline.Segment] {
+        observer.addSelfSpans(selfSpans, selfName: selfName)
+        let (rawSegments, rawGaps) = await observer.stop()
+
+        // The observer stops slightly after audio fixes `durationSec`, so a trailing
+        // gap/segment can run past it. Clamp ends so the JSON is internally consistent
+        // (and we never hand the backend a segment longer than the audio).
+        let segments = Self.clampSegments(rawSegments, to: durationSec)
+        let gaps = Self.clampGaps(rawGaps, to: durationSec)
+
+        let names = Set(segments.map { $0.name })
+        let participants = names.sorted().map {
+            VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
+        }
+        let timeline = VisualTimeline(
+            sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion,
+            t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS,
+            selfName: selfName, participants: participants, segments: segments, visualGaps: gaps)
+        try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json"))
+        return segments
+    }
+}
@@ -14,6 +14,7 @@ import AppKit
@available(macOS 13.0, *)
 final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
    private let bundleID: String
+    private let windowID: CGWindowID?
    private let adapter: any AppAdapter
    private let t0Host: Double
    private let fps: Int
@@ -27,8 +28,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
    /// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
    var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?

-    init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
+    init(bundleID: String, windowID: CGWindowID? = nil, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
        self.bundleID = bundleID
+        self.windowID = windowID
        self.adapter = adapter
        self.t0Host = t0Host
        self.fps = max(1, fps)
@@ -36,12 +38,15 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {

    func start() async throws {
        let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
-        // The call window: the largest window owned by the target app.
        let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
-        guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else {
+        // Prefer the EXACT detected window (e.g. the Meet browser window) by ID; fall
+        // back to the largest owned window when no ID was supplied or it's gone.
+        guard let idx = Self.pickWindowIndex(candidates.map { ($0.windowID, $0.frame) }, preferredID: windowID),
+              candidates.indices.contains(idx) else {
            throw NSError(domain: "Ten31", code: 2,
                          userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
        }
+        let window = candidates[idx]

        let filter = SCContentFilter(desktopIndependentWindow: window)
        let config = SCStreamConfiguration()
@@ -50,8 +55,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
        config.showsCursor = false
        config.pixelFormat = kCVPixelFormatType_32BGRA
        // window.frame is in points; capture at native pixels so OCR can read small
-        // initials/names (a half-res Retina capture badly hurts recognition).
-        let scale = NSScreen.main?.backingScaleFactor ?? 2
+        // initials/names (a half-res Retina capture badly hurts recognition). Use the
+        // scale of the display the window is actually on, not always the main screen.
+        let scale = Self.backingScale(forWindowFrame: window.frame)
        config.width = max(2, Int(window.frame.width * scale))
        config.height = max(2, Int(window.frame.height * scale))

@@ -61,8 +67,36 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
        self.stream = stream
    }

+    /// Choose which candidate window to capture: the one matching `preferredID` if
+    /// present, else the largest by area. Returns the index into `candidates`, or
+    /// nil if there are none. Pure/testable — no ScreenCaptureKit types.
+    static func pickWindowIndex(_ candidates: [(id: CGWindowID, frame: CGRect)],
+                                preferredID: CGWindowID?) -> Int? {
+        guard !candidates.isEmpty else { return nil }
+        if let preferredID, let i = candidates.firstIndex(where: { $0.id == preferredID }) {
+            return i
+        }
+        return candidates.indices.max(by: {
+            candidates[$0].frame.width * candidates[$0].frame.height
+                < candidates[$1].frame.width * candidates[$1].frame.height
+        })
+    }
+
+    /// Backing scale of the display that contains the window's center. SCWindow.frame
+    /// is in global display (top-left origin) points; NSScreen is bottom-left, so we
+    /// flip the center through the primary screen's height before testing containment.
+    private static func backingScale(forWindowFrame frame: CGRect) -> CGFloat {
+        let screens = NSScreen.screens
+        guard let primary = screens.first else { return NSScreen.main?.backingScaleFactor ?? 2 }
+        let centerAppKit = CGPoint(x: frame.midX, y: primary.frame.maxY - frame.midY)
+        let screen = screens.first { $0.frame.contains(centerAppKit) } ?? NSScreen.main ?? primary
+        return screen.backingScaleFactor
+    }
+
    func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
-        if let stream { try? await stream.stopCapture() }
+        // Bound stopCapture: an already-errored SCStream can block forever, which
+        // would wedge session finalization in `.finishing`. Mirror AudioRecorder.
+        if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) }
        stream = nil
        return queue.sync {
            if let gs = gapStart {
@@ -113,6 +147,17 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {

    func stream(_ stream: SCStream, didStopWithError error: Error) {}

+    /// Proceed as soon as `stopCapture()` returns OR the timeout fires, so a wedged
+    /// stream can't block forever.
+    private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async {
+        await withTaskGroup(of: Void.self) { group in
+            group.addTask { try? await stream.stopCapture() }
+            group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) }
+            _ = await group.next()
+            group.cancelAll()
+        }
+    }
+
    private enum FrameKind { case live, idle, gap }

    /// SCK delivers `.complete` only when content changes, `.idle` for a static
@@ -0,0 +1,67 @@
+import XCTest
+import CoreGraphics
+@testable import Ten31Transcripts
+
+/// Window-selection logic: prefer the exact detected window (e.g. the Meet browser
+/// window) by ID, else fall back to the largest owned window. This is the fix for
+/// the "captures the wrong browser window" data-flow bug.
+final class VisualObserverTests: XCTestCase {
+
+    private func c(_ id: CGWindowID, _ w: CGFloat, _ h: CGFloat) -> (id: CGWindowID, frame: CGRect) {
+        (id, CGRect(x: 0, y: 0, width: w, height: h))
+    }
+
+    func testPrefersMatchingWindowIDOverLargest() {
+        // The Meet window (id 42) is NOT the largest — must still be chosen by ID.
+        let candidates = [c(7, 1600, 1000), c(42, 800, 600), c(9, 1200, 900)]
+        let idx = VisualObserver.pickWindowIndex(candidates, preferredID: 42)
+        XCTAssertEqual(idx, 1)
+    }
+
+    func testFallsBackToLargestWhenNoPreferredID() {
+        let candidates = [c(7, 800, 600), c(9, 1600, 1000), c(11, 1200, 900)]
+        let idx = VisualObserver.pickWindowIndex(candidates, preferredID: nil)
+        XCTAssertEqual(idx, 1)   // the 1600x1000 window
+    }
+
+    func testFallsBackToLargestWhenPreferredIDMissing() {
+        let candidates = [c(7, 800, 600), c(9, 1600, 1000)]
+        let idx = VisualObserver.pickWindowIndex(candidates, preferredID: 999)   // gone
+        XCTAssertEqual(idx, 1)
+    }
+
+    func testNilWhenNoCandidates() {
+        XCTAssertNil(VisualObserver.pickWindowIndex([], preferredID: 42))
+        XCTAssertNil(VisualObserver.pickWindowIndex([], preferredID: nil))
+    }
+
+    // MARK: - Duration clamping (visual_timeline.json internal consistency)
+
+    func testClampSegmentsToDuration() {
+        let segs = [
+            VisualTimeline.Segment(start: 1, end: 5, name: "A", confidence: 0.9, source: "vision"),
+            VisualTimeline.Segment(start: 8, end: 12, name: "B", confidence: 0.8, source: "vision"), // end past 10
+            VisualTimeline.Segment(start: 10.5, end: 11, name: "C", confidence: 0.7, source: "vision"), // fully past → dropped
+        ]
+        let out = VisualCapture.clampSegments(segs, to: 10)
+        XCTAssertEqual(out.count, 2)
+        XCTAssertEqual(out[0].end, 5, accuracy: 0.001)
+        XCTAssertEqual(out[1].end, 10, accuracy: 0.001)   // clamped
+        XCTAssertFalse(out.contains { $0.name == "C" })   // dropped
+    }
+
+    func testClampGapsToDuration() {
+        let gaps = [
+            VisualTimeline.Gap(start: 2, end: 4, reason: "minimized"),
+            VisualTimeline.Gap(start: 9, end: 13, reason: "minimized"),   // clamped to 10
+        ]
+        let out = VisualCapture.clampGaps(gaps, to: 10)
+        XCTAssertEqual(out.count, 2)
+        XCTAssertEqual(out[1].end, 10, accuracy: 0.001)
+    }
+
+    func testClampPassthroughWhenDurationUnknown() {
+        let segs = [VisualTimeline.Segment(start: 1, end: 99, name: "A", confidence: 1, source: "vision")]
+        XCTAssertEqual(VisualCapture.clampSegments(segs, to: 0), segs)   // no duration → unchanged
+    }
+}