Wire visual capture into the recording lifecycle (failure-isolated)

Visual capture now runs alongside audio: on call start the session picks the
app's adapter, captures the call window on the SAME monotonic clock as the audio
(AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands
the backend the visual segments with mic-VAD self-spans merged. Any visual
failure (no adapter, no window, Screen Recording denied) leaves the session
recording audio-only — the proven path is never blocked or broken.

- CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact
  CGWindowID of the matched Meet browser window (native apps → nil → largest).
- VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json.
- AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment.

Hardened per a 3-lens adversarial review (concurrency / failure-isolation /
data-flow), all 6 confirmed findings fixed:
- P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session
  (cross-session SCStream leak + visual_timeline.json written to the wrong
  folder). Now gated on session identity — generation + recorder ===, still
  .recording — with fail-closed adoption; otherwise the stream is cancelled.
- P1: observer captured the browser's largest window, not the detected Meet
  window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested),
  largest-area only as fallback.
- P2: a startVisual orphaned by a concurrent stop could leak a stream on quit.
  inFlightVisual is registered before the await and drained in prepareForTermination.
- P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in
  VisualCapture (clampSegments/clampGaps, unit-tested).
- P4: capture pixel size used NSScreen.main scale; now uses the scale of the
  display actually hosting the window (OCR clarity on secondary displays).
- VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so
  a wedged stream can't hang finalization.

25/25 XCTest pass. Live validation on real calls still pending.
This commit is contained in:
Grant Gilliam
2026-06-06 10:18:52 -05:00
parent c347acbd97
commit 880b56e426
6 changed files with 348 additions and 48 deletions
@@ -88,6 +88,11 @@ final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput {
}
}
/// The shared monotonic t0 (`CACurrentMediaTime` base) captured at `start()`,
/// so visual capture can timestamp frames against the exact same clock as the
/// audio. Valid only after `start()` has returned.
func sharedT0Host() -> Double { ioQueue.sync { t0Host } }
func stop() async -> RecordingResult {
// Stop the mic FIRST always succeeds and halts mic capture immediately.
engine?.inputNode.removeTap(onBus: 0)
+45 -27
View File
@@ -26,6 +26,16 @@ final class CallDetector: ObservableObject {
}
}
/// A detected call plus what to capture for visuals: the bundle ID of the owner
/// (the native app for Zoom/Teams/Signal, or the *browser* hosting the Meet tab)
/// and for Meet the exact `CGWindowID` of the matched call window, so the
/// observer captures that window instead of guessing the browser's largest one.
struct DetectedCall: Equatable {
let app: DetectedApp
let bundleID: String
let windowID: CGWindowID?
}
enum Status: Equatable {
case disabled
case listening
@@ -34,7 +44,7 @@ final class CallDetector: ObservableObject {
@Published private(set) var status: Status = .disabled
var onCallStart: ((DetectedApp) -> Void)?
var onCallStart: ((DetectedCall) -> Void)?
var onCallEnd: (() -> Void)?
private let mic = MicActivityMonitor()
@@ -42,7 +52,7 @@ final class CallDetector: ObservableObject {
private var openTimer: Timer?
private var closeTimer: Timer?
private var inCall = false
private var currentApp: DetectedApp?
private var currentCall: DetectedCall?
private var enabled = false
private let openDelay: TimeInterval = 2.0
@@ -79,7 +89,7 @@ final class CallDetector: ObservableObject {
pollTimer?.invalidate(); pollTimer = nil
cancelOpen(); cancelClose()
inCall = false
currentApp = nil
currentCall = nil
status = .disabled
}
@@ -92,8 +102,8 @@ final class CallDetector: ObservableObject {
if let candidate {
cancelClose()
if inCall {
currentApp = candidate
status = .inCall(candidate)
currentCall = candidate
status = .inCall(candidate.app)
} else if openTimer == nil {
openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in
Task { @MainActor in self?.fireOpen() }
@@ -112,18 +122,18 @@ final class CallDetector: ObservableObject {
private func fireOpen() {
openTimer = nil
// Re-resolve the app at fire time (the debounce window may have changed it).
guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return }
guard enabled, mic.isRunning, let call = detectApp(), !inCall else { return }
inCall = true
currentApp = app
status = .inCall(app)
onCallStart?(app)
currentCall = call
status = .inCall(call.app)
onCallStart?(call)
}
private func fireClose() {
closeTimer = nil
guard enabled, inCall else { return }
inCall = false
currentApp = nil
currentCall = nil
status = .listening
onCallEnd?()
}
@@ -137,7 +147,7 @@ final class CallDetector: ObservableObject {
/// On macOS 14+ we attribute mic usage per-process (robust start AND stop,
/// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13
/// we fall back to the per-app call-window heuristic.
private func detectApp() -> DetectedApp? {
private func detectApp() -> DetectedCall? {
if #available(macOS 14.0, *) {
return detectViaMicAttribution()
}
@@ -145,7 +155,7 @@ final class CallDetector: ObservableObject {
}
@available(macOS 14.0, *)
private func detectViaMicAttribution() -> DetectedApp? {
private func detectViaMicAttribution() -> DetectedCall? {
let micPIDs = AudioInputProcesses.micUsingPIDs()
guard !micPIDs.isEmpty else { return nil }
let selfPID = NSRunningApplication.current.processIdentifier
@@ -154,39 +164,42 @@ final class CallDetector: ObservableObject {
let pid = app.processIdentifier
guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue }
if let native = Self.nativeApps.first(where: { $0.id == id }) {
return native.app // Signal/Zoom/Teams using the mic = in a call
return DetectedCall(app: native.app, bundleID: id, windowID: nil) // native: capture largest owned window
}
// A browser using the mic + a Meet window = a Meet call. The mic state
// gives reliable start/stop; the window check keeps non-Meet browser
// mic use (other web apps) from being mislabeled as a Meet recording.
if Self.browserIDs.contains(id), pidHasMeetWindow(pid) {
return .meet
// Capture that exact browser window (by ID), not just the browser.
if Self.browserIDs.contains(id), let wid = meetWindowID(pid) {
return DetectedCall(app: .meet, bundleID: id, windowID: wid)
}
}
return nil
}
private func pidHasMeetWindow(_ pid: pid_t) -> Bool {
/// The `CGWindowID` of this PID's Google Meet call window (title "Meet - "),
/// or nil if none also serves as the "is this a Meet call?" check.
private func meetWindowID(_ pid: pid_t) -> CGWindowID? {
guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]]
else { return false }
else { return nil }
for w in info {
guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid,
let title = w[kCGWindowName as String] as? String else { continue }
if Self.looksLikeMeet(title) { return true }
let title = w[kCGWindowName as String] as? String, Self.looksLikeMeet(title) else { continue }
return w[kCGWindowNumber as String] as? CGWindowID
}
return false
return nil
}
/// macOS 13 fallback: detect by the presence of a call WINDOW per app.
private func detectViaWindowTitle() -> DetectedApp? {
var pidToApp: [pid_t: DetectedApp] = [:]
var browserPIDs = Set<pid_t>()
private func detectViaWindowTitle() -> DetectedCall? {
var pidToApp: [pid_t: (app: DetectedApp, id: String)] = [:]
var browserPIDs: [pid_t: String] = [:]
for app in NSWorkspace.shared.runningApplications {
guard let id = app.bundleIdentifier else { continue }
if let native = Self.nativeApps.first(where: { $0.id == id }) {
pidToApp[app.processIdentifier] = native.app
pidToApp[app.processIdentifier] = (native.app, id)
} else if Self.browserIDs.contains(id) {
browserPIDs.insert(app.processIdentifier)
browserPIDs[app.processIdentifier] = id
}
}
guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil }
@@ -197,8 +210,13 @@ final class CallDetector: ObservableObject {
guard let pid = info[kCGWindowOwnerPID as String] as? pid_t,
let title = info[kCGWindowName as String] as? String,
!title.isEmpty else { continue }
if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet }
if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app }
if let id = browserPIDs[pid], Self.looksLikeMeet(title) {
return DetectedCall(app: .meet, bundleID: id,
windowID: info[kCGWindowNumber as String] as? CGWindowID)
}
if let native = pidToApp[pid], Self.isCallWindow(native.app, title) {
return DetectedCall(app: native.app, bundleID: native.id, windowID: nil)
}
}
return nil
}
@@ -62,11 +62,20 @@ final class SessionController: ObservableObject {
let sessionId: String
let app: String
let mixedURL: URL
let selfSpans: [VADSpan]
let timeline: [VisualTimeline.Segment]
}
private var lastProcess: ProcessInputs?
private var processTask: Task<Void, Never>?
private var recorder: AudioRecorder?
/// Visual capture for the current session (nil for manual recordings, apps with
/// no adapter, or when the window can't be captured those record audio-only).
private var visualCapture: VisualCapture?
/// A visual capture whose `start()` is in flight (registered before the await),
/// so `prepareForTermination` can tear it down if its start-Task is orphaned.
private var inFlightVisual: VisualCapture?
/// App + capture target to start visual capture for, set at `start()`. `windowID`
/// pins the exact detected window (e.g. the Meet browser window); nil largest.
private var pendingCapture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)?
private var currentFolder: URL?
private var startTime: Date?
private var timer: Timer?
@@ -86,7 +95,7 @@ final class SessionController: ObservableObject {
fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
SessionController.shared = self
detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
detector.onCallStart = { [weak self] call in self?.handleCallStart(call) }
detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
detector.$status
.sink { [weak self] status in self?.detectionStatus = status }
@@ -124,10 +133,11 @@ final class SessionController: ObservableObject {
// MARK: - Auto-detection
private func handleCallStart(_ app: CallDetector.DetectedApp) {
private func handleCallStart(_ call: CallDetector.DetectedCall) {
guard settings.autoRecordOnDetection else { return }
switch state {
case .idle, .error: start(label: app.label, auto: true)
case .idle, .error:
start(label: call.app.label, auto: true, capture: (call.app, call.bundleID, call.windowID))
case .starting, .recording, .finishing: break // don't disturb an active session
}
}
@@ -156,7 +166,8 @@ final class SessionController: ObservableObject {
// MARK: - Start / Stop
private func start(label: String = "manual", auto: Bool = false) {
private func start(label: String = "manual", auto: Bool = false,
capture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)? = nil) {
let folder: URL
do {
folder = try makeSessionFolder(label: label)
@@ -168,6 +179,7 @@ final class SessionController: ObservableObject {
currentLabel = label
autoStarted = auto
pendingAutoStop = false
pendingCapture = capture
let recorder = AudioRecorder(
micURL: folder.appendingPathComponent("mic.wav"),
systemURL: folder.appendingPathComponent("system.wav"),
@@ -177,6 +189,7 @@ final class SessionController: ObservableObject {
state = .starting
lifecycleGeneration += 1
let myGen = lifecycleGeneration
lifecycleTask = Task {
do {
try await recorder.start() // self-tears-down if it throws
@@ -187,7 +200,12 @@ final class SessionController: ObservableObject {
if self.pendingAutoStop {
self.pendingAutoStop = false
self.stop()
return
}
// Attach visual capture on the SAME clock (best-effort, audio-only on failure).
// Pass this session's generation + recorder so a slow start can't
// adopt itself into a different session that began meanwhile.
await self.startVisual(t0Host: recorder.sharedT0Host(), generation: myGen, recorder: recorder)
} catch {
self.handleStartFailure(error)
}
@@ -213,18 +231,71 @@ final class SessionController: ObservableObject {
}
}
// MARK: - Visual capture
/// Best-effort: start window capture for the detected app on the audio clock.
/// Any failure (no adapter, no window, Screen Recording denied) leaves
/// `visualCapture` nil and the session records audio-only.
///
/// `generation`/`recorder` identify the session that launched this; because
/// `vc.start()` is a slow async call, a stop + a fresh start can complete during
/// it. We adopt the stream ONLY back into the same session otherwise we cancel
/// it, so a stale capture can never attach to (or leak into) a different session.
private func startVisual(t0Host: Double, generation: Int, recorder: AudioRecorder) async {
guard let capture = pendingCapture else { return } // manual recording audio-only
pendingCapture = nil
guard let vc = VisualCapture(app: capture.app, bundleID: capture.bundleID,
windowID: capture.windowID, t0Host: t0Host) else { return }
// Register the live capture before the await so a quit (prepareForTermination)
// can drain it even if this start-Task gets orphaned by a concurrent stop.
inFlightVisual = vc
defer { if inFlightVisual === vc { inFlightVisual = nil } }
do {
try await vc.start()
// Adopt only if THIS session still owns the slot (same generation, same
// recorder, still recording); otherwise discard rather than leak/misattach.
guard generation == lifecycleGeneration, self.recorder === recorder,
case .recording = state else {
await vc.cancel()
return
}
if let existing = visualCapture { await existing.cancel() } // fail-closed
visualCapture = vc
} catch {
await vc.cancel() // tear down any partial stream; never break recording
}
}
/// Stop visual capture (if any), write `visual_timeline.json`, and return the
/// timeline for the backend: visual segments + merged self-spans when visual
/// ran, otherwise the mic-VAD self spans alone.
private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?) async -> [VisualTimeline.Segment] {
let selfName = settings.selfName
if let vc = visualCapture, let folder {
visualCapture = nil
return await vc.finish(
selfSpans: result.selfSpans, selfName: selfName,
sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
durationSec: result.duration, folder: folder)
}
if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
return TranscriptPipeline.timeline(fromSelfSpans: result.selfSpans, selfName: selfName)
}
private func stop() {
guard let recorder else { return }
state = .finishing
stopTimer()
let folder = currentFolder
lifecycleGeneration += 1
lifecycleTask = Task {
let result = await recorder.stop()
self.finish(result)
let timeline = await self.stopVisualAndTimeline(result, folder: folder)
self.finish(result, timeline: timeline)
}
}
private func finish(_ result: RecordingResult) {
private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment]) {
recorder = nil
micLevel = 0
systemLevel = 0
@@ -237,7 +308,7 @@ final class SessionController: ObservableObject {
duration: result.duration, selfSpanCount: result.selfSpans.count)
lastProcess = ProcessInputs(
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
mixedURL: result.mixedURL, selfSpans: result.selfSpans)
mixedURL: result.mixedURL, timeline: timeline)
}
let autoSend = settings.autoSendOnStop
currentFolder = nil
@@ -250,10 +321,10 @@ final class SessionController: ObservableObject {
// MARK: - Backend transcription
/// Send the last finished session to the backend `speakers.json`. Uses the
/// mic-VAD self spans as the timeline for now; visual segments (Phase 34) get
/// merged in once the adapters land. Safe to call manually ("Send to backend")
/// or automatically on stop.
/// Send the last finished session to the backend `speakers.json`. The
/// timeline is the session's visual segments (with mic-VAD self spans merged)
/// when visual capture ran, or the self spans alone otherwise. Safe to call
/// manually ("Send to backend") or automatically on stop.
func processLastSession() {
guard let inputs = lastProcess else { return }
if case .processing = transcriptStatus { return }
@@ -266,8 +337,7 @@ final class SessionController: ObservableObject {
baseURL: settings.backendBaseURL,
skipTLS: settings.skipTLSVerification,
voiceprints: voiceprints)
let timeline = TranscriptPipeline.timeline(
fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
let timeline = inputs.timeline
do {
let speakers = try await pipeline.process(
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
@@ -286,6 +356,9 @@ final class SessionController: ObservableObject {
private func fail(_ message: String) {
recorder = nil
visualCapture = nil // recorder.start() failed before visual started; nothing running
inFlightVisual = nil
pendingCapture = nil
currentFolder = nil
autoStarted = false
pendingAutoStop = false
@@ -312,11 +385,20 @@ final class SessionController: ObservableObject {
if state == .recording, let recorder {
state = .finishing
stopTimer()
finish(await recorder.stop())
let folder = currentFolder
let result = await recorder.stop()
let timeline = await stopVisualAndTimeline(result, folder: folder)
finish(result, timeline: timeline)
} else if lifecycleGeneration == gen {
break // settled: no new transition was spawned
}
}
// A visual start-Task orphaned by a concurrent stop may still hold a live
// stream that nothing else will tear down before exit drain it here.
if let vc = inFlightVisual {
inFlightVisual = nil
await vc.cancel()
}
}
// MARK: - Timer
@@ -0,0 +1,83 @@
import Foundation
import CoreGraphics
/// Owns the visual side of one recording session: picks the app's adapter, runs a
/// `VisualObserver` over the call window, and on stop writes `visual_timeline.json`
/// and returns the speaker segments for the backend hand-off.
///
/// Strictly best-effort: if there's no adapter for the app, or the window can't be
/// captured, the session simply records audio-only visuals never block or break
/// the proven audio path. `init?` returns nil when the app has no visual adapter.
@available(macOS 13.0, *)
final class VisualCapture {
let app: CallDetector.DetectedApp
private let adapter: any AppAdapter
private let observer: VisualObserver
init?(app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?, t0Host: Double) {
guard let adapter = AdapterRegistry.adapter(for: app) else { return nil }
self.app = app
self.adapter = adapter
self.observer = VisualObserver(bundleID: bundleID, windowID: windowID, adapter: adapter,
t0Host: t0Host, fps: adapter.preferredFPS)
}
/// Start window capture. Throws if the window isn't capturable (no window yet,
/// Screen Recording denied) the caller catches and falls back to audio-only.
func start() async throws {
try await observer.start()
}
/// Stop and discard capture without writing anything (used when the session
/// ends before capture was fully adopted).
func cancel() async {
_ = await observer.stop()
}
/// Clamp segment ends to the audio duration; drop any that become empty. Keeps
/// `visual_timeline.json` internally consistent and never sends the backend a
/// segment longer than the audio. (`duration <= 0` passthrough.)
static func clampSegments(_ segs: [VisualTimeline.Segment], to duration: Double) -> [VisualTimeline.Segment] {
guard duration > 0 else { return segs }
return segs.compactMap { s in
let end = min(s.end, duration)
guard end > s.start else { return nil }
return .init(start: s.start, end: end, name: s.name, confidence: s.confidence, source: s.source)
}
}
static func clampGaps(_ gaps: [VisualTimeline.Gap], to duration: Double) -> [VisualTimeline.Gap] {
guard duration > 0 else { return gaps }
return gaps.compactMap { g in
let end = min(g.end, duration)
guard end > g.start else { return nil }
return .init(start: g.start, end: end, reason: g.reason)
}
}
/// Stop capture, fold in the mic-VAD self spans, write `visual_timeline.json`
/// into the session folder, and return the merged segments for `label-merge`.
func finish(selfSpans: [VADSpan], selfName: String,
sessionId: String, t0Unix: Double, durationSec: Double,
folder: URL) async -> [VisualTimeline.Segment] {
observer.addSelfSpans(selfSpans, selfName: selfName)
let (rawSegments, rawGaps) = await observer.stop()
// The observer stops slightly after audio fixes `durationSec`, so a trailing
// gap/segment can run past it. Clamp ends so the JSON is internally consistent
// (and we never hand the backend a segment longer than the audio).
let segments = Self.clampSegments(rawSegments, to: durationSec)
let gaps = Self.clampGaps(rawGaps, to: durationSec)
let names = Set(segments.map { $0.name })
let participants = names.sorted().map {
VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
}
let timeline = VisualTimeline(
sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion,
t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS,
selfName: selfName, participants: participants, segments: segments, visualGaps: gaps)
try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json"))
return segments
}
}
+51 -6
View File
@@ -14,6 +14,7 @@ import AppKit
@available(macOS 13.0, *)
final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
private let bundleID: String
private let windowID: CGWindowID?
private let adapter: any AppAdapter
private let t0Host: Double
private let fps: Int
@@ -27,8 +28,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
/// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?
init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
init(bundleID: String, windowID: CGWindowID? = nil, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
self.bundleID = bundleID
self.windowID = windowID
self.adapter = adapter
self.t0Host = t0Host
self.fps = max(1, fps)
@@ -36,12 +38,15 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
func start() async throws {
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
// The call window: the largest window owned by the target app.
let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else {
// Prefer the EXACT detected window (e.g. the Meet browser window) by ID; fall
// back to the largest owned window when no ID was supplied or it's gone.
guard let idx = Self.pickWindowIndex(candidates.map { ($0.windowID, $0.frame) }, preferredID: windowID),
candidates.indices.contains(idx) else {
throw NSError(domain: "Ten31", code: 2,
userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
}
let window = candidates[idx]
let filter = SCContentFilter(desktopIndependentWindow: window)
let config = SCStreamConfiguration()
@@ -50,8 +55,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
config.showsCursor = false
config.pixelFormat = kCVPixelFormatType_32BGRA
// window.frame is in points; capture at native pixels so OCR can read small
// initials/names (a half-res Retina capture badly hurts recognition).
let scale = NSScreen.main?.backingScaleFactor ?? 2
// initials/names (a half-res Retina capture badly hurts recognition). Use the
// scale of the display the window is actually on, not always the main screen.
let scale = Self.backingScale(forWindowFrame: window.frame)
config.width = max(2, Int(window.frame.width * scale))
config.height = max(2, Int(window.frame.height * scale))
@@ -61,8 +67,36 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
self.stream = stream
}
/// Choose which candidate window to capture: the one matching `preferredID` if
/// present, else the largest by area. Returns the index into `candidates`, or
/// nil if there are none. Pure/testable no ScreenCaptureKit types.
static func pickWindowIndex(_ candidates: [(id: CGWindowID, frame: CGRect)],
preferredID: CGWindowID?) -> Int? {
guard !candidates.isEmpty else { return nil }
if let preferredID, let i = candidates.firstIndex(where: { $0.id == preferredID }) {
return i
}
return candidates.indices.max(by: {
candidates[$0].frame.width * candidates[$0].frame.height
< candidates[$1].frame.width * candidates[$1].frame.height
})
}
/// Backing scale of the display that contains the window's center. SCWindow.frame
/// is in global display (top-left origin) points; NSScreen is bottom-left, so we
/// flip the center through the primary screen's height before testing containment.
private static func backingScale(forWindowFrame frame: CGRect) -> CGFloat {
let screens = NSScreen.screens
guard let primary = screens.first else { return NSScreen.main?.backingScaleFactor ?? 2 }
let centerAppKit = CGPoint(x: frame.midX, y: primary.frame.maxY - frame.midY)
let screen = screens.first { $0.frame.contains(centerAppKit) } ?? NSScreen.main ?? primary
return screen.backingScaleFactor
}
func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
if let stream { try? await stream.stopCapture() }
// Bound stopCapture: an already-errored SCStream can block forever, which
// would wedge session finalization in `.finishing`. Mirror AudioRecorder.
if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) }
stream = nil
return queue.sync {
if let gs = gapStart {
@@ -113,6 +147,17 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
func stream(_ stream: SCStream, didStopWithError error: Error) {}
/// Proceed as soon as `stopCapture()` returns OR the timeout fires, so a wedged
/// stream can't block forever.
private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async {
await withTaskGroup(of: Void.self) { group in
group.addTask { try? await stream.stopCapture() }
group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) }
_ = await group.next()
group.cancelAll()
}
}
private enum FrameKind { case live, idle, gap }
/// SCK delivers `.complete` only when content changes, `.idle` for a static
@@ -0,0 +1,67 @@
import XCTest
import CoreGraphics
@testable import Ten31Transcripts
/// Window-selection logic: prefer the exact detected window (e.g. the Meet browser
/// window) by ID, else fall back to the largest owned window. This is the fix for
/// the "captures the wrong browser window" data-flow bug.
final class VisualObserverTests: XCTestCase {
private func c(_ id: CGWindowID, _ w: CGFloat, _ h: CGFloat) -> (id: CGWindowID, frame: CGRect) {
(id, CGRect(x: 0, y: 0, width: w, height: h))
}
func testPrefersMatchingWindowIDOverLargest() {
// The Meet window (id 42) is NOT the largest must still be chosen by ID.
let candidates = [c(7, 1600, 1000), c(42, 800, 600), c(9, 1200, 900)]
let idx = VisualObserver.pickWindowIndex(candidates, preferredID: 42)
XCTAssertEqual(idx, 1)
}
func testFallsBackToLargestWhenNoPreferredID() {
let candidates = [c(7, 800, 600), c(9, 1600, 1000), c(11, 1200, 900)]
let idx = VisualObserver.pickWindowIndex(candidates, preferredID: nil)
XCTAssertEqual(idx, 1) // the 1600x1000 window
}
func testFallsBackToLargestWhenPreferredIDMissing() {
let candidates = [c(7, 800, 600), c(9, 1600, 1000)]
let idx = VisualObserver.pickWindowIndex(candidates, preferredID: 999) // gone
XCTAssertEqual(idx, 1)
}
func testNilWhenNoCandidates() {
XCTAssertNil(VisualObserver.pickWindowIndex([], preferredID: 42))
XCTAssertNil(VisualObserver.pickWindowIndex([], preferredID: nil))
}
// MARK: - Duration clamping (visual_timeline.json internal consistency)
func testClampSegmentsToDuration() {
let segs = [
VisualTimeline.Segment(start: 1, end: 5, name: "A", confidence: 0.9, source: "vision"),
VisualTimeline.Segment(start: 8, end: 12, name: "B", confidence: 0.8, source: "vision"), // end past 10
VisualTimeline.Segment(start: 10.5, end: 11, name: "C", confidence: 0.7, source: "vision"), // fully past dropped
]
let out = VisualCapture.clampSegments(segs, to: 10)
XCTAssertEqual(out.count, 2)
XCTAssertEqual(out[0].end, 5, accuracy: 0.001)
XCTAssertEqual(out[1].end, 10, accuracy: 0.001) // clamped
XCTAssertFalse(out.contains { $0.name == "C" }) // dropped
}
func testClampGapsToDuration() {
let gaps = [
VisualTimeline.Gap(start: 2, end: 4, reason: "minimized"),
VisualTimeline.Gap(start: 9, end: 13, reason: "minimized"), // clamped to 10
]
let out = VisualCapture.clampGaps(gaps, to: 10)
XCTAssertEqual(out.count, 2)
XCTAssertEqual(out[1].end, 10, accuracy: 0.001)
}
func testClampPassthroughWhenDurationUnknown() {
let segs = [VisualTimeline.Segment(start: 1, end: 99, name: "A", confidence: 1, source: "vision")]
XCTAssertEqual(VisualCapture.clampSegments(segs, to: 0), segs) // no duration unchanged
}
}