import Foundation import ScreenCaptureKit import CoreMedia import QuartzCore import AppKit /// Window-scoped visual capture: streams the call window's own rendered content /// at ~`fps`, hands each frame to the app adapter, and **releases it immediately /// — frames are never written to disk**. Builds the speaker timeline and records /// `visual_gap`s when the window is minimized (SCK delivers non-live frames). /// /// Window visibility/focus is NOT required — SCK captures a window even when it's /// occluded or on another Space; only minimization freezes the backing buffer. @available(macOS 13.0, *) final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput { private let bundleID: String private let windowID: CGWindowID? private let adapter: any AppAdapter private let t0Host: Double private let fps: Int private let queue = DispatchQueue(label: "xyz.ten31.visual") private var stream: SCStream? private let builder = TimelineBuilder() private var gaps: [VisualTimeline.Gap] = [] private var gapStart: Double? /// Optional live hook (e.g. for a debug HUD). Observations only; no frame. var onObservations: (([SpeakerObservation], TimeInterval) -> Void)? init(bundleID: String, windowID: CGWindowID? = nil, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) { self.bundleID = bundleID self.windowID = windowID self.adapter = adapter self.t0Host = t0Host self.fps = max(1, fps) } func start() async throws { let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false) let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID } // Prefer the EXACT detected window (e.g. the Meet browser window) by ID; fall // back to the largest owned window when no ID was supplied or it's gone. guard let idx = Self.pickWindowIndex(candidates.map { ($0.windowID, $0.frame) }, preferredID: windowID), candidates.indices.contains(idx) else { throw NSError(domain: "Ten31", code: 2, userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."]) } let window = candidates[idx] let filter = SCContentFilter(desktopIndependentWindow: window) let config = SCStreamConfiguration() config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(fps)) config.queueDepth = 3 config.showsCursor = false config.pixelFormat = kCVPixelFormatType_32BGRA // window.frame is in points; capture at native pixels so OCR can read small // initials/names (a half-res Retina capture badly hurts recognition). Use the // scale of the display the window is actually on, not always the main screen. let scale = Self.backingScale(forWindowFrame: window.frame) config.width = max(2, Int(window.frame.width * scale)) config.height = max(2, Int(window.frame.height * scale)) let stream = SCStream(filter: filter, configuration: config, delegate: self) try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: queue) try await stream.startCapture() self.stream = stream } /// Choose which candidate window to capture: the one matching `preferredID` if /// present, else the largest by area. Returns the index into `candidates`, or /// nil if there are none. Pure/testable — no ScreenCaptureKit types. static func pickWindowIndex(_ candidates: [(id: CGWindowID, frame: CGRect)], preferredID: CGWindowID?) -> Int? { guard !candidates.isEmpty else { return nil } if let preferredID, let i = candidates.firstIndex(where: { $0.id == preferredID }) { return i } return candidates.indices.max(by: { candidates[$0].frame.width * candidates[$0].frame.height < candidates[$1].frame.width * candidates[$1].frame.height }) } /// Backing scale of the display that contains the window's center. SCWindow.frame /// is in global display (top-left origin) points; NSScreen is bottom-left, so we /// flip the center through the primary screen's height before testing containment. private static func backingScale(forWindowFrame frame: CGRect) -> CGFloat { let screens = NSScreen.screens guard let primary = screens.first else { return NSScreen.main?.backingScaleFactor ?? 2 } let centerAppKit = CGPoint(x: frame.midX, y: primary.frame.maxY - frame.midY) let screen = screens.first { $0.frame.contains(centerAppKit) } ?? NSScreen.main ?? primary return screen.backingScaleFactor } func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) { // Bound stopCapture: an already-errored SCStream can block forever, which // would wedge session finalization in `.finishing`. Mirror AudioRecorder. if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) } stream = nil return queue.sync { if let gs = gapStart { gaps.append(.init(start: gs, end: CACurrentMediaTime() - t0Host, reason: "minimized")) gapStart = nil } builder.finish() return (builder.segments, gaps) } } /// Merge mic-VAD self spans into the visual timeline (call before `stop`'s read, /// or fold in afterwards in the packager). func addSelfSpans(_ spans: [VADSpan], selfName: String) { queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) } } /// Every distinct participant name OCR'd over the session (read on the builder's /// queue; safe to call after `stop`). func participantNames() -> [String] { queue.sync { builder.observedNames } } // MARK: - SCStreamOutput (on `queue`) func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer, of type: SCStreamOutputType) { guard type == .screen, CMSampleBufferDataIsReady(sampleBuffer) else { return } let now = CACurrentMediaTime() - t0Host switch frameKind(sampleBuffer) { case .idle: // Window is live but static (no pixel change) — no new info, not a gap. return case .gap: // Minimized/blanked: the backing buffer is frozen. Open a gap once and // close any open speaker segments so none is carried across it. if gapStart == nil { gapStart = now builder.closeOpenSegments(at: now) } return case .live: if let gs = gapStart { gaps.append(.init(start: gs, end: now, reason: "minimized")) gapStart = nil } guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } let observations = adapter.analyze(frame: pixelBuffer, at: now) // frame released after this scope builder.ingest(observations, at: now) onObservations?(observations, now) } } func stream(_ stream: SCStream, didStopWithError error: Error) {} /// Proceed as soon as `stopCapture()` returns OR the timeout fires, so a wedged /// stream can't block forever. private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async { await withTaskGroup(of: Void.self) { group in group.addTask { try? await stream.stopCapture() } group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) } _ = await group.next() group.cancelAll() } } private enum FrameKind { case live, idle, gap } /// SCK delivers `.complete` only when content changes, `.idle` for a static /// (but visible) window, and `.blank`/`.suspended`/`.stopped` when frozen. private func frameKind(_ sampleBuffer: CMSampleBuffer) -> FrameKind { guard let attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, createIfNecessary: false) as? [[SCStreamFrameInfo: Any]], let raw = attachments.first?[.status] as? Int, let status = SCFrameStatus(rawValue: raw) else { return .live } switch status { case .complete: return .live case .idle: return .idle default: return .gap // .blank / .suspended / .stopped } } }