import Foundation import CoreGraphics /// Owns the visual side of one recording session: picks the app's adapter, runs a /// `VisualObserver` over the call window, and on stop writes `visual_timeline.json` /// and returns the speaker segments for the backend hand-off. /// /// Strictly best-effort: if there's no adapter for the app, or the window can't be /// captured, the session simply records audio-only — visuals never block or break /// the proven audio path. `init?` returns nil when the app has no visual adapter. @available(macOS 13.0, *) final class VisualCapture { let app: CallDetector.DetectedApp private let adapter: any AppAdapter private let observer: VisualObserver init?(app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?, t0Host: Double) { guard let adapter = AdapterRegistry.adapter(for: app) else { return nil } self.app = app self.adapter = adapter self.observer = VisualObserver(bundleID: bundleID, windowID: windowID, adapter: adapter, t0Host: t0Host, fps: adapter.preferredFPS) } /// Start window capture. Throws if the window isn't capturable (no window yet, /// Screen Recording denied) — the caller catches and falls back to audio-only. func start() async throws { try await observer.start() } /// Stop and discard capture without writing anything (used when the session /// ends before capture was fully adopted). func cancel() async { _ = await observer.stop() } /// Clamp segment ends to the audio duration; drop any that become empty. Keeps /// `visual_timeline.json` internally consistent and never sends the backend a /// segment longer than the audio. (`duration <= 0` → passthrough.) static func clampSegments(_ segs: [VisualTimeline.Segment], to duration: Double) -> [VisualTimeline.Segment] { guard duration > 0 else { return segs } return segs.compactMap { s in let end = min(s.end, duration) guard end > s.start else { return nil } return .init(start: s.start, end: end, name: s.name, confidence: s.confidence, source: s.source) } } static func clampGaps(_ gaps: [VisualTimeline.Gap], to duration: Double) -> [VisualTimeline.Gap] { guard duration > 0 else { return gaps } return gaps.compactMap { g in let end = min(g.end, duration) guard end > g.start else { return nil } return .init(start: g.start, end: end, reason: g.reason) } } /// Stop capture, fold in the mic-VAD self spans, write `visual_timeline.json` /// into the session folder, and return the merged segments for `label-merge`. func finish(selfSpans: [VADSpan], selfName: String, sessionId: String, t0Unix: Double, durationSec: Double, folder: URL) async -> [VisualTimeline.Segment] { observer.addSelfSpans(selfSpans, selfName: selfName) let (rawSegments, rawGaps) = await observer.stop() // The observer stops slightly after audio fixes `durationSec`, so a trailing // gap/segment can run past it. Clamp ends so the JSON is internally consistent // (and we never hand the backend a segment longer than the audio). let segments = Self.clampSegments(rawSegments, to: durationSec) let gaps = Self.clampGaps(rawGaps, to: durationSec) let names = Set(segments.map { $0.name }) let participants = names.sorted().map { VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil) } let timeline = VisualTimeline( sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion, t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS, selfName: selfName, participants: participants, segments: segments, visualGaps: gaps) try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json")) return segments } }