diff --git a/Ten31Transcripts/Audio/AudioRecorder.swift b/Ten31Transcripts/Audio/AudioRecorder.swift index e1a7490..453c25e 100644 --- a/Ten31Transcripts/Audio/AudioRecorder.swift +++ b/Ten31Transcripts/Audio/AudioRecorder.swift @@ -88,6 +88,11 @@ final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput { } } + /// The shared monotonic t0 (`CACurrentMediaTime` base) captured at `start()`, + /// so visual capture can timestamp frames against the exact same clock as the + /// audio. Valid only after `start()` has returned. + func sharedT0Host() -> Double { ioQueue.sync { t0Host } } + func stop() async -> RecordingResult { // Stop the mic FIRST — always succeeds and halts mic capture immediately. engine?.inputNode.removeTap(onBus: 0) diff --git a/Ten31Transcripts/Detection/CallDetector.swift b/Ten31Transcripts/Detection/CallDetector.swift index 7cc8b34..abf10d0 100644 --- a/Ten31Transcripts/Detection/CallDetector.swift +++ b/Ten31Transcripts/Detection/CallDetector.swift @@ -26,6 +26,16 @@ final class CallDetector: ObservableObject { } } + /// A detected call plus what to capture for visuals: the bundle ID of the owner + /// (the native app for Zoom/Teams/Signal, or the *browser* hosting the Meet tab) + /// and — for Meet — the exact `CGWindowID` of the matched call window, so the + /// observer captures that window instead of guessing the browser's largest one. + struct DetectedCall: Equatable { + let app: DetectedApp + let bundleID: String + let windowID: CGWindowID? + } + enum Status: Equatable { case disabled case listening @@ -34,7 +44,7 @@ final class CallDetector: ObservableObject { @Published private(set) var status: Status = .disabled - var onCallStart: ((DetectedApp) -> Void)? + var onCallStart: ((DetectedCall) -> Void)? var onCallEnd: (() -> Void)? private let mic = MicActivityMonitor() @@ -42,7 +52,7 @@ final class CallDetector: ObservableObject { private var openTimer: Timer? private var closeTimer: Timer? private var inCall = false - private var currentApp: DetectedApp? + private var currentCall: DetectedCall? private var enabled = false private let openDelay: TimeInterval = 2.0 @@ -79,7 +89,7 @@ final class CallDetector: ObservableObject { pollTimer?.invalidate(); pollTimer = nil cancelOpen(); cancelClose() inCall = false - currentApp = nil + currentCall = nil status = .disabled } @@ -92,8 +102,8 @@ final class CallDetector: ObservableObject { if let candidate { cancelClose() if inCall { - currentApp = candidate - status = .inCall(candidate) + currentCall = candidate + status = .inCall(candidate.app) } else if openTimer == nil { openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in Task { @MainActor in self?.fireOpen() } @@ -112,18 +122,18 @@ final class CallDetector: ObservableObject { private func fireOpen() { openTimer = nil // Re-resolve the app at fire time (the debounce window may have changed it). - guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return } + guard enabled, mic.isRunning, let call = detectApp(), !inCall else { return } inCall = true - currentApp = app - status = .inCall(app) - onCallStart?(app) + currentCall = call + status = .inCall(call.app) + onCallStart?(call) } private func fireClose() { closeTimer = nil guard enabled, inCall else { return } inCall = false - currentApp = nil + currentCall = nil status = .listening onCallEnd?() } @@ -137,7 +147,7 @@ final class CallDetector: ObservableObject { /// On macOS 14+ we attribute mic usage per-process (robust start AND stop, /// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13 /// we fall back to the per-app call-window heuristic. - private func detectApp() -> DetectedApp? { + private func detectApp() -> DetectedCall? { if #available(macOS 14.0, *) { return detectViaMicAttribution() } @@ -145,7 +155,7 @@ final class CallDetector: ObservableObject { } @available(macOS 14.0, *) - private func detectViaMicAttribution() -> DetectedApp? { + private func detectViaMicAttribution() -> DetectedCall? { let micPIDs = AudioInputProcesses.micUsingPIDs() guard !micPIDs.isEmpty else { return nil } let selfPID = NSRunningApplication.current.processIdentifier @@ -154,39 +164,42 @@ final class CallDetector: ObservableObject { let pid = app.processIdentifier guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue } if let native = Self.nativeApps.first(where: { $0.id == id }) { - return native.app // Signal/Zoom/Teams using the mic = in a call + return DetectedCall(app: native.app, bundleID: id, windowID: nil) // native: capture largest owned window } // A browser using the mic + a Meet window = a Meet call. The mic state // gives reliable start/stop; the window check keeps non-Meet browser // mic use (other web apps) from being mislabeled as a Meet recording. - if Self.browserIDs.contains(id), pidHasMeetWindow(pid) { - return .meet + // Capture that exact browser window (by ID), not just the browser. + if Self.browserIDs.contains(id), let wid = meetWindowID(pid) { + return DetectedCall(app: .meet, bundleID: id, windowID: wid) } } return nil } - private func pidHasMeetWindow(_ pid: pid_t) -> Bool { + /// The `CGWindowID` of this PID's Google Meet call window (title "Meet - …"), + /// or nil if none — also serves as the "is this a Meet call?" check. + private func meetWindowID(_ pid: pid_t) -> CGWindowID? { guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] - else { return false } + else { return nil } for w in info { guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid, - let title = w[kCGWindowName as String] as? String else { continue } - if Self.looksLikeMeet(title) { return true } + let title = w[kCGWindowName as String] as? String, Self.looksLikeMeet(title) else { continue } + return w[kCGWindowNumber as String] as? CGWindowID } - return false + return nil } /// macOS 13 fallback: detect by the presence of a call WINDOW per app. - private func detectViaWindowTitle() -> DetectedApp? { - var pidToApp: [pid_t: DetectedApp] = [:] - var browserPIDs = Set() + private func detectViaWindowTitle() -> DetectedCall? { + var pidToApp: [pid_t: (app: DetectedApp, id: String)] = [:] + var browserPIDs: [pid_t: String] = [:] for app in NSWorkspace.shared.runningApplications { guard let id = app.bundleIdentifier else { continue } if let native = Self.nativeApps.first(where: { $0.id == id }) { - pidToApp[app.processIdentifier] = native.app + pidToApp[app.processIdentifier] = (native.app, id) } else if Self.browserIDs.contains(id) { - browserPIDs.insert(app.processIdentifier) + browserPIDs[app.processIdentifier] = id } } guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil } @@ -197,8 +210,13 @@ final class CallDetector: ObservableObject { guard let pid = info[kCGWindowOwnerPID as String] as? pid_t, let title = info[kCGWindowName as String] as? String, !title.isEmpty else { continue } - if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet } - if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app } + if let id = browserPIDs[pid], Self.looksLikeMeet(title) { + return DetectedCall(app: .meet, bundleID: id, + windowID: info[kCGWindowNumber as String] as? CGWindowID) + } + if let native = pidToApp[pid], Self.isCallWindow(native.app, title) { + return DetectedCall(app: native.app, bundleID: native.id, windowID: nil) + } } return nil } diff --git a/Ten31Transcripts/Session/SessionController.swift b/Ten31Transcripts/Session/SessionController.swift index 6b355b9..1057b52 100644 --- a/Ten31Transcripts/Session/SessionController.swift +++ b/Ten31Transcripts/Session/SessionController.swift @@ -62,11 +62,20 @@ final class SessionController: ObservableObject { let sessionId: String let app: String let mixedURL: URL - let selfSpans: [VADSpan] + let timeline: [VisualTimeline.Segment] } private var lastProcess: ProcessInputs? private var processTask: Task? private var recorder: AudioRecorder? + /// Visual capture for the current session (nil for manual recordings, apps with + /// no adapter, or when the window can't be captured — those record audio-only). + private var visualCapture: VisualCapture? + /// A visual capture whose `start()` is in flight (registered before the await), + /// so `prepareForTermination` can tear it down if its start-Task is orphaned. + private var inFlightVisual: VisualCapture? + /// App + capture target to start visual capture for, set at `start()`. `windowID` + /// pins the exact detected window (e.g. the Meet browser window); nil → largest. + private var pendingCapture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)? private var currentFolder: URL? private var startTime: Date? private var timer: Timer? @@ -86,7 +95,7 @@ final class SessionController: ObservableObject { fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json")) SessionController.shared = self - detector.onCallStart = { [weak self] app in self?.handleCallStart(app) } + detector.onCallStart = { [weak self] call in self?.handleCallStart(call) } detector.onCallEnd = { [weak self] in self?.handleCallEnd() } detector.$status .sink { [weak self] status in self?.detectionStatus = status } @@ -124,10 +133,11 @@ final class SessionController: ObservableObject { // MARK: - Auto-detection - private func handleCallStart(_ app: CallDetector.DetectedApp) { + private func handleCallStart(_ call: CallDetector.DetectedCall) { guard settings.autoRecordOnDetection else { return } switch state { - case .idle, .error: start(label: app.label, auto: true) + case .idle, .error: + start(label: call.app.label, auto: true, capture: (call.app, call.bundleID, call.windowID)) case .starting, .recording, .finishing: break // don't disturb an active session } } @@ -156,7 +166,8 @@ final class SessionController: ObservableObject { // MARK: - Start / Stop - private func start(label: String = "manual", auto: Bool = false) { + private func start(label: String = "manual", auto: Bool = false, + capture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)? = nil) { let folder: URL do { folder = try makeSessionFolder(label: label) @@ -168,6 +179,7 @@ final class SessionController: ObservableObject { currentLabel = label autoStarted = auto pendingAutoStop = false + pendingCapture = capture let recorder = AudioRecorder( micURL: folder.appendingPathComponent("mic.wav"), systemURL: folder.appendingPathComponent("system.wav"), @@ -177,6 +189,7 @@ final class SessionController: ObservableObject { state = .starting lifecycleGeneration += 1 + let myGen = lifecycleGeneration lifecycleTask = Task { do { try await recorder.start() // self-tears-down if it throws @@ -187,7 +200,12 @@ final class SessionController: ObservableObject { if self.pendingAutoStop { self.pendingAutoStop = false self.stop() + return } + // Attach visual capture on the SAME clock (best-effort, audio-only on failure). + // Pass this session's generation + recorder so a slow start can't + // adopt itself into a different session that began meanwhile. + await self.startVisual(t0Host: recorder.sharedT0Host(), generation: myGen, recorder: recorder) } catch { self.handleStartFailure(error) } @@ -213,18 +231,71 @@ final class SessionController: ObservableObject { } } + // MARK: - Visual capture + + /// Best-effort: start window capture for the detected app on the audio clock. + /// Any failure (no adapter, no window, Screen Recording denied) leaves + /// `visualCapture` nil and the session records audio-only. + /// + /// `generation`/`recorder` identify the session that launched this; because + /// `vc.start()` is a slow async call, a stop + a fresh start can complete during + /// it. We adopt the stream ONLY back into the same session — otherwise we cancel + /// it, so a stale capture can never attach to (or leak into) a different session. + private func startVisual(t0Host: Double, generation: Int, recorder: AudioRecorder) async { + guard let capture = pendingCapture else { return } // manual recording → audio-only + pendingCapture = nil + guard let vc = VisualCapture(app: capture.app, bundleID: capture.bundleID, + windowID: capture.windowID, t0Host: t0Host) else { return } + // Register the live capture before the await so a quit (prepareForTermination) + // can drain it even if this start-Task gets orphaned by a concurrent stop. + inFlightVisual = vc + defer { if inFlightVisual === vc { inFlightVisual = nil } } + do { + try await vc.start() + // Adopt only if THIS session still owns the slot (same generation, same + // recorder, still recording); otherwise discard rather than leak/misattach. + guard generation == lifecycleGeneration, self.recorder === recorder, + case .recording = state else { + await vc.cancel() + return + } + if let existing = visualCapture { await existing.cancel() } // fail-closed + visualCapture = vc + } catch { + await vc.cancel() // tear down any partial stream; never break recording + } + } + + /// Stop visual capture (if any), write `visual_timeline.json`, and return the + /// timeline for the backend: visual segments + merged self-spans when visual + /// ran, otherwise the mic-VAD self spans alone. + private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?) async -> [VisualTimeline.Segment] { + let selfName = settings.selfName + if let vc = visualCapture, let folder { + visualCapture = nil + return await vc.finish( + selfSpans: result.selfSpans, selfName: selfName, + sessionId: folder.lastPathComponent, t0Unix: result.t0Unix, + durationSec: result.duration, folder: folder) + } + if let vc = visualCapture { await vc.cancel(); visualCapture = nil } + return TranscriptPipeline.timeline(fromSelfSpans: result.selfSpans, selfName: selfName) + } + private func stop() { guard let recorder else { return } state = .finishing stopTimer() + let folder = currentFolder lifecycleGeneration += 1 lifecycleTask = Task { let result = await recorder.stop() - self.finish(result) + let timeline = await self.stopVisualAndTimeline(result, folder: folder) + self.finish(result, timeline: timeline) } } - private func finish(_ result: RecordingResult) { + private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment]) { recorder = nil micLevel = 0 systemLevel = 0 @@ -237,7 +308,7 @@ final class SessionController: ObservableObject { duration: result.duration, selfSpanCount: result.selfSpans.count) lastProcess = ProcessInputs( folder: folder, sessionId: folder.lastPathComponent, app: currentLabel, - mixedURL: result.mixedURL, selfSpans: result.selfSpans) + mixedURL: result.mixedURL, timeline: timeline) } let autoSend = settings.autoSendOnStop currentFolder = nil @@ -250,10 +321,10 @@ final class SessionController: ObservableObject { // MARK: - Backend transcription - /// Send the last finished session to the backend → `speakers.json`. Uses the - /// mic-VAD self spans as the timeline for now; visual segments (Phase 3–4) get - /// merged in once the adapters land. Safe to call manually ("Send to backend") - /// or automatically on stop. + /// Send the last finished session to the backend → `speakers.json`. The + /// timeline is the session's visual segments (with mic-VAD self spans merged) + /// when visual capture ran, or the self spans alone otherwise. Safe to call + /// manually ("Send to backend") or automatically on stop. func processLastSession() { guard let inputs = lastProcess else { return } if case .processing = transcriptStatus { return } @@ -266,8 +337,7 @@ final class SessionController: ObservableObject { baseURL: settings.backendBaseURL, skipTLS: settings.skipTLSVerification, voiceprints: voiceprints) - let timeline = TranscriptPipeline.timeline( - fromSelfSpans: inputs.selfSpans, selfName: settings.selfName) + let timeline = inputs.timeline do { let speakers = try await pipeline.process( sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app, @@ -286,6 +356,9 @@ final class SessionController: ObservableObject { private func fail(_ message: String) { recorder = nil + visualCapture = nil // recorder.start() failed before visual started; nothing running + inFlightVisual = nil + pendingCapture = nil currentFolder = nil autoStarted = false pendingAutoStop = false @@ -312,11 +385,20 @@ final class SessionController: ObservableObject { if state == .recording, let recorder { state = .finishing stopTimer() - finish(await recorder.stop()) + let folder = currentFolder + let result = await recorder.stop() + let timeline = await stopVisualAndTimeline(result, folder: folder) + finish(result, timeline: timeline) } else if lifecycleGeneration == gen { break // settled: no new transition was spawned } } + // A visual start-Task orphaned by a concurrent stop may still hold a live + // stream that nothing else will tear down before exit — drain it here. + if let vc = inFlightVisual { + inFlightVisual = nil + await vc.cancel() + } } // MARK: - Timer diff --git a/Ten31Transcripts/Visual/VisualCapture.swift b/Ten31Transcripts/Visual/VisualCapture.swift new file mode 100644 index 0000000..6fa7f7b --- /dev/null +++ b/Ten31Transcripts/Visual/VisualCapture.swift @@ -0,0 +1,83 @@ +import Foundation +import CoreGraphics + +/// Owns the visual side of one recording session: picks the app's adapter, runs a +/// `VisualObserver` over the call window, and on stop writes `visual_timeline.json` +/// and returns the speaker segments for the backend hand-off. +/// +/// Strictly best-effort: if there's no adapter for the app, or the window can't be +/// captured, the session simply records audio-only — visuals never block or break +/// the proven audio path. `init?` returns nil when the app has no visual adapter. +@available(macOS 13.0, *) +final class VisualCapture { + let app: CallDetector.DetectedApp + private let adapter: any AppAdapter + private let observer: VisualObserver + + init?(app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?, t0Host: Double) { + guard let adapter = AdapterRegistry.adapter(for: app) else { return nil } + self.app = app + self.adapter = adapter + self.observer = VisualObserver(bundleID: bundleID, windowID: windowID, adapter: adapter, + t0Host: t0Host, fps: adapter.preferredFPS) + } + + /// Start window capture. Throws if the window isn't capturable (no window yet, + /// Screen Recording denied) — the caller catches and falls back to audio-only. + func start() async throws { + try await observer.start() + } + + /// Stop and discard capture without writing anything (used when the session + /// ends before capture was fully adopted). + func cancel() async { + _ = await observer.stop() + } + + /// Clamp segment ends to the audio duration; drop any that become empty. Keeps + /// `visual_timeline.json` internally consistent and never sends the backend a + /// segment longer than the audio. (`duration <= 0` → passthrough.) + static func clampSegments(_ segs: [VisualTimeline.Segment], to duration: Double) -> [VisualTimeline.Segment] { + guard duration > 0 else { return segs } + return segs.compactMap { s in + let end = min(s.end, duration) + guard end > s.start else { return nil } + return .init(start: s.start, end: end, name: s.name, confidence: s.confidence, source: s.source) + } + } + + static func clampGaps(_ gaps: [VisualTimeline.Gap], to duration: Double) -> [VisualTimeline.Gap] { + guard duration > 0 else { return gaps } + return gaps.compactMap { g in + let end = min(g.end, duration) + guard end > g.start else { return nil } + return .init(start: g.start, end: end, reason: g.reason) + } + } + + /// Stop capture, fold in the mic-VAD self spans, write `visual_timeline.json` + /// into the session folder, and return the merged segments for `label-merge`. + func finish(selfSpans: [VADSpan], selfName: String, + sessionId: String, t0Unix: Double, durationSec: Double, + folder: URL) async -> [VisualTimeline.Segment] { + observer.addSelfSpans(selfSpans, selfName: selfName) + let (rawSegments, rawGaps) = await observer.stop() + + // The observer stops slightly after audio fixes `durationSec`, so a trailing + // gap/segment can run past it. Clamp ends so the JSON is internally consistent + // (and we never hand the backend a segment longer than the audio). + let segments = Self.clampSegments(rawSegments, to: durationSec) + let gaps = Self.clampGaps(rawGaps, to: durationSec) + + let names = Set(segments.map { $0.name }) + let participants = names.sorted().map { + VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil) + } + let timeline = VisualTimeline( + sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion, + t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS, + selfName: selfName, participants: participants, segments: segments, visualGaps: gaps) + try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json")) + return segments + } +} diff --git a/Ten31Transcripts/Visual/VisualObserver.swift b/Ten31Transcripts/Visual/VisualObserver.swift index 4e54559..49ab1c0 100644 --- a/Ten31Transcripts/Visual/VisualObserver.swift +++ b/Ten31Transcripts/Visual/VisualObserver.swift @@ -14,6 +14,7 @@ import AppKit @available(macOS 13.0, *) final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput { private let bundleID: String + private let windowID: CGWindowID? private let adapter: any AppAdapter private let t0Host: Double private let fps: Int @@ -27,8 +28,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput { /// Optional live hook (e.g. for a debug HUD). Observations only; no frame. var onObservations: (([SpeakerObservation], TimeInterval) -> Void)? - init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) { + init(bundleID: String, windowID: CGWindowID? = nil, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) { self.bundleID = bundleID + self.windowID = windowID self.adapter = adapter self.t0Host = t0Host self.fps = max(1, fps) @@ -36,12 +38,15 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput { func start() async throws { let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false) - // The call window: the largest window owned by the target app. let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID } - guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else { + // Prefer the EXACT detected window (e.g. the Meet browser window) by ID; fall + // back to the largest owned window when no ID was supplied or it's gone. + guard let idx = Self.pickWindowIndex(candidates.map { ($0.windowID, $0.frame) }, preferredID: windowID), + candidates.indices.contains(idx) else { throw NSError(domain: "Ten31", code: 2, userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."]) } + let window = candidates[idx] let filter = SCContentFilter(desktopIndependentWindow: window) let config = SCStreamConfiguration() @@ -50,8 +55,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput { config.showsCursor = false config.pixelFormat = kCVPixelFormatType_32BGRA // window.frame is in points; capture at native pixels so OCR can read small - // initials/names (a half-res Retina capture badly hurts recognition). - let scale = NSScreen.main?.backingScaleFactor ?? 2 + // initials/names (a half-res Retina capture badly hurts recognition). Use the + // scale of the display the window is actually on, not always the main screen. + let scale = Self.backingScale(forWindowFrame: window.frame) config.width = max(2, Int(window.frame.width * scale)) config.height = max(2, Int(window.frame.height * scale)) @@ -61,8 +67,36 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput { self.stream = stream } + /// Choose which candidate window to capture: the one matching `preferredID` if + /// present, else the largest by area. Returns the index into `candidates`, or + /// nil if there are none. Pure/testable — no ScreenCaptureKit types. + static func pickWindowIndex(_ candidates: [(id: CGWindowID, frame: CGRect)], + preferredID: CGWindowID?) -> Int? { + guard !candidates.isEmpty else { return nil } + if let preferredID, let i = candidates.firstIndex(where: { $0.id == preferredID }) { + return i + } + return candidates.indices.max(by: { + candidates[$0].frame.width * candidates[$0].frame.height + < candidates[$1].frame.width * candidates[$1].frame.height + }) + } + + /// Backing scale of the display that contains the window's center. SCWindow.frame + /// is in global display (top-left origin) points; NSScreen is bottom-left, so we + /// flip the center through the primary screen's height before testing containment. + private static func backingScale(forWindowFrame frame: CGRect) -> CGFloat { + let screens = NSScreen.screens + guard let primary = screens.first else { return NSScreen.main?.backingScaleFactor ?? 2 } + let centerAppKit = CGPoint(x: frame.midX, y: primary.frame.maxY - frame.midY) + let screen = screens.first { $0.frame.contains(centerAppKit) } ?? NSScreen.main ?? primary + return screen.backingScaleFactor + } + func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) { - if let stream { try? await stream.stopCapture() } + // Bound stopCapture: an already-errored SCStream can block forever, which + // would wedge session finalization in `.finishing`. Mirror AudioRecorder. + if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) } stream = nil return queue.sync { if let gs = gapStart { @@ -113,6 +147,17 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput { func stream(_ stream: SCStream, didStopWithError error: Error) {} + /// Proceed as soon as `stopCapture()` returns OR the timeout fires, so a wedged + /// stream can't block forever. + private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async { + await withTaskGroup(of: Void.self) { group in + group.addTask { try? await stream.stopCapture() } + group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) } + _ = await group.next() + group.cancelAll() + } + } + private enum FrameKind { case live, idle, gap } /// SCK delivers `.complete` only when content changes, `.idle` for a static diff --git a/Ten31TranscriptsTests/VisualObserverTests.swift b/Ten31TranscriptsTests/VisualObserverTests.swift new file mode 100644 index 0000000..bca8da2 --- /dev/null +++ b/Ten31TranscriptsTests/VisualObserverTests.swift @@ -0,0 +1,67 @@ +import XCTest +import CoreGraphics +@testable import Ten31Transcripts + +/// Window-selection logic: prefer the exact detected window (e.g. the Meet browser +/// window) by ID, else fall back to the largest owned window. This is the fix for +/// the "captures the wrong browser window" data-flow bug. +final class VisualObserverTests: XCTestCase { + + private func c(_ id: CGWindowID, _ w: CGFloat, _ h: CGFloat) -> (id: CGWindowID, frame: CGRect) { + (id, CGRect(x: 0, y: 0, width: w, height: h)) + } + + func testPrefersMatchingWindowIDOverLargest() { + // The Meet window (id 42) is NOT the largest — must still be chosen by ID. + let candidates = [c(7, 1600, 1000), c(42, 800, 600), c(9, 1200, 900)] + let idx = VisualObserver.pickWindowIndex(candidates, preferredID: 42) + XCTAssertEqual(idx, 1) + } + + func testFallsBackToLargestWhenNoPreferredID() { + let candidates = [c(7, 800, 600), c(9, 1600, 1000), c(11, 1200, 900)] + let idx = VisualObserver.pickWindowIndex(candidates, preferredID: nil) + XCTAssertEqual(idx, 1) // the 1600x1000 window + } + + func testFallsBackToLargestWhenPreferredIDMissing() { + let candidates = [c(7, 800, 600), c(9, 1600, 1000)] + let idx = VisualObserver.pickWindowIndex(candidates, preferredID: 999) // gone + XCTAssertEqual(idx, 1) + } + + func testNilWhenNoCandidates() { + XCTAssertNil(VisualObserver.pickWindowIndex([], preferredID: 42)) + XCTAssertNil(VisualObserver.pickWindowIndex([], preferredID: nil)) + } + + // MARK: - Duration clamping (visual_timeline.json internal consistency) + + func testClampSegmentsToDuration() { + let segs = [ + VisualTimeline.Segment(start: 1, end: 5, name: "A", confidence: 0.9, source: "vision"), + VisualTimeline.Segment(start: 8, end: 12, name: "B", confidence: 0.8, source: "vision"), // end past 10 + VisualTimeline.Segment(start: 10.5, end: 11, name: "C", confidence: 0.7, source: "vision"), // fully past → dropped + ] + let out = VisualCapture.clampSegments(segs, to: 10) + XCTAssertEqual(out.count, 2) + XCTAssertEqual(out[0].end, 5, accuracy: 0.001) + XCTAssertEqual(out[1].end, 10, accuracy: 0.001) // clamped + XCTAssertFalse(out.contains { $0.name == "C" }) // dropped + } + + func testClampGapsToDuration() { + let gaps = [ + VisualTimeline.Gap(start: 2, end: 4, reason: "minimized"), + VisualTimeline.Gap(start: 9, end: 13, reason: "minimized"), // clamped to 10 + ] + let out = VisualCapture.clampGaps(gaps, to: 10) + XCTAssertEqual(out.count, 2) + XCTAssertEqual(out[1].end, 10, accuracy: 0.001) + } + + func testClampPassthroughWhenDurationUnknown() { + let segs = [VisualTimeline.Segment(start: 1, end: 99, name: "A", confidence: 1, source: "vision")] + XCTAssertEqual(VisualCapture.clampSegments(segs, to: 0), segs) // no duration → unchanged + } +}