Wire visual capture into the recording lifecycle (failure-isolated)
Visual capture now runs alongside audio: on call start the session picks the
app's adapter, captures the call window on the SAME monotonic clock as the audio
(AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands
the backend the visual segments with mic-VAD self-spans merged. Any visual
failure (no adapter, no window, Screen Recording denied) leaves the session
recording audio-only — the proven path is never blocked or broken.
- CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact
CGWindowID of the matched Meet browser window (native apps → nil → largest).
- VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json.
- AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment.
Hardened per a 3-lens adversarial review (concurrency / failure-isolation /
data-flow), all 6 confirmed findings fixed:
- P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session
(cross-session SCStream leak + visual_timeline.json written to the wrong
folder). Now gated on session identity — generation + recorder ===, still
.recording — with fail-closed adoption; otherwise the stream is cancelled.
- P1: observer captured the browser's largest window, not the detected Meet
window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested),
largest-area only as fallback.
- P2: a startVisual orphaned by a concurrent stop could leak a stream on quit.
inFlightVisual is registered before the await and drained in prepareForTermination.
- P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in
VisualCapture (clampSegments/clampGaps, unit-tested).
- P4: capture pixel size used NSScreen.main scale; now uses the scale of the
display actually hosting the window (OCR clarity on secondary displays).
- VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so
a wedged stream can't hang finalization.
25/25 XCTest pass. Live validation on real calls still pending.
This commit is contained in:
@@ -88,6 +88,11 @@ final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput {
|
||||
}
|
||||
}
|
||||
|
||||
/// The shared monotonic t0 (`CACurrentMediaTime` base) captured at `start()`,
|
||||
/// so visual capture can timestamp frames against the exact same clock as the
|
||||
/// audio. Valid only after `start()` has returned.
|
||||
func sharedT0Host() -> Double { ioQueue.sync { t0Host } }
|
||||
|
||||
func stop() async -> RecordingResult {
|
||||
// Stop the mic FIRST — always succeeds and halts mic capture immediately.
|
||||
engine?.inputNode.removeTap(onBus: 0)
|
||||
|
||||
@@ -26,6 +26,16 @@ final class CallDetector: ObservableObject {
|
||||
}
|
||||
}
|
||||
|
||||
/// A detected call plus what to capture for visuals: the bundle ID of the owner
|
||||
/// (the native app for Zoom/Teams/Signal, or the *browser* hosting the Meet tab)
|
||||
/// and — for Meet — the exact `CGWindowID` of the matched call window, so the
|
||||
/// observer captures that window instead of guessing the browser's largest one.
|
||||
struct DetectedCall: Equatable {
|
||||
let app: DetectedApp
|
||||
let bundleID: String
|
||||
let windowID: CGWindowID?
|
||||
}
|
||||
|
||||
enum Status: Equatable {
|
||||
case disabled
|
||||
case listening
|
||||
@@ -34,7 +44,7 @@ final class CallDetector: ObservableObject {
|
||||
|
||||
@Published private(set) var status: Status = .disabled
|
||||
|
||||
var onCallStart: ((DetectedApp) -> Void)?
|
||||
var onCallStart: ((DetectedCall) -> Void)?
|
||||
var onCallEnd: (() -> Void)?
|
||||
|
||||
private let mic = MicActivityMonitor()
|
||||
@@ -42,7 +52,7 @@ final class CallDetector: ObservableObject {
|
||||
private var openTimer: Timer?
|
||||
private var closeTimer: Timer?
|
||||
private var inCall = false
|
||||
private var currentApp: DetectedApp?
|
||||
private var currentCall: DetectedCall?
|
||||
private var enabled = false
|
||||
|
||||
private let openDelay: TimeInterval = 2.0
|
||||
@@ -79,7 +89,7 @@ final class CallDetector: ObservableObject {
|
||||
pollTimer?.invalidate(); pollTimer = nil
|
||||
cancelOpen(); cancelClose()
|
||||
inCall = false
|
||||
currentApp = nil
|
||||
currentCall = nil
|
||||
status = .disabled
|
||||
}
|
||||
|
||||
@@ -92,8 +102,8 @@ final class CallDetector: ObservableObject {
|
||||
if let candidate {
|
||||
cancelClose()
|
||||
if inCall {
|
||||
currentApp = candidate
|
||||
status = .inCall(candidate)
|
||||
currentCall = candidate
|
||||
status = .inCall(candidate.app)
|
||||
} else if openTimer == nil {
|
||||
openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in
|
||||
Task { @MainActor in self?.fireOpen() }
|
||||
@@ -112,18 +122,18 @@ final class CallDetector: ObservableObject {
|
||||
private func fireOpen() {
|
||||
openTimer = nil
|
||||
// Re-resolve the app at fire time (the debounce window may have changed it).
|
||||
guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return }
|
||||
guard enabled, mic.isRunning, let call = detectApp(), !inCall else { return }
|
||||
inCall = true
|
||||
currentApp = app
|
||||
status = .inCall(app)
|
||||
onCallStart?(app)
|
||||
currentCall = call
|
||||
status = .inCall(call.app)
|
||||
onCallStart?(call)
|
||||
}
|
||||
|
||||
private func fireClose() {
|
||||
closeTimer = nil
|
||||
guard enabled, inCall else { return }
|
||||
inCall = false
|
||||
currentApp = nil
|
||||
currentCall = nil
|
||||
status = .listening
|
||||
onCallEnd?()
|
||||
}
|
||||
@@ -137,7 +147,7 @@ final class CallDetector: ObservableObject {
|
||||
/// On macOS 14+ we attribute mic usage per-process (robust start AND stop,
|
||||
/// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13
|
||||
/// we fall back to the per-app call-window heuristic.
|
||||
private func detectApp() -> DetectedApp? {
|
||||
private func detectApp() -> DetectedCall? {
|
||||
if #available(macOS 14.0, *) {
|
||||
return detectViaMicAttribution()
|
||||
}
|
||||
@@ -145,7 +155,7 @@ final class CallDetector: ObservableObject {
|
||||
}
|
||||
|
||||
@available(macOS 14.0, *)
|
||||
private func detectViaMicAttribution() -> DetectedApp? {
|
||||
private func detectViaMicAttribution() -> DetectedCall? {
|
||||
let micPIDs = AudioInputProcesses.micUsingPIDs()
|
||||
guard !micPIDs.isEmpty else { return nil }
|
||||
let selfPID = NSRunningApplication.current.processIdentifier
|
||||
@@ -154,39 +164,42 @@ final class CallDetector: ObservableObject {
|
||||
let pid = app.processIdentifier
|
||||
guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue }
|
||||
if let native = Self.nativeApps.first(where: { $0.id == id }) {
|
||||
return native.app // Signal/Zoom/Teams using the mic = in a call
|
||||
return DetectedCall(app: native.app, bundleID: id, windowID: nil) // native: capture largest owned window
|
||||
}
|
||||
// A browser using the mic + a Meet window = a Meet call. The mic state
|
||||
// gives reliable start/stop; the window check keeps non-Meet browser
|
||||
// mic use (other web apps) from being mislabeled as a Meet recording.
|
||||
if Self.browserIDs.contains(id), pidHasMeetWindow(pid) {
|
||||
return .meet
|
||||
// Capture that exact browser window (by ID), not just the browser.
|
||||
if Self.browserIDs.contains(id), let wid = meetWindowID(pid) {
|
||||
return DetectedCall(app: .meet, bundleID: id, windowID: wid)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
private func pidHasMeetWindow(_ pid: pid_t) -> Bool {
|
||||
/// The `CGWindowID` of this PID's Google Meet call window (title "Meet - …"),
|
||||
/// or nil if none — also serves as the "is this a Meet call?" check.
|
||||
private func meetWindowID(_ pid: pid_t) -> CGWindowID? {
|
||||
guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]]
|
||||
else { return false }
|
||||
else { return nil }
|
||||
for w in info {
|
||||
guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid,
|
||||
let title = w[kCGWindowName as String] as? String else { continue }
|
||||
if Self.looksLikeMeet(title) { return true }
|
||||
let title = w[kCGWindowName as String] as? String, Self.looksLikeMeet(title) else { continue }
|
||||
return w[kCGWindowNumber as String] as? CGWindowID
|
||||
}
|
||||
return false
|
||||
return nil
|
||||
}
|
||||
|
||||
/// macOS 13 fallback: detect by the presence of a call WINDOW per app.
|
||||
private func detectViaWindowTitle() -> DetectedApp? {
|
||||
var pidToApp: [pid_t: DetectedApp] = [:]
|
||||
var browserPIDs = Set<pid_t>()
|
||||
private func detectViaWindowTitle() -> DetectedCall? {
|
||||
var pidToApp: [pid_t: (app: DetectedApp, id: String)] = [:]
|
||||
var browserPIDs: [pid_t: String] = [:]
|
||||
for app in NSWorkspace.shared.runningApplications {
|
||||
guard let id = app.bundleIdentifier else { continue }
|
||||
if let native = Self.nativeApps.first(where: { $0.id == id }) {
|
||||
pidToApp[app.processIdentifier] = native.app
|
||||
pidToApp[app.processIdentifier] = (native.app, id)
|
||||
} else if Self.browserIDs.contains(id) {
|
||||
browserPIDs.insert(app.processIdentifier)
|
||||
browserPIDs[app.processIdentifier] = id
|
||||
}
|
||||
}
|
||||
guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil }
|
||||
@@ -197,8 +210,13 @@ final class CallDetector: ObservableObject {
|
||||
guard let pid = info[kCGWindowOwnerPID as String] as? pid_t,
|
||||
let title = info[kCGWindowName as String] as? String,
|
||||
!title.isEmpty else { continue }
|
||||
if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet }
|
||||
if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app }
|
||||
if let id = browserPIDs[pid], Self.looksLikeMeet(title) {
|
||||
return DetectedCall(app: .meet, bundleID: id,
|
||||
windowID: info[kCGWindowNumber as String] as? CGWindowID)
|
||||
}
|
||||
if let native = pidToApp[pid], Self.isCallWindow(native.app, title) {
|
||||
return DetectedCall(app: native.app, bundleID: native.id, windowID: nil)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -62,11 +62,20 @@ final class SessionController: ObservableObject {
|
||||
let sessionId: String
|
||||
let app: String
|
||||
let mixedURL: URL
|
||||
let selfSpans: [VADSpan]
|
||||
let timeline: [VisualTimeline.Segment]
|
||||
}
|
||||
private var lastProcess: ProcessInputs?
|
||||
private var processTask: Task<Void, Never>?
|
||||
private var recorder: AudioRecorder?
|
||||
/// Visual capture for the current session (nil for manual recordings, apps with
|
||||
/// no adapter, or when the window can't be captured — those record audio-only).
|
||||
private var visualCapture: VisualCapture?
|
||||
/// A visual capture whose `start()` is in flight (registered before the await),
|
||||
/// so `prepareForTermination` can tear it down if its start-Task is orphaned.
|
||||
private var inFlightVisual: VisualCapture?
|
||||
/// App + capture target to start visual capture for, set at `start()`. `windowID`
|
||||
/// pins the exact detected window (e.g. the Meet browser window); nil → largest.
|
||||
private var pendingCapture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)?
|
||||
private var currentFolder: URL?
|
||||
private var startTime: Date?
|
||||
private var timer: Timer?
|
||||
@@ -86,7 +95,7 @@ final class SessionController: ObservableObject {
|
||||
fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
|
||||
SessionController.shared = self
|
||||
|
||||
detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
|
||||
detector.onCallStart = { [weak self] call in self?.handleCallStart(call) }
|
||||
detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
|
||||
detector.$status
|
||||
.sink { [weak self] status in self?.detectionStatus = status }
|
||||
@@ -124,10 +133,11 @@ final class SessionController: ObservableObject {
|
||||
|
||||
// MARK: - Auto-detection
|
||||
|
||||
private func handleCallStart(_ app: CallDetector.DetectedApp) {
|
||||
private func handleCallStart(_ call: CallDetector.DetectedCall) {
|
||||
guard settings.autoRecordOnDetection else { return }
|
||||
switch state {
|
||||
case .idle, .error: start(label: app.label, auto: true)
|
||||
case .idle, .error:
|
||||
start(label: call.app.label, auto: true, capture: (call.app, call.bundleID, call.windowID))
|
||||
case .starting, .recording, .finishing: break // don't disturb an active session
|
||||
}
|
||||
}
|
||||
@@ -156,7 +166,8 @@ final class SessionController: ObservableObject {
|
||||
|
||||
// MARK: - Start / Stop
|
||||
|
||||
private func start(label: String = "manual", auto: Bool = false) {
|
||||
private func start(label: String = "manual", auto: Bool = false,
|
||||
capture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)? = nil) {
|
||||
let folder: URL
|
||||
do {
|
||||
folder = try makeSessionFolder(label: label)
|
||||
@@ -168,6 +179,7 @@ final class SessionController: ObservableObject {
|
||||
currentLabel = label
|
||||
autoStarted = auto
|
||||
pendingAutoStop = false
|
||||
pendingCapture = capture
|
||||
let recorder = AudioRecorder(
|
||||
micURL: folder.appendingPathComponent("mic.wav"),
|
||||
systemURL: folder.appendingPathComponent("system.wav"),
|
||||
@@ -177,6 +189,7 @@ final class SessionController: ObservableObject {
|
||||
state = .starting
|
||||
|
||||
lifecycleGeneration += 1
|
||||
let myGen = lifecycleGeneration
|
||||
lifecycleTask = Task {
|
||||
do {
|
||||
try await recorder.start() // self-tears-down if it throws
|
||||
@@ -187,7 +200,12 @@ final class SessionController: ObservableObject {
|
||||
if self.pendingAutoStop {
|
||||
self.pendingAutoStop = false
|
||||
self.stop()
|
||||
return
|
||||
}
|
||||
// Attach visual capture on the SAME clock (best-effort, audio-only on failure).
|
||||
// Pass this session's generation + recorder so a slow start can't
|
||||
// adopt itself into a different session that began meanwhile.
|
||||
await self.startVisual(t0Host: recorder.sharedT0Host(), generation: myGen, recorder: recorder)
|
||||
} catch {
|
||||
self.handleStartFailure(error)
|
||||
}
|
||||
@@ -213,18 +231,71 @@ final class SessionController: ObservableObject {
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Visual capture
|
||||
|
||||
/// Best-effort: start window capture for the detected app on the audio clock.
|
||||
/// Any failure (no adapter, no window, Screen Recording denied) leaves
|
||||
/// `visualCapture` nil and the session records audio-only.
|
||||
///
|
||||
/// `generation`/`recorder` identify the session that launched this; because
|
||||
/// `vc.start()` is a slow async call, a stop + a fresh start can complete during
|
||||
/// it. We adopt the stream ONLY back into the same session — otherwise we cancel
|
||||
/// it, so a stale capture can never attach to (or leak into) a different session.
|
||||
private func startVisual(t0Host: Double, generation: Int, recorder: AudioRecorder) async {
|
||||
guard let capture = pendingCapture else { return } // manual recording → audio-only
|
||||
pendingCapture = nil
|
||||
guard let vc = VisualCapture(app: capture.app, bundleID: capture.bundleID,
|
||||
windowID: capture.windowID, t0Host: t0Host) else { return }
|
||||
// Register the live capture before the await so a quit (prepareForTermination)
|
||||
// can drain it even if this start-Task gets orphaned by a concurrent stop.
|
||||
inFlightVisual = vc
|
||||
defer { if inFlightVisual === vc { inFlightVisual = nil } }
|
||||
do {
|
||||
try await vc.start()
|
||||
// Adopt only if THIS session still owns the slot (same generation, same
|
||||
// recorder, still recording); otherwise discard rather than leak/misattach.
|
||||
guard generation == lifecycleGeneration, self.recorder === recorder,
|
||||
case .recording = state else {
|
||||
await vc.cancel()
|
||||
return
|
||||
}
|
||||
if let existing = visualCapture { await existing.cancel() } // fail-closed
|
||||
visualCapture = vc
|
||||
} catch {
|
||||
await vc.cancel() // tear down any partial stream; never break recording
|
||||
}
|
||||
}
|
||||
|
||||
/// Stop visual capture (if any), write `visual_timeline.json`, and return the
|
||||
/// timeline for the backend: visual segments + merged self-spans when visual
|
||||
/// ran, otherwise the mic-VAD self spans alone.
|
||||
private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?) async -> [VisualTimeline.Segment] {
|
||||
let selfName = settings.selfName
|
||||
if let vc = visualCapture, let folder {
|
||||
visualCapture = nil
|
||||
return await vc.finish(
|
||||
selfSpans: result.selfSpans, selfName: selfName,
|
||||
sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
|
||||
durationSec: result.duration, folder: folder)
|
||||
}
|
||||
if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
|
||||
return TranscriptPipeline.timeline(fromSelfSpans: result.selfSpans, selfName: selfName)
|
||||
}
|
||||
|
||||
private func stop() {
|
||||
guard let recorder else { return }
|
||||
state = .finishing
|
||||
stopTimer()
|
||||
let folder = currentFolder
|
||||
lifecycleGeneration += 1
|
||||
lifecycleTask = Task {
|
||||
let result = await recorder.stop()
|
||||
self.finish(result)
|
||||
let timeline = await self.stopVisualAndTimeline(result, folder: folder)
|
||||
self.finish(result, timeline: timeline)
|
||||
}
|
||||
}
|
||||
|
||||
private func finish(_ result: RecordingResult) {
|
||||
private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment]) {
|
||||
recorder = nil
|
||||
micLevel = 0
|
||||
systemLevel = 0
|
||||
@@ -237,7 +308,7 @@ final class SessionController: ObservableObject {
|
||||
duration: result.duration, selfSpanCount: result.selfSpans.count)
|
||||
lastProcess = ProcessInputs(
|
||||
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
|
||||
mixedURL: result.mixedURL, selfSpans: result.selfSpans)
|
||||
mixedURL: result.mixedURL, timeline: timeline)
|
||||
}
|
||||
let autoSend = settings.autoSendOnStop
|
||||
currentFolder = nil
|
||||
@@ -250,10 +321,10 @@ final class SessionController: ObservableObject {
|
||||
|
||||
// MARK: - Backend transcription
|
||||
|
||||
/// Send the last finished session to the backend → `speakers.json`. Uses the
|
||||
/// mic-VAD self spans as the timeline for now; visual segments (Phase 3–4) get
|
||||
/// merged in once the adapters land. Safe to call manually ("Send to backend")
|
||||
/// or automatically on stop.
|
||||
/// Send the last finished session to the backend → `speakers.json`. The
|
||||
/// timeline is the session's visual segments (with mic-VAD self spans merged)
|
||||
/// when visual capture ran, or the self spans alone otherwise. Safe to call
|
||||
/// manually ("Send to backend") or automatically on stop.
|
||||
func processLastSession() {
|
||||
guard let inputs = lastProcess else { return }
|
||||
if case .processing = transcriptStatus { return }
|
||||
@@ -266,8 +337,7 @@ final class SessionController: ObservableObject {
|
||||
baseURL: settings.backendBaseURL,
|
||||
skipTLS: settings.skipTLSVerification,
|
||||
voiceprints: voiceprints)
|
||||
let timeline = TranscriptPipeline.timeline(
|
||||
fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
|
||||
let timeline = inputs.timeline
|
||||
do {
|
||||
let speakers = try await pipeline.process(
|
||||
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
||||
@@ -286,6 +356,9 @@ final class SessionController: ObservableObject {
|
||||
|
||||
private func fail(_ message: String) {
|
||||
recorder = nil
|
||||
visualCapture = nil // recorder.start() failed before visual started; nothing running
|
||||
inFlightVisual = nil
|
||||
pendingCapture = nil
|
||||
currentFolder = nil
|
||||
autoStarted = false
|
||||
pendingAutoStop = false
|
||||
@@ -312,11 +385,20 @@ final class SessionController: ObservableObject {
|
||||
if state == .recording, let recorder {
|
||||
state = .finishing
|
||||
stopTimer()
|
||||
finish(await recorder.stop())
|
||||
let folder = currentFolder
|
||||
let result = await recorder.stop()
|
||||
let timeline = await stopVisualAndTimeline(result, folder: folder)
|
||||
finish(result, timeline: timeline)
|
||||
} else if lifecycleGeneration == gen {
|
||||
break // settled: no new transition was spawned
|
||||
}
|
||||
}
|
||||
// A visual start-Task orphaned by a concurrent stop may still hold a live
|
||||
// stream that nothing else will tear down before exit — drain it here.
|
||||
if let vc = inFlightVisual {
|
||||
inFlightVisual = nil
|
||||
await vc.cancel()
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Timer
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
import Foundation
|
||||
import CoreGraphics
|
||||
|
||||
/// Owns the visual side of one recording session: picks the app's adapter, runs a
|
||||
/// `VisualObserver` over the call window, and on stop writes `visual_timeline.json`
|
||||
/// and returns the speaker segments for the backend hand-off.
|
||||
///
|
||||
/// Strictly best-effort: if there's no adapter for the app, or the window can't be
|
||||
/// captured, the session simply records audio-only — visuals never block or break
|
||||
/// the proven audio path. `init?` returns nil when the app has no visual adapter.
|
||||
@available(macOS 13.0, *)
|
||||
final class VisualCapture {
|
||||
let app: CallDetector.DetectedApp
|
||||
private let adapter: any AppAdapter
|
||||
private let observer: VisualObserver
|
||||
|
||||
init?(app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?, t0Host: Double) {
|
||||
guard let adapter = AdapterRegistry.adapter(for: app) else { return nil }
|
||||
self.app = app
|
||||
self.adapter = adapter
|
||||
self.observer = VisualObserver(bundleID: bundleID, windowID: windowID, adapter: adapter,
|
||||
t0Host: t0Host, fps: adapter.preferredFPS)
|
||||
}
|
||||
|
||||
/// Start window capture. Throws if the window isn't capturable (no window yet,
|
||||
/// Screen Recording denied) — the caller catches and falls back to audio-only.
|
||||
func start() async throws {
|
||||
try await observer.start()
|
||||
}
|
||||
|
||||
/// Stop and discard capture without writing anything (used when the session
|
||||
/// ends before capture was fully adopted).
|
||||
func cancel() async {
|
||||
_ = await observer.stop()
|
||||
}
|
||||
|
||||
/// Clamp segment ends to the audio duration; drop any that become empty. Keeps
|
||||
/// `visual_timeline.json` internally consistent and never sends the backend a
|
||||
/// segment longer than the audio. (`duration <= 0` → passthrough.)
|
||||
static func clampSegments(_ segs: [VisualTimeline.Segment], to duration: Double) -> [VisualTimeline.Segment] {
|
||||
guard duration > 0 else { return segs }
|
||||
return segs.compactMap { s in
|
||||
let end = min(s.end, duration)
|
||||
guard end > s.start else { return nil }
|
||||
return .init(start: s.start, end: end, name: s.name, confidence: s.confidence, source: s.source)
|
||||
}
|
||||
}
|
||||
|
||||
static func clampGaps(_ gaps: [VisualTimeline.Gap], to duration: Double) -> [VisualTimeline.Gap] {
|
||||
guard duration > 0 else { return gaps }
|
||||
return gaps.compactMap { g in
|
||||
let end = min(g.end, duration)
|
||||
guard end > g.start else { return nil }
|
||||
return .init(start: g.start, end: end, reason: g.reason)
|
||||
}
|
||||
}
|
||||
|
||||
/// Stop capture, fold in the mic-VAD self spans, write `visual_timeline.json`
|
||||
/// into the session folder, and return the merged segments for `label-merge`.
|
||||
func finish(selfSpans: [VADSpan], selfName: String,
|
||||
sessionId: String, t0Unix: Double, durationSec: Double,
|
||||
folder: URL) async -> [VisualTimeline.Segment] {
|
||||
observer.addSelfSpans(selfSpans, selfName: selfName)
|
||||
let (rawSegments, rawGaps) = await observer.stop()
|
||||
|
||||
// The observer stops slightly after audio fixes `durationSec`, so a trailing
|
||||
// gap/segment can run past it. Clamp ends so the JSON is internally consistent
|
||||
// (and we never hand the backend a segment longer than the audio).
|
||||
let segments = Self.clampSegments(rawSegments, to: durationSec)
|
||||
let gaps = Self.clampGaps(rawGaps, to: durationSec)
|
||||
|
||||
let names = Set(segments.map { $0.name })
|
||||
let participants = names.sorted().map {
|
||||
VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
|
||||
}
|
||||
let timeline = VisualTimeline(
|
||||
sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion,
|
||||
t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS,
|
||||
selfName: selfName, participants: participants, segments: segments, visualGaps: gaps)
|
||||
try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json"))
|
||||
return segments
|
||||
}
|
||||
}
|
||||
@@ -14,6 +14,7 @@ import AppKit
|
||||
@available(macOS 13.0, *)
|
||||
final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
|
||||
private let bundleID: String
|
||||
private let windowID: CGWindowID?
|
||||
private let adapter: any AppAdapter
|
||||
private let t0Host: Double
|
||||
private let fps: Int
|
||||
@@ -27,8 +28,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
|
||||
/// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
|
||||
var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?
|
||||
|
||||
init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
|
||||
init(bundleID: String, windowID: CGWindowID? = nil, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
|
||||
self.bundleID = bundleID
|
||||
self.windowID = windowID
|
||||
self.adapter = adapter
|
||||
self.t0Host = t0Host
|
||||
self.fps = max(1, fps)
|
||||
@@ -36,12 +38,15 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
|
||||
|
||||
func start() async throws {
|
||||
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
|
||||
// The call window: the largest window owned by the target app.
|
||||
let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
|
||||
guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else {
|
||||
// Prefer the EXACT detected window (e.g. the Meet browser window) by ID; fall
|
||||
// back to the largest owned window when no ID was supplied or it's gone.
|
||||
guard let idx = Self.pickWindowIndex(candidates.map { ($0.windowID, $0.frame) }, preferredID: windowID),
|
||||
candidates.indices.contains(idx) else {
|
||||
throw NSError(domain: "Ten31", code: 2,
|
||||
userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
|
||||
}
|
||||
let window = candidates[idx]
|
||||
|
||||
let filter = SCContentFilter(desktopIndependentWindow: window)
|
||||
let config = SCStreamConfiguration()
|
||||
@@ -50,8 +55,9 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
|
||||
config.showsCursor = false
|
||||
config.pixelFormat = kCVPixelFormatType_32BGRA
|
||||
// window.frame is in points; capture at native pixels so OCR can read small
|
||||
// initials/names (a half-res Retina capture badly hurts recognition).
|
||||
let scale = NSScreen.main?.backingScaleFactor ?? 2
|
||||
// initials/names (a half-res Retina capture badly hurts recognition). Use the
|
||||
// scale of the display the window is actually on, not always the main screen.
|
||||
let scale = Self.backingScale(forWindowFrame: window.frame)
|
||||
config.width = max(2, Int(window.frame.width * scale))
|
||||
config.height = max(2, Int(window.frame.height * scale))
|
||||
|
||||
@@ -61,8 +67,36 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
|
||||
self.stream = stream
|
||||
}
|
||||
|
||||
/// Choose which candidate window to capture: the one matching `preferredID` if
|
||||
/// present, else the largest by area. Returns the index into `candidates`, or
|
||||
/// nil if there are none. Pure/testable — no ScreenCaptureKit types.
|
||||
static func pickWindowIndex(_ candidates: [(id: CGWindowID, frame: CGRect)],
|
||||
preferredID: CGWindowID?) -> Int? {
|
||||
guard !candidates.isEmpty else { return nil }
|
||||
if let preferredID, let i = candidates.firstIndex(where: { $0.id == preferredID }) {
|
||||
return i
|
||||
}
|
||||
return candidates.indices.max(by: {
|
||||
candidates[$0].frame.width * candidates[$0].frame.height
|
||||
< candidates[$1].frame.width * candidates[$1].frame.height
|
||||
})
|
||||
}
|
||||
|
||||
/// Backing scale of the display that contains the window's center. SCWindow.frame
|
||||
/// is in global display (top-left origin) points; NSScreen is bottom-left, so we
|
||||
/// flip the center through the primary screen's height before testing containment.
|
||||
private static func backingScale(forWindowFrame frame: CGRect) -> CGFloat {
|
||||
let screens = NSScreen.screens
|
||||
guard let primary = screens.first else { return NSScreen.main?.backingScaleFactor ?? 2 }
|
||||
let centerAppKit = CGPoint(x: frame.midX, y: primary.frame.maxY - frame.midY)
|
||||
let screen = screens.first { $0.frame.contains(centerAppKit) } ?? NSScreen.main ?? primary
|
||||
return screen.backingScaleFactor
|
||||
}
|
||||
|
||||
func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
|
||||
if let stream { try? await stream.stopCapture() }
|
||||
// Bound stopCapture: an already-errored SCStream can block forever, which
|
||||
// would wedge session finalization in `.finishing`. Mirror AudioRecorder.
|
||||
if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) }
|
||||
stream = nil
|
||||
return queue.sync {
|
||||
if let gs = gapStart {
|
||||
@@ -113,6 +147,17 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
|
||||
|
||||
func stream(_ stream: SCStream, didStopWithError error: Error) {}
|
||||
|
||||
/// Proceed as soon as `stopCapture()` returns OR the timeout fires, so a wedged
|
||||
/// stream can't block forever.
|
||||
private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async {
|
||||
await withTaskGroup(of: Void.self) { group in
|
||||
group.addTask { try? await stream.stopCapture() }
|
||||
group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) }
|
||||
_ = await group.next()
|
||||
group.cancelAll()
|
||||
}
|
||||
}
|
||||
|
||||
private enum FrameKind { case live, idle, gap }
|
||||
|
||||
/// SCK delivers `.complete` only when content changes, `.idle` for a static
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
import XCTest
|
||||
import CoreGraphics
|
||||
@testable import Ten31Transcripts
|
||||
|
||||
/// Window-selection logic: prefer the exact detected window (e.g. the Meet browser
|
||||
/// window) by ID, else fall back to the largest owned window. This is the fix for
|
||||
/// the "captures the wrong browser window" data-flow bug.
|
||||
final class VisualObserverTests: XCTestCase {
|
||||
|
||||
private func c(_ id: CGWindowID, _ w: CGFloat, _ h: CGFloat) -> (id: CGWindowID, frame: CGRect) {
|
||||
(id, CGRect(x: 0, y: 0, width: w, height: h))
|
||||
}
|
||||
|
||||
func testPrefersMatchingWindowIDOverLargest() {
|
||||
// The Meet window (id 42) is NOT the largest — must still be chosen by ID.
|
||||
let candidates = [c(7, 1600, 1000), c(42, 800, 600), c(9, 1200, 900)]
|
||||
let idx = VisualObserver.pickWindowIndex(candidates, preferredID: 42)
|
||||
XCTAssertEqual(idx, 1)
|
||||
}
|
||||
|
||||
func testFallsBackToLargestWhenNoPreferredID() {
|
||||
let candidates = [c(7, 800, 600), c(9, 1600, 1000), c(11, 1200, 900)]
|
||||
let idx = VisualObserver.pickWindowIndex(candidates, preferredID: nil)
|
||||
XCTAssertEqual(idx, 1) // the 1600x1000 window
|
||||
}
|
||||
|
||||
func testFallsBackToLargestWhenPreferredIDMissing() {
|
||||
let candidates = [c(7, 800, 600), c(9, 1600, 1000)]
|
||||
let idx = VisualObserver.pickWindowIndex(candidates, preferredID: 999) // gone
|
||||
XCTAssertEqual(idx, 1)
|
||||
}
|
||||
|
||||
func testNilWhenNoCandidates() {
|
||||
XCTAssertNil(VisualObserver.pickWindowIndex([], preferredID: 42))
|
||||
XCTAssertNil(VisualObserver.pickWindowIndex([], preferredID: nil))
|
||||
}
|
||||
|
||||
// MARK: - Duration clamping (visual_timeline.json internal consistency)
|
||||
|
||||
func testClampSegmentsToDuration() {
|
||||
let segs = [
|
||||
VisualTimeline.Segment(start: 1, end: 5, name: "A", confidence: 0.9, source: "vision"),
|
||||
VisualTimeline.Segment(start: 8, end: 12, name: "B", confidence: 0.8, source: "vision"), // end past 10
|
||||
VisualTimeline.Segment(start: 10.5, end: 11, name: "C", confidence: 0.7, source: "vision"), // fully past → dropped
|
||||
]
|
||||
let out = VisualCapture.clampSegments(segs, to: 10)
|
||||
XCTAssertEqual(out.count, 2)
|
||||
XCTAssertEqual(out[0].end, 5, accuracy: 0.001)
|
||||
XCTAssertEqual(out[1].end, 10, accuracy: 0.001) // clamped
|
||||
XCTAssertFalse(out.contains { $0.name == "C" }) // dropped
|
||||
}
|
||||
|
||||
func testClampGapsToDuration() {
|
||||
let gaps = [
|
||||
VisualTimeline.Gap(start: 2, end: 4, reason: "minimized"),
|
||||
VisualTimeline.Gap(start: 9, end: 13, reason: "minimized"), // clamped to 10
|
||||
]
|
||||
let out = VisualCapture.clampGaps(gaps, to: 10)
|
||||
XCTAssertEqual(out.count, 2)
|
||||
XCTAssertEqual(out[1].end, 10, accuracy: 0.001)
|
||||
}
|
||||
|
||||
func testClampPassthroughWhenDurationUnknown() {
|
||||
let segs = [VisualTimeline.Segment(start: 1, end: 99, name: "A", confidence: 1, source: "vision")]
|
||||
XCTAssertEqual(VisualCapture.clampSegments(segs, to: 0), segs) // no duration → unchanged
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user