Wire visual capture into the recording lifecycle (failure-isolated)

Visual capture now runs alongside audio: on call start the session picks the
app's adapter, captures the call window on the SAME monotonic clock as the audio
(AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands
the backend the visual segments with mic-VAD self-spans merged. Any visual
failure (no adapter, no window, Screen Recording denied) leaves the session
recording audio-only — the proven path is never blocked or broken.

- CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact
  CGWindowID of the matched Meet browser window (native apps → nil → largest).
- VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json.
- AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment.

Hardened per a 3-lens adversarial review (concurrency / failure-isolation /
data-flow), all 6 confirmed findings fixed:
- P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session
  (cross-session SCStream leak + visual_timeline.json written to the wrong
  folder). Now gated on session identity — generation + recorder ===, still
  .recording — with fail-closed adoption; otherwise the stream is cancelled.
- P1: observer captured the browser's largest window, not the detected Meet
  window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested),
  largest-area only as fallback.
- P2: a startVisual orphaned by a concurrent stop could leak a stream on quit.
  inFlightVisual is registered before the await and drained in prepareForTermination.
- P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in
  VisualCapture (clampSegments/clampGaps, unit-tested).
- P4: capture pixel size used NSScreen.main scale; now uses the scale of the
  display actually hosting the window (OCR clarity on secondary displays).
- VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so
  a wedged stream can't hang finalization.

25/25 XCTest pass. Live validation on real calls still pending.
This commit is contained in:
Grant Gilliam
2026-06-06 10:18:52 -05:00
parent c347acbd97
commit 880b56e426
6 changed files with 348 additions and 48 deletions
@@ -62,11 +62,20 @@ final class SessionController: ObservableObject {
let sessionId: String
let app: String
let mixedURL: URL
let selfSpans: [VADSpan]
let timeline: [VisualTimeline.Segment]
}
private var lastProcess: ProcessInputs?
private var processTask: Task<Void, Never>?
private var recorder: AudioRecorder?
/// Visual capture for the current session (nil for manual recordings, apps with
/// no adapter, or when the window can't be captured those record audio-only).
private var visualCapture: VisualCapture?
/// A visual capture whose `start()` is in flight (registered before the await),
/// so `prepareForTermination` can tear it down if its start-Task is orphaned.
private var inFlightVisual: VisualCapture?
/// App + capture target to start visual capture for, set at `start()`. `windowID`
/// pins the exact detected window (e.g. the Meet browser window); nil largest.
private var pendingCapture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)?
private var currentFolder: URL?
private var startTime: Date?
private var timer: Timer?
@@ -86,7 +95,7 @@ final class SessionController: ObservableObject {
fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
SessionController.shared = self
detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
detector.onCallStart = { [weak self] call in self?.handleCallStart(call) }
detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
detector.$status
.sink { [weak self] status in self?.detectionStatus = status }
@@ -124,10 +133,11 @@ final class SessionController: ObservableObject {
// MARK: - Auto-detection
private func handleCallStart(_ app: CallDetector.DetectedApp) {
private func handleCallStart(_ call: CallDetector.DetectedCall) {
guard settings.autoRecordOnDetection else { return }
switch state {
case .idle, .error: start(label: app.label, auto: true)
case .idle, .error:
start(label: call.app.label, auto: true, capture: (call.app, call.bundleID, call.windowID))
case .starting, .recording, .finishing: break // don't disturb an active session
}
}
@@ -156,7 +166,8 @@ final class SessionController: ObservableObject {
// MARK: - Start / Stop
private func start(label: String = "manual", auto: Bool = false) {
private func start(label: String = "manual", auto: Bool = false,
capture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)? = nil) {
let folder: URL
do {
folder = try makeSessionFolder(label: label)
@@ -168,6 +179,7 @@ final class SessionController: ObservableObject {
currentLabel = label
autoStarted = auto
pendingAutoStop = false
pendingCapture = capture
let recorder = AudioRecorder(
micURL: folder.appendingPathComponent("mic.wav"),
systemURL: folder.appendingPathComponent("system.wav"),
@@ -177,6 +189,7 @@ final class SessionController: ObservableObject {
state = .starting
lifecycleGeneration += 1
let myGen = lifecycleGeneration
lifecycleTask = Task {
do {
try await recorder.start() // self-tears-down if it throws
@@ -187,7 +200,12 @@ final class SessionController: ObservableObject {
if self.pendingAutoStop {
self.pendingAutoStop = false
self.stop()
return
}
// Attach visual capture on the SAME clock (best-effort, audio-only on failure).
// Pass this session's generation + recorder so a slow start can't
// adopt itself into a different session that began meanwhile.
await self.startVisual(t0Host: recorder.sharedT0Host(), generation: myGen, recorder: recorder)
} catch {
self.handleStartFailure(error)
}
@@ -213,18 +231,71 @@ final class SessionController: ObservableObject {
}
}
// MARK: - Visual capture
/// Best-effort: start window capture for the detected app on the audio clock.
/// Any failure (no adapter, no window, Screen Recording denied) leaves
/// `visualCapture` nil and the session records audio-only.
///
/// `generation`/`recorder` identify the session that launched this; because
/// `vc.start()` is a slow async call, a stop + a fresh start can complete during
/// it. We adopt the stream ONLY back into the same session otherwise we cancel
/// it, so a stale capture can never attach to (or leak into) a different session.
private func startVisual(t0Host: Double, generation: Int, recorder: AudioRecorder) async {
guard let capture = pendingCapture else { return } // manual recording audio-only
pendingCapture = nil
guard let vc = VisualCapture(app: capture.app, bundleID: capture.bundleID,
windowID: capture.windowID, t0Host: t0Host) else { return }
// Register the live capture before the await so a quit (prepareForTermination)
// can drain it even if this start-Task gets orphaned by a concurrent stop.
inFlightVisual = vc
defer { if inFlightVisual === vc { inFlightVisual = nil } }
do {
try await vc.start()
// Adopt only if THIS session still owns the slot (same generation, same
// recorder, still recording); otherwise discard rather than leak/misattach.
guard generation == lifecycleGeneration, self.recorder === recorder,
case .recording = state else {
await vc.cancel()
return
}
if let existing = visualCapture { await existing.cancel() } // fail-closed
visualCapture = vc
} catch {
await vc.cancel() // tear down any partial stream; never break recording
}
}
/// Stop visual capture (if any), write `visual_timeline.json`, and return the
/// timeline for the backend: visual segments + merged self-spans when visual
/// ran, otherwise the mic-VAD self spans alone.
private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?) async -> [VisualTimeline.Segment] {
let selfName = settings.selfName
if let vc = visualCapture, let folder {
visualCapture = nil
return await vc.finish(
selfSpans: result.selfSpans, selfName: selfName,
sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
durationSec: result.duration, folder: folder)
}
if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
return TranscriptPipeline.timeline(fromSelfSpans: result.selfSpans, selfName: selfName)
}
private func stop() {
guard let recorder else { return }
state = .finishing
stopTimer()
let folder = currentFolder
lifecycleGeneration += 1
lifecycleTask = Task {
let result = await recorder.stop()
self.finish(result)
let timeline = await self.stopVisualAndTimeline(result, folder: folder)
self.finish(result, timeline: timeline)
}
}
private func finish(_ result: RecordingResult) {
private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment]) {
recorder = nil
micLevel = 0
systemLevel = 0
@@ -237,7 +308,7 @@ final class SessionController: ObservableObject {
duration: result.duration, selfSpanCount: result.selfSpans.count)
lastProcess = ProcessInputs(
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
mixedURL: result.mixedURL, selfSpans: result.selfSpans)
mixedURL: result.mixedURL, timeline: timeline)
}
let autoSend = settings.autoSendOnStop
currentFolder = nil
@@ -250,10 +321,10 @@ final class SessionController: ObservableObject {
// MARK: - Backend transcription
/// Send the last finished session to the backend `speakers.json`. Uses the
/// mic-VAD self spans as the timeline for now; visual segments (Phase 34) get
/// merged in once the adapters land. Safe to call manually ("Send to backend")
/// or automatically on stop.
/// Send the last finished session to the backend `speakers.json`. The
/// timeline is the session's visual segments (with mic-VAD self spans merged)
/// when visual capture ran, or the self spans alone otherwise. Safe to call
/// manually ("Send to backend") or automatically on stop.
func processLastSession() {
guard let inputs = lastProcess else { return }
if case .processing = transcriptStatus { return }
@@ -266,8 +337,7 @@ final class SessionController: ObservableObject {
baseURL: settings.backendBaseURL,
skipTLS: settings.skipTLSVerification,
voiceprints: voiceprints)
let timeline = TranscriptPipeline.timeline(
fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
let timeline = inputs.timeline
do {
let speakers = try await pipeline.process(
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
@@ -286,6 +356,9 @@ final class SessionController: ObservableObject {
private func fail(_ message: String) {
recorder = nil
visualCapture = nil // recorder.start() failed before visual started; nothing running
inFlightVisual = nil
pendingCapture = nil
currentFolder = nil
autoStarted = false
pendingAutoStop = false
@@ -312,11 +385,20 @@ final class SessionController: ObservableObject {
if state == .recording, let recorder {
state = .finishing
stopTimer()
finish(await recorder.stop())
let folder = currentFolder
let result = await recorder.stop()
let timeline = await stopVisualAndTimeline(result, folder: folder)
finish(result, timeline: timeline)
} else if lifecycleGeneration == gen {
break // settled: no new transition was spawned
}
}
// A visual start-Task orphaned by a concurrent stop may still hold a live
// stream that nothing else will tear down before exit drain it here.
if let vc = inFlightVisual {
inFlightVisual = nil
await vc.cancel()
}
}
// MARK: - Timer