Wire visual capture into the recording lifecycle (failure-isolated)
Visual capture now runs alongside audio: on call start the session picks the
app's adapter, captures the call window on the SAME monotonic clock as the audio
(AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands
the backend the visual segments with mic-VAD self-spans merged. Any visual
failure (no adapter, no window, Screen Recording denied) leaves the session
recording audio-only — the proven path is never blocked or broken.
- CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact
CGWindowID of the matched Meet browser window (native apps → nil → largest).
- VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json.
- AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment.
Hardened per a 3-lens adversarial review (concurrency / failure-isolation /
data-flow), all 6 confirmed findings fixed:
- P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session
(cross-session SCStream leak + visual_timeline.json written to the wrong
folder). Now gated on session identity — generation + recorder ===, still
.recording — with fail-closed adoption; otherwise the stream is cancelled.
- P1: observer captured the browser's largest window, not the detected Meet
window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested),
largest-area only as fallback.
- P2: a startVisual orphaned by a concurrent stop could leak a stream on quit.
inFlightVisual is registered before the await and drained in prepareForTermination.
- P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in
VisualCapture (clampSegments/clampGaps, unit-tested).
- P4: capture pixel size used NSScreen.main scale; now uses the scale of the
display actually hosting the window (OCR clarity on secondary displays).
- VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so
a wedged stream can't hang finalization.
25/25 XCTest pass. Live validation on real calls still pending.
This commit is contained in:
@@ -62,11 +62,20 @@ final class SessionController: ObservableObject {
|
||||
let sessionId: String
|
||||
let app: String
|
||||
let mixedURL: URL
|
||||
let selfSpans: [VADSpan]
|
||||
let timeline: [VisualTimeline.Segment]
|
||||
}
|
||||
private var lastProcess: ProcessInputs?
|
||||
private var processTask: Task<Void, Never>?
|
||||
private var recorder: AudioRecorder?
|
||||
/// Visual capture for the current session (nil for manual recordings, apps with
|
||||
/// no adapter, or when the window can't be captured — those record audio-only).
|
||||
private var visualCapture: VisualCapture?
|
||||
/// A visual capture whose `start()` is in flight (registered before the await),
|
||||
/// so `prepareForTermination` can tear it down if its start-Task is orphaned.
|
||||
private var inFlightVisual: VisualCapture?
|
||||
/// App + capture target to start visual capture for, set at `start()`. `windowID`
|
||||
/// pins the exact detected window (e.g. the Meet browser window); nil → largest.
|
||||
private var pendingCapture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)?
|
||||
private var currentFolder: URL?
|
||||
private var startTime: Date?
|
||||
private var timer: Timer?
|
||||
@@ -86,7 +95,7 @@ final class SessionController: ObservableObject {
|
||||
fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
|
||||
SessionController.shared = self
|
||||
|
||||
detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
|
||||
detector.onCallStart = { [weak self] call in self?.handleCallStart(call) }
|
||||
detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
|
||||
detector.$status
|
||||
.sink { [weak self] status in self?.detectionStatus = status }
|
||||
@@ -124,10 +133,11 @@ final class SessionController: ObservableObject {
|
||||
|
||||
// MARK: - Auto-detection
|
||||
|
||||
private func handleCallStart(_ app: CallDetector.DetectedApp) {
|
||||
private func handleCallStart(_ call: CallDetector.DetectedCall) {
|
||||
guard settings.autoRecordOnDetection else { return }
|
||||
switch state {
|
||||
case .idle, .error: start(label: app.label, auto: true)
|
||||
case .idle, .error:
|
||||
start(label: call.app.label, auto: true, capture: (call.app, call.bundleID, call.windowID))
|
||||
case .starting, .recording, .finishing: break // don't disturb an active session
|
||||
}
|
||||
}
|
||||
@@ -156,7 +166,8 @@ final class SessionController: ObservableObject {
|
||||
|
||||
// MARK: - Start / Stop
|
||||
|
||||
private func start(label: String = "manual", auto: Bool = false) {
|
||||
private func start(label: String = "manual", auto: Bool = false,
|
||||
capture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)? = nil) {
|
||||
let folder: URL
|
||||
do {
|
||||
folder = try makeSessionFolder(label: label)
|
||||
@@ -168,6 +179,7 @@ final class SessionController: ObservableObject {
|
||||
currentLabel = label
|
||||
autoStarted = auto
|
||||
pendingAutoStop = false
|
||||
pendingCapture = capture
|
||||
let recorder = AudioRecorder(
|
||||
micURL: folder.appendingPathComponent("mic.wav"),
|
||||
systemURL: folder.appendingPathComponent("system.wav"),
|
||||
@@ -177,6 +189,7 @@ final class SessionController: ObservableObject {
|
||||
state = .starting
|
||||
|
||||
lifecycleGeneration += 1
|
||||
let myGen = lifecycleGeneration
|
||||
lifecycleTask = Task {
|
||||
do {
|
||||
try await recorder.start() // self-tears-down if it throws
|
||||
@@ -187,7 +200,12 @@ final class SessionController: ObservableObject {
|
||||
if self.pendingAutoStop {
|
||||
self.pendingAutoStop = false
|
||||
self.stop()
|
||||
return
|
||||
}
|
||||
// Attach visual capture on the SAME clock (best-effort, audio-only on failure).
|
||||
// Pass this session's generation + recorder so a slow start can't
|
||||
// adopt itself into a different session that began meanwhile.
|
||||
await self.startVisual(t0Host: recorder.sharedT0Host(), generation: myGen, recorder: recorder)
|
||||
} catch {
|
||||
self.handleStartFailure(error)
|
||||
}
|
||||
@@ -213,18 +231,71 @@ final class SessionController: ObservableObject {
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Visual capture
|
||||
|
||||
/// Best-effort: start window capture for the detected app on the audio clock.
|
||||
/// Any failure (no adapter, no window, Screen Recording denied) leaves
|
||||
/// `visualCapture` nil and the session records audio-only.
|
||||
///
|
||||
/// `generation`/`recorder` identify the session that launched this; because
|
||||
/// `vc.start()` is a slow async call, a stop + a fresh start can complete during
|
||||
/// it. We adopt the stream ONLY back into the same session — otherwise we cancel
|
||||
/// it, so a stale capture can never attach to (or leak into) a different session.
|
||||
private func startVisual(t0Host: Double, generation: Int, recorder: AudioRecorder) async {
|
||||
guard let capture = pendingCapture else { return } // manual recording → audio-only
|
||||
pendingCapture = nil
|
||||
guard let vc = VisualCapture(app: capture.app, bundleID: capture.bundleID,
|
||||
windowID: capture.windowID, t0Host: t0Host) else { return }
|
||||
// Register the live capture before the await so a quit (prepareForTermination)
|
||||
// can drain it even if this start-Task gets orphaned by a concurrent stop.
|
||||
inFlightVisual = vc
|
||||
defer { if inFlightVisual === vc { inFlightVisual = nil } }
|
||||
do {
|
||||
try await vc.start()
|
||||
// Adopt only if THIS session still owns the slot (same generation, same
|
||||
// recorder, still recording); otherwise discard rather than leak/misattach.
|
||||
guard generation == lifecycleGeneration, self.recorder === recorder,
|
||||
case .recording = state else {
|
||||
await vc.cancel()
|
||||
return
|
||||
}
|
||||
if let existing = visualCapture { await existing.cancel() } // fail-closed
|
||||
visualCapture = vc
|
||||
} catch {
|
||||
await vc.cancel() // tear down any partial stream; never break recording
|
||||
}
|
||||
}
|
||||
|
||||
/// Stop visual capture (if any), write `visual_timeline.json`, and return the
|
||||
/// timeline for the backend: visual segments + merged self-spans when visual
|
||||
/// ran, otherwise the mic-VAD self spans alone.
|
||||
private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?) async -> [VisualTimeline.Segment] {
|
||||
let selfName = settings.selfName
|
||||
if let vc = visualCapture, let folder {
|
||||
visualCapture = nil
|
||||
return await vc.finish(
|
||||
selfSpans: result.selfSpans, selfName: selfName,
|
||||
sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
|
||||
durationSec: result.duration, folder: folder)
|
||||
}
|
||||
if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
|
||||
return TranscriptPipeline.timeline(fromSelfSpans: result.selfSpans, selfName: selfName)
|
||||
}
|
||||
|
||||
private func stop() {
|
||||
guard let recorder else { return }
|
||||
state = .finishing
|
||||
stopTimer()
|
||||
let folder = currentFolder
|
||||
lifecycleGeneration += 1
|
||||
lifecycleTask = Task {
|
||||
let result = await recorder.stop()
|
||||
self.finish(result)
|
||||
let timeline = await self.stopVisualAndTimeline(result, folder: folder)
|
||||
self.finish(result, timeline: timeline)
|
||||
}
|
||||
}
|
||||
|
||||
private func finish(_ result: RecordingResult) {
|
||||
private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment]) {
|
||||
recorder = nil
|
||||
micLevel = 0
|
||||
systemLevel = 0
|
||||
@@ -237,7 +308,7 @@ final class SessionController: ObservableObject {
|
||||
duration: result.duration, selfSpanCount: result.selfSpans.count)
|
||||
lastProcess = ProcessInputs(
|
||||
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
|
||||
mixedURL: result.mixedURL, selfSpans: result.selfSpans)
|
||||
mixedURL: result.mixedURL, timeline: timeline)
|
||||
}
|
||||
let autoSend = settings.autoSendOnStop
|
||||
currentFolder = nil
|
||||
@@ -250,10 +321,10 @@ final class SessionController: ObservableObject {
|
||||
|
||||
// MARK: - Backend transcription
|
||||
|
||||
/// Send the last finished session to the backend → `speakers.json`. Uses the
|
||||
/// mic-VAD self spans as the timeline for now; visual segments (Phase 3–4) get
|
||||
/// merged in once the adapters land. Safe to call manually ("Send to backend")
|
||||
/// or automatically on stop.
|
||||
/// Send the last finished session to the backend → `speakers.json`. The
|
||||
/// timeline is the session's visual segments (with mic-VAD self spans merged)
|
||||
/// when visual capture ran, or the self spans alone otherwise. Safe to call
|
||||
/// manually ("Send to backend") or automatically on stop.
|
||||
func processLastSession() {
|
||||
guard let inputs = lastProcess else { return }
|
||||
if case .processing = transcriptStatus { return }
|
||||
@@ -266,8 +337,7 @@ final class SessionController: ObservableObject {
|
||||
baseURL: settings.backendBaseURL,
|
||||
skipTLS: settings.skipTLSVerification,
|
||||
voiceprints: voiceprints)
|
||||
let timeline = TranscriptPipeline.timeline(
|
||||
fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
|
||||
let timeline = inputs.timeline
|
||||
do {
|
||||
let speakers = try await pipeline.process(
|
||||
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
||||
@@ -286,6 +356,9 @@ final class SessionController: ObservableObject {
|
||||
|
||||
private func fail(_ message: String) {
|
||||
recorder = nil
|
||||
visualCapture = nil // recorder.start() failed before visual started; nothing running
|
||||
inFlightVisual = nil
|
||||
pendingCapture = nil
|
||||
currentFolder = nil
|
||||
autoStarted = false
|
||||
pendingAutoStop = false
|
||||
@@ -312,11 +385,20 @@ final class SessionController: ObservableObject {
|
||||
if state == .recording, let recorder {
|
||||
state = .finishing
|
||||
stopTimer()
|
||||
finish(await recorder.stop())
|
||||
let folder = currentFolder
|
||||
let result = await recorder.stop()
|
||||
let timeline = await stopVisualAndTimeline(result, folder: folder)
|
||||
finish(result, timeline: timeline)
|
||||
} else if lifecycleGeneration == gen {
|
||||
break // settled: no new transition was spawned
|
||||
}
|
||||
}
|
||||
// A visual start-Task orphaned by a concurrent stop may still hold a live
|
||||
// stream that nothing else will tear down before exit — drain it here.
|
||||
if let vc = inFlightVisual {
|
||||
inFlightVisual = nil
|
||||
await vc.cancel()
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Timer
|
||||
|
||||
Reference in New Issue
Block a user