Wire visual capture into the recording lifecycle (failure-isolated)
Visual capture now runs alongside audio: on call start the session picks the
app's adapter, captures the call window on the SAME monotonic clock as the audio
(AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands
the backend the visual segments with mic-VAD self-spans merged. Any visual
failure (no adapter, no window, Screen Recording denied) leaves the session
recording audio-only — the proven path is never blocked or broken.
- CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact
CGWindowID of the matched Meet browser window (native apps → nil → largest).
- VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json.
- AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment.
Hardened per a 3-lens adversarial review (concurrency / failure-isolation /
data-flow), all 6 confirmed findings fixed:
- P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session
(cross-session SCStream leak + visual_timeline.json written to the wrong
folder). Now gated on session identity — generation + recorder ===, still
.recording — with fail-closed adoption; otherwise the stream is cancelled.
- P1: observer captured the browser's largest window, not the detected Meet
window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested),
largest-area only as fallback.
- P2: a startVisual orphaned by a concurrent stop could leak a stream on quit.
inFlightVisual is registered before the await and drained in prepareForTermination.
- P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in
VisualCapture (clampSegments/clampGaps, unit-tested).
- P4: capture pixel size used NSScreen.main scale; now uses the scale of the
display actually hosting the window (OCR clarity on secondary displays).
- VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so
a wedged stream can't hang finalization.
25/25 XCTest pass. Live validation on real calls still pending.
This commit is contained in:
@@ -0,0 +1,83 @@
|
||||
import Foundation
|
||||
import CoreGraphics
|
||||
|
||||
/// Owns the visual side of one recording session: picks the app's adapter, runs a
|
||||
/// `VisualObserver` over the call window, and on stop writes `visual_timeline.json`
|
||||
/// and returns the speaker segments for the backend hand-off.
|
||||
///
|
||||
/// Strictly best-effort: if there's no adapter for the app, or the window can't be
|
||||
/// captured, the session simply records audio-only — visuals never block or break
|
||||
/// the proven audio path. `init?` returns nil when the app has no visual adapter.
|
||||
@available(macOS 13.0, *)
|
||||
final class VisualCapture {
|
||||
let app: CallDetector.DetectedApp
|
||||
private let adapter: any AppAdapter
|
||||
private let observer: VisualObserver
|
||||
|
||||
init?(app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?, t0Host: Double) {
|
||||
guard let adapter = AdapterRegistry.adapter(for: app) else { return nil }
|
||||
self.app = app
|
||||
self.adapter = adapter
|
||||
self.observer = VisualObserver(bundleID: bundleID, windowID: windowID, adapter: adapter,
|
||||
t0Host: t0Host, fps: adapter.preferredFPS)
|
||||
}
|
||||
|
||||
/// Start window capture. Throws if the window isn't capturable (no window yet,
|
||||
/// Screen Recording denied) — the caller catches and falls back to audio-only.
|
||||
func start() async throws {
|
||||
try await observer.start()
|
||||
}
|
||||
|
||||
/// Stop and discard capture without writing anything (used when the session
|
||||
/// ends before capture was fully adopted).
|
||||
func cancel() async {
|
||||
_ = await observer.stop()
|
||||
}
|
||||
|
||||
/// Clamp segment ends to the audio duration; drop any that become empty. Keeps
|
||||
/// `visual_timeline.json` internally consistent and never sends the backend a
|
||||
/// segment longer than the audio. (`duration <= 0` → passthrough.)
|
||||
static func clampSegments(_ segs: [VisualTimeline.Segment], to duration: Double) -> [VisualTimeline.Segment] {
|
||||
guard duration > 0 else { return segs }
|
||||
return segs.compactMap { s in
|
||||
let end = min(s.end, duration)
|
||||
guard end > s.start else { return nil }
|
||||
return .init(start: s.start, end: end, name: s.name, confidence: s.confidence, source: s.source)
|
||||
}
|
||||
}
|
||||
|
||||
static func clampGaps(_ gaps: [VisualTimeline.Gap], to duration: Double) -> [VisualTimeline.Gap] {
|
||||
guard duration > 0 else { return gaps }
|
||||
return gaps.compactMap { g in
|
||||
let end = min(g.end, duration)
|
||||
guard end > g.start else { return nil }
|
||||
return .init(start: g.start, end: end, reason: g.reason)
|
||||
}
|
||||
}
|
||||
|
||||
/// Stop capture, fold in the mic-VAD self spans, write `visual_timeline.json`
|
||||
/// into the session folder, and return the merged segments for `label-merge`.
|
||||
func finish(selfSpans: [VADSpan], selfName: String,
|
||||
sessionId: String, t0Unix: Double, durationSec: Double,
|
||||
folder: URL) async -> [VisualTimeline.Segment] {
|
||||
observer.addSelfSpans(selfSpans, selfName: selfName)
|
||||
let (rawSegments, rawGaps) = await observer.stop()
|
||||
|
||||
// The observer stops slightly after audio fixes `durationSec`, so a trailing
|
||||
// gap/segment can run past it. Clamp ends so the JSON is internally consistent
|
||||
// (and we never hand the backend a segment longer than the audio).
|
||||
let segments = Self.clampSegments(rawSegments, to: durationSec)
|
||||
let gaps = Self.clampGaps(rawGaps, to: durationSec)
|
||||
|
||||
let names = Set(segments.map { $0.name })
|
||||
let participants = names.sorted().map {
|
||||
VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
|
||||
}
|
||||
let timeline = VisualTimeline(
|
||||
sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion,
|
||||
t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS,
|
||||
selfName: selfName, participants: participants, segments: segments, visualGaps: gaps)
|
||||
try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json"))
|
||||
return segments
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user