880b56e426
Visual capture now runs alongside audio: on call start the session picks the
app's adapter, captures the call window on the SAME monotonic clock as the audio
(AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands
the backend the visual segments with mic-VAD self-spans merged. Any visual
failure (no adapter, no window, Screen Recording denied) leaves the session
recording audio-only — the proven path is never blocked or broken.
- CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact
CGWindowID of the matched Meet browser window (native apps → nil → largest).
- VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json.
- AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment.
Hardened per a 3-lens adversarial review (concurrency / failure-isolation /
data-flow), all 6 confirmed findings fixed:
- P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session
(cross-session SCStream leak + visual_timeline.json written to the wrong
folder). Now gated on session identity — generation + recorder ===, still
.recording — with fail-closed adoption; otherwise the stream is cancelled.
- P1: observer captured the browser's largest window, not the detected Meet
window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested),
largest-area only as fallback.
- P2: a startVisual orphaned by a concurrent stop could leak a stream on quit.
inFlightVisual is registered before the await and drained in prepareForTermination.
- P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in
VisualCapture (clampSegments/clampGaps, unit-tested).
- P4: capture pixel size used NSScreen.main scale; now uses the scale of the
display actually hosting the window (OCR clarity on secondary displays).
- VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so
a wedged stream can't hang finalization.
25/25 XCTest pass. Live validation on real calls still pending.
177 lines
8.2 KiB
Swift
177 lines
8.2 KiB
Swift
import Foundation
|
|
import ScreenCaptureKit
|
|
import CoreMedia
|
|
import QuartzCore
|
|
import AppKit
|
|
|
|
/// Window-scoped visual capture: streams the call window's own rendered content
|
|
/// at ~`fps`, hands each frame to the app adapter, and **releases it immediately
|
|
/// — frames are never written to disk**. Builds the speaker timeline and records
|
|
/// `visual_gap`s when the window is minimized (SCK delivers non-live frames).
|
|
///
|
|
/// Window visibility/focus is NOT required — SCK captures a window even when it's
|
|
/// occluded or on another Space; only minimization freezes the backing buffer.
|
|
@available(macOS 13.0, *)
|
|
final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
|
|
private let bundleID: String
|
|
private let windowID: CGWindowID?
|
|
private let adapter: any AppAdapter
|
|
private let t0Host: Double
|
|
private let fps: Int
|
|
private let queue = DispatchQueue(label: "xyz.ten31.visual")
|
|
|
|
private var stream: SCStream?
|
|
private let builder = TimelineBuilder()
|
|
private var gaps: [VisualTimeline.Gap] = []
|
|
private var gapStart: Double?
|
|
|
|
/// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
|
|
var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?
|
|
|
|
init(bundleID: String, windowID: CGWindowID? = nil, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
|
|
self.bundleID = bundleID
|
|
self.windowID = windowID
|
|
self.adapter = adapter
|
|
self.t0Host = t0Host
|
|
self.fps = max(1, fps)
|
|
}
|
|
|
|
func start() async throws {
|
|
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
|
|
let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
|
|
// Prefer the EXACT detected window (e.g. the Meet browser window) by ID; fall
|
|
// back to the largest owned window when no ID was supplied or it's gone.
|
|
guard let idx = Self.pickWindowIndex(candidates.map { ($0.windowID, $0.frame) }, preferredID: windowID),
|
|
candidates.indices.contains(idx) else {
|
|
throw NSError(domain: "Ten31", code: 2,
|
|
userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
|
|
}
|
|
let window = candidates[idx]
|
|
|
|
let filter = SCContentFilter(desktopIndependentWindow: window)
|
|
let config = SCStreamConfiguration()
|
|
config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(fps))
|
|
config.queueDepth = 3
|
|
config.showsCursor = false
|
|
config.pixelFormat = kCVPixelFormatType_32BGRA
|
|
// window.frame is in points; capture at native pixels so OCR can read small
|
|
// initials/names (a half-res Retina capture badly hurts recognition). Use the
|
|
// scale of the display the window is actually on, not always the main screen.
|
|
let scale = Self.backingScale(forWindowFrame: window.frame)
|
|
config.width = max(2, Int(window.frame.width * scale))
|
|
config.height = max(2, Int(window.frame.height * scale))
|
|
|
|
let stream = SCStream(filter: filter, configuration: config, delegate: self)
|
|
try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: queue)
|
|
try await stream.startCapture()
|
|
self.stream = stream
|
|
}
|
|
|
|
/// Choose which candidate window to capture: the one matching `preferredID` if
|
|
/// present, else the largest by area. Returns the index into `candidates`, or
|
|
/// nil if there are none. Pure/testable — no ScreenCaptureKit types.
|
|
static func pickWindowIndex(_ candidates: [(id: CGWindowID, frame: CGRect)],
|
|
preferredID: CGWindowID?) -> Int? {
|
|
guard !candidates.isEmpty else { return nil }
|
|
if let preferredID, let i = candidates.firstIndex(where: { $0.id == preferredID }) {
|
|
return i
|
|
}
|
|
return candidates.indices.max(by: {
|
|
candidates[$0].frame.width * candidates[$0].frame.height
|
|
< candidates[$1].frame.width * candidates[$1].frame.height
|
|
})
|
|
}
|
|
|
|
/// Backing scale of the display that contains the window's center. SCWindow.frame
|
|
/// is in global display (top-left origin) points; NSScreen is bottom-left, so we
|
|
/// flip the center through the primary screen's height before testing containment.
|
|
private static func backingScale(forWindowFrame frame: CGRect) -> CGFloat {
|
|
let screens = NSScreen.screens
|
|
guard let primary = screens.first else { return NSScreen.main?.backingScaleFactor ?? 2 }
|
|
let centerAppKit = CGPoint(x: frame.midX, y: primary.frame.maxY - frame.midY)
|
|
let screen = screens.first { $0.frame.contains(centerAppKit) } ?? NSScreen.main ?? primary
|
|
return screen.backingScaleFactor
|
|
}
|
|
|
|
func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
|
|
// Bound stopCapture: an already-errored SCStream can block forever, which
|
|
// would wedge session finalization in `.finishing`. Mirror AudioRecorder.
|
|
if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) }
|
|
stream = nil
|
|
return queue.sync {
|
|
if let gs = gapStart {
|
|
gaps.append(.init(start: gs, end: CACurrentMediaTime() - t0Host, reason: "minimized"))
|
|
gapStart = nil
|
|
}
|
|
builder.finish()
|
|
return (builder.segments, gaps)
|
|
}
|
|
}
|
|
|
|
/// Merge mic-VAD self spans into the visual timeline (call before `stop`'s read,
|
|
/// or fold in afterwards in the packager).
|
|
func addSelfSpans(_ spans: [VADSpan], selfName: String) {
|
|
queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) }
|
|
}
|
|
|
|
// MARK: - SCStreamOutput (on `queue`)
|
|
|
|
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
|
|
of type: SCStreamOutputType) {
|
|
guard type == .screen, CMSampleBufferDataIsReady(sampleBuffer) else { return }
|
|
let now = CACurrentMediaTime() - t0Host
|
|
|
|
switch frameKind(sampleBuffer) {
|
|
case .idle:
|
|
// Window is live but static (no pixel change) — no new info, not a gap.
|
|
return
|
|
case .gap:
|
|
// Minimized/blanked: the backing buffer is frozen. Open a gap once and
|
|
// close any open speaker segments so none is carried across it.
|
|
if gapStart == nil {
|
|
gapStart = now
|
|
builder.closeOpenSegments(at: now)
|
|
}
|
|
return
|
|
case .live:
|
|
if let gs = gapStart {
|
|
gaps.append(.init(start: gs, end: now, reason: "minimized"))
|
|
gapStart = nil
|
|
}
|
|
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
|
|
let observations = adapter.analyze(frame: pixelBuffer, at: now) // frame released after this scope
|
|
builder.ingest(observations, at: now)
|
|
onObservations?(observations, now)
|
|
}
|
|
}
|
|
|
|
func stream(_ stream: SCStream, didStopWithError error: Error) {}
|
|
|
|
/// Proceed as soon as `stopCapture()` returns OR the timeout fires, so a wedged
|
|
/// stream can't block forever.
|
|
private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async {
|
|
await withTaskGroup(of: Void.self) { group in
|
|
group.addTask { try? await stream.stopCapture() }
|
|
group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) }
|
|
_ = await group.next()
|
|
group.cancelAll()
|
|
}
|
|
}
|
|
|
|
private enum FrameKind { case live, idle, gap }
|
|
|
|
/// SCK delivers `.complete` only when content changes, `.idle` for a static
|
|
/// (but visible) window, and `.blank`/`.suspended`/`.stopped` when frozen.
|
|
private func frameKind(_ sampleBuffer: CMSampleBuffer) -> FrameKind {
|
|
guard let attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, createIfNecessary: false)
|
|
as? [[SCStreamFrameInfo: Any]],
|
|
let raw = attachments.first?[.status] as? Int,
|
|
let status = SCFrameStatus(rawValue: raw) else { return .live }
|
|
switch status {
|
|
case .complete: return .live
|
|
case .idle: return .idle
|
|
default: return .gap // .blank / .suspended / .stopped
|
|
}
|
|
}
|
|
}
|