Files
ten31-transcripts/Ten31Transcripts/Visual/VisualObserver.swift
T
Grant Gilliam a3e3406b28 Make diarization chunk length configurable (Auto + presets)
Chunk size was hardcoded at 2.5-min bodies. Add a Settings control:
Auto / Standard 2.5min / Large group 60s / Fine 90s. Shorter chunks keep fewer
simultaneous speakers per window (Sortformer resolves ~4/chunk), useful for large
calls, at some cost to speed and cross-chunk voice matching.

- ChunkMode (new, pure/testable): mode → body seconds; Auto picks 60s when >4
  participants were detected, else 150s; overlap + single-chunk threshold scale
  with the body length.
- AppSettings.chunkMode (+ typed `chunk`); SettingsView picker with explanation.
- TranscriptPipeline.process gains chunkSeconds; derives overlap/threshold from it.
- SessionController resolves the body from the setting + the session's detected
  participant count (visual_timeline participants) for both send + re-process.
- Participant roster now counts EVERY tile OCR'd, not just who spoke
  (TimelineBuilder.observedNames → VisualObserver → VisualCapture), so the Auto
  call-size signal is meaningful even though speaking-detection is sparse.

Tests: ChunkMode resolution, overlap scaling, short-body re-chunking. 69 pass.
2026-06-09 10:15:16 -05:00

181 lines
8.4 KiB
Swift

import Foundation
import ScreenCaptureKit
import CoreMedia
import QuartzCore
import AppKit
/// Window-scoped visual capture: streams the call window's own rendered content
/// at ~`fps`, hands each frame to the app adapter, and **releases it immediately
/// frames are never written to disk**. Builds the speaker timeline and records
/// `visual_gap`s when the window is minimized (SCK delivers non-live frames).
///
/// Window visibility/focus is NOT required SCK captures a window even when it's
/// occluded or on another Space; only minimization freezes the backing buffer.
@available(macOS 13.0, *)
final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
private let bundleID: String
private let windowID: CGWindowID?
private let adapter: any AppAdapter
private let t0Host: Double
private let fps: Int
private let queue = DispatchQueue(label: "xyz.ten31.visual")
private var stream: SCStream?
private let builder = TimelineBuilder()
private var gaps: [VisualTimeline.Gap] = []
private var gapStart: Double?
/// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?
init(bundleID: String, windowID: CGWindowID? = nil, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
self.bundleID = bundleID
self.windowID = windowID
self.adapter = adapter
self.t0Host = t0Host
self.fps = max(1, fps)
}
func start() async throws {
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
// Prefer the EXACT detected window (e.g. the Meet browser window) by ID; fall
// back to the largest owned window when no ID was supplied or it's gone.
guard let idx = Self.pickWindowIndex(candidates.map { ($0.windowID, $0.frame) }, preferredID: windowID),
candidates.indices.contains(idx) else {
throw NSError(domain: "Ten31", code: 2,
userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
}
let window = candidates[idx]
let filter = SCContentFilter(desktopIndependentWindow: window)
let config = SCStreamConfiguration()
config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(fps))
config.queueDepth = 3
config.showsCursor = false
config.pixelFormat = kCVPixelFormatType_32BGRA
// window.frame is in points; capture at native pixels so OCR can read small
// initials/names (a half-res Retina capture badly hurts recognition). Use the
// scale of the display the window is actually on, not always the main screen.
let scale = Self.backingScale(forWindowFrame: window.frame)
config.width = max(2, Int(window.frame.width * scale))
config.height = max(2, Int(window.frame.height * scale))
let stream = SCStream(filter: filter, configuration: config, delegate: self)
try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: queue)
try await stream.startCapture()
self.stream = stream
}
/// Choose which candidate window to capture: the one matching `preferredID` if
/// present, else the largest by area. Returns the index into `candidates`, or
/// nil if there are none. Pure/testable no ScreenCaptureKit types.
static func pickWindowIndex(_ candidates: [(id: CGWindowID, frame: CGRect)],
preferredID: CGWindowID?) -> Int? {
guard !candidates.isEmpty else { return nil }
if let preferredID, let i = candidates.firstIndex(where: { $0.id == preferredID }) {
return i
}
return candidates.indices.max(by: {
candidates[$0].frame.width * candidates[$0].frame.height
< candidates[$1].frame.width * candidates[$1].frame.height
})
}
/// Backing scale of the display that contains the window's center. SCWindow.frame
/// is in global display (top-left origin) points; NSScreen is bottom-left, so we
/// flip the center through the primary screen's height before testing containment.
private static func backingScale(forWindowFrame frame: CGRect) -> CGFloat {
let screens = NSScreen.screens
guard let primary = screens.first else { return NSScreen.main?.backingScaleFactor ?? 2 }
let centerAppKit = CGPoint(x: frame.midX, y: primary.frame.maxY - frame.midY)
let screen = screens.first { $0.frame.contains(centerAppKit) } ?? NSScreen.main ?? primary
return screen.backingScaleFactor
}
func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
// Bound stopCapture: an already-errored SCStream can block forever, which
// would wedge session finalization in `.finishing`. Mirror AudioRecorder.
if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) }
stream = nil
return queue.sync {
if let gs = gapStart {
gaps.append(.init(start: gs, end: CACurrentMediaTime() - t0Host, reason: "minimized"))
gapStart = nil
}
builder.finish()
return (builder.segments, gaps)
}
}
/// Merge mic-VAD self spans into the visual timeline (call before `stop`'s read,
/// or fold in afterwards in the packager).
func addSelfSpans(_ spans: [VADSpan], selfName: String) {
queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) }
}
/// Every distinct participant name OCR'd over the session (read on the builder's
/// queue; safe to call after `stop`).
func participantNames() -> [String] { queue.sync { builder.observedNames } }
// MARK: - SCStreamOutput (on `queue`)
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
of type: SCStreamOutputType) {
guard type == .screen, CMSampleBufferDataIsReady(sampleBuffer) else { return }
let now = CACurrentMediaTime() - t0Host
switch frameKind(sampleBuffer) {
case .idle:
// Window is live but static (no pixel change) no new info, not a gap.
return
case .gap:
// Minimized/blanked: the backing buffer is frozen. Open a gap once and
// close any open speaker segments so none is carried across it.
if gapStart == nil {
gapStart = now
builder.closeOpenSegments(at: now)
}
return
case .live:
if let gs = gapStart {
gaps.append(.init(start: gs, end: now, reason: "minimized"))
gapStart = nil
}
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
let observations = adapter.analyze(frame: pixelBuffer, at: now) // frame released after this scope
builder.ingest(observations, at: now)
onObservations?(observations, now)
}
}
func stream(_ stream: SCStream, didStopWithError error: Error) {}
/// Proceed as soon as `stopCapture()` returns OR the timeout fires, so a wedged
/// stream can't block forever.
private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async {
await withTaskGroup(of: Void.self) { group in
group.addTask { try? await stream.stopCapture() }
group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) }
_ = await group.next()
group.cancelAll()
}
}
private enum FrameKind { case live, idle, gap }
/// SCK delivers `.complete` only when content changes, `.idle` for a static
/// (but visible) window, and `.blank`/`.suspended`/`.stopped` when frozen.
private func frameKind(_ sampleBuffer: CMSampleBuffer) -> FrameKind {
guard let attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, createIfNecessary: false)
as? [[SCStreamFrameInfo: Any]],
let raw = attachments.first?[.status] as? Int,
let status = SCFrameStatus(rawValue: raw) else { return .live }
switch status {
case .complete: return .live
case .idle: return .idle
default: return .gap // .blank / .suspended / .stopped
}
}
}