import Foundation import CoreGraphics import CoreVideo /// One per-frame observation from an app adapter: a participant tile, whether its /// active-speaker cue is showing, and where it is. `name` may be a full name, /// initials (Signal), or "" if unknown. `t` is seconds relative to the session t0. struct SpeakerObservation: Equatable { let name: String let speaking: Bool let bbox: CGRect let confidence: Double // 0…1 let t: TimeInterval } /// Per-app screen-reading strategy. Each conferencing app gets one implementation /// that knows that app's tile layout, name placement, and active-speaker cue. /// Adapters must be testable offline against still-image fixtures. protocol AppAdapter { static var bundleIDs: [String] { get } var adapterVersion: String { get } var preferredFPS: Int { get } /// Analyze one frame; return the speakers visible and whether each is speaking. /// Must process in-memory and never persist the frame. func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] /// Optional: participant names from the app's Accessibility tree (Electron /// apps like Signal expose these), preferred over OCR when available. func namesFromAccessibility() -> [String]? } extension AppAdapter { func namesFromAccessibility() -> [String]? { nil } var preferredFPS: Int { 3 } }