ten31-transcripts/Ten31Transcripts/Visual/SpeakerObservation.swift

import Foundation
import CoreGraphics
import CoreVideo

/// One per-frame observation from an app adapter: a participant tile, whether its
/// active-speaker cue is showing, and where it is. `name` may be a full name,
/// initials (Signal), or "" if unknown. `t` is seconds relative to the session t0.
struct SpeakerObservation: Equatable {
    let name: String
    let speaking: Bool
    let bbox: CGRect
    let confidence: Double   // 0…1
    let t: TimeInterval
}

/// Per-app screen-reading strategy. Each conferencing app gets one implementation
/// that knows that app's tile layout, name placement, and active-speaker cue.
/// Adapters must be testable offline against still-image fixtures.
protocol AppAdapter {
    static var bundleIDs: [String] { get }
    var adapterVersion: String { get }
    var preferredFPS: Int { get }

    /// Analyze one frame; return the speakers visible and whether each is speaking.
    /// Must process in-memory and never persist the frame.
    func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation]

    /// Optional: participant names from the app's Accessibility tree (Electron
    /// apps like Signal expose these), preferred over OCR when available.
    func namesFromAccessibility() -> [String]?
}

extension AppAdapter {
    func namesFromAccessibility() -> [String]? { nil }
    var preferredFPS: Int { 3 }
}