ten31-transcripts/Ten31Transcripts/Visual/GridCallAnalyzer.swift

import Foundation
import CoreGraphics
import CoreVideo
import CoreImage

/// Shared engine for tile-grid conferencing UIs: OCR the name/initials on each
/// tile, then mark the active speaker(s) by the speaking-highlight around their
/// tile. Handles BOTH highlight kinds:
/// - **white border** (Signal: 3px #ffffff ring — detected via thin near-white edges)
/// - **coloured border** (Zoom/Teams — detected via saturated edges)
///
/// The white name text is excluded so it can't be mistaken for the white border.
/// Geometry (`Config`) is a first pass; tile expansion calibrates per app against
/// real screenshot fixtures. Detection *logic* is validated on synthetic frames.
struct GridCallAnalyzer {
    /// Where the name label sits relative to its participant tile — drives how the
    /// tile rect is estimated from the OCR'd name box.
    enum NameAnchor {
        case bottomCenter   // Signal: centered footer; tile extends UP, centered on the name
        case bottomLeft     // Meet/Zoom: name in the bottom-left corner; tile extends UP and RIGHT
        case center         // name centered inside the tile
    }

    struct Config {
        var tileExpandX = 2.4         // tile width  ≈ name width  × this
        var tileExpandY = 4.8         // tile height ≈ name height × this
        var nameAnchor: NameAnchor = .bottomCenter
        var detectColoredBorder = true
        var detectWhiteBorder = true
        // Coloured-border sensitivity. Default 0.5 suits vivid rings (Zoom green/
        // yellow); lower it for muted accent rings (Teams violet ≈ 0.41, Meet's
        // light-blue glow ≈ 0.44). `colorHueRange` (degrees) optionally pins the
        // ring's hue so a low threshold doesn't catch warm video — set per platform
        // once calibrated against real screenshots.
        var colorSaturation: Double = 0.5
        var colorMinBrightness: Double = 60
        var colorHueRange: ClosedRange<Double>? = nil
        var minTextConfidence: Float = 0.3
        var maxNameLength = 40
        var minHighlightPoints = 6
        var highlightShareOfMax = 0.35
        var minRingSpan: Double = 60   // a speaking border spans a sizable box, not a speck
    }

    var config = Config()
    var recognizer = TextRecognizer()

    func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
        guard let cg = Self.cgImage(from: pixelBuffer) else { return [] }
        return analyze(cgImage: cg, at: t)
    }

    func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
        let texts = recognizer.recognize(in: cgImage).filter {
            $0.confidence >= config.minTextConfidence && Self.isLikelyName(cleaned($0.text))
        }
        guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
        let w = cgImage.width, h = cgImage.height

        struct Tile { let name: String; let textRect: CGRect; let tile: CGRect; let conf: Double }
        let tiles = texts.map { r in
            Tile(name: cleaned(r.text),
                 textRect: pixelRect(r.boundingBox, w, h),
                 tile: tileRect(r.boundingBox, w, h),
                 conf: Double(r.confidence))
        }

        // Highlight pixels: coloured (saturated) and/or white (thin near-white).
        var highlight: [CGPoint] = []
        if config.detectColoredBorder {
            highlight += sampler.saturatedPoints(threshold: config.colorSaturation,
                                                 minBrightness: config.colorMinBrightness,
                                                 hueRange: config.colorHueRange)
        }
        if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }

        // Drop points inside any name-text region so the name's own text isn't mistaken
        // for the border highlight.
        let exclusions = tiles.map {
            $0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
        }
        let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }

        // Cluster the highlight pixels into connected RINGS (one per bordered tile),
        // then attribute each ring to the OCR name sitting inside it. The ring *is*
        // the tile, so this is independent of tile-size estimation (which fails on big
        // real tiles) and handles multiple simultaneous borders (crosstalk) naturally.
        let rings = Self.connectedComponents(points, maxGap: 18)
        var speakingBBox: [Int: CGRect] = [:]      // tile index -> the ring bbox marking it speaking
        for ring in rings where ring.count >= config.minHighlightPoints {
            let bb = Self.boundingBox(ring)
            guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue }   // a ring, not a blob
            for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) {
                speakingBBox[i] = bb
            }
        }

        return tiles.enumerated().map { idx, tile in
            SpeakerObservation(name: tile.name, speaking: speakingBBox[idx] != nil,
                               bbox: speakingBBox[idx] ?? tile.tile, confidence: tile.conf, t: t)
        }
    }

    /// Connected components of grid-sampled points: two points join if within `maxGap`
    /// on both axes. Spatial-hashed so it stays cheap on dense frames.
    static func connectedComponents(_ points: [CGPoint], maxGap: Double) -> [[CGPoint]] {
        guard !points.isEmpty else { return [] }
        var parent = Array(0..<points.count)
        func find(_ x: Int) -> Int { var r = x; while parent[r] != r { parent[r] = parent[parent[r]]; r = parent[r] }; return r }
        func union(_ a: Int, _ b: Int) { let ra = find(a), rb = find(b); if ra != rb { parent[ra] = rb } }

        let cell = max(1.0, maxGap)
        func key(_ cx: Int, _ cy: Int) -> Int { cx &* 100_003 &+ cy }
        var buckets: [Int: [Int]] = [:]
        for (i, p) in points.enumerated() {
            buckets[key(Int(p.x / cell), Int(p.y / cell)), default: []].append(i)
        }
        for (i, p) in points.enumerated() {
            let cx = Int(p.x / cell), cy = Int(p.y / cell)
            for dx in -1...1 { for dy in -1...1 {
                for j in buckets[key(cx + dx, cy + dy)] ?? [] where j > i {
                    if abs(points[j].x - p.x) <= maxGap, abs(points[j].y - p.y) <= maxGap { union(i, j) }
                }
            } }
        }
        var groups: [Int: [CGPoint]] = [:]
        for i in points.indices { groups[find(i), default: []].append(points[i]) }
        return Array(groups.values)
    }

    static func boundingBox(_ pts: [CGPoint]) -> CGRect {
        var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX
        for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) }
        return CGRect(x: minX, y: minY, width: maxX - minX, height: maxY - minY)
    }

    /// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin).
    private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
        let W = Double(w), H = Double(h)
        return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H)
    }

    /// Estimate the participant tile from the name label, per the app's `nameAnchor`:
    /// - `.bottomCenter` (Signal): tile extends UP from a centered footer.
    /// - `.bottomLeft` (Meet/Zoom): name hugs the tile's bottom-left corner; the
    ///   tile extends UP and to the RIGHT of it.
    /// - `.center`: tile centered on the name.
    private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
        let W = Double(w), H = Double(h)
        let name = pixelRect(box, w, h)
        let nw = name.width * config.tileExpandX
        let nh = name.height * config.tileExpandY
        let rect: CGRect
        switch config.nameAnchor {
        case .bottomCenter:
            let bottom = name.maxY + name.height * 0.3
            rect = CGRect(x: name.midX - nw / 2, y: bottom - nh, width: nw, height: nh)
        case .bottomLeft:
            let bottom = name.maxY + name.height * 0.3
            let left = name.minX - name.height * 0.4   // small left padding ≈ the corner gutter
            rect = CGRect(x: left, y: bottom - nh, width: nw, height: nh)
        case .center:
            rect = CGRect(x: name.midX - nw / 2, y: name.midY - nh / 2, width: nw, height: nh)
        }
        return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
    }

    private func cleaned(_ s: String) -> String {
        s.trimmingCharacters(in: .whitespacesAndNewlines)
    }

    /// True if `s` looks like a participant name label rather than UI chrome. Call
    /// UIs are full of text (meeting URLs, the clock, "Add others", lobby dialogs)
    /// that OCR would otherwise treat as speakers. Participant labels are short,
    /// Title-Cased, 1–3 alphabetic words with no digits/URL/email punctuation.
    /// Derived from real Meet/Zoom captures; errs toward dropping (a missed name
    /// just means no visual hint — the backend still diarizes from audio).
    static func isLikelyName(_ s: String) -> Bool {
        guard s.count >= 2, s.count <= 30 else { return false }
        // Reject URLs, emails, meeting codes, times, button glyphs.
        if s.rangeOfCharacter(from: CharacterSet(charactersIn: "@:/\\|+*=<>#0123456789")) != nil {
            return false
        }
        let words = s.split(separator: " ")
        guard (1...3).contains(words.count) else { return false }
        let allowed = CharacterSet.letters.union(CharacterSet(charactersIn: "'.-"))
        for w in words {
            guard let first = w.first, first.isUppercase else { return false }   // Title Case
            if String(w).unicodeScalars.contains(where: { !allowed.contains($0) }) { return false }
        }
        return true
    }

    private static let ciContext = CIContext()

    static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? {
        let ci = CIImage(cvPixelBuffer: pixelBuffer)
        return ciContext.createCGImage(ci, from: ci.extent)   // reuse; allocating per frame is costly
    }
}