import Foundation import CoreGraphics import CoreVideo import CoreImage /// Shared engine for tile-grid conferencing UIs: OCR the name/initials on each /// tile, then mark the active speaker(s) by the speaking-highlight around their /// tile. Handles BOTH highlight kinds: /// - **white border** (Signal: 3px #ffffff ring — detected via thin near-white edges) /// - **coloured border** (Zoom/Teams — detected via saturated edges) /// /// The white name text is excluded so it can't be mistaken for the white border. /// Geometry (`Config`) is a first pass; tile expansion calibrates per app against /// real screenshot fixtures. Detection *logic* is validated on synthetic frames. struct GridCallAnalyzer { /// Where the name label sits relative to its participant tile — drives how the /// tile rect is estimated from the OCR'd name box. enum NameAnchor { case bottomCenter // Signal: centered footer; tile extends UP, centered on the name case bottomLeft // Meet/Zoom: name in the bottom-left corner; tile extends UP and RIGHT case center // name centered inside the tile } struct Config { var tileExpandX = 2.4 // tile width ≈ name width × this var tileExpandY = 4.8 // tile height ≈ name height × this var nameAnchor: NameAnchor = .bottomCenter var detectColoredBorder = true var detectWhiteBorder = true // Coloured-border sensitivity. Default 0.5 suits vivid rings (Zoom green/ // yellow); lower it for muted accent rings (Teams violet ≈ 0.41, Meet's // light-blue glow ≈ 0.44). `colorHueRange` (degrees) optionally pins the // ring's hue so a low threshold doesn't catch warm video — set per platform // once calibrated against real screenshots. var colorSaturation: Double = 0.5 var colorMinBrightness: Double = 60 var colorHueRange: ClosedRange? = nil var minTextConfidence: Float = 0.3 var maxNameLength = 40 var minHighlightPoints = 6 var highlightShareOfMax = 0.35 } var config = Config() var recognizer = TextRecognizer() func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] { guard let cg = Self.cgImage(from: pixelBuffer) else { return [] } return analyze(cgImage: cg, at: t) } func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] { let texts = recognizer.recognize(in: cgImage).filter { $0.confidence >= config.minTextConfidence && Self.isLikelyName(cleaned($0.text)) } guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] } let w = cgImage.width, h = cgImage.height struct Tile { let name: String; let textRect: CGRect; let tile: CGRect; let conf: Double } let tiles = texts.map { r in Tile(name: cleaned(r.text), textRect: pixelRect(r.boundingBox, w, h), tile: tileRect(r.boundingBox, w, h), conf: Double(r.confidence)) } // Highlight pixels: coloured (saturated) and/or white (thin near-white). var highlight: [CGPoint] = [] if config.detectColoredBorder { highlight += sampler.saturatedPoints(threshold: config.colorSaturation, minBrightness: config.colorMinBrightness, hueRange: config.colorHueRange) } if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() } // Drop points inside any name-text region so the white name itself doesn't count. let exclusions = tiles.map { $0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35) } let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } } // Attribute each highlight pixel to EXACTLY ONE tile — the (no-margin) // estimated rect that contains it, nearest centre as tiebreak. Containment // (not a radius) keeps a border from bleeding into adjacent tiles even when // the tile-size estimate is rough; an under-sized estimate merely drops the // far edge rather than misattributing it. var counts = [Int](repeating: 0, count: tiles.count) for p in points { var best = -1 var bestDistSq = Double.greatestFiniteMagnitude for (i, tile) in tiles.enumerated() where tile.tile.contains(p) { let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY let dd = dx * dx + dy * dy if dd < bestDistSq { bestDistSq = dd; best = i } } if best >= 0 { counts[best] += 1 } } let maxCount = counts.max() ?? 0 let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax)) return tiles.enumerated().map { idx, tile in let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need return SpeakerObservation(name: tile.name, speaking: speaking, bbox: tile.tile, confidence: tile.conf, t: t) } } /// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin). private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect { let W = Double(w), H = Double(h) return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H) } /// Estimate the participant tile from the name label, per the app's `nameAnchor`: /// - `.bottomCenter` (Signal): tile extends UP from a centered footer. /// - `.bottomLeft` (Meet/Zoom): name hugs the tile's bottom-left corner; the /// tile extends UP and to the RIGHT of it. /// - `.center`: tile centered on the name. private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect { let W = Double(w), H = Double(h) let name = pixelRect(box, w, h) let nw = name.width * config.tileExpandX let nh = name.height * config.tileExpandY let rect: CGRect switch config.nameAnchor { case .bottomCenter: let bottom = name.maxY + name.height * 0.3 rect = CGRect(x: name.midX - nw / 2, y: bottom - nh, width: nw, height: nh) case .bottomLeft: let bottom = name.maxY + name.height * 0.3 let left = name.minX - name.height * 0.4 // small left padding ≈ the corner gutter rect = CGRect(x: left, y: bottom - nh, width: nw, height: nh) case .center: rect = CGRect(x: name.midX - nw / 2, y: name.midY - nh / 2, width: nw, height: nh) } return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H)) } private func cleaned(_ s: String) -> String { s.trimmingCharacters(in: .whitespacesAndNewlines) } /// True if `s` looks like a participant name label rather than UI chrome. Call /// UIs are full of text (meeting URLs, the clock, "Add others", lobby dialogs) /// that OCR would otherwise treat as speakers. Participant labels are short, /// Title-Cased, 1–3 alphabetic words with no digits/URL/email punctuation. /// Derived from real Meet/Zoom captures; errs toward dropping (a missed name /// just means no visual hint — the backend still diarizes from audio). static func isLikelyName(_ s: String) -> Bool { guard s.count >= 2, s.count <= 30 else { return false } // Reject URLs, emails, meeting codes, times, button glyphs. if s.rangeOfCharacter(from: CharacterSet(charactersIn: "@:/\\|+*=<>#0123456789")) != nil { return false } let words = s.split(separator: " ") guard (1...3).contains(words.count) else { return false } let allowed = CharacterSet.letters.union(CharacterSet(charactersIn: "'.-")) for w in words { guard let first = w.first, first.isUppercase else { return false } // Title Case if String(w).unicodeScalars.contains(where: { !allowed.contains($0) }) { return false } } return true } private static let ciContext = CIContext() static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? { let ci = CIImage(cvPixelBuffer: pixelBuffer) return ciContext.createCGImage(ci, from: ci.extent) // reuse; allocating per frame is costly } }