import Foundation import CoreGraphics import CoreVideo import CoreImage /// Shared engine for tile-grid conferencing UIs (Signal/Zoom/Teams): OCR the /// name/initials on each tile, then mark the active speaker(s) by the saturated /// coloured highlight around their tile. /// /// Geometry (`Config`) is a first pass; the exact tile expansion and saturation /// threshold get calibrated per app against real screenshot fixtures. The /// detection *logic* (read names; pick the highlighted tile) is validated with /// synthetic frames. struct GridCallAnalyzer { struct Config { var tileExpandX = 1.8 // grow text bbox → approx tile (for the reported bbox) var tileExpandY = 2.6 var minTextConfidence: Float = 0.3 var maxNameLength = 40 /// Highlight detection: a name is "speaking" if enough strongly-saturated /// highlight pixels sit within `highlightRadiusFraction` of its label. var highlightRadiusFraction = 0.22 // of max(frame W,H) var minHighlightPoints = 6 var highlightShareOfMax = 0.35 // must be ≥ this fraction of the busiest tile } var config = Config() var recognizer = TextRecognizer() func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] { guard let cg = Self.cgImage(from: pixelBuffer) else { return [] } return analyze(cgImage: cg, at: t) } func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] { let texts = recognizer.recognize(in: cgImage).filter { $0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty } guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] } let w = cgImage.width, h = cgImage.height let tiles = texts.map { r -> (name: String, center: CGPoint, rect: CGRect, conf: Double) in let rect = tileRect(r.boundingBox, imageW: w, imageH: h) let cx = r.boundingBox.midX * Double(w) let cy = (1 - r.boundingBox.midY) * Double(h) // flip Y to top-left origin return (cleaned(r.text), CGPoint(x: cx, y: cy), rect, Double(r.confidence)) } // Find highlight pixels once, attribute each to the nearest name label. let points = sampler.saturatedPoints() let radius = Double(max(w, h)) * config.highlightRadiusFraction let r2 = radius * radius let counts = tiles.map { tile -> Int in points.reduce(0) { acc, p in let dx = Double(p.x) - tile.center.x, dy = Double(p.y) - tile.center.y return acc + (dx * dx + dy * dy <= r2 ? 1 : 0) } } let maxCount = counts.max() ?? 0 let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax)) return tiles.enumerated().map { idx, tile in let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need return SpeakerObservation(name: tile.name, speaking: speaking, bbox: tile.rect, confidence: tile.conf, t: t) } } /// Vision normalized bbox (bottom-left origin) → pixel tile rect (top-left), /// expanded around the text centre to approximate the whole tile. private func tileRect(_ box: CGRect, imageW: Int, imageH: Int) -> CGRect { let W = Double(imageW), H = Double(imageH) let pw = box.width * W let ph = box.height * H let cx = (box.midX) * W let cy = (1 - box.midY) * H // flip Y to top-left origin let nw = pw * config.tileExpandX let nh = ph * config.tileExpandY let rect = CGRect(x: cx - nw / 2, y: cy - nh / 2, width: nw, height: nh) return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H)) } private func cleaned(_ s: String) -> String { let t = s.trimmingCharacters(in: .whitespacesAndNewlines) return t.count <= config.maxNameLength ? t : "" } private static let ciContext = CIContext() static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? { let ci = CIImage(cvPixelBuffer: pixelBuffer) return ciContext.createCGImage(ci, from: ci.extent) // reuse; allocating per frame is costly } }