Files
ten31-transcripts/Ten31Transcripts/Visual/GridCallAnalyzer.swift
T
Grant Gilliam 3bc169533a Ring-based speaker attribution (fixes real large-tile detection)
Real Teams/Signal frames exposed a geometry bug: estimating a tile's SIZE from its
name width (×3) produces a tiny box on big real tiles, so the speaking border ring
fell entirely outside it → zero points → 'not speaking' (Joe Payne's clear blue
border went undetected). Pure nearest-name fails too (the top edge of a lower tile
is closer to the upper tile's bottom-anchored name).

Fix: cluster the highlight pixels into connected RINGS (GridCallAnalyzer.connectedComponents,
spatial-hashed union-find), then attribute each ring to the OCR'd name inside its
bounding box. The ring *is* the tile, so detection is independent of tile-size
estimation, and multiple simultaneous borders (lag/persist/crosstalk) become separate
rings naturally — exactly the multi-ring case Grant flagged. minRingSpan rejects specks.

Validated on real frames: Teams now detects 'Joe Payne' (was empty); Signal detects
'JA' in the group grid. (Signal _002 has a border but no rendered name that frame —
inherent Signal intermittency; voice + reconciliation cover it.) 59/59 synthetic
XCTest still green (white + coloured, single + crosstalk).
2026-06-08 12:03:36 -05:00

201 lines
10 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
import CoreGraphics
import CoreVideo
import CoreImage
/// Shared engine for tile-grid conferencing UIs: OCR the name/initials on each
/// tile, then mark the active speaker(s) by the speaking-highlight around their
/// tile. Handles BOTH highlight kinds:
/// - **white border** (Signal: 3px #ffffff ring detected via thin near-white edges)
/// - **coloured border** (Zoom/Teams detected via saturated edges)
///
/// The white name text is excluded so it can't be mistaken for the white border.
/// Geometry (`Config`) is a first pass; tile expansion calibrates per app against
/// real screenshot fixtures. Detection *logic* is validated on synthetic frames.
struct GridCallAnalyzer {
/// Where the name label sits relative to its participant tile drives how the
/// tile rect is estimated from the OCR'd name box.
enum NameAnchor {
case bottomCenter // Signal: centered footer; tile extends UP, centered on the name
case bottomLeft // Meet/Zoom: name in the bottom-left corner; tile extends UP and RIGHT
case center // name centered inside the tile
}
struct Config {
var tileExpandX = 2.4 // tile width name width × this
var tileExpandY = 4.8 // tile height name height × this
var nameAnchor: NameAnchor = .bottomCenter
var detectColoredBorder = true
var detectWhiteBorder = true
// Coloured-border sensitivity. Default 0.5 suits vivid rings (Zoom green/
// yellow); lower it for muted accent rings (Teams violet 0.41, Meet's
// light-blue glow 0.44). `colorHueRange` (degrees) optionally pins the
// ring's hue so a low threshold doesn't catch warm video set per platform
// once calibrated against real screenshots.
var colorSaturation: Double = 0.5
var colorMinBrightness: Double = 60
var colorHueRange: ClosedRange<Double>? = nil
var minTextConfidence: Float = 0.3
var maxNameLength = 40
var minHighlightPoints = 6
var highlightShareOfMax = 0.35
var minRingSpan: Double = 60 // a speaking border spans a sizable box, not a speck
}
var config = Config()
var recognizer = TextRecognizer()
func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
guard let cg = Self.cgImage(from: pixelBuffer) else { return [] }
return analyze(cgImage: cg, at: t)
}
func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
let texts = recognizer.recognize(in: cgImage).filter {
$0.confidence >= config.minTextConfidence && Self.isLikelyName(cleaned($0.text))
}
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
let w = cgImage.width, h = cgImage.height
struct Tile { let name: String; let textRect: CGRect; let tile: CGRect; let conf: Double }
let tiles = texts.map { r in
Tile(name: cleaned(r.text),
textRect: pixelRect(r.boundingBox, w, h),
tile: tileRect(r.boundingBox, w, h),
conf: Double(r.confidence))
}
// Highlight pixels: coloured (saturated) and/or white (thin near-white).
var highlight: [CGPoint] = []
if config.detectColoredBorder {
highlight += sampler.saturatedPoints(threshold: config.colorSaturation,
minBrightness: config.colorMinBrightness,
hueRange: config.colorHueRange)
}
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
// Drop points inside any name-text region so the name's own text isn't mistaken
// for the border highlight.
let exclusions = tiles.map {
$0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
}
let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
// Cluster the highlight pixels into connected RINGS (one per bordered tile),
// then attribute each ring to the OCR name sitting inside it. The ring *is*
// the tile, so this is independent of tile-size estimation (which fails on big
// real tiles) and handles multiple simultaneous borders (crosstalk) naturally.
let rings = Self.connectedComponents(points, maxGap: 18)
var speakingBBox: [Int: CGRect] = [:] // tile index -> the ring bbox marking it speaking
for ring in rings where ring.count >= config.minHighlightPoints {
let bb = Self.boundingBox(ring)
guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a blob
for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) {
speakingBBox[i] = bb
}
}
return tiles.enumerated().map { idx, tile in
SpeakerObservation(name: tile.name, speaking: speakingBBox[idx] != nil,
bbox: speakingBBox[idx] ?? tile.tile, confidence: tile.conf, t: t)
}
}
/// Connected components of grid-sampled points: two points join if within `maxGap`
/// on both axes. Spatial-hashed so it stays cheap on dense frames.
static func connectedComponents(_ points: [CGPoint], maxGap: Double) -> [[CGPoint]] {
guard !points.isEmpty else { return [] }
var parent = Array(0..<points.count)
func find(_ x: Int) -> Int { var r = x; while parent[r] != r { parent[r] = parent[parent[r]]; r = parent[r] }; return r }
func union(_ a: Int, _ b: Int) { let ra = find(a), rb = find(b); if ra != rb { parent[ra] = rb } }
let cell = max(1.0, maxGap)
func key(_ cx: Int, _ cy: Int) -> Int { cx &* 100_003 &+ cy }
var buckets: [Int: [Int]] = [:]
for (i, p) in points.enumerated() {
buckets[key(Int(p.x / cell), Int(p.y / cell)), default: []].append(i)
}
for (i, p) in points.enumerated() {
let cx = Int(p.x / cell), cy = Int(p.y / cell)
for dx in -1...1 { for dy in -1...1 {
for j in buckets[key(cx + dx, cy + dy)] ?? [] where j > i {
if abs(points[j].x - p.x) <= maxGap, abs(points[j].y - p.y) <= maxGap { union(i, j) }
}
} }
}
var groups: [Int: [CGPoint]] = [:]
for i in points.indices { groups[find(i), default: []].append(points[i]) }
return Array(groups.values)
}
static func boundingBox(_ pts: [CGPoint]) -> CGRect {
var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX
for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) }
return CGRect(x: minX, y: minY, width: maxX - minX, height: maxY - minY)
}
/// Vision normalized bbox (bottom-left origin) pixel rect (top-left origin).
private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
let W = Double(w), H = Double(h)
return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H)
}
/// Estimate the participant tile from the name label, per the app's `nameAnchor`:
/// - `.bottomCenter` (Signal): tile extends UP from a centered footer.
/// - `.bottomLeft` (Meet/Zoom): name hugs the tile's bottom-left corner; the
/// tile extends UP and to the RIGHT of it.
/// - `.center`: tile centered on the name.
private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
let W = Double(w), H = Double(h)
let name = pixelRect(box, w, h)
let nw = name.width * config.tileExpandX
let nh = name.height * config.tileExpandY
let rect: CGRect
switch config.nameAnchor {
case .bottomCenter:
let bottom = name.maxY + name.height * 0.3
rect = CGRect(x: name.midX - nw / 2, y: bottom - nh, width: nw, height: nh)
case .bottomLeft:
let bottom = name.maxY + name.height * 0.3
let left = name.minX - name.height * 0.4 // small left padding the corner gutter
rect = CGRect(x: left, y: bottom - nh, width: nw, height: nh)
case .center:
rect = CGRect(x: name.midX - nw / 2, y: name.midY - nh / 2, width: nw, height: nh)
}
return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
}
private func cleaned(_ s: String) -> String {
s.trimmingCharacters(in: .whitespacesAndNewlines)
}
/// True if `s` looks like a participant name label rather than UI chrome. Call
/// UIs are full of text (meeting URLs, the clock, "Add others", lobby dialogs)
/// that OCR would otherwise treat as speakers. Participant labels are short,
/// Title-Cased, 13 alphabetic words with no digits/URL/email punctuation.
/// Derived from real Meet/Zoom captures; errs toward dropping (a missed name
/// just means no visual hint the backend still diarizes from audio).
static func isLikelyName(_ s: String) -> Bool {
guard s.count >= 2, s.count <= 30 else { return false }
// Reject URLs, emails, meeting codes, times, button glyphs.
if s.rangeOfCharacter(from: CharacterSet(charactersIn: "@:/\\|+*=<>#0123456789")) != nil {
return false
}
let words = s.split(separator: " ")
guard (1...3).contains(words.count) else { return false }
let allowed = CharacterSet.letters.union(CharacterSet(charactersIn: "'.-"))
for w in words {
guard let first = w.first, first.isUppercase else { return false } // Title Case
if String(w).unicodeScalars.contains(where: { !allowed.contains($0) }) { return false }
}
return true
}
private static let ciContext = CIContext()
static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? {
let ci = CIImage(cvPixelBuffer: pixelBuffer)
return ciContext.createCGImage(ci, from: ci.extent) // reuse; allocating per frame is costly
}
}