Ring-based speaker attribution (fixes real large-tile detection)

Real Teams/Signal frames exposed a geometry bug: estimating a tile's SIZE from its
name width (×3) produces a tiny box on big real tiles, so the speaking border ring
fell entirely outside it → zero points → 'not speaking' (Joe Payne's clear blue
border went undetected). Pure nearest-name fails too (the top edge of a lower tile
is closer to the upper tile's bottom-anchored name).

Fix: cluster the highlight pixels into connected RINGS (GridCallAnalyzer.connectedComponents,
spatial-hashed union-find), then attribute each ring to the OCR'd name inside its
bounding box. The ring *is* the tile, so detection is independent of tile-size
estimation, and multiple simultaneous borders (lag/persist/crosstalk) become separate
rings naturally — exactly the multi-ring case Grant flagged. minRingSpan rejects specks.

Validated on real frames: Teams now detects 'Joe Payne' (was empty); Signal detects
'JA' in the group grid. (Signal _002 has a border but no rendered name that frame —
inherent Signal intermittency; voice + reconciliation cover it.) 59/59 synthetic
XCTest still green (white + coloured, single + crosstalk).
This commit is contained in:
Grant Gilliam
2026-06-08 12:03:36 -05:00
parent 6d0c8be8c9
commit 3bc169533a
+49 -20
View File
@@ -39,6 +39,7 @@ struct GridCallAnalyzer {
var maxNameLength = 40
var minHighlightPoints = 6
var highlightShareOfMax = 0.35
var minRingSpan: Double = 60 // a speaking border spans a sizable box, not a speck
}
var config = Config()
@@ -73,38 +74,66 @@ struct GridCallAnalyzer {
}
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
// Drop points inside any name-text region so the white name itself doesn't count.
// Drop points inside any name-text region so the name's own text isn't mistaken
// for the border highlight.
let exclusions = tiles.map {
$0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
}
let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
// Attribute each highlight pixel to EXACTLY ONE tile the (no-margin)
// estimated rect that contains it, nearest centre as tiebreak. Containment
// (not a radius) keeps a border from bleeding into adjacent tiles even when
// the tile-size estimate is rough; an under-sized estimate merely drops the
// far edge rather than misattributing it.
var counts = [Int](repeating: 0, count: tiles.count)
for p in points {
var best = -1
var bestDistSq = Double.greatestFiniteMagnitude
for (i, tile) in tiles.enumerated() where tile.tile.contains(p) {
let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY
let dd = dx * dx + dy * dy
if dd < bestDistSq { bestDistSq = dd; best = i }
// Cluster the highlight pixels into connected RINGS (one per bordered tile),
// then attribute each ring to the OCR name sitting inside it. The ring *is*
// the tile, so this is independent of tile-size estimation (which fails on big
// real tiles) and handles multiple simultaneous borders (crosstalk) naturally.
let rings = Self.connectedComponents(points, maxGap: 18)
var speakingBBox: [Int: CGRect] = [:] // tile index -> the ring bbox marking it speaking
for ring in rings where ring.count >= config.minHighlightPoints {
let bb = Self.boundingBox(ring)
guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a blob
for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) {
speakingBBox[i] = bb
}
if best >= 0 { counts[best] += 1 }
}
let maxCount = counts.max() ?? 0
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
return tiles.enumerated().map { idx, tile in
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
return SpeakerObservation(name: tile.name, speaking: speaking,
bbox: tile.tile, confidence: tile.conf, t: t)
SpeakerObservation(name: tile.name, speaking: speakingBBox[idx] != nil,
bbox: speakingBBox[idx] ?? tile.tile, confidence: tile.conf, t: t)
}
}
/// Connected components of grid-sampled points: two points join if within `maxGap`
/// on both axes. Spatial-hashed so it stays cheap on dense frames.
static func connectedComponents(_ points: [CGPoint], maxGap: Double) -> [[CGPoint]] {
guard !points.isEmpty else { return [] }
var parent = Array(0..<points.count)
func find(_ x: Int) -> Int { var r = x; while parent[r] != r { parent[r] = parent[parent[r]]; r = parent[r] }; return r }
func union(_ a: Int, _ b: Int) { let ra = find(a), rb = find(b); if ra != rb { parent[ra] = rb } }
let cell = max(1.0, maxGap)
func key(_ cx: Int, _ cy: Int) -> Int { cx &* 100_003 &+ cy }
var buckets: [Int: [Int]] = [:]
for (i, p) in points.enumerated() {
buckets[key(Int(p.x / cell), Int(p.y / cell)), default: []].append(i)
}
for (i, p) in points.enumerated() {
let cx = Int(p.x / cell), cy = Int(p.y / cell)
for dx in -1...1 { for dy in -1...1 {
for j in buckets[key(cx + dx, cy + dy)] ?? [] where j > i {
if abs(points[j].x - p.x) <= maxGap, abs(points[j].y - p.y) <= maxGap { union(i, j) }
}
} }
}
var groups: [Int: [CGPoint]] = [:]
for i in points.indices { groups[find(i), default: []].append(points[i]) }
return Array(groups.values)
}
static func boundingBox(_ pts: [CGPoint]) -> CGRect {
var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX
for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) }
return CGRect(x: minX, y: minY, width: maxX - minX, height: maxY - minY)
}
/// Vision normalized bbox (bottom-left origin) pixel rect (top-left origin).
private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
let W = Double(w), H = Double(h)