Ring-based speaker attribution (fixes real large-tile detection)

Real Teams/Signal frames exposed a geometry bug: estimating a tile's SIZE from its name width (×3) produces a tiny box on big real tiles, so the speaking border ring fell entirely outside it → zero points → 'not speaking' (Joe Payne's clear blue border went undetected). Pure nearest-name fails too (the top edge of a lower tile is closer to the upper tile's bottom-anchored name). Fix: cluster the highlight pixels into connected RINGS (GridCallAnalyzer.connectedComponents, spatial-hashed union-find), then attribute each ring to the OCR'd name inside its bounding box. The ring *is* the tile, so detection is independent of tile-size estimation, and multiple simultaneous borders (lag/persist/crosstalk) become separate rings naturally — exactly the multi-ring case Grant flagged. minRingSpan rejects specks. Validated on real frames: Teams now detects 'Joe Payne' (was empty); Signal detects 'JA' in the group grid. (Signal _002 has a border but no rendered name that frame — inherent Signal intermittency; voice + reconciliation cover it.) 59/59 synthetic XCTest still green (white + coloured, single + crosstalk).
2026-06-08 12:03:36 -05:00
parent 6d0c8be8c9
commit 3bc169533a
1 changed files with 49 additions and 20 deletions
@@ -39,6 +39,7 @@ struct GridCallAnalyzer {
        var maxNameLength = 40
        var minHighlightPoints = 6
        var highlightShareOfMax = 0.35
+        var minRingSpan: Double = 60   // a speaking border spans a sizable box, not a speck
    }

    var config = Config()
@@ -73,38 +74,66 @@ struct GridCallAnalyzer {
        }
        if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }

-        // Drop points inside any name-text region so the white name itself doesn't count.
+        // Drop points inside any name-text region so the name's own text isn't mistaken
+        // for the border highlight.
        let exclusions = tiles.map {
            $0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
        }
        let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }

-        // Attribute each highlight pixel to EXACTLY ONE tile — the (no-margin)
-        // estimated rect that contains it, nearest centre as tiebreak. Containment
-        // (not a radius) keeps a border from bleeding into adjacent tiles even when
-        // the tile-size estimate is rough; an under-sized estimate merely drops the
-        // far edge rather than misattributing it.
-        var counts = [Int](repeating: 0, count: tiles.count)
-        for p in points {
-            var best = -1
-            var bestDistSq = Double.greatestFiniteMagnitude
-            for (i, tile) in tiles.enumerated() where tile.tile.contains(p) {
-                let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY
-                let dd = dx * dx + dy * dy
-                if dd < bestDistSq { bestDistSq = dd; best = i }
+        // Cluster the highlight pixels into connected RINGS (one per bordered tile),
+        // then attribute each ring to the OCR name sitting inside it. The ring *is*
+        // the tile, so this is independent of tile-size estimation (which fails on big
+        // real tiles) and handles multiple simultaneous borders (crosstalk) naturally.
+        let rings = Self.connectedComponents(points, maxGap: 18)
+        var speakingBBox: [Int: CGRect] = [:]      // tile index -> the ring bbox marking it speaking
+        for ring in rings where ring.count >= config.minHighlightPoints {
+            let bb = Self.boundingBox(ring)
+            guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue }   // a ring, not a blob
+            for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) {
+                speakingBBox[i] = bb
            }
-            if best >= 0 { counts[best] += 1 }
        }
-        let maxCount = counts.max() ?? 0
-        let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))

        return tiles.enumerated().map { idx, tile in
-            let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
-            return SpeakerObservation(name: tile.name, speaking: speaking,
-                                      bbox: tile.tile, confidence: tile.conf, t: t)
+            SpeakerObservation(name: tile.name, speaking: speakingBBox[idx] != nil,
+                               bbox: speakingBBox[idx] ?? tile.tile, confidence: tile.conf, t: t)
        }
    }

+    /// Connected components of grid-sampled points: two points join if within `maxGap`
+    /// on both axes. Spatial-hashed so it stays cheap on dense frames.
+    static func connectedComponents(_ points: [CGPoint], maxGap: Double) -> [[CGPoint]] {
+        guard !points.isEmpty else { return [] }
+        var parent = Array(0..<points.count)
+        func find(_ x: Int) -> Int { var r = x; while parent[r] != r { parent[r] = parent[parent[r]]; r = parent[r] }; return r }
+        func union(_ a: Int, _ b: Int) { let ra = find(a), rb = find(b); if ra != rb { parent[ra] = rb } }
+
+        let cell = max(1.0, maxGap)
+        func key(_ cx: Int, _ cy: Int) -> Int { cx &* 100_003 &+ cy }
+        var buckets: [Int: [Int]] = [:]
+        for (i, p) in points.enumerated() {
+            buckets[key(Int(p.x / cell), Int(p.y / cell)), default: []].append(i)
+        }
+        for (i, p) in points.enumerated() {
+            let cx = Int(p.x / cell), cy = Int(p.y / cell)
+            for dx in -1...1 { for dy in -1...1 {
+                for j in buckets[key(cx + dx, cy + dy)] ?? [] where j > i {
+                    if abs(points[j].x - p.x) <= maxGap, abs(points[j].y - p.y) <= maxGap { union(i, j) }
+                }
+            } }
+        }
+        var groups: [Int: [CGPoint]] = [:]
+        for i in points.indices { groups[find(i), default: []].append(points[i]) }
+        return Array(groups.values)
+    }
+
+    static func boundingBox(_ pts: [CGPoint]) -> CGRect {
+        var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX
+        for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) }
+        return CGRect(x: minX, y: minY, width: maxX - minX, height: maxY - minY)
+    }
+
    /// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin).
    private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
        let W = Double(w), H = Double(h)