Signal: detect the white speaking border (not a coloured one)

Signal's active-speaker cue is a 3px #ffffff rounded border (saturation ≈ 0), which the saturation-based highlight detector could never see. Per the Signal-Desktop source review: - FrameSampler.thinWhitePoints: grid-sample near-white pixels that sit on a THIN structure (a non-white pixel within edgeGap on some axis) so a border/ ring counts but a solid white blob (face, bright video) does not. - GridCallAnalyzer: combine coloured (saturated) + white (thin) highlight pixels; exclude name-text regions so the white footer name can't be mistaken for the border; estimate the tile UP from the name footer (nameAtBottom); attribute each highlight pixel to exactly one tile by containment (nearest centre as tiebreak) so a border can't bleed into an adjacent tile. - SignalAdapter: white border on, coloured off, name-at-bottom geometry. Synthetic 4-tile harness now isolates each speaker with no adjacent-tile bleed; all 15 XCTest cases pass. Real-screenshot geometry calibration still pending.
2026-06-06 09:52:10 -05:00
parent 863136aeec
commit a56b47143c
4 changed files with 116 additions and 46 deletions
@@ -3,25 +3,26 @@ import CoreGraphics
 import CoreVideo
 import CoreImage

-/// Shared engine for tile-grid conferencing UIs (Signal/Zoom/Teams): OCR the
-/// name/initials on each tile, then mark the active speaker(s) by the saturated
-/// coloured highlight around their tile.
+/// Shared engine for tile-grid conferencing UIs: OCR the name/initials on each
+/// tile, then mark the active speaker(s) by the speaking-highlight around their
+/// tile. Handles BOTH highlight kinds:
+/// - **white border** (Signal: 3px #ffffff ring — detected via thin near-white edges)
+/// - **coloured border** (Zoom/Teams — detected via saturated edges)
 ///
-/// Geometry (`Config`) is a first pass; the exact tile expansion and saturation
-/// threshold get calibrated per app against real screenshot fixtures. The
-/// detection *logic* (read names; pick the highlighted tile) is validated with
-/// synthetic frames.
+/// The white name text is excluded so it can't be mistaken for the white border.
+/// Geometry (`Config`) is a first pass; tile expansion calibrates per app against
+/// real screenshot fixtures. Detection *logic* is validated on synthetic frames.
 struct GridCallAnalyzer {
    struct Config {
-        var tileExpandX = 1.8        // grow text bbox → approx tile (for the reported bbox)
-        var tileExpandY = 2.6
+        var tileExpandX = 2.4         // tile width  ≈ name width  × this
+        var tileExpandY = 4.8         // tile height ≈ name height × this
+        var nameAtBottom = true       // Signal/most: name footer sits at the tile bottom
+        var detectColoredBorder = true
+        var detectWhiteBorder = true
        var minTextConfidence: Float = 0.3
        var maxNameLength = 40
-        /// Highlight detection: a name is "speaking" if enough strongly-saturated
-        /// highlight pixels sit within `highlightRadiusFraction` of its label.
-        var highlightRadiusFraction = 0.22   // of max(frame W,H)
        var minHighlightPoints = 6
-        var highlightShareOfMax = 0.35       // must be ≥ this fraction of the busiest tile
+        var highlightShareOfMax = 0.35
    }

    var config = Config()
@@ -37,24 +38,42 @@ struct GridCallAnalyzer {
            $0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
        }
        guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
-
        let w = cgImage.width, h = cgImage.height
-        let tiles = texts.map { r -> (name: String, center: CGPoint, rect: CGRect, conf: Double) in
-            let rect = tileRect(r.boundingBox, imageW: w, imageH: h)
-            let cx = r.boundingBox.midX * Double(w)
-            let cy = (1 - r.boundingBox.midY) * Double(h)     // flip Y to top-left origin
-            return (cleaned(r.text), CGPoint(x: cx, y: cy), rect, Double(r.confidence))
+
+        struct Tile { let name: String; let textRect: CGRect; let tile: CGRect; let conf: Double }
+        let tiles = texts.map { r in
+            Tile(name: cleaned(r.text),
+                 textRect: pixelRect(r.boundingBox, w, h),
+                 tile: tileRect(r.boundingBox, w, h),
+                 conf: Double(r.confidence))
        }

-        // Find highlight pixels once, attribute each to the nearest name label.
-        let points = sampler.saturatedPoints()
-        let radius = Double(max(w, h)) * config.highlightRadiusFraction
-        let r2 = radius * radius
-        let counts = tiles.map { tile -> Int in
-            points.reduce(0) { acc, p in
-                let dx = Double(p.x) - tile.center.x, dy = Double(p.y) - tile.center.y
-                return acc + (dx * dx + dy * dy <= r2 ? 1 : 0)
+        // Highlight pixels: coloured (saturated) and/or white (thin near-white).
+        var highlight: [CGPoint] = []
+        if config.detectColoredBorder { highlight += sampler.saturatedPoints() }
+        if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
+
+        // Drop points inside any name-text region so the white name itself doesn't count.
+        let exclusions = tiles.map {
+            $0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
+        }
+        let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
+
+        // Attribute each highlight pixel to EXACTLY ONE tile — the (no-margin)
+        // estimated rect that contains it, nearest centre as tiebreak. Containment
+        // (not a radius) keeps a border from bleeding into adjacent tiles even when
+        // the tile-size estimate is rough; an under-sized estimate merely drops the
+        // far edge rather than misattributing it.
+        var counts = [Int](repeating: 0, count: tiles.count)
+        for p in points {
+            var best = -1
+            var bestDistSq = Double.greatestFiniteMagnitude
+            for (i, tile) in tiles.enumerated() where tile.tile.contains(p) {
+                let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY
+                let dd = dx * dx + dy * dy
+                if dd < bestDistSq { bestDistSq = dd; best = i }
            }
+            if best >= 0 { counts[best] += 1 }
        }
        let maxCount = counts.max() ?? 0
        let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
@@ -62,21 +81,31 @@ struct GridCallAnalyzer {
        return tiles.enumerated().map { idx, tile in
            let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
            return SpeakerObservation(name: tile.name, speaking: speaking,
-                                      bbox: tile.rect, confidence: tile.conf, t: t)
+                                      bbox: tile.tile, confidence: tile.conf, t: t)
        }
    }

-    /// Vision normalized bbox (bottom-left origin) → pixel tile rect (top-left),
-    /// expanded around the text centre to approximate the whole tile.
-    private func tileRect(_ box: CGRect, imageW: Int, imageH: Int) -> CGRect {
-        let W = Double(imageW), H = Double(imageH)
-        let pw = box.width * W
-        let ph = box.height * H
-        let cx = (box.midX) * W
-        let cy = (1 - box.midY) * H          // flip Y to top-left origin
-        let nw = pw * config.tileExpandX
-        let nh = ph * config.tileExpandY
-        let rect = CGRect(x: cx - nw / 2, y: cy - nh / 2, width: nw, height: nh)
+    /// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin).
+    private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
+        let W = Double(w), H = Double(h)
+        return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H)
+    }
+
+    /// Estimate the participant tile from the name label. With `nameAtBottom`, the
+    /// tile extends UP from the footer (Signal); otherwise it's centred on the name.
+    private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
+        let W = Double(w), H = Double(h)
+        let name = pixelRect(box, w, h)
+        let nw = name.width * config.tileExpandX
+        let nh = name.height * config.tileExpandY
+        let cx = name.midX
+        let rect: CGRect
+        if config.nameAtBottom {
+            let bottom = name.maxY + name.height * 0.3
+            rect = CGRect(x: cx - nw / 2, y: bottom - nh, width: nw, height: nh)
+        } else {
+            rect = CGRect(x: cx - nw / 2, y: name.midY - nh / 2, width: nw, height: nh)
+        }
        return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
    }