Signal: detect the white speaking border (not a coloured one)

Signal's active-speaker cue is a 3px #ffffff rounded border (saturation ≈ 0),
which the saturation-based highlight detector could never see. Per the
Signal-Desktop source review:

- FrameSampler.thinWhitePoints: grid-sample near-white pixels that sit on a
  THIN structure (a non-white pixel within edgeGap on some axis) so a border/
  ring counts but a solid white blob (face, bright video) does not.
- GridCallAnalyzer: combine coloured (saturated) + white (thin) highlight
  pixels; exclude name-text regions so the white footer name can't be mistaken
  for the border; estimate the tile UP from the name footer (nameAtBottom);
  attribute each highlight pixel to exactly one tile by containment (nearest
  centre as tiebreak) so a border can't bleed into an adjacent tile.
- SignalAdapter: white border on, coloured off, name-at-bottom geometry.

Synthetic 4-tile harness now isolates each speaker with no adjacent-tile bleed;
all 15 XCTest cases pass. Real-screenshot geometry calibration still pending.
This commit is contained in:
Grant Gilliam
2026-06-06 09:52:10 -05:00
parent 863136aeec
commit a56b47143c
4 changed files with 116 additions and 46 deletions
+68 -39
View File
@@ -3,25 +3,26 @@ import CoreGraphics
import CoreVideo
import CoreImage
/// Shared engine for tile-grid conferencing UIs (Signal/Zoom/Teams): OCR the
/// name/initials on each tile, then mark the active speaker(s) by the saturated
/// coloured highlight around their tile.
/// Shared engine for tile-grid conferencing UIs: OCR the name/initials on each
/// tile, then mark the active speaker(s) by the speaking-highlight around their
/// tile. Handles BOTH highlight kinds:
/// - **white border** (Signal: 3px #ffffff ring detected via thin near-white edges)
/// - **coloured border** (Zoom/Teams detected via saturated edges)
///
/// Geometry (`Config`) is a first pass; the exact tile expansion and saturation
/// threshold get calibrated per app against real screenshot fixtures. The
/// detection *logic* (read names; pick the highlighted tile) is validated with
/// synthetic frames.
/// The white name text is excluded so it can't be mistaken for the white border.
/// Geometry (`Config`) is a first pass; tile expansion calibrates per app against
/// real screenshot fixtures. Detection *logic* is validated on synthetic frames.
struct GridCallAnalyzer {
struct Config {
var tileExpandX = 1.8 // grow text bbox approx tile (for the reported bbox)
var tileExpandY = 2.6
var tileExpandX = 2.4 // tile width name width × this
var tileExpandY = 4.8 // tile height name height × this
var nameAtBottom = true // Signal/most: name footer sits at the tile bottom
var detectColoredBorder = true
var detectWhiteBorder = true
var minTextConfidence: Float = 0.3
var maxNameLength = 40
/// Highlight detection: a name is "speaking" if enough strongly-saturated
/// highlight pixels sit within `highlightRadiusFraction` of its label.
var highlightRadiusFraction = 0.22 // of max(frame W,H)
var minHighlightPoints = 6
var highlightShareOfMax = 0.35 // must be this fraction of the busiest tile
var highlightShareOfMax = 0.35
}
var config = Config()
@@ -37,24 +38,42 @@ struct GridCallAnalyzer {
$0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
}
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
let w = cgImage.width, h = cgImage.height
let tiles = texts.map { r -> (name: String, center: CGPoint, rect: CGRect, conf: Double) in
let rect = tileRect(r.boundingBox, imageW: w, imageH: h)
let cx = r.boundingBox.midX * Double(w)
let cy = (1 - r.boundingBox.midY) * Double(h) // flip Y to top-left origin
return (cleaned(r.text), CGPoint(x: cx, y: cy), rect, Double(r.confidence))
struct Tile { let name: String; let textRect: CGRect; let tile: CGRect; let conf: Double }
let tiles = texts.map { r in
Tile(name: cleaned(r.text),
textRect: pixelRect(r.boundingBox, w, h),
tile: tileRect(r.boundingBox, w, h),
conf: Double(r.confidence))
}
// Find highlight pixels once, attribute each to the nearest name label.
let points = sampler.saturatedPoints()
let radius = Double(max(w, h)) * config.highlightRadiusFraction
let r2 = radius * radius
let counts = tiles.map { tile -> Int in
points.reduce(0) { acc, p in
let dx = Double(p.x) - tile.center.x, dy = Double(p.y) - tile.center.y
return acc + (dx * dx + dy * dy <= r2 ? 1 : 0)
// Highlight pixels: coloured (saturated) and/or white (thin near-white).
var highlight: [CGPoint] = []
if config.detectColoredBorder { highlight += sampler.saturatedPoints() }
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
// Drop points inside any name-text region so the white name itself doesn't count.
let exclusions = tiles.map {
$0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
}
let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
// Attribute each highlight pixel to EXACTLY ONE tile the (no-margin)
// estimated rect that contains it, nearest centre as tiebreak. Containment
// (not a radius) keeps a border from bleeding into adjacent tiles even when
// the tile-size estimate is rough; an under-sized estimate merely drops the
// far edge rather than misattributing it.
var counts = [Int](repeating: 0, count: tiles.count)
for p in points {
var best = -1
var bestDistSq = Double.greatestFiniteMagnitude
for (i, tile) in tiles.enumerated() where tile.tile.contains(p) {
let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY
let dd = dx * dx + dy * dy
if dd < bestDistSq { bestDistSq = dd; best = i }
}
if best >= 0 { counts[best] += 1 }
}
let maxCount = counts.max() ?? 0
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
@@ -62,21 +81,31 @@ struct GridCallAnalyzer {
return tiles.enumerated().map { idx, tile in
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
return SpeakerObservation(name: tile.name, speaking: speaking,
bbox: tile.rect, confidence: tile.conf, t: t)
bbox: tile.tile, confidence: tile.conf, t: t)
}
}
/// Vision normalized bbox (bottom-left origin) pixel tile rect (top-left),
/// expanded around the text centre to approximate the whole tile.
private func tileRect(_ box: CGRect, imageW: Int, imageH: Int) -> CGRect {
let W = Double(imageW), H = Double(imageH)
let pw = box.width * W
let ph = box.height * H
let cx = (box.midX) * W
let cy = (1 - box.midY) * H // flip Y to top-left origin
let nw = pw * config.tileExpandX
let nh = ph * config.tileExpandY
let rect = CGRect(x: cx - nw / 2, y: cy - nh / 2, width: nw, height: nh)
/// Vision normalized bbox (bottom-left origin) pixel rect (top-left origin).
private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
let W = Double(w), H = Double(h)
return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H)
}
/// Estimate the participant tile from the name label. With `nameAtBottom`, the
/// tile extends UP from the footer (Signal); otherwise it's centred on the name.
private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
let W = Double(w), H = Double(h)
let name = pixelRect(box, w, h)
let nw = name.width * config.tileExpandX
let nh = name.height * config.tileExpandY
let cx = name.midX
let rect: CGRect
if config.nameAtBottom {
let bottom = name.maxY + name.height * 0.3
rect = CGRect(x: cx - nw / 2, y: bottom - nh, width: nw, height: nh)
} else {
rect = CGRect(x: cx - nw / 2, y: name.midY - nh / 2, width: nw, height: nh)
}
return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
}