Signal: detect the white speaking border (not a coloured one)
Signal's active-speaker cue is a 3px #ffffff rounded border (saturation ≈ 0), which the saturation-based highlight detector could never see. Per the Signal-Desktop source review: - FrameSampler.thinWhitePoints: grid-sample near-white pixels that sit on a THIN structure (a non-white pixel within edgeGap on some axis) so a border/ ring counts but a solid white blob (face, bright video) does not. - GridCallAnalyzer: combine coloured (saturated) + white (thin) highlight pixels; exclude name-text regions so the white footer name can't be mistaken for the border; estimate the tile UP from the name footer (nameAtBottom); attribute each highlight pixel to exactly one tile by containment (nearest centre as tiebreak) so a border can't bleed into an adjacent tile. - SignalAdapter: white border on, coloured off, name-at-bottom geometry. Synthetic 4-tile harness now isolates each speaker with no adjacent-tile bleed; all 15 XCTest cases pass. Real-screenshot geometry calibration still pending.
This commit is contained in:
@@ -14,9 +14,16 @@ struct SignalAdapter: AppAdapter {
|
|||||||
|
|
||||||
init() {
|
init() {
|
||||||
var config = GridCallAnalyzer.Config()
|
var config = GridCallAnalyzer.Config()
|
||||||
// Signal tiles are squarish with initials centred; tune with fixtures.
|
// Signal's speaking cue is a 3px WHITE rounded border (not coloured); the
|
||||||
config.tileExpandX = 1.6
|
// name is a bottom footer, so the tile extends up from it. Geometry tuned
|
||||||
config.tileExpandY = 1.8
|
// with real fixtures. (Gotchas, per Signal source: NO border in 1:1 calls —
|
||||||
|
// fall back to mic-VAD/audio pill — and in Speaker view the large tile is
|
||||||
|
// the speaker; both handled at a higher level later.)
|
||||||
|
config.nameAtBottom = true
|
||||||
|
config.detectWhiteBorder = true
|
||||||
|
config.detectColoredBorder = false
|
||||||
|
config.tileExpandX = 2.4
|
||||||
|
config.tileExpandY = 4.8
|
||||||
self.analyzer = GridCallAnalyzer(config: config)
|
self.analyzer = GridCallAnalyzer(config: config)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -60,8 +60,40 @@ struct FrameSampler {
|
|||||||
return [top, bottom, left, right].map { meanSaturation(inPixelRect: $0) }.max() ?? 0
|
return [top, bottom, left, right].map { meanSaturation(inPixelRect: $0) }.max() ?? 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private func isNearWhite(_ x: Int, _ y: Int, minChannel: Double) -> Bool {
|
||||||
|
guard x >= 0, x < width, y >= 0, y < height else { return false }
|
||||||
|
let i = (y * width + x) * 4
|
||||||
|
return Double(pixels[i]) >= minChannel
|
||||||
|
&& Double(pixels[i + 1]) >= minChannel
|
||||||
|
&& Double(pixels[i + 2]) >= minChannel
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Grid-sampled near-white pixels that lie on a THIN structure (a non-white
|
||||||
|
/// pixel within `edgeGap` on some axis) — i.e. a border/ring/audio-bar, not a
|
||||||
|
/// solid white blob (face, bright video). This is Signal's white speaking
|
||||||
|
/// border (saturation ≈ 0, so `saturatedPoints` can't see it).
|
||||||
|
func thinWhitePoints(minChannel: Double = 200, edgeGap: Int = 6, gridStep: Int = 4) -> [CGPoint] {
|
||||||
|
var points: [CGPoint] = []
|
||||||
|
var y = edgeGap
|
||||||
|
while y < height - edgeGap {
|
||||||
|
var x = edgeGap
|
||||||
|
while x < width - edgeGap {
|
||||||
|
if isNearWhite(x, y, minChannel: minChannel) {
|
||||||
|
let thin = !isNearWhite(x - edgeGap, y, minChannel: minChannel)
|
||||||
|
|| !isNearWhite(x + edgeGap, y, minChannel: minChannel)
|
||||||
|
|| !isNearWhite(x, y - edgeGap, minChannel: minChannel)
|
||||||
|
|| !isNearWhite(x, y + edgeGap, minChannel: minChannel)
|
||||||
|
if thin { points.append(CGPoint(x: x, y: y)) }
|
||||||
|
}
|
||||||
|
x += gridStep
|
||||||
|
}
|
||||||
|
y += gridStep
|
||||||
|
}
|
||||||
|
return points
|
||||||
|
}
|
||||||
|
|
||||||
/// Grid-sampled pixel positions (top-left origin) that are strongly saturated
|
/// Grid-sampled pixel positions (top-left origin) that are strongly saturated
|
||||||
/// AND bright enough to be a UI highlight — i.e. the speaking ring/border.
|
/// AND bright enough to be a UI highlight — i.e. a coloured speaking ring/border.
|
||||||
func saturatedPoints(threshold: Double = 0.5, minBrightness: Double = 60, gridStep: Int = 6) -> [CGPoint] {
|
func saturatedPoints(threshold: Double = 0.5, minBrightness: Double = 60, gridStep: Int = 6) -> [CGPoint] {
|
||||||
var points: [CGPoint] = []
|
var points: [CGPoint] = []
|
||||||
var y = 0
|
var y = 0
|
||||||
|
|||||||
@@ -3,25 +3,26 @@ import CoreGraphics
|
|||||||
import CoreVideo
|
import CoreVideo
|
||||||
import CoreImage
|
import CoreImage
|
||||||
|
|
||||||
/// Shared engine for tile-grid conferencing UIs (Signal/Zoom/Teams): OCR the
|
/// Shared engine for tile-grid conferencing UIs: OCR the name/initials on each
|
||||||
/// name/initials on each tile, then mark the active speaker(s) by the saturated
|
/// tile, then mark the active speaker(s) by the speaking-highlight around their
|
||||||
/// coloured highlight around their tile.
|
/// tile. Handles BOTH highlight kinds:
|
||||||
|
/// - **white border** (Signal: 3px #ffffff ring — detected via thin near-white edges)
|
||||||
|
/// - **coloured border** (Zoom/Teams — detected via saturated edges)
|
||||||
///
|
///
|
||||||
/// Geometry (`Config`) is a first pass; the exact tile expansion and saturation
|
/// The white name text is excluded so it can't be mistaken for the white border.
|
||||||
/// threshold get calibrated per app against real screenshot fixtures. The
|
/// Geometry (`Config`) is a first pass; tile expansion calibrates per app against
|
||||||
/// detection *logic* (read names; pick the highlighted tile) is validated with
|
/// real screenshot fixtures. Detection *logic* is validated on synthetic frames.
|
||||||
/// synthetic frames.
|
|
||||||
struct GridCallAnalyzer {
|
struct GridCallAnalyzer {
|
||||||
struct Config {
|
struct Config {
|
||||||
var tileExpandX = 1.8 // grow text bbox → approx tile (for the reported bbox)
|
var tileExpandX = 2.4 // tile width ≈ name width × this
|
||||||
var tileExpandY = 2.6
|
var tileExpandY = 4.8 // tile height ≈ name height × this
|
||||||
|
var nameAtBottom = true // Signal/most: name footer sits at the tile bottom
|
||||||
|
var detectColoredBorder = true
|
||||||
|
var detectWhiteBorder = true
|
||||||
var minTextConfidence: Float = 0.3
|
var minTextConfidence: Float = 0.3
|
||||||
var maxNameLength = 40
|
var maxNameLength = 40
|
||||||
/// Highlight detection: a name is "speaking" if enough strongly-saturated
|
|
||||||
/// highlight pixels sit within `highlightRadiusFraction` of its label.
|
|
||||||
var highlightRadiusFraction = 0.22 // of max(frame W,H)
|
|
||||||
var minHighlightPoints = 6
|
var minHighlightPoints = 6
|
||||||
var highlightShareOfMax = 0.35 // must be ≥ this fraction of the busiest tile
|
var highlightShareOfMax = 0.35
|
||||||
}
|
}
|
||||||
|
|
||||||
var config = Config()
|
var config = Config()
|
||||||
@@ -37,24 +38,42 @@ struct GridCallAnalyzer {
|
|||||||
$0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
|
$0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
|
||||||
}
|
}
|
||||||
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
|
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
|
||||||
|
|
||||||
let w = cgImage.width, h = cgImage.height
|
let w = cgImage.width, h = cgImage.height
|
||||||
let tiles = texts.map { r -> (name: String, center: CGPoint, rect: CGRect, conf: Double) in
|
|
||||||
let rect = tileRect(r.boundingBox, imageW: w, imageH: h)
|
struct Tile { let name: String; let textRect: CGRect; let tile: CGRect; let conf: Double }
|
||||||
let cx = r.boundingBox.midX * Double(w)
|
let tiles = texts.map { r in
|
||||||
let cy = (1 - r.boundingBox.midY) * Double(h) // flip Y to top-left origin
|
Tile(name: cleaned(r.text),
|
||||||
return (cleaned(r.text), CGPoint(x: cx, y: cy), rect, Double(r.confidence))
|
textRect: pixelRect(r.boundingBox, w, h),
|
||||||
|
tile: tileRect(r.boundingBox, w, h),
|
||||||
|
conf: Double(r.confidence))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find highlight pixels once, attribute each to the nearest name label.
|
// Highlight pixels: coloured (saturated) and/or white (thin near-white).
|
||||||
let points = sampler.saturatedPoints()
|
var highlight: [CGPoint] = []
|
||||||
let radius = Double(max(w, h)) * config.highlightRadiusFraction
|
if config.detectColoredBorder { highlight += sampler.saturatedPoints() }
|
||||||
let r2 = radius * radius
|
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
|
||||||
let counts = tiles.map { tile -> Int in
|
|
||||||
points.reduce(0) { acc, p in
|
// Drop points inside any name-text region so the white name itself doesn't count.
|
||||||
let dx = Double(p.x) - tile.center.x, dy = Double(p.y) - tile.center.y
|
let exclusions = tiles.map {
|
||||||
return acc + (dx * dx + dy * dy <= r2 ? 1 : 0)
|
$0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
|
||||||
|
}
|
||||||
|
let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
|
||||||
|
|
||||||
|
// Attribute each highlight pixel to EXACTLY ONE tile — the (no-margin)
|
||||||
|
// estimated rect that contains it, nearest centre as tiebreak. Containment
|
||||||
|
// (not a radius) keeps a border from bleeding into adjacent tiles even when
|
||||||
|
// the tile-size estimate is rough; an under-sized estimate merely drops the
|
||||||
|
// far edge rather than misattributing it.
|
||||||
|
var counts = [Int](repeating: 0, count: tiles.count)
|
||||||
|
for p in points {
|
||||||
|
var best = -1
|
||||||
|
var bestDistSq = Double.greatestFiniteMagnitude
|
||||||
|
for (i, tile) in tiles.enumerated() where tile.tile.contains(p) {
|
||||||
|
let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY
|
||||||
|
let dd = dx * dx + dy * dy
|
||||||
|
if dd < bestDistSq { bestDistSq = dd; best = i }
|
||||||
}
|
}
|
||||||
|
if best >= 0 { counts[best] += 1 }
|
||||||
}
|
}
|
||||||
let maxCount = counts.max() ?? 0
|
let maxCount = counts.max() ?? 0
|
||||||
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
|
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
|
||||||
@@ -62,21 +81,31 @@ struct GridCallAnalyzer {
|
|||||||
return tiles.enumerated().map { idx, tile in
|
return tiles.enumerated().map { idx, tile in
|
||||||
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
|
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
|
||||||
return SpeakerObservation(name: tile.name, speaking: speaking,
|
return SpeakerObservation(name: tile.name, speaking: speaking,
|
||||||
bbox: tile.rect, confidence: tile.conf, t: t)
|
bbox: tile.tile, confidence: tile.conf, t: t)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Vision normalized bbox (bottom-left origin) → pixel tile rect (top-left),
|
/// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin).
|
||||||
/// expanded around the text centre to approximate the whole tile.
|
private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
|
||||||
private func tileRect(_ box: CGRect, imageW: Int, imageH: Int) -> CGRect {
|
let W = Double(w), H = Double(h)
|
||||||
let W = Double(imageW), H = Double(imageH)
|
return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H)
|
||||||
let pw = box.width * W
|
}
|
||||||
let ph = box.height * H
|
|
||||||
let cx = (box.midX) * W
|
/// Estimate the participant tile from the name label. With `nameAtBottom`, the
|
||||||
let cy = (1 - box.midY) * H // flip Y to top-left origin
|
/// tile extends UP from the footer (Signal); otherwise it's centred on the name.
|
||||||
let nw = pw * config.tileExpandX
|
private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
|
||||||
let nh = ph * config.tileExpandY
|
let W = Double(w), H = Double(h)
|
||||||
let rect = CGRect(x: cx - nw / 2, y: cy - nh / 2, width: nw, height: nh)
|
let name = pixelRect(box, w, h)
|
||||||
|
let nw = name.width * config.tileExpandX
|
||||||
|
let nh = name.height * config.tileExpandY
|
||||||
|
let cx = name.midX
|
||||||
|
let rect: CGRect
|
||||||
|
if config.nameAtBottom {
|
||||||
|
let bottom = name.maxY + name.height * 0.3
|
||||||
|
rect = CGRect(x: cx - nw / 2, y: bottom - nh, width: nw, height: nh)
|
||||||
|
} else {
|
||||||
|
rect = CGRect(x: cx - nw / 2, y: name.midY - nh / 2, width: nw, height: nh)
|
||||||
|
}
|
||||||
return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
|
return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -34,10 +34,12 @@ final class GridCallAnalyzerTests: XCTestCase {
|
|||||||
for (i, (name, rect)) in rects.enumerated() {
|
for (i, (name, rect)) in rects.enumerated() {
|
||||||
ctx.setFillColor(CGColor(red: 0.18, green: 0.18, blue: 0.2, alpha: 1)); ctx.fill(rect)
|
ctx.setFillColor(CGColor(red: 0.18, green: 0.18, blue: 0.2, alpha: 1)); ctx.fill(rect)
|
||||||
if i == speakingIndex {
|
if i == speakingIndex {
|
||||||
ctx.setStrokeColor(CGColor(red: 0.1, green: 0.85, blue: 0.2, alpha: 1)); ctx.setLineWidth(14)
|
// Signal's cue: a WHITE rounded border (not coloured).
|
||||||
ctx.stroke(rect.insetBy(dx: 7, dy: 7))
|
ctx.setStrokeColor(CGColor(red: 1, green: 1, blue: 1, alpha: 1)); ctx.setLineWidth(6)
|
||||||
|
ctx.stroke(rect.insetBy(dx: 3, dy: 3))
|
||||||
}
|
}
|
||||||
drawText(name, ctx, center: CGPoint(x: rect.midX, y: rect.midY), size: 54)
|
// Name footer at the BOTTOM of the tile (bottom-left origin: rect.minY).
|
||||||
|
drawText(name, ctx, center: CGPoint(x: rect.midX, y: rect.minY + 28), size: 46)
|
||||||
}
|
}
|
||||||
return ctx.makeImage()!
|
return ctx.makeImage()!
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user