a56b47143c
Signal's active-speaker cue is a 3px #ffffff rounded border (saturation ≈ 0), which the saturation-based highlight detector could never see. Per the Signal-Desktop source review: - FrameSampler.thinWhitePoints: grid-sample near-white pixels that sit on a THIN structure (a non-white pixel within edgeGap on some axis) so a border/ ring counts but a solid white blob (face, bright video) does not. - GridCallAnalyzer: combine coloured (saturated) + white (thin) highlight pixels; exclude name-text regions so the white footer name can't be mistaken for the border; estimate the tile UP from the name footer (nameAtBottom); attribute each highlight pixel to exactly one tile by containment (nearest centre as tiebreak) so a border can't bleed into an adjacent tile. - SignalAdapter: white border on, coloured off, name-at-bottom geometry. Synthetic 4-tile harness now isolates each speaker with no adjacent-tile bleed; all 15 XCTest cases pass. Real-screenshot geometry calibration still pending.
124 lines
5.6 KiB
Swift
124 lines
5.6 KiB
Swift
import Foundation
|
||
import CoreGraphics
|
||
import CoreVideo
|
||
import CoreImage
|
||
|
||
/// Shared engine for tile-grid conferencing UIs: OCR the name/initials on each
|
||
/// tile, then mark the active speaker(s) by the speaking-highlight around their
|
||
/// tile. Handles BOTH highlight kinds:
|
||
/// - **white border** (Signal: 3px #ffffff ring — detected via thin near-white edges)
|
||
/// - **coloured border** (Zoom/Teams — detected via saturated edges)
|
||
///
|
||
/// The white name text is excluded so it can't be mistaken for the white border.
|
||
/// Geometry (`Config`) is a first pass; tile expansion calibrates per app against
|
||
/// real screenshot fixtures. Detection *logic* is validated on synthetic frames.
|
||
struct GridCallAnalyzer {
|
||
struct Config {
|
||
var tileExpandX = 2.4 // tile width ≈ name width × this
|
||
var tileExpandY = 4.8 // tile height ≈ name height × this
|
||
var nameAtBottom = true // Signal/most: name footer sits at the tile bottom
|
||
var detectColoredBorder = true
|
||
var detectWhiteBorder = true
|
||
var minTextConfidence: Float = 0.3
|
||
var maxNameLength = 40
|
||
var minHighlightPoints = 6
|
||
var highlightShareOfMax = 0.35
|
||
}
|
||
|
||
var config = Config()
|
||
var recognizer = TextRecognizer()
|
||
|
||
func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
|
||
guard let cg = Self.cgImage(from: pixelBuffer) else { return [] }
|
||
return analyze(cgImage: cg, at: t)
|
||
}
|
||
|
||
func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
|
||
let texts = recognizer.recognize(in: cgImage).filter {
|
||
$0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
|
||
}
|
||
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
|
||
let w = cgImage.width, h = cgImage.height
|
||
|
||
struct Tile { let name: String; let textRect: CGRect; let tile: CGRect; let conf: Double }
|
||
let tiles = texts.map { r in
|
||
Tile(name: cleaned(r.text),
|
||
textRect: pixelRect(r.boundingBox, w, h),
|
||
tile: tileRect(r.boundingBox, w, h),
|
||
conf: Double(r.confidence))
|
||
}
|
||
|
||
// Highlight pixels: coloured (saturated) and/or white (thin near-white).
|
||
var highlight: [CGPoint] = []
|
||
if config.detectColoredBorder { highlight += sampler.saturatedPoints() }
|
||
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
|
||
|
||
// Drop points inside any name-text region so the white name itself doesn't count.
|
||
let exclusions = tiles.map {
|
||
$0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
|
||
}
|
||
let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
|
||
|
||
// Attribute each highlight pixel to EXACTLY ONE tile — the (no-margin)
|
||
// estimated rect that contains it, nearest centre as tiebreak. Containment
|
||
// (not a radius) keeps a border from bleeding into adjacent tiles even when
|
||
// the tile-size estimate is rough; an under-sized estimate merely drops the
|
||
// far edge rather than misattributing it.
|
||
var counts = [Int](repeating: 0, count: tiles.count)
|
||
for p in points {
|
||
var best = -1
|
||
var bestDistSq = Double.greatestFiniteMagnitude
|
||
for (i, tile) in tiles.enumerated() where tile.tile.contains(p) {
|
||
let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY
|
||
let dd = dx * dx + dy * dy
|
||
if dd < bestDistSq { bestDistSq = dd; best = i }
|
||
}
|
||
if best >= 0 { counts[best] += 1 }
|
||
}
|
||
let maxCount = counts.max() ?? 0
|
||
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
|
||
|
||
return tiles.enumerated().map { idx, tile in
|
||
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
|
||
return SpeakerObservation(name: tile.name, speaking: speaking,
|
||
bbox: tile.tile, confidence: tile.conf, t: t)
|
||
}
|
||
}
|
||
|
||
/// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin).
|
||
private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
|
||
let W = Double(w), H = Double(h)
|
||
return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H)
|
||
}
|
||
|
||
/// Estimate the participant tile from the name label. With `nameAtBottom`, the
|
||
/// tile extends UP from the footer (Signal); otherwise it's centred on the name.
|
||
private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
|
||
let W = Double(w), H = Double(h)
|
||
let name = pixelRect(box, w, h)
|
||
let nw = name.width * config.tileExpandX
|
||
let nh = name.height * config.tileExpandY
|
||
let cx = name.midX
|
||
let rect: CGRect
|
||
if config.nameAtBottom {
|
||
let bottom = name.maxY + name.height * 0.3
|
||
rect = CGRect(x: cx - nw / 2, y: bottom - nh, width: nw, height: nh)
|
||
} else {
|
||
rect = CGRect(x: cx - nw / 2, y: name.midY - nh / 2, width: nw, height: nh)
|
||
}
|
||
return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
|
||
}
|
||
|
||
private func cleaned(_ s: String) -> String {
|
||
let t = s.trimmingCharacters(in: .whitespacesAndNewlines)
|
||
return t.count <= config.maxNameLength ? t : ""
|
||
}
|
||
|
||
private static let ciContext = CIContext()
|
||
|
||
static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? {
|
||
let ci = CIImage(cvPixelBuffer: pixelBuffer)
|
||
return ciContext.createCGImage(ci, from: ci.extent) // reuse; allocating per frame is costly
|
||
}
|
||
}
|