Ring-based speaker attribution (fixes real large-tile detection)
Real Teams/Signal frames exposed a geometry bug: estimating a tile's SIZE from its name width (×3) produces a tiny box on big real tiles, so the speaking border ring fell entirely outside it → zero points → 'not speaking' (Joe Payne's clear blue border went undetected). Pure nearest-name fails too (the top edge of a lower tile is closer to the upper tile's bottom-anchored name). Fix: cluster the highlight pixels into connected RINGS (GridCallAnalyzer.connectedComponents, spatial-hashed union-find), then attribute each ring to the OCR'd name inside its bounding box. The ring *is* the tile, so detection is independent of tile-size estimation, and multiple simultaneous borders (lag/persist/crosstalk) become separate rings naturally — exactly the multi-ring case Grant flagged. minRingSpan rejects specks. Validated on real frames: Teams now detects 'Joe Payne' (was empty); Signal detects 'JA' in the group grid. (Signal _002 has a border but no rendered name that frame — inherent Signal intermittency; voice + reconciliation cover it.) 59/59 synthetic XCTest still green (white + coloured, single + crosstalk).
This commit is contained in:
@@ -39,6 +39,7 @@ struct GridCallAnalyzer {
|
||||
var maxNameLength = 40
|
||||
var minHighlightPoints = 6
|
||||
var highlightShareOfMax = 0.35
|
||||
var minRingSpan: Double = 60 // a speaking border spans a sizable box, not a speck
|
||||
}
|
||||
|
||||
var config = Config()
|
||||
@@ -73,38 +74,66 @@ struct GridCallAnalyzer {
|
||||
}
|
||||
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
|
||||
|
||||
// Drop points inside any name-text region so the white name itself doesn't count.
|
||||
// Drop points inside any name-text region so the name's own text isn't mistaken
|
||||
// for the border highlight.
|
||||
let exclusions = tiles.map {
|
||||
$0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
|
||||
}
|
||||
let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
|
||||
|
||||
// Attribute each highlight pixel to EXACTLY ONE tile — the (no-margin)
|
||||
// estimated rect that contains it, nearest centre as tiebreak. Containment
|
||||
// (not a radius) keeps a border from bleeding into adjacent tiles even when
|
||||
// the tile-size estimate is rough; an under-sized estimate merely drops the
|
||||
// far edge rather than misattributing it.
|
||||
var counts = [Int](repeating: 0, count: tiles.count)
|
||||
for p in points {
|
||||
var best = -1
|
||||
var bestDistSq = Double.greatestFiniteMagnitude
|
||||
for (i, tile) in tiles.enumerated() where tile.tile.contains(p) {
|
||||
let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY
|
||||
let dd = dx * dx + dy * dy
|
||||
if dd < bestDistSq { bestDistSq = dd; best = i }
|
||||
// Cluster the highlight pixels into connected RINGS (one per bordered tile),
|
||||
// then attribute each ring to the OCR name sitting inside it. The ring *is*
|
||||
// the tile, so this is independent of tile-size estimation (which fails on big
|
||||
// real tiles) and handles multiple simultaneous borders (crosstalk) naturally.
|
||||
let rings = Self.connectedComponents(points, maxGap: 18)
|
||||
var speakingBBox: [Int: CGRect] = [:] // tile index -> the ring bbox marking it speaking
|
||||
for ring in rings where ring.count >= config.minHighlightPoints {
|
||||
let bb = Self.boundingBox(ring)
|
||||
guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a blob
|
||||
for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) {
|
||||
speakingBBox[i] = bb
|
||||
}
|
||||
if best >= 0 { counts[best] += 1 }
|
||||
}
|
||||
let maxCount = counts.max() ?? 0
|
||||
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
|
||||
|
||||
return tiles.enumerated().map { idx, tile in
|
||||
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
|
||||
return SpeakerObservation(name: tile.name, speaking: speaking,
|
||||
bbox: tile.tile, confidence: tile.conf, t: t)
|
||||
SpeakerObservation(name: tile.name, speaking: speakingBBox[idx] != nil,
|
||||
bbox: speakingBBox[idx] ?? tile.tile, confidence: tile.conf, t: t)
|
||||
}
|
||||
}
|
||||
|
||||
/// Connected components of grid-sampled points: two points join if within `maxGap`
|
||||
/// on both axes. Spatial-hashed so it stays cheap on dense frames.
|
||||
static func connectedComponents(_ points: [CGPoint], maxGap: Double) -> [[CGPoint]] {
|
||||
guard !points.isEmpty else { return [] }
|
||||
var parent = Array(0..<points.count)
|
||||
func find(_ x: Int) -> Int { var r = x; while parent[r] != r { parent[r] = parent[parent[r]]; r = parent[r] }; return r }
|
||||
func union(_ a: Int, _ b: Int) { let ra = find(a), rb = find(b); if ra != rb { parent[ra] = rb } }
|
||||
|
||||
let cell = max(1.0, maxGap)
|
||||
func key(_ cx: Int, _ cy: Int) -> Int { cx &* 100_003 &+ cy }
|
||||
var buckets: [Int: [Int]] = [:]
|
||||
for (i, p) in points.enumerated() {
|
||||
buckets[key(Int(p.x / cell), Int(p.y / cell)), default: []].append(i)
|
||||
}
|
||||
for (i, p) in points.enumerated() {
|
||||
let cx = Int(p.x / cell), cy = Int(p.y / cell)
|
||||
for dx in -1...1 { for dy in -1...1 {
|
||||
for j in buckets[key(cx + dx, cy + dy)] ?? [] where j > i {
|
||||
if abs(points[j].x - p.x) <= maxGap, abs(points[j].y - p.y) <= maxGap { union(i, j) }
|
||||
}
|
||||
} }
|
||||
}
|
||||
var groups: [Int: [CGPoint]] = [:]
|
||||
for i in points.indices { groups[find(i), default: []].append(points[i]) }
|
||||
return Array(groups.values)
|
||||
}
|
||||
|
||||
static func boundingBox(_ pts: [CGPoint]) -> CGRect {
|
||||
var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX
|
||||
for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) }
|
||||
return CGRect(x: minX, y: minY, width: maxX - minX, height: maxY - minY)
|
||||
}
|
||||
|
||||
/// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin).
|
||||
private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
|
||||
let W = Double(w), H = Double(h)
|
||||
|
||||
Reference in New Issue
Block a user