Ring-based speaker attribution (fixes real large-tile detection)
Real Teams/Signal frames exposed a geometry bug: estimating a tile's SIZE from its name width (×3) produces a tiny box on big real tiles, so the speaking border ring fell entirely outside it → zero points → 'not speaking' (Joe Payne's clear blue border went undetected). Pure nearest-name fails too (the top edge of a lower tile is closer to the upper tile's bottom-anchored name). Fix: cluster the highlight pixels into connected RINGS (GridCallAnalyzer.connectedComponents, spatial-hashed union-find), then attribute each ring to the OCR'd name inside its bounding box. The ring *is* the tile, so detection is independent of tile-size estimation, and multiple simultaneous borders (lag/persist/crosstalk) become separate rings naturally — exactly the multi-ring case Grant flagged. minRingSpan rejects specks. Validated on real frames: Teams now detects 'Joe Payne' (was empty); Signal detects 'JA' in the group grid. (Signal _002 has a border but no rendered name that frame — inherent Signal intermittency; voice + reconciliation cover it.) 59/59 synthetic XCTest still green (white + coloured, single + crosstalk).
This commit is contained in:
@@ -39,6 +39,7 @@ struct GridCallAnalyzer {
|
|||||||
var maxNameLength = 40
|
var maxNameLength = 40
|
||||||
var minHighlightPoints = 6
|
var minHighlightPoints = 6
|
||||||
var highlightShareOfMax = 0.35
|
var highlightShareOfMax = 0.35
|
||||||
|
var minRingSpan: Double = 60 // a speaking border spans a sizable box, not a speck
|
||||||
}
|
}
|
||||||
|
|
||||||
var config = Config()
|
var config = Config()
|
||||||
@@ -73,38 +74,66 @@ struct GridCallAnalyzer {
|
|||||||
}
|
}
|
||||||
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
|
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
|
||||||
|
|
||||||
// Drop points inside any name-text region so the white name itself doesn't count.
|
// Drop points inside any name-text region so the name's own text isn't mistaken
|
||||||
|
// for the border highlight.
|
||||||
let exclusions = tiles.map {
|
let exclusions = tiles.map {
|
||||||
$0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
|
$0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
|
||||||
}
|
}
|
||||||
let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
|
let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
|
||||||
|
|
||||||
// Attribute each highlight pixel to EXACTLY ONE tile — the (no-margin)
|
// Cluster the highlight pixels into connected RINGS (one per bordered tile),
|
||||||
// estimated rect that contains it, nearest centre as tiebreak. Containment
|
// then attribute each ring to the OCR name sitting inside it. The ring *is*
|
||||||
// (not a radius) keeps a border from bleeding into adjacent tiles even when
|
// the tile, so this is independent of tile-size estimation (which fails on big
|
||||||
// the tile-size estimate is rough; an under-sized estimate merely drops the
|
// real tiles) and handles multiple simultaneous borders (crosstalk) naturally.
|
||||||
// far edge rather than misattributing it.
|
let rings = Self.connectedComponents(points, maxGap: 18)
|
||||||
var counts = [Int](repeating: 0, count: tiles.count)
|
var speakingBBox: [Int: CGRect] = [:] // tile index -> the ring bbox marking it speaking
|
||||||
for p in points {
|
for ring in rings where ring.count >= config.minHighlightPoints {
|
||||||
var best = -1
|
let bb = Self.boundingBox(ring)
|
||||||
var bestDistSq = Double.greatestFiniteMagnitude
|
guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a blob
|
||||||
for (i, tile) in tiles.enumerated() where tile.tile.contains(p) {
|
for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) {
|
||||||
let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY
|
speakingBBox[i] = bb
|
||||||
let dd = dx * dx + dy * dy
|
|
||||||
if dd < bestDistSq { bestDistSq = dd; best = i }
|
|
||||||
}
|
}
|
||||||
if best >= 0 { counts[best] += 1 }
|
|
||||||
}
|
}
|
||||||
let maxCount = counts.max() ?? 0
|
|
||||||
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
|
|
||||||
|
|
||||||
return tiles.enumerated().map { idx, tile in
|
return tiles.enumerated().map { idx, tile in
|
||||||
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
|
SpeakerObservation(name: tile.name, speaking: speakingBBox[idx] != nil,
|
||||||
return SpeakerObservation(name: tile.name, speaking: speaking,
|
bbox: speakingBBox[idx] ?? tile.tile, confidence: tile.conf, t: t)
|
||||||
bbox: tile.tile, confidence: tile.conf, t: t)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Connected components of grid-sampled points: two points join if within `maxGap`
|
||||||
|
/// on both axes. Spatial-hashed so it stays cheap on dense frames.
|
||||||
|
static func connectedComponents(_ points: [CGPoint], maxGap: Double) -> [[CGPoint]] {
|
||||||
|
guard !points.isEmpty else { return [] }
|
||||||
|
var parent = Array(0..<points.count)
|
||||||
|
func find(_ x: Int) -> Int { var r = x; while parent[r] != r { parent[r] = parent[parent[r]]; r = parent[r] }; return r }
|
||||||
|
func union(_ a: Int, _ b: Int) { let ra = find(a), rb = find(b); if ra != rb { parent[ra] = rb } }
|
||||||
|
|
||||||
|
let cell = max(1.0, maxGap)
|
||||||
|
func key(_ cx: Int, _ cy: Int) -> Int { cx &* 100_003 &+ cy }
|
||||||
|
var buckets: [Int: [Int]] = [:]
|
||||||
|
for (i, p) in points.enumerated() {
|
||||||
|
buckets[key(Int(p.x / cell), Int(p.y / cell)), default: []].append(i)
|
||||||
|
}
|
||||||
|
for (i, p) in points.enumerated() {
|
||||||
|
let cx = Int(p.x / cell), cy = Int(p.y / cell)
|
||||||
|
for dx in -1...1 { for dy in -1...1 {
|
||||||
|
for j in buckets[key(cx + dx, cy + dy)] ?? [] where j > i {
|
||||||
|
if abs(points[j].x - p.x) <= maxGap, abs(points[j].y - p.y) <= maxGap { union(i, j) }
|
||||||
|
}
|
||||||
|
} }
|
||||||
|
}
|
||||||
|
var groups: [Int: [CGPoint]] = [:]
|
||||||
|
for i in points.indices { groups[find(i), default: []].append(points[i]) }
|
||||||
|
return Array(groups.values)
|
||||||
|
}
|
||||||
|
|
||||||
|
static func boundingBox(_ pts: [CGPoint]) -> CGRect {
|
||||||
|
var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX
|
||||||
|
for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) }
|
||||||
|
return CGRect(x: minX, y: minY, width: maxX - minX, height: maxY - minY)
|
||||||
|
}
|
||||||
|
|
||||||
/// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin).
|
/// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin).
|
||||||
private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
|
private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
|
||||||
let W = Double(w), H = Double(h)
|
let W = Double(w), H = Double(h)
|
||||||
|
|||||||
Reference in New Issue
Block a user