diff --git a/Ten31Transcripts/Visual/GridCallAnalyzer.swift b/Ten31Transcripts/Visual/GridCallAnalyzer.swift index 56b17d2..d515082 100644 --- a/Ten31Transcripts/Visual/GridCallAnalyzer.swift +++ b/Ten31Transcripts/Visual/GridCallAnalyzer.swift @@ -39,6 +39,7 @@ struct GridCallAnalyzer { var maxNameLength = 40 var minHighlightPoints = 6 var highlightShareOfMax = 0.35 + var minRingSpan: Double = 60 // a speaking border spans a sizable box, not a speck } var config = Config() @@ -73,38 +74,66 @@ struct GridCallAnalyzer { } if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() } - // Drop points inside any name-text region so the white name itself doesn't count. + // Drop points inside any name-text region so the name's own text isn't mistaken + // for the border highlight. let exclusions = tiles.map { $0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35) } let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } } - // Attribute each highlight pixel to EXACTLY ONE tile — the (no-margin) - // estimated rect that contains it, nearest centre as tiebreak. Containment - // (not a radius) keeps a border from bleeding into adjacent tiles even when - // the tile-size estimate is rough; an under-sized estimate merely drops the - // far edge rather than misattributing it. - var counts = [Int](repeating: 0, count: tiles.count) - for p in points { - var best = -1 - var bestDistSq = Double.greatestFiniteMagnitude - for (i, tile) in tiles.enumerated() where tile.tile.contains(p) { - let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY - let dd = dx * dx + dy * dy - if dd < bestDistSq { bestDistSq = dd; best = i } + // Cluster the highlight pixels into connected RINGS (one per bordered tile), + // then attribute each ring to the OCR name sitting inside it. The ring *is* + // the tile, so this is independent of tile-size estimation (which fails on big + // real tiles) and handles multiple simultaneous borders (crosstalk) naturally. + let rings = Self.connectedComponents(points, maxGap: 18) + var speakingBBox: [Int: CGRect] = [:] // tile index -> the ring bbox marking it speaking + for ring in rings where ring.count >= config.minHighlightPoints { + let bb = Self.boundingBox(ring) + guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a blob + for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) { + speakingBBox[i] = bb } - if best >= 0 { counts[best] += 1 } } - let maxCount = counts.max() ?? 0 - let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax)) return tiles.enumerated().map { idx, tile in - let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need - return SpeakerObservation(name: tile.name, speaking: speaking, - bbox: tile.tile, confidence: tile.conf, t: t) + SpeakerObservation(name: tile.name, speaking: speakingBBox[idx] != nil, + bbox: speakingBBox[idx] ?? tile.tile, confidence: tile.conf, t: t) } } + /// Connected components of grid-sampled points: two points join if within `maxGap` + /// on both axes. Spatial-hashed so it stays cheap on dense frames. + static func connectedComponents(_ points: [CGPoint], maxGap: Double) -> [[CGPoint]] { + guard !points.isEmpty else { return [] } + var parent = Array(0.. Int { var r = x; while parent[r] != r { parent[r] = parent[parent[r]]; r = parent[r] }; return r } + func union(_ a: Int, _ b: Int) { let ra = find(a), rb = find(b); if ra != rb { parent[ra] = rb } } + + let cell = max(1.0, maxGap) + func key(_ cx: Int, _ cy: Int) -> Int { cx &* 100_003 &+ cy } + var buckets: [Int: [Int]] = [:] + for (i, p) in points.enumerated() { + buckets[key(Int(p.x / cell), Int(p.y / cell)), default: []].append(i) + } + for (i, p) in points.enumerated() { + let cx = Int(p.x / cell), cy = Int(p.y / cell) + for dx in -1...1 { for dy in -1...1 { + for j in buckets[key(cx + dx, cy + dy)] ?? [] where j > i { + if abs(points[j].x - p.x) <= maxGap, abs(points[j].y - p.y) <= maxGap { union(i, j) } + } + } } + } + var groups: [Int: [CGPoint]] = [:] + for i in points.indices { groups[find(i), default: []].append(points[i]) } + return Array(groups.values) + } + + static func boundingBox(_ pts: [CGPoint]) -> CGRect { + var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX + for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) } + return CGRect(x: minX, y: minY, width: maxX - minX, height: maxY - minY) + } + /// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin). private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect { let W = Double(w), H = Double(h)