Fix Meet visual: reject solid avatar tiles + screen-share OCR

Root cause of the "4 people → 2 speakers" Meet call: the colored-border detector
read solid camera-off avatar tiles (orange "J", magenta "G") as active speakers
for the ENTIRE call. Those whole-call phantom spans dominated backend name
attribution, collapsing every remote voice onto one name — and the giant filled
bbox also swallowed screen-share text (WERUNBTC.COM ×49) as a speaker.

Validated against 9 real fixtures (harness over the real MeetAdapter):

Detection:
- FrameSampler.thinColoredPoints: coloured counterpart of thinWhitePoints — keeps
  thin border/ring/pill edges, drops solid colour fills.
- GridCallAnalyzer.isHollow: reject a highlight component whose interior is filled
  (a solid tile) vs a hollow ring (a real border). Config.maxInteriorFill (0.2 default).
- MeetAdapter: detect thin BLUE edges only (hue 180–240°, measured from the
  fixtures), maxInteriorFill 0.3 (real Meet rings ≈0.2–0.3, solid tiles ≈0.36).
- Result on fixtures: John Arnold/Grant Gilliam (solid tiles) now NEVER detected;
  Matt Odell/Mark detected when their blue cue is present. Sparse but never wrong —
  correct for a naming hint over audio diarization.

OCR name hygiene:
- isLikelyName rejects domain-like screen-share text ("WERUNBTC.COM", OCR'd ".GOM").
- cleaned() strips trailing punctuation ("Mark." → "Mark").
- TimelineBuilder.canonicalizeByFrequency folds rare OCR misspellings into a
  dominant near-twin name ("Matt Odel"/"MattOdell" → "Matt Odell", "Mare" → "Mark").

Tests: hollow-ring, extended OCR filter, fuzzy-merge. 65 pass.
This commit is contained in:
Grant Gilliam
2026-06-08 16:18:52 -05:00
parent 5c80e827a1
commit 39beccf7f4
6 changed files with 182 additions and 6 deletions
+43 -4
View File
@@ -35,11 +35,21 @@ struct GridCallAnalyzer {
var colorSaturation: Double = 0.5
var colorMinBrightness: Double = 60
var colorHueRange: ClosedRange<Double>? = nil
// When true, the coloured highlight is detected from THIN edges only (drops
// solid colour fills like Meet's camera-off avatar tiles). Pair with a tight
// `colorHueRange` so a solid tile's thin background boundary is rejected too.
var coloredBorderThinOnly = false
var minTextConfidence: Float = 0.3
var maxNameLength = 40
var minHighlightPoints = 6
var highlightShareOfMax = 0.35
var minRingSpan: Double = 60 // a speaking border spans a sizable box, not a speck
// A real active-speaker cue is a thin RING (border) with an EMPTY interior.
// A solid camera-off avatar tile (Meet's orange/magenta fill) or a screen-share
// fill is a filled BLOB its highlight points spread through the interior. Reject
// a component when more than this fraction of its points fall in the central
// 60%×60% of its bbox (a hollow ring 0; a solid fill 0.36). Set 1 to disable.
var maxInteriorFill: Double = 0.2
}
var config = Config()
@@ -68,9 +78,13 @@ struct GridCallAnalyzer {
// Highlight pixels: coloured (saturated) and/or white (thin near-white).
var highlight: [CGPoint] = []
if config.detectColoredBorder {
highlight += sampler.saturatedPoints(threshold: config.colorSaturation,
minBrightness: config.colorMinBrightness,
hueRange: config.colorHueRange)
highlight += config.coloredBorderThinOnly
? sampler.thinColoredPoints(threshold: config.colorSaturation,
minBrightness: config.colorMinBrightness,
hueRange: config.colorHueRange)
: sampler.saturatedPoints(threshold: config.colorSaturation,
minBrightness: config.colorMinBrightness,
hueRange: config.colorHueRange)
}
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
@@ -89,7 +103,8 @@ struct GridCallAnalyzer {
var speakingBBox: [Int: CGRect] = [:] // tile index -> the ring bbox marking it speaking
for ring in rings where ring.count >= config.minHighlightPoints {
let bb = Self.boundingBox(ring)
guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a blob
guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a speck
guard Self.isHollow(ring, bbox: bb, maxInteriorFill: config.maxInteriorFill) else { continue } // a ring, not a filled tile
for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) {
speakingBBox[i] = bb
}
@@ -128,6 +143,18 @@ struct GridCallAnalyzer {
return Array(groups.values)
}
/// True if `pts` form a hollow ring (border) rather than a filled blob: at most
/// `maxInteriorFill` of the points fall in the central 60%×60% of `bbox`. A thin
/// border has an empty interior ( 0); a solid camera-off avatar tile or a
/// screen-share fill spreads points through the interior ( 0.36). Disabled when
/// `maxInteriorFill >= 1`.
static func isHollow(_ pts: [CGPoint], bbox: CGRect, maxInteriorFill: Double) -> Bool {
guard maxInteriorFill < 1, !pts.isEmpty else { return true }
let inner = bbox.insetBy(dx: bbox.width * 0.2, dy: bbox.height * 0.2)
let innerCount = pts.reduce(into: 0) { if inner.contains($1) { $0 += 1 } }
return Double(innerCount) / Double(pts.count) <= maxInteriorFill
}
static func boundingBox(_ pts: [CGPoint]) -> CGRect {
var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX
for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) }
@@ -166,7 +193,11 @@ struct GridCallAnalyzer {
}
private func cleaned(_ s: String) -> String {
// Trim whitespace and any trailing punctuation OCR tacks on, so "Mark." folds
// into "Mark" rather than becoming a separate phantom speaker.
s.trimmingCharacters(in: .whitespacesAndNewlines)
.trimmingCharacters(in: CharacterSet(charactersIn: ".,;:·•-"))
.trimmingCharacters(in: .whitespacesAndNewlines)
}
/// True if `s` looks like a participant name label rather than UI chrome. Call
@@ -181,6 +212,14 @@ struct GridCallAnalyzer {
if s.rangeOfCharacter(from: CharacterSet(charactersIn: "@:/\\|+*=<>#0123456789")) != nil {
return false
}
// Reject domain-like screen-share text (e.g. "WERUNBTC.COM", OCR'd "WERUNBTC.GOM"):
// a token whose final dotted segment is a 24 letter suffix. Real names don't end
// in a TLD; this keeps "Cait's Phone" and initials like "MO".
let lower = s.lowercased()
if let dot = lower.lastIndex(of: "."), lower.index(after: dot) < lower.endIndex {
let suffix = lower[lower.index(after: dot)...]
if (2...4).contains(suffix.count) && suffix.allSatisfy({ $0.isLetter }) { return false }
}
let words = s.split(separator: " ")
guard (1...3).contains(words.count) else { return false }
let allowed = CharacterSet.letters.union(CharacterSet(charactersIn: "'.-"))