Fix Meet visual: reject solid avatar tiles + screen-share OCR

Root cause of the "4 people → 2 speakers" Meet call: the colored-border detector
read solid camera-off avatar tiles (orange "J", magenta "G") as active speakers
for the ENTIRE call. Those whole-call phantom spans dominated backend name
attribution, collapsing every remote voice onto one name — and the giant filled
bbox also swallowed screen-share text (WERUNBTC.COM ×49) as a speaker.

Validated against 9 real fixtures (harness over the real MeetAdapter):

Detection:
- FrameSampler.thinColoredPoints: coloured counterpart of thinWhitePoints — keeps
  thin border/ring/pill edges, drops solid colour fills.
- GridCallAnalyzer.isHollow: reject a highlight component whose interior is filled
  (a solid tile) vs a hollow ring (a real border). Config.maxInteriorFill (0.2 default).
- MeetAdapter: detect thin BLUE edges only (hue 180–240°, measured from the
  fixtures), maxInteriorFill 0.3 (real Meet rings ≈0.2–0.3, solid tiles ≈0.36).
- Result on fixtures: John Arnold/Grant Gilliam (solid tiles) now NEVER detected;
  Matt Odell/Mark detected when their blue cue is present. Sparse but never wrong —
  correct for a naming hint over audio diarization.

OCR name hygiene:
- isLikelyName rejects domain-like screen-share text ("WERUNBTC.COM", OCR'd ".GOM").
- cleaned() strips trailing punctuation ("Mark." → "Mark").
- TimelineBuilder.canonicalizeByFrequency folds rare OCR misspellings into a
  dominant near-twin name ("Matt Odel"/"MattOdell" → "Matt Odell", "Mare" → "Mark").

Tests: hollow-ring, extended OCR filter, fuzzy-merge. 65 pass.
This commit is contained in:
Grant Gilliam
2026-06-08 16:18:52 -05:00
parent 5c80e827a1
commit 39beccf7f4
6 changed files with 182 additions and 6 deletions
@@ -32,6 +32,16 @@ struct MeetAdapter: AppAdapter {
// The bright ring (#1a73e8) is ~0.89 sat but the lighter glow (#8ab4f8) is
// ~0.44, below the 0.5 default lower the threshold so the glow registers.
config.colorSaturation = 0.35
// Meet's active cue is a thin BLUE (210°) ring + audio pill. Detect thin blue
// EDGES only, gated to blue: this rejects solid camera-off avatar tiles (orange
// 30°, magenta 340°), which otherwise read as "speaking" for the whole call
// and collapse every remote voice onto one name. Validated on real fixtures.
config.coloredBorderThinOnly = true
config.colorHueRange = 180...240
// Meet's blue border is faint; real rings measure 0.200.30 interior fill while
// solid tiles measure 0.36, so allow a higher fill here than the 0.2 default to
// recover real borders without readmitting the solid-tile false positives.
config.maxInteriorFill = 0.3
config.tileExpandX = 3.0
config.tileExpandY = 5.0
self.analyzer = GridCallAnalyzer(config: config)
@@ -120,6 +120,43 @@ struct FrameSampler {
return points
}
/// Grid-sampled saturated pixels that lie on a THIN structure (a non-saturated
/// pixel within `edgeGap` on some axis) the coloured counterpart of
/// `thinWhitePoints`. This keeps a thin speaking BORDER/ring/pill but drops the
/// solid interior of a colour FILL (e.g. Meet's orange/magenta camera-off avatar
/// tiles), whose pixels are surrounded by the same colour. Pair with `hueRange`
/// to keep only the cue's colour (Meet's blue ring) and reject the thin edges a
/// solid tile still has against the background (orange/magenta boundaries).
func thinColoredPoints(threshold: Double = 0.35, minBrightness: Double = 60,
hueRange: ClosedRange<Double>? = nil,
edgeGap: Int = 6, gridStep: Int = 4) -> [CGPoint] {
func isCue(_ x: Int, _ y: Int) -> Bool {
guard x >= 0, x < width, y >= 0, y < height else { return false }
let i = (y * width + x) * 4
let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2])
let mx = max(r, g, b), mn = min(r, g, b)
let sat = mx > 0 ? (mx - mn) / mx : 0
guard sat > threshold, mx > minBrightness else { return false }
if let hr = hueRange { return hr.contains(Self.hueDegrees(r, g, b, mx, mn)) }
return true
}
var points: [CGPoint] = []
var y = edgeGap
while y < height - edgeGap {
var x = edgeGap
while x < width - edgeGap {
if isCue(x, y) {
let thin = !isCue(x - edgeGap, y) || !isCue(x + edgeGap, y)
|| !isCue(x, y - edgeGap) || !isCue(x, y + edgeGap)
if thin { points.append(CGPoint(x: x, y: y)) }
}
x += gridStep
}
y += gridStep
}
return points
}
/// HSV hue in degrees (0360) from RGB and its precomputed max/min channels.
private static func hueDegrees(_ r: Double, _ g: Double, _ b: Double, _ mx: Double, _ mn: Double) -> Double {
let d = mx - mn
+43 -4
View File
@@ -35,11 +35,21 @@ struct GridCallAnalyzer {
var colorSaturation: Double = 0.5
var colorMinBrightness: Double = 60
var colorHueRange: ClosedRange<Double>? = nil
// When true, the coloured highlight is detected from THIN edges only (drops
// solid colour fills like Meet's camera-off avatar tiles). Pair with a tight
// `colorHueRange` so a solid tile's thin background boundary is rejected too.
var coloredBorderThinOnly = false
var minTextConfidence: Float = 0.3
var maxNameLength = 40
var minHighlightPoints = 6
var highlightShareOfMax = 0.35
var minRingSpan: Double = 60 // a speaking border spans a sizable box, not a speck
// A real active-speaker cue is a thin RING (border) with an EMPTY interior.
// A solid camera-off avatar tile (Meet's orange/magenta fill) or a screen-share
// fill is a filled BLOB its highlight points spread through the interior. Reject
// a component when more than this fraction of its points fall in the central
// 60%×60% of its bbox (a hollow ring 0; a solid fill 0.36). Set 1 to disable.
var maxInteriorFill: Double = 0.2
}
var config = Config()
@@ -68,9 +78,13 @@ struct GridCallAnalyzer {
// Highlight pixels: coloured (saturated) and/or white (thin near-white).
var highlight: [CGPoint] = []
if config.detectColoredBorder {
highlight += sampler.saturatedPoints(threshold: config.colorSaturation,
minBrightness: config.colorMinBrightness,
hueRange: config.colorHueRange)
highlight += config.coloredBorderThinOnly
? sampler.thinColoredPoints(threshold: config.colorSaturation,
minBrightness: config.colorMinBrightness,
hueRange: config.colorHueRange)
: sampler.saturatedPoints(threshold: config.colorSaturation,
minBrightness: config.colorMinBrightness,
hueRange: config.colorHueRange)
}
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
@@ -89,7 +103,8 @@ struct GridCallAnalyzer {
var speakingBBox: [Int: CGRect] = [:] // tile index -> the ring bbox marking it speaking
for ring in rings where ring.count >= config.minHighlightPoints {
let bb = Self.boundingBox(ring)
guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a blob
guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a speck
guard Self.isHollow(ring, bbox: bb, maxInteriorFill: config.maxInteriorFill) else { continue } // a ring, not a filled tile
for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) {
speakingBBox[i] = bb
}
@@ -128,6 +143,18 @@ struct GridCallAnalyzer {
return Array(groups.values)
}
/// True if `pts` form a hollow ring (border) rather than a filled blob: at most
/// `maxInteriorFill` of the points fall in the central 60%×60% of `bbox`. A thin
/// border has an empty interior ( 0); a solid camera-off avatar tile or a
/// screen-share fill spreads points through the interior ( 0.36). Disabled when
/// `maxInteriorFill >= 1`.
static func isHollow(_ pts: [CGPoint], bbox: CGRect, maxInteriorFill: Double) -> Bool {
guard maxInteriorFill < 1, !pts.isEmpty else { return true }
let inner = bbox.insetBy(dx: bbox.width * 0.2, dy: bbox.height * 0.2)
let innerCount = pts.reduce(into: 0) { if inner.contains($1) { $0 += 1 } }
return Double(innerCount) / Double(pts.count) <= maxInteriorFill
}
static func boundingBox(_ pts: [CGPoint]) -> CGRect {
var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX
for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) }
@@ -166,7 +193,11 @@ struct GridCallAnalyzer {
}
private func cleaned(_ s: String) -> String {
// Trim whitespace and any trailing punctuation OCR tacks on, so "Mark." folds
// into "Mark" rather than becoming a separate phantom speaker.
s.trimmingCharacters(in: .whitespacesAndNewlines)
.trimmingCharacters(in: CharacterSet(charactersIn: ".,;:·•-"))
.trimmingCharacters(in: .whitespacesAndNewlines)
}
/// True if `s` looks like a participant name label rather than UI chrome. Call
@@ -181,6 +212,14 @@ struct GridCallAnalyzer {
if s.rangeOfCharacter(from: CharacterSet(charactersIn: "@:/\\|+*=<>#0123456789")) != nil {
return false
}
// Reject domain-like screen-share text (e.g. "WERUNBTC.COM", OCR'd "WERUNBTC.GOM"):
// a token whose final dotted segment is a 24 letter suffix. Real names don't end
// in a TLD; this keeps "Cait's Phone" and initials like "MO".
let lower = s.lowercased()
if let dot = lower.lastIndex(of: "."), lower.index(after: dot) < lower.endIndex {
let suffix = lower[lower.index(after: dot)...]
if (2...4).contains(suffix.count) && suffix.allSatisfy({ $0.isLetter }) { return false }
}
let words = s.split(separator: " ")
guard (1...3).contains(words.count) else { return false }
let allowed = CharacterSet.letters.union(CharacterSet(charactersIn: "'.-"))
@@ -93,9 +93,57 @@ final class TimelineBuilder {
closeSegment(name: name, state: st)
states[name]?.open = false
}
segments = Self.canonicalizeByFrequency(segments)
segments.sort { $0.start < $1.start }
}
/// Fold rare OCR misspellings into the dominant name they're a typo of: a name with
/// little total time is remapped to a much longer-running name with the same initial
/// within a small edit distance (e.g. "Matt Odel"/"MattOdell"/"Mare" "Matt Odell"/
/// "Mark"). Conservative by design it won't merge two well-attested speakers, only
/// a transient variant into its clearly-dominant canonical. Pure/testable.
static func canonicalizeByFrequency(_ segs: [VisualTimeline.Segment],
minorMaxSec: Double = 5, dominanceRatio: Double = 8,
maxEdits: Int = 2) -> [VisualTimeline.Segment] {
var dur: [String: Double] = [:]
for s in segs { dur[s.name, default: 0] += s.end - s.start }
let names = Array(dur.keys)
var remap: [String: String] = [:]
for minor in names {
let md = dur[minor]!
guard md <= minorMaxSec, let mInit = minor.first else { continue }
var best: String?, bestDur = 0.0
for major in names where major != minor {
let Md = dur[major]!
guard Md >= md * dominanceRatio, Md > bestDur, major.first == mInit else { continue }
if levenshtein(minor.lowercased(), major.lowercased()) <= maxEdits { best = major; bestDur = Md }
}
if let b = best { remap[minor] = b }
}
guard !remap.isEmpty else { return segs }
return segs.map { s in
remap[s.name].map { VisualTimeline.Segment(start: s.start, end: s.end, name: $0,
confidence: s.confidence, source: s.source) } ?? s
}
}
/// Levenshtein edit distance (small strings names).
static func levenshtein(_ a: String, _ b: String) -> Int {
let x = Array(a), y = Array(b)
if x.isEmpty { return y.count }; if y.isEmpty { return x.count }
var prev = Array(0...y.count)
var cur = [Int](repeating: 0, count: y.count + 1)
for i in 1...x.count {
cur[0] = i
for j in 1...y.count {
cur[j] = x[i-1] == y[j-1] ? prev[j-1]
: Swift.min(prev[j-1], prev[j], cur[j-1]) + 1
}
swap(&prev, &cur)
}
return prev[y.count]
}
// MARK: - Internal
private struct NameState {