Fix Meet visual: reject solid avatar tiles + screen-share OCR
Root cause of the "4 people → 2 speakers" Meet call: the colored-border detector
read solid camera-off avatar tiles (orange "J", magenta "G") as active speakers
for the ENTIRE call. Those whole-call phantom spans dominated backend name
attribution, collapsing every remote voice onto one name — and the giant filled
bbox also swallowed screen-share text (WERUNBTC.COM ×49) as a speaker.
Validated against 9 real fixtures (harness over the real MeetAdapter):
Detection:
- FrameSampler.thinColoredPoints: coloured counterpart of thinWhitePoints — keeps
thin border/ring/pill edges, drops solid colour fills.
- GridCallAnalyzer.isHollow: reject a highlight component whose interior is filled
(a solid tile) vs a hollow ring (a real border). Config.maxInteriorFill (0.2 default).
- MeetAdapter: detect thin BLUE edges only (hue 180–240°, measured from the
fixtures), maxInteriorFill 0.3 (real Meet rings ≈0.2–0.3, solid tiles ≈0.36).
- Result on fixtures: John Arnold/Grant Gilliam (solid tiles) now NEVER detected;
Matt Odell/Mark detected when their blue cue is present. Sparse but never wrong —
correct for a naming hint over audio diarization.
OCR name hygiene:
- isLikelyName rejects domain-like screen-share text ("WERUNBTC.COM", OCR'd ".GOM").
- cleaned() strips trailing punctuation ("Mark." → "Mark").
- TimelineBuilder.canonicalizeByFrequency folds rare OCR misspellings into a
dominant near-twin name ("Matt Odel"/"MattOdell" → "Matt Odell", "Mare" → "Mark").
Tests: hollow-ring, extended OCR filter, fuzzy-merge. 65 pass.
This commit is contained in:
@@ -35,11 +35,21 @@ struct GridCallAnalyzer {
|
||||
var colorSaturation: Double = 0.5
|
||||
var colorMinBrightness: Double = 60
|
||||
var colorHueRange: ClosedRange<Double>? = nil
|
||||
// When true, the coloured highlight is detected from THIN edges only (drops
|
||||
// solid colour fills like Meet's camera-off avatar tiles). Pair with a tight
|
||||
// `colorHueRange` so a solid tile's thin background boundary is rejected too.
|
||||
var coloredBorderThinOnly = false
|
||||
var minTextConfidence: Float = 0.3
|
||||
var maxNameLength = 40
|
||||
var minHighlightPoints = 6
|
||||
var highlightShareOfMax = 0.35
|
||||
var minRingSpan: Double = 60 // a speaking border spans a sizable box, not a speck
|
||||
// A real active-speaker cue is a thin RING (border) with an EMPTY interior.
|
||||
// A solid camera-off avatar tile (Meet's orange/magenta fill) or a screen-share
|
||||
// fill is a filled BLOB — its highlight points spread through the interior. Reject
|
||||
// a component when more than this fraction of its points fall in the central
|
||||
// 60%×60% of its bbox (a hollow ring ≈ 0; a solid fill ≈ 0.36). Set ≥ 1 to disable.
|
||||
var maxInteriorFill: Double = 0.2
|
||||
}
|
||||
|
||||
var config = Config()
|
||||
@@ -68,9 +78,13 @@ struct GridCallAnalyzer {
|
||||
// Highlight pixels: coloured (saturated) and/or white (thin near-white).
|
||||
var highlight: [CGPoint] = []
|
||||
if config.detectColoredBorder {
|
||||
highlight += sampler.saturatedPoints(threshold: config.colorSaturation,
|
||||
minBrightness: config.colorMinBrightness,
|
||||
hueRange: config.colorHueRange)
|
||||
highlight += config.coloredBorderThinOnly
|
||||
? sampler.thinColoredPoints(threshold: config.colorSaturation,
|
||||
minBrightness: config.colorMinBrightness,
|
||||
hueRange: config.colorHueRange)
|
||||
: sampler.saturatedPoints(threshold: config.colorSaturation,
|
||||
minBrightness: config.colorMinBrightness,
|
||||
hueRange: config.colorHueRange)
|
||||
}
|
||||
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
|
||||
|
||||
@@ -89,7 +103,8 @@ struct GridCallAnalyzer {
|
||||
var speakingBBox: [Int: CGRect] = [:] // tile index -> the ring bbox marking it speaking
|
||||
for ring in rings where ring.count >= config.minHighlightPoints {
|
||||
let bb = Self.boundingBox(ring)
|
||||
guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a blob
|
||||
guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a speck
|
||||
guard Self.isHollow(ring, bbox: bb, maxInteriorFill: config.maxInteriorFill) else { continue } // a ring, not a filled tile
|
||||
for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) {
|
||||
speakingBBox[i] = bb
|
||||
}
|
||||
@@ -128,6 +143,18 @@ struct GridCallAnalyzer {
|
||||
return Array(groups.values)
|
||||
}
|
||||
|
||||
/// True if `pts` form a hollow ring (border) rather than a filled blob: at most
|
||||
/// `maxInteriorFill` of the points fall in the central 60%×60% of `bbox`. A thin
|
||||
/// border has an empty interior (≈ 0); a solid camera-off avatar tile or a
|
||||
/// screen-share fill spreads points through the interior (≈ 0.36). Disabled when
|
||||
/// `maxInteriorFill >= 1`.
|
||||
static func isHollow(_ pts: [CGPoint], bbox: CGRect, maxInteriorFill: Double) -> Bool {
|
||||
guard maxInteriorFill < 1, !pts.isEmpty else { return true }
|
||||
let inner = bbox.insetBy(dx: bbox.width * 0.2, dy: bbox.height * 0.2)
|
||||
let innerCount = pts.reduce(into: 0) { if inner.contains($1) { $0 += 1 } }
|
||||
return Double(innerCount) / Double(pts.count) <= maxInteriorFill
|
||||
}
|
||||
|
||||
static func boundingBox(_ pts: [CGPoint]) -> CGRect {
|
||||
var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX
|
||||
for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) }
|
||||
@@ -166,7 +193,11 @@ struct GridCallAnalyzer {
|
||||
}
|
||||
|
||||
private func cleaned(_ s: String) -> String {
|
||||
// Trim whitespace and any trailing punctuation OCR tacks on, so "Mark." folds
|
||||
// into "Mark" rather than becoming a separate phantom speaker.
|
||||
s.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
.trimmingCharacters(in: CharacterSet(charactersIn: ".,;:·•-"))
|
||||
.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
}
|
||||
|
||||
/// True if `s` looks like a participant name label rather than UI chrome. Call
|
||||
@@ -181,6 +212,14 @@ struct GridCallAnalyzer {
|
||||
if s.rangeOfCharacter(from: CharacterSet(charactersIn: "@:/\\|+*=<>#0123456789")) != nil {
|
||||
return false
|
||||
}
|
||||
// Reject domain-like screen-share text (e.g. "WERUNBTC.COM", OCR'd "WERUNBTC.GOM"):
|
||||
// a token whose final dotted segment is a 2–4 letter suffix. Real names don't end
|
||||
// in a TLD; this keeps "Cait's Phone" and initials like "MO".
|
||||
let lower = s.lowercased()
|
||||
if let dot = lower.lastIndex(of: "."), lower.index(after: dot) < lower.endIndex {
|
||||
let suffix = lower[lower.index(after: dot)...]
|
||||
if (2...4).contains(suffix.count) && suffix.allSatisfy({ $0.isLetter }) { return false }
|
||||
}
|
||||
let words = s.split(separator: " ")
|
||||
guard (1...3).contains(words.count) else { return false }
|
||||
let allowed = CharacterSet.letters.union(CharacterSet(charactersIn: "'.-"))
|
||||
|
||||
Reference in New Issue
Block a user