39beccf7f4
Root cause of the "4 people → 2 speakers" Meet call: the colored-border detector
read solid camera-off avatar tiles (orange "J", magenta "G") as active speakers
for the ENTIRE call. Those whole-call phantom spans dominated backend name
attribution, collapsing every remote voice onto one name — and the giant filled
bbox also swallowed screen-share text (WERUNBTC.COM ×49) as a speaker.
Validated against 9 real fixtures (harness over the real MeetAdapter):
Detection:
- FrameSampler.thinColoredPoints: coloured counterpart of thinWhitePoints — keeps
thin border/ring/pill edges, drops solid colour fills.
- GridCallAnalyzer.isHollow: reject a highlight component whose interior is filled
(a solid tile) vs a hollow ring (a real border). Config.maxInteriorFill (0.2 default).
- MeetAdapter: detect thin BLUE edges only (hue 180–240°, measured from the
fixtures), maxInteriorFill 0.3 (real Meet rings ≈0.2–0.3, solid tiles ≈0.36).
- Result on fixtures: John Arnold/Grant Gilliam (solid tiles) now NEVER detected;
Matt Odell/Mark detected when their blue cue is present. Sparse but never wrong —
correct for a naming hint over audio diarization.
OCR name hygiene:
- isLikelyName rejects domain-like screen-share text ("WERUNBTC.COM", OCR'd ".GOM").
- cleaned() strips trailing punctuation ("Mark." → "Mark").
- TimelineBuilder.canonicalizeByFrequency folds rare OCR misspellings into a
dominant near-twin name ("Matt Odel"/"MattOdell" → "Matt Odell", "Mare" → "Mark").
Tests: hollow-ring, extended OCR filter, fuzzy-merge. 65 pass.
240 lines
13 KiB
Swift
240 lines
13 KiB
Swift
import Foundation
|
||
import CoreGraphics
|
||
import CoreVideo
|
||
import CoreImage
|
||
|
||
/// Shared engine for tile-grid conferencing UIs: OCR the name/initials on each
|
||
/// tile, then mark the active speaker(s) by the speaking-highlight around their
|
||
/// tile. Handles BOTH highlight kinds:
|
||
/// - **white border** (Signal: 3px #ffffff ring — detected via thin near-white edges)
|
||
/// - **coloured border** (Zoom/Teams — detected via saturated edges)
|
||
///
|
||
/// The white name text is excluded so it can't be mistaken for the white border.
|
||
/// Geometry (`Config`) is a first pass; tile expansion calibrates per app against
|
||
/// real screenshot fixtures. Detection *logic* is validated on synthetic frames.
|
||
struct GridCallAnalyzer {
|
||
/// Where the name label sits relative to its participant tile — drives how the
|
||
/// tile rect is estimated from the OCR'd name box.
|
||
enum NameAnchor {
|
||
case bottomCenter // Signal: centered footer; tile extends UP, centered on the name
|
||
case bottomLeft // Meet/Zoom: name in the bottom-left corner; tile extends UP and RIGHT
|
||
case center // name centered inside the tile
|
||
}
|
||
|
||
struct Config {
|
||
var tileExpandX = 2.4 // tile width ≈ name width × this
|
||
var tileExpandY = 4.8 // tile height ≈ name height × this
|
||
var nameAnchor: NameAnchor = .bottomCenter
|
||
var detectColoredBorder = true
|
||
var detectWhiteBorder = true
|
||
// Coloured-border sensitivity. Default 0.5 suits vivid rings (Zoom green/
|
||
// yellow); lower it for muted accent rings (Teams violet ≈ 0.41, Meet's
|
||
// light-blue glow ≈ 0.44). `colorHueRange` (degrees) optionally pins the
|
||
// ring's hue so a low threshold doesn't catch warm video — set per platform
|
||
// once calibrated against real screenshots.
|
||
var colorSaturation: Double = 0.5
|
||
var colorMinBrightness: Double = 60
|
||
var colorHueRange: ClosedRange<Double>? = nil
|
||
// When true, the coloured highlight is detected from THIN edges only (drops
|
||
// solid colour fills like Meet's camera-off avatar tiles). Pair with a tight
|
||
// `colorHueRange` so a solid tile's thin background boundary is rejected too.
|
||
var coloredBorderThinOnly = false
|
||
var minTextConfidence: Float = 0.3
|
||
var maxNameLength = 40
|
||
var minHighlightPoints = 6
|
||
var highlightShareOfMax = 0.35
|
||
var minRingSpan: Double = 60 // a speaking border spans a sizable box, not a speck
|
||
// A real active-speaker cue is a thin RING (border) with an EMPTY interior.
|
||
// A solid camera-off avatar tile (Meet's orange/magenta fill) or a screen-share
|
||
// fill is a filled BLOB — its highlight points spread through the interior. Reject
|
||
// a component when more than this fraction of its points fall in the central
|
||
// 60%×60% of its bbox (a hollow ring ≈ 0; a solid fill ≈ 0.36). Set ≥ 1 to disable.
|
||
var maxInteriorFill: Double = 0.2
|
||
}
|
||
|
||
var config = Config()
|
||
var recognizer = TextRecognizer()
|
||
|
||
func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
|
||
guard let cg = Self.cgImage(from: pixelBuffer) else { return [] }
|
||
return analyze(cgImage: cg, at: t)
|
||
}
|
||
|
||
func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
|
||
let texts = recognizer.recognize(in: cgImage).filter {
|
||
$0.confidence >= config.minTextConfidence && Self.isLikelyName(cleaned($0.text))
|
||
}
|
||
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
|
||
let w = cgImage.width, h = cgImage.height
|
||
|
||
struct Tile { let name: String; let textRect: CGRect; let tile: CGRect; let conf: Double }
|
||
let tiles = texts.map { r in
|
||
Tile(name: cleaned(r.text),
|
||
textRect: pixelRect(r.boundingBox, w, h),
|
||
tile: tileRect(r.boundingBox, w, h),
|
||
conf: Double(r.confidence))
|
||
}
|
||
|
||
// Highlight pixels: coloured (saturated) and/or white (thin near-white).
|
||
var highlight: [CGPoint] = []
|
||
if config.detectColoredBorder {
|
||
highlight += config.coloredBorderThinOnly
|
||
? sampler.thinColoredPoints(threshold: config.colorSaturation,
|
||
minBrightness: config.colorMinBrightness,
|
||
hueRange: config.colorHueRange)
|
||
: sampler.saturatedPoints(threshold: config.colorSaturation,
|
||
minBrightness: config.colorMinBrightness,
|
||
hueRange: config.colorHueRange)
|
||
}
|
||
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
|
||
|
||
// Drop points inside any name-text region so the name's own text isn't mistaken
|
||
// for the border highlight.
|
||
let exclusions = tiles.map {
|
||
$0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
|
||
}
|
||
let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
|
||
|
||
// Cluster the highlight pixels into connected RINGS (one per bordered tile),
|
||
// then attribute each ring to the OCR name sitting inside it. The ring *is*
|
||
// the tile, so this is independent of tile-size estimation (which fails on big
|
||
// real tiles) and handles multiple simultaneous borders (crosstalk) naturally.
|
||
let rings = Self.connectedComponents(points, maxGap: 18)
|
||
var speakingBBox: [Int: CGRect] = [:] // tile index -> the ring bbox marking it speaking
|
||
for ring in rings where ring.count >= config.minHighlightPoints {
|
||
let bb = Self.boundingBox(ring)
|
||
guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a speck
|
||
guard Self.isHollow(ring, bbox: bb, maxInteriorFill: config.maxInteriorFill) else { continue } // a ring, not a filled tile
|
||
for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) {
|
||
speakingBBox[i] = bb
|
||
}
|
||
}
|
||
|
||
return tiles.enumerated().map { idx, tile in
|
||
SpeakerObservation(name: tile.name, speaking: speakingBBox[idx] != nil,
|
||
bbox: speakingBBox[idx] ?? tile.tile, confidence: tile.conf, t: t)
|
||
}
|
||
}
|
||
|
||
/// Connected components of grid-sampled points: two points join if within `maxGap`
|
||
/// on both axes. Spatial-hashed so it stays cheap on dense frames.
|
||
static func connectedComponents(_ points: [CGPoint], maxGap: Double) -> [[CGPoint]] {
|
||
guard !points.isEmpty else { return [] }
|
||
var parent = Array(0..<points.count)
|
||
func find(_ x: Int) -> Int { var r = x; while parent[r] != r { parent[r] = parent[parent[r]]; r = parent[r] }; return r }
|
||
func union(_ a: Int, _ b: Int) { let ra = find(a), rb = find(b); if ra != rb { parent[ra] = rb } }
|
||
|
||
let cell = max(1.0, maxGap)
|
||
func key(_ cx: Int, _ cy: Int) -> Int { cx &* 100_003 &+ cy }
|
||
var buckets: [Int: [Int]] = [:]
|
||
for (i, p) in points.enumerated() {
|
||
buckets[key(Int(p.x / cell), Int(p.y / cell)), default: []].append(i)
|
||
}
|
||
for (i, p) in points.enumerated() {
|
||
let cx = Int(p.x / cell), cy = Int(p.y / cell)
|
||
for dx in -1...1 { for dy in -1...1 {
|
||
for j in buckets[key(cx + dx, cy + dy)] ?? [] where j > i {
|
||
if abs(points[j].x - p.x) <= maxGap, abs(points[j].y - p.y) <= maxGap { union(i, j) }
|
||
}
|
||
} }
|
||
}
|
||
var groups: [Int: [CGPoint]] = [:]
|
||
for i in points.indices { groups[find(i), default: []].append(points[i]) }
|
||
return Array(groups.values)
|
||
}
|
||
|
||
/// True if `pts` form a hollow ring (border) rather than a filled blob: at most
|
||
/// `maxInteriorFill` of the points fall in the central 60%×60% of `bbox`. A thin
|
||
/// border has an empty interior (≈ 0); a solid camera-off avatar tile or a
|
||
/// screen-share fill spreads points through the interior (≈ 0.36). Disabled when
|
||
/// `maxInteriorFill >= 1`.
|
||
static func isHollow(_ pts: [CGPoint], bbox: CGRect, maxInteriorFill: Double) -> Bool {
|
||
guard maxInteriorFill < 1, !pts.isEmpty else { return true }
|
||
let inner = bbox.insetBy(dx: bbox.width * 0.2, dy: bbox.height * 0.2)
|
||
let innerCount = pts.reduce(into: 0) { if inner.contains($1) { $0 += 1 } }
|
||
return Double(innerCount) / Double(pts.count) <= maxInteriorFill
|
||
}
|
||
|
||
static func boundingBox(_ pts: [CGPoint]) -> CGRect {
|
||
var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX
|
||
for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) }
|
||
return CGRect(x: minX, y: minY, width: maxX - minX, height: maxY - minY)
|
||
}
|
||
|
||
/// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin).
|
||
private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
|
||
let W = Double(w), H = Double(h)
|
||
return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H)
|
||
}
|
||
|
||
/// Estimate the participant tile from the name label, per the app's `nameAnchor`:
|
||
/// - `.bottomCenter` (Signal): tile extends UP from a centered footer.
|
||
/// - `.bottomLeft` (Meet/Zoom): name hugs the tile's bottom-left corner; the
|
||
/// tile extends UP and to the RIGHT of it.
|
||
/// - `.center`: tile centered on the name.
|
||
private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
|
||
let W = Double(w), H = Double(h)
|
||
let name = pixelRect(box, w, h)
|
||
let nw = name.width * config.tileExpandX
|
||
let nh = name.height * config.tileExpandY
|
||
let rect: CGRect
|
||
switch config.nameAnchor {
|
||
case .bottomCenter:
|
||
let bottom = name.maxY + name.height * 0.3
|
||
rect = CGRect(x: name.midX - nw / 2, y: bottom - nh, width: nw, height: nh)
|
||
case .bottomLeft:
|
||
let bottom = name.maxY + name.height * 0.3
|
||
let left = name.minX - name.height * 0.4 // small left padding ≈ the corner gutter
|
||
rect = CGRect(x: left, y: bottom - nh, width: nw, height: nh)
|
||
case .center:
|
||
rect = CGRect(x: name.midX - nw / 2, y: name.midY - nh / 2, width: nw, height: nh)
|
||
}
|
||
return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
|
||
}
|
||
|
||
private func cleaned(_ s: String) -> String {
|
||
// Trim whitespace and any trailing punctuation OCR tacks on, so "Mark." folds
|
||
// into "Mark" rather than becoming a separate phantom speaker.
|
||
s.trimmingCharacters(in: .whitespacesAndNewlines)
|
||
.trimmingCharacters(in: CharacterSet(charactersIn: ".,;:·•-"))
|
||
.trimmingCharacters(in: .whitespacesAndNewlines)
|
||
}
|
||
|
||
/// True if `s` looks like a participant name label rather than UI chrome. Call
|
||
/// UIs are full of text (meeting URLs, the clock, "Add others", lobby dialogs)
|
||
/// that OCR would otherwise treat as speakers. Participant labels are short,
|
||
/// Title-Cased, 1–3 alphabetic words with no digits/URL/email punctuation.
|
||
/// Derived from real Meet/Zoom captures; errs toward dropping (a missed name
|
||
/// just means no visual hint — the backend still diarizes from audio).
|
||
static func isLikelyName(_ s: String) -> Bool {
|
||
guard s.count >= 2, s.count <= 30 else { return false }
|
||
// Reject URLs, emails, meeting codes, times, button glyphs.
|
||
if s.rangeOfCharacter(from: CharacterSet(charactersIn: "@:/\\|+*=<>#0123456789")) != nil {
|
||
return false
|
||
}
|
||
// Reject domain-like screen-share text (e.g. "WERUNBTC.COM", OCR'd "WERUNBTC.GOM"):
|
||
// a token whose final dotted segment is a 2–4 letter suffix. Real names don't end
|
||
// in a TLD; this keeps "Cait's Phone" and initials like "MO".
|
||
let lower = s.lowercased()
|
||
if let dot = lower.lastIndex(of: "."), lower.index(after: dot) < lower.endIndex {
|
||
let suffix = lower[lower.index(after: dot)...]
|
||
if (2...4).contains(suffix.count) && suffix.allSatisfy({ $0.isLetter }) { return false }
|
||
}
|
||
let words = s.split(separator: " ")
|
||
guard (1...3).contains(words.count) else { return false }
|
||
let allowed = CharacterSet.letters.union(CharacterSet(charactersIn: "'.-"))
|
||
for w in words {
|
||
guard let first = w.first, first.isUppercase else { return false } // Title Case
|
||
if String(w).unicodeScalars.contains(where: { !allowed.contains($0) }) { return false }
|
||
}
|
||
return true
|
||
}
|
||
|
||
private static let ciContext = CIContext()
|
||
|
||
static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? {
|
||
let ci = CIImage(cvPixelBuffer: pixelBuffer)
|
||
return ciContext.createCGImage(ci, from: ci.extent) // reuse; allocating per frame is costly
|
||
}
|
||
}
|