Files
ten31-transcripts/Ten31Transcripts/Visual/GridCallAnalyzer.swift
T
Grant Gilliam 7f16b29f56 Filter OCR to participant-name labels (kill visual-timeline noise)
Real Meet capture revealed the visual pipeline was treating ALL on-screen text as
participant names: meeting URL, clock, 'Add others' button, lobby 'Your meeting's
ready' dialog, 'Joined as …@gmail.com', etc. 46 of 52 'visual segments' in a real
session were phantom speakers. (The backend was unaffected — it diarizes from audio
and ignores names that match no voice cluster — but the visual_timeline.json and the
segment count were junk.)

GridCallAnalyzer.isLikelyName now gates OCR strings to things shaped like a name:
2–30 chars, 1–3 Title-Cased alphabetic words, no digits/URL/email/glyph punctuation.
Errs toward dropping (a missed name just loses a hint; audio diarization still runs).
Unit-tested against the EXACT 19 OCR strings from the real session: keeps the 5
real names, drops all 14 chrome strings. 28/28 XCTest.
2026-06-06 12:01:57 -05:00

172 lines
8.4 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
import CoreGraphics
import CoreVideo
import CoreImage
/// Shared engine for tile-grid conferencing UIs: OCR the name/initials on each
/// tile, then mark the active speaker(s) by the speaking-highlight around their
/// tile. Handles BOTH highlight kinds:
/// - **white border** (Signal: 3px #ffffff ring detected via thin near-white edges)
/// - **coloured border** (Zoom/Teams detected via saturated edges)
///
/// The white name text is excluded so it can't be mistaken for the white border.
/// Geometry (`Config`) is a first pass; tile expansion calibrates per app against
/// real screenshot fixtures. Detection *logic* is validated on synthetic frames.
struct GridCallAnalyzer {
/// Where the name label sits relative to its participant tile drives how the
/// tile rect is estimated from the OCR'd name box.
enum NameAnchor {
case bottomCenter // Signal: centered footer; tile extends UP, centered on the name
case bottomLeft // Meet/Zoom: name in the bottom-left corner; tile extends UP and RIGHT
case center // name centered inside the tile
}
struct Config {
var tileExpandX = 2.4 // tile width name width × this
var tileExpandY = 4.8 // tile height name height × this
var nameAnchor: NameAnchor = .bottomCenter
var detectColoredBorder = true
var detectWhiteBorder = true
// Coloured-border sensitivity. Default 0.5 suits vivid rings (Zoom green/
// yellow); lower it for muted accent rings (Teams violet 0.41, Meet's
// light-blue glow 0.44). `colorHueRange` (degrees) optionally pins the
// ring's hue so a low threshold doesn't catch warm video set per platform
// once calibrated against real screenshots.
var colorSaturation: Double = 0.5
var colorMinBrightness: Double = 60
var colorHueRange: ClosedRange<Double>? = nil
var minTextConfidence: Float = 0.3
var maxNameLength = 40
var minHighlightPoints = 6
var highlightShareOfMax = 0.35
}
var config = Config()
var recognizer = TextRecognizer()
func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
guard let cg = Self.cgImage(from: pixelBuffer) else { return [] }
return analyze(cgImage: cg, at: t)
}
func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
let texts = recognizer.recognize(in: cgImage).filter {
$0.confidence >= config.minTextConfidence && Self.isLikelyName(cleaned($0.text))
}
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
let w = cgImage.width, h = cgImage.height
struct Tile { let name: String; let textRect: CGRect; let tile: CGRect; let conf: Double }
let tiles = texts.map { r in
Tile(name: cleaned(r.text),
textRect: pixelRect(r.boundingBox, w, h),
tile: tileRect(r.boundingBox, w, h),
conf: Double(r.confidence))
}
// Highlight pixels: coloured (saturated) and/or white (thin near-white).
var highlight: [CGPoint] = []
if config.detectColoredBorder {
highlight += sampler.saturatedPoints(threshold: config.colorSaturation,
minBrightness: config.colorMinBrightness,
hueRange: config.colorHueRange)
}
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
// Drop points inside any name-text region so the white name itself doesn't count.
let exclusions = tiles.map {
$0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
}
let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
// Attribute each highlight pixel to EXACTLY ONE tile the (no-margin)
// estimated rect that contains it, nearest centre as tiebreak. Containment
// (not a radius) keeps a border from bleeding into adjacent tiles even when
// the tile-size estimate is rough; an under-sized estimate merely drops the
// far edge rather than misattributing it.
var counts = [Int](repeating: 0, count: tiles.count)
for p in points {
var best = -1
var bestDistSq = Double.greatestFiniteMagnitude
for (i, tile) in tiles.enumerated() where tile.tile.contains(p) {
let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY
let dd = dx * dx + dy * dy
if dd < bestDistSq { bestDistSq = dd; best = i }
}
if best >= 0 { counts[best] += 1 }
}
let maxCount = counts.max() ?? 0
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
return tiles.enumerated().map { idx, tile in
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
return SpeakerObservation(name: tile.name, speaking: speaking,
bbox: tile.tile, confidence: tile.conf, t: t)
}
}
/// Vision normalized bbox (bottom-left origin) pixel rect (top-left origin).
private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
let W = Double(w), H = Double(h)
return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H)
}
/// Estimate the participant tile from the name label, per the app's `nameAnchor`:
/// - `.bottomCenter` (Signal): tile extends UP from a centered footer.
/// - `.bottomLeft` (Meet/Zoom): name hugs the tile's bottom-left corner; the
/// tile extends UP and to the RIGHT of it.
/// - `.center`: tile centered on the name.
private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
let W = Double(w), H = Double(h)
let name = pixelRect(box, w, h)
let nw = name.width * config.tileExpandX
let nh = name.height * config.tileExpandY
let rect: CGRect
switch config.nameAnchor {
case .bottomCenter:
let bottom = name.maxY + name.height * 0.3
rect = CGRect(x: name.midX - nw / 2, y: bottom - nh, width: nw, height: nh)
case .bottomLeft:
let bottom = name.maxY + name.height * 0.3
let left = name.minX - name.height * 0.4 // small left padding the corner gutter
rect = CGRect(x: left, y: bottom - nh, width: nw, height: nh)
case .center:
rect = CGRect(x: name.midX - nw / 2, y: name.midY - nh / 2, width: nw, height: nh)
}
return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
}
private func cleaned(_ s: String) -> String {
s.trimmingCharacters(in: .whitespacesAndNewlines)
}
/// True if `s` looks like a participant name label rather than UI chrome. Call
/// UIs are full of text (meeting URLs, the clock, "Add others", lobby dialogs)
/// that OCR would otherwise treat as speakers. Participant labels are short,
/// Title-Cased, 13 alphabetic words with no digits/URL/email punctuation.
/// Derived from real Meet/Zoom captures; errs toward dropping (a missed name
/// just means no visual hint the backend still diarizes from audio).
static func isLikelyName(_ s: String) -> Bool {
guard s.count >= 2, s.count <= 30 else { return false }
// Reject URLs, emails, meeting codes, times, button glyphs.
if s.rangeOfCharacter(from: CharacterSet(charactersIn: "@:/\\|+*=<>#0123456789")) != nil {
return false
}
let words = s.split(separator: " ")
guard (1...3).contains(words.count) else { return false }
let allowed = CharacterSet.letters.union(CharacterSet(charactersIn: "'.-"))
for w in words {
guard let first = w.first, first.isUppercase else { return false } // Title Case
if String(w).unicodeScalars.contains(where: { !allowed.contains($0) }) { return false }
}
return true
}
private static let ciContext = CIContext()
static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? {
let ci = CIImage(cvPixelBuffer: pixelBuffer)
return ciContext.createCGImage(ci, from: ci.extent) // reuse; allocating per frame is costly
}
}