Files
ten31-transcripts/Ten31Transcripts/Visual/GridCallAnalyzer.swift
T
Grant Gilliam c347acbd97 Adapters: add Meet, Zoom, Teams (coloured border) + adapter registry
Front-loads the remaining visual adapters per the Signal→Meet→Zoom priority.
All three reuse GridCallAnalyzer's coloured-border (saturated) detection path
and share the new bottom-left name anchor:

- GridCallAnalyzer: generalise nameAtBottom:Bool into a NameAnchor enum
  (.bottomCenter for Signal's centered footer, .bottomLeft for Meet/Zoom/Teams
  where the name hugs the tile's bottom-left corner, .center for completeness).
  tileRect estimates the tile up-and-right of a bottom-left name.
- MeetAdapter (Google-blue ring, browser-hosted), ZoomAdapter (green/yellow
  border), TeamsAdapter (violet ring): coloured-border on, white-border off,
  bottom-left names. Geometry constants are first-pass pending real fixtures.
- AdapterRegistry.adapter(for:) maps CallDetector.DetectedApp → AppAdapter so
  VisualObserver can be constructed when live visual capture is wired in;
  unmapped apps degrade to audio-only.

Synthetic 4-tile tests: Meet picks each blue-bordered speaker with no
adjacent-tile bleed, Zoom picks the green-bordered speaker, and Signal's
white-only detector correctly ignores a coloured border. 18/18 XCTest pass.
2026-06-06 09:57:53 -05:00

139 lines
6.5 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
import CoreGraphics
import CoreVideo
import CoreImage
/// Shared engine for tile-grid conferencing UIs: OCR the name/initials on each
/// tile, then mark the active speaker(s) by the speaking-highlight around their
/// tile. Handles BOTH highlight kinds:
/// - **white border** (Signal: 3px #ffffff ring detected via thin near-white edges)
/// - **coloured border** (Zoom/Teams detected via saturated edges)
///
/// The white name text is excluded so it can't be mistaken for the white border.
/// Geometry (`Config`) is a first pass; tile expansion calibrates per app against
/// real screenshot fixtures. Detection *logic* is validated on synthetic frames.
struct GridCallAnalyzer {
/// Where the name label sits relative to its participant tile drives how the
/// tile rect is estimated from the OCR'd name box.
enum NameAnchor {
case bottomCenter // Signal: centered footer; tile extends UP, centered on the name
case bottomLeft // Meet/Zoom: name in the bottom-left corner; tile extends UP and RIGHT
case center // name centered inside the tile
}
struct Config {
var tileExpandX = 2.4 // tile width name width × this
var tileExpandY = 4.8 // tile height name height × this
var nameAnchor: NameAnchor = .bottomCenter
var detectColoredBorder = true
var detectWhiteBorder = true
var minTextConfidence: Float = 0.3
var maxNameLength = 40
var minHighlightPoints = 6
var highlightShareOfMax = 0.35
}
var config = Config()
var recognizer = TextRecognizer()
func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
guard let cg = Self.cgImage(from: pixelBuffer) else { return [] }
return analyze(cgImage: cg, at: t)
}
func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
let texts = recognizer.recognize(in: cgImage).filter {
$0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
}
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
let w = cgImage.width, h = cgImage.height
struct Tile { let name: String; let textRect: CGRect; let tile: CGRect; let conf: Double }
let tiles = texts.map { r in
Tile(name: cleaned(r.text),
textRect: pixelRect(r.boundingBox, w, h),
tile: tileRect(r.boundingBox, w, h),
conf: Double(r.confidence))
}
// Highlight pixels: coloured (saturated) and/or white (thin near-white).
var highlight: [CGPoint] = []
if config.detectColoredBorder { highlight += sampler.saturatedPoints() }
if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() }
// Drop points inside any name-text region so the white name itself doesn't count.
let exclusions = tiles.map {
$0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35)
}
let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } }
// Attribute each highlight pixel to EXACTLY ONE tile the (no-margin)
// estimated rect that contains it, nearest centre as tiebreak. Containment
// (not a radius) keeps a border from bleeding into adjacent tiles even when
// the tile-size estimate is rough; an under-sized estimate merely drops the
// far edge rather than misattributing it.
var counts = [Int](repeating: 0, count: tiles.count)
for p in points {
var best = -1
var bestDistSq = Double.greatestFiniteMagnitude
for (i, tile) in tiles.enumerated() where tile.tile.contains(p) {
let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY
let dd = dx * dx + dy * dy
if dd < bestDistSq { bestDistSq = dd; best = i }
}
if best >= 0 { counts[best] += 1 }
}
let maxCount = counts.max() ?? 0
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
return tiles.enumerated().map { idx, tile in
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
return SpeakerObservation(name: tile.name, speaking: speaking,
bbox: tile.tile, confidence: tile.conf, t: t)
}
}
/// Vision normalized bbox (bottom-left origin) pixel rect (top-left origin).
private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
let W = Double(w), H = Double(h)
return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H)
}
/// Estimate the participant tile from the name label, per the app's `nameAnchor`:
/// - `.bottomCenter` (Signal): tile extends UP from a centered footer.
/// - `.bottomLeft` (Meet/Zoom): name hugs the tile's bottom-left corner; the
/// tile extends UP and to the RIGHT of it.
/// - `.center`: tile centered on the name.
private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect {
let W = Double(w), H = Double(h)
let name = pixelRect(box, w, h)
let nw = name.width * config.tileExpandX
let nh = name.height * config.tileExpandY
let rect: CGRect
switch config.nameAnchor {
case .bottomCenter:
let bottom = name.maxY + name.height * 0.3
rect = CGRect(x: name.midX - nw / 2, y: bottom - nh, width: nw, height: nh)
case .bottomLeft:
let bottom = name.maxY + name.height * 0.3
let left = name.minX - name.height * 0.4 // small left padding the corner gutter
rect = CGRect(x: left, y: bottom - nh, width: nw, height: nh)
case .center:
rect = CGRect(x: name.midX - nw / 2, y: name.midY - nh / 2, width: nw, height: nh)
}
return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
}
private func cleaned(_ s: String) -> String {
let t = s.trimmingCharacters(in: .whitespacesAndNewlines)
return t.count <= config.maxNameLength ? t : ""
}
private static let ciContext = CIContext()
static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? {
let ci = CIImage(cvPixelBuffer: pixelBuffer)
return ciContext.createCGImage(ci, from: ci.extent) // reuse; allocating per frame is costly
}
}