Files
Grant Gilliam 39beccf7f4 Fix Meet visual: reject solid avatar tiles + screen-share OCR
Root cause of the "4 people → 2 speakers" Meet call: the colored-border detector
read solid camera-off avatar tiles (orange "J", magenta "G") as active speakers
for the ENTIRE call. Those whole-call phantom spans dominated backend name
attribution, collapsing every remote voice onto one name — and the giant filled
bbox also swallowed screen-share text (WERUNBTC.COM ×49) as a speaker.

Validated against 9 real fixtures (harness over the real MeetAdapter):

Detection:
- FrameSampler.thinColoredPoints: coloured counterpart of thinWhitePoints — keeps
  thin border/ring/pill edges, drops solid colour fills.
- GridCallAnalyzer.isHollow: reject a highlight component whose interior is filled
  (a solid tile) vs a hollow ring (a real border). Config.maxInteriorFill (0.2 default).
- MeetAdapter: detect thin BLUE edges only (hue 180–240°, measured from the
  fixtures), maxInteriorFill 0.3 (real Meet rings ≈0.2–0.3, solid tiles ≈0.36).
- Result on fixtures: John Arnold/Grant Gilliam (solid tiles) now NEVER detected;
  Matt Odell/Mark detected when their blue cue is present. Sparse but never wrong —
  correct for a naming hint over audio diarization.

OCR name hygiene:
- isLikelyName rejects domain-like screen-share text ("WERUNBTC.COM", OCR'd ".GOM").
- cleaned() strips trailing punctuation ("Mark." → "Mark").
- TimelineBuilder.canonicalizeByFrequency folds rare OCR misspellings into a
  dominant near-twin name ("Matt Odel"/"MattOdell" → "Matt Odell", "Mare" → "Mark").

Tests: hollow-ring, extended OCR filter, fuzzy-merge. 65 pass.
2026-06-08 16:18:52 -05:00

180 lines
9.7 KiB
Swift
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import XCTest
import CoreGraphics
import CoreText
@testable import Ten31Transcripts
/// Validates the visual adapter against synthetic call frames (no real
/// screenshots needed): OCR anchors the tiles and the highlight is attributed to
/// the correct speaker, tracking it as it moves.
final class GridCallAnalyzerTests: XCTestCase {
private func drawText(_ s: String, _ ctx: CGContext, center: CGPoint, size: CGFloat) {
let font = CTFontCreateWithName("Helvetica-Bold" as CFString, size, nil)
let attrs = [kCTFontAttributeName: font,
kCTForegroundColorAttributeName: CGColor(red: 1, green: 1, blue: 1, alpha: 1)] as CFDictionary
let line = CTLineCreateWithAttributedString(CFAttributedStringCreate(nil, s as CFString, attrs)!)
let b = CTLineGetBoundsWithOptions(line, [])
ctx.textPosition = CGPoint(x: center.x - b.width / 2, y: center.y - b.height / 2)
CTLineDraw(line, ctx)
}
private func frame(speakingIndex: Int) -> CGImage {
let W = 800, H = 600
let ctx = CGContext(data: nil, width: W, height: H, bitsPerComponent: 8, bytesPerRow: 0,
space: CGColorSpaceCreateDeviceRGB(),
bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue)!
ctx.setFillColor(CGColor(red: 0.1, green: 0.1, blue: 0.12, alpha: 1))
ctx.fill(CGRect(x: 0, y: 0, width: W, height: H))
let rects: [(String, CGRect)] = [
("GRANT", CGRect(x: 40, y: 320, width: 340, height: 230)),
("SARAH", CGRect(x: 420, y: 320, width: 340, height: 230)),
("DMITRI", CGRect(x: 40, y: 50, width: 340, height: 230)),
("ALEX", CGRect(x: 420, y: 50, width: 340, height: 230)),
]
for (i, (name, rect)) in rects.enumerated() {
ctx.setFillColor(CGColor(red: 0.18, green: 0.18, blue: 0.2, alpha: 1)); ctx.fill(rect)
if i == speakingIndex {
// Signal's cue: a WHITE rounded border (not coloured).
ctx.setStrokeColor(CGColor(red: 1, green: 1, blue: 1, alpha: 1)); ctx.setLineWidth(6)
ctx.stroke(rect.insetBy(dx: 3, dy: 3))
}
// Name footer at the BOTTOM of the tile (bottom-left origin: rect.minY).
drawText(name, ctx, center: CGPoint(x: rect.midX, y: rect.minY + 28), size: 46)
}
return ctx.makeImage()!
}
func testReadsNamesAndPicksHighlightedSpeaker() {
let obs = SignalAdapter().analyze(cgImage: frame(speakingIndex: 1), at: 0) // SARAH
XCTAssertGreaterThanOrEqual(obs.count, 2)
let speaking = obs.filter { $0.speaking }
XCTAssertEqual(speaking.count, 1)
// SARAH tile center in top-left pixels (590, 165)
XCTAssertEqual(speaking.first?.bbox.midX ?? 0, 590, accuracy: 160)
XCTAssertEqual(speaking.first?.bbox.midY ?? 0, 165, accuracy: 160)
}
func testHighlightTracksToAnotherTile() {
let obs = SignalAdapter().analyze(cgImage: frame(speakingIndex: 2), at: 1) // DMITRI
let speaking = obs.filter { $0.speaking }
XCTAssertEqual(speaking.count, 1)
XCTAssertEqual(speaking.first?.bbox.midX ?? 0, 210, accuracy: 160)
XCTAssertEqual(speaking.first?.bbox.midY ?? 0, 435, accuracy: 160)
}
// MARK: - Coloured-border apps (Meet / Zoom / Teams): name in bottom-LEFT corner.
private func leftText(_ s: String, _ ctx: CGContext, leftBaseline: CGPoint, size: CGFloat) {
let font = CTFontCreateWithName("Helvetica-Bold" as CFString, size, nil)
let attrs = [kCTFontAttributeName: font,
kCTForegroundColorAttributeName: CGColor(red: 1, green: 1, blue: 1, alpha: 1)] as CFDictionary
let line = CTLineCreateWithAttributedString(CFAttributedStringCreate(nil, s as CFString, attrs)!)
ctx.textPosition = leftBaseline
CTLineDraw(line, ctx)
}
private func coloredFrame(speakingIndex: Int, border: CGColor) -> CGImage {
let W = 900, H = 640
let ctx = CGContext(data: nil, width: W, height: H, bitsPerComponent: 8, bytesPerRow: 0,
space: CGColorSpaceCreateDeviceRGB(),
bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue)!
ctx.setFillColor(CGColor(red: 0.1, green: 0.1, blue: 0.12, alpha: 1))
ctx.fill(CGRect(x: 0, y: 0, width: W, height: H))
let rects: [(String, CGRect)] = [
("GRANT", CGRect(x: 40, y: 340, width: 380, height: 250)),
("SARAH", CGRect(x: 480, y: 340, width: 380, height: 250)),
("DMITRI", CGRect(x: 40, y: 50, width: 380, height: 250)),
("ALEX", CGRect(x: 480, y: 50, width: 380, height: 250)),
]
for (i, (name, rect)) in rects.enumerated() {
ctx.setFillColor(CGColor(red: 0.18, green: 0.18, blue: 0.2, alpha: 1)); ctx.fill(rect)
if i == speakingIndex {
ctx.setStrokeColor(border); ctx.setLineWidth(8)
ctx.stroke(rect.insetBy(dx: 4, dy: 4))
}
// Name in the bottom-LEFT corner (bottom-left origin: near minX/minY).
leftText(name, ctx, leftBaseline: CGPoint(x: rect.minX + 16, y: rect.minY + 16), size: 40)
}
return ctx.makeImage()!
}
func testMeetPicksBlueBorderedSpeaker() {
let blue = CGColor(red: 0.16, green: 0.45, blue: 0.95, alpha: 1)
for (idx, name) in ["GRANT", "SARAH", "DMITRI", "ALEX"].enumerated() {
let obs = MeetAdapter().analyze(cgImage: coloredFrame(speakingIndex: idx, border: blue), at: 0)
let speaking = Set(obs.filter { $0.speaking }.map { $0.name })
XCTAssertEqual(speaking, [name], "Meet: only \(name) should be speaking")
}
// No border no speaker.
let none = MeetAdapter().analyze(cgImage: coloredFrame(speakingIndex: -1, border: blue), at: 0)
XCTAssertTrue(none.filter { $0.speaking }.isEmpty)
}
func testZoomPicksGreenBorderedSpeaker() {
let green = CGColor(red: 0.2, green: 0.85, blue: 0.3, alpha: 1)
let obs = ZoomAdapter().analyze(cgImage: coloredFrame(speakingIndex: 3, border: green), at: 0) // ALEX
let speaking = Set(obs.filter { $0.speaking }.map { $0.name })
XCTAssertEqual(speaking, ["ALEX"])
}
func testTeamsDetectsFaintVioletRing() {
// Teams' brand violet #6264A7 is only ~0.41 saturation below the 0.5
// default that Meet/Zoom inherited. The Teams adapter lowers the threshold,
// so it must still pick the ring.
let violet = CGColor(red: 0.384, green: 0.392, blue: 0.655, alpha: 1)
let obs = TeamsAdapter().analyze(cgImage: coloredFrame(speakingIndex: 2, border: violet), at: 0) // DMITRI
let speaking = Set(obs.filter { $0.speaking }.map { $0.name })
XCTAssertEqual(speaking, ["DMITRI"])
}
func testTeamsHueGateRejectsWrongColourBorder() {
// A green border (wrong hue for Teams) must NOT register the violet hue
// gate is what keeps the lowered threshold from catching warm/other content.
let green = CGColor(red: 0.2, green: 0.85, blue: 0.3, alpha: 1)
let obs = TeamsAdapter().analyze(cgImage: coloredFrame(speakingIndex: 0, border: green), at: 0)
XCTAssertTrue(obs.filter { $0.speaking }.isEmpty)
}
func testNameFilterAgainstRealMeetOCR() {
// The exact strings OCR pulled from a real Meet session only the first
// group are participants; the rest are UI chrome that must NOT become speakers.
let names = ["Grant Gilliam", "Caitlyn Viggiano", "Cait's Phone", "Grant", "Me", "Matt Odell"]
let junk = ["11:43 AM | rvo-rmjg-rdq", "@ Embassy Er", "Admit 1 guest",
"Joined as grant.gilliam@gmail.com", "Others may see your video differently",
"Others might still see your full video.", "Your meeting's ready", "efforot",
"g* Add others", "g+ Add others", "meet.google.com/rvo-rmjg-rdq",
"permission before they can join.", "the meeting", "G",
// Screen-share domain text OCR'd as a name (incl. OCR'd TLDs).
"WERUNBTC.COM", "WERUNBTG.COM", "WERUNBTC.GOM"]
for n in names { XCTAssertTrue(GridCallAnalyzer.isLikelyName(n), "should keep name: \(n)") }
for j in junk { XCTAssertFalse(GridCallAnalyzer.isLikelyName(j), "should drop junk: \(j)") }
}
func testHollowRingKeptFilledTileRejected() {
// A thin ring (border): points only on the perimeter of a 120×120 box.
var ring: [CGPoint] = []
for t in stride(from: 0.0, through: 120, by: 4) {
ring.append(.init(x: t, y: 0)); ring.append(.init(x: t, y: 120))
ring.append(.init(x: 0, y: t)); ring.append(.init(x: 120, y: t))
}
let rbb = GridCallAnalyzer.boundingBox(ring)
XCTAssertTrue(GridCallAnalyzer.isHollow(ring, bbox: rbb, maxInteriorFill: 0.2))
// A solid fill (camera-off avatar tile): points across the whole box.
var blob: [CGPoint] = []
for x in stride(from: 0.0, through: 120, by: 4) {
for y in stride(from: 0.0, through: 120, by: 4) { blob.append(.init(x: x, y: y)) }
}
let bbb = GridCallAnalyzer.boundingBox(blob)
XCTAssertFalse(GridCallAnalyzer.isHollow(blob, bbox: bbb, maxInteriorFill: 0.2))
}
func testWhiteBorderDetectorIgnoresColouredBorder() {
// Signal looks only for the white border, so a coloured (Meet) border must
// not register as a Signal speaker.
let blue = CGColor(red: 0.16, green: 0.45, blue: 0.95, alpha: 1)
let obs = SignalAdapter().analyze(cgImage: coloredFrame(speakingIndex: 0, border: blue), at: 0)
XCTAssertTrue(obs.filter { $0.speaking }.isEmpty)
}
}