diff --git a/Ten31Transcripts/Adapters/SignalAdapter.swift b/Ten31Transcripts/Adapters/SignalAdapter.swift index 5024b62..f3d428d 100644 --- a/Ten31Transcripts/Adapters/SignalAdapter.swift +++ b/Ten31Transcripts/Adapters/SignalAdapter.swift @@ -14,9 +14,16 @@ struct SignalAdapter: AppAdapter { init() { var config = GridCallAnalyzer.Config() - // Signal tiles are squarish with initials centred; tune with fixtures. - config.tileExpandX = 1.6 - config.tileExpandY = 1.8 + // Signal's speaking cue is a 3px WHITE rounded border (not coloured); the + // name is a bottom footer, so the tile extends up from it. Geometry tuned + // with real fixtures. (Gotchas, per Signal source: NO border in 1:1 calls — + // fall back to mic-VAD/audio pill — and in Speaker view the large tile is + // the speaker; both handled at a higher level later.) + config.nameAtBottom = true + config.detectWhiteBorder = true + config.detectColoredBorder = false + config.tileExpandX = 2.4 + config.tileExpandY = 4.8 self.analyzer = GridCallAnalyzer(config: config) } diff --git a/Ten31Transcripts/Visual/FrameSampler.swift b/Ten31Transcripts/Visual/FrameSampler.swift index bd943c3..30092ac 100644 --- a/Ten31Transcripts/Visual/FrameSampler.swift +++ b/Ten31Transcripts/Visual/FrameSampler.swift @@ -60,8 +60,40 @@ struct FrameSampler { return [top, bottom, left, right].map { meanSaturation(inPixelRect: $0) }.max() ?? 0 } + private func isNearWhite(_ x: Int, _ y: Int, minChannel: Double) -> Bool { + guard x >= 0, x < width, y >= 0, y < height else { return false } + let i = (y * width + x) * 4 + return Double(pixels[i]) >= minChannel + && Double(pixels[i + 1]) >= minChannel + && Double(pixels[i + 2]) >= minChannel + } + + /// Grid-sampled near-white pixels that lie on a THIN structure (a non-white + /// pixel within `edgeGap` on some axis) — i.e. a border/ring/audio-bar, not a + /// solid white blob (face, bright video). This is Signal's white speaking + /// border (saturation ≈ 0, so `saturatedPoints` can't see it). + func thinWhitePoints(minChannel: Double = 200, edgeGap: Int = 6, gridStep: Int = 4) -> [CGPoint] { + var points: [CGPoint] = [] + var y = edgeGap + while y < height - edgeGap { + var x = edgeGap + while x < width - edgeGap { + if isNearWhite(x, y, minChannel: minChannel) { + let thin = !isNearWhite(x - edgeGap, y, minChannel: minChannel) + || !isNearWhite(x + edgeGap, y, minChannel: minChannel) + || !isNearWhite(x, y - edgeGap, minChannel: minChannel) + || !isNearWhite(x, y + edgeGap, minChannel: minChannel) + if thin { points.append(CGPoint(x: x, y: y)) } + } + x += gridStep + } + y += gridStep + } + return points + } + /// Grid-sampled pixel positions (top-left origin) that are strongly saturated - /// AND bright enough to be a UI highlight — i.e. the speaking ring/border. + /// AND bright enough to be a UI highlight — i.e. a coloured speaking ring/border. func saturatedPoints(threshold: Double = 0.5, minBrightness: Double = 60, gridStep: Int = 6) -> [CGPoint] { var points: [CGPoint] = [] var y = 0 diff --git a/Ten31Transcripts/Visual/GridCallAnalyzer.swift b/Ten31Transcripts/Visual/GridCallAnalyzer.swift index bf841e1..eed54b7 100644 --- a/Ten31Transcripts/Visual/GridCallAnalyzer.swift +++ b/Ten31Transcripts/Visual/GridCallAnalyzer.swift @@ -3,25 +3,26 @@ import CoreGraphics import CoreVideo import CoreImage -/// Shared engine for tile-grid conferencing UIs (Signal/Zoom/Teams): OCR the -/// name/initials on each tile, then mark the active speaker(s) by the saturated -/// coloured highlight around their tile. +/// Shared engine for tile-grid conferencing UIs: OCR the name/initials on each +/// tile, then mark the active speaker(s) by the speaking-highlight around their +/// tile. Handles BOTH highlight kinds: +/// - **white border** (Signal: 3px #ffffff ring — detected via thin near-white edges) +/// - **coloured border** (Zoom/Teams — detected via saturated edges) /// -/// Geometry (`Config`) is a first pass; the exact tile expansion and saturation -/// threshold get calibrated per app against real screenshot fixtures. The -/// detection *logic* (read names; pick the highlighted tile) is validated with -/// synthetic frames. +/// The white name text is excluded so it can't be mistaken for the white border. +/// Geometry (`Config`) is a first pass; tile expansion calibrates per app against +/// real screenshot fixtures. Detection *logic* is validated on synthetic frames. struct GridCallAnalyzer { struct Config { - var tileExpandX = 1.8 // grow text bbox → approx tile (for the reported bbox) - var tileExpandY = 2.6 + var tileExpandX = 2.4 // tile width ≈ name width × this + var tileExpandY = 4.8 // tile height ≈ name height × this + var nameAtBottom = true // Signal/most: name footer sits at the tile bottom + var detectColoredBorder = true + var detectWhiteBorder = true var minTextConfidence: Float = 0.3 var maxNameLength = 40 - /// Highlight detection: a name is "speaking" if enough strongly-saturated - /// highlight pixels sit within `highlightRadiusFraction` of its label. - var highlightRadiusFraction = 0.22 // of max(frame W,H) var minHighlightPoints = 6 - var highlightShareOfMax = 0.35 // must be ≥ this fraction of the busiest tile + var highlightShareOfMax = 0.35 } var config = Config() @@ -37,24 +38,42 @@ struct GridCallAnalyzer { $0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty } guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] } - let w = cgImage.width, h = cgImage.height - let tiles = texts.map { r -> (name: String, center: CGPoint, rect: CGRect, conf: Double) in - let rect = tileRect(r.boundingBox, imageW: w, imageH: h) - let cx = r.boundingBox.midX * Double(w) - let cy = (1 - r.boundingBox.midY) * Double(h) // flip Y to top-left origin - return (cleaned(r.text), CGPoint(x: cx, y: cy), rect, Double(r.confidence)) + + struct Tile { let name: String; let textRect: CGRect; let tile: CGRect; let conf: Double } + let tiles = texts.map { r in + Tile(name: cleaned(r.text), + textRect: pixelRect(r.boundingBox, w, h), + tile: tileRect(r.boundingBox, w, h), + conf: Double(r.confidence)) } - // Find highlight pixels once, attribute each to the nearest name label. - let points = sampler.saturatedPoints() - let radius = Double(max(w, h)) * config.highlightRadiusFraction - let r2 = radius * radius - let counts = tiles.map { tile -> Int in - points.reduce(0) { acc, p in - let dx = Double(p.x) - tile.center.x, dy = Double(p.y) - tile.center.y - return acc + (dx * dx + dy * dy <= r2 ? 1 : 0) + // Highlight pixels: coloured (saturated) and/or white (thin near-white). + var highlight: [CGPoint] = [] + if config.detectColoredBorder { highlight += sampler.saturatedPoints() } + if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() } + + // Drop points inside any name-text region so the white name itself doesn't count. + let exclusions = tiles.map { + $0.textRect.insetBy(dx: -$0.textRect.width * 0.15, dy: -$0.textRect.height * 0.35) + } + let points = highlight.filter { p in !exclusions.contains { $0.contains(p) } } + + // Attribute each highlight pixel to EXACTLY ONE tile — the (no-margin) + // estimated rect that contains it, nearest centre as tiebreak. Containment + // (not a radius) keeps a border from bleeding into adjacent tiles even when + // the tile-size estimate is rough; an under-sized estimate merely drops the + // far edge rather than misattributing it. + var counts = [Int](repeating: 0, count: tiles.count) + for p in points { + var best = -1 + var bestDistSq = Double.greatestFiniteMagnitude + for (i, tile) in tiles.enumerated() where tile.tile.contains(p) { + let dx = p.x - tile.tile.midX, dy = p.y - tile.tile.midY + let dd = dx * dx + dy * dy + if dd < bestDistSq { bestDistSq = dd; best = i } } + if best >= 0 { counts[best] += 1 } } let maxCount = counts.max() ?? 0 let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax)) @@ -62,21 +81,31 @@ struct GridCallAnalyzer { return tiles.enumerated().map { idx, tile in let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need return SpeakerObservation(name: tile.name, speaking: speaking, - bbox: tile.rect, confidence: tile.conf, t: t) + bbox: tile.tile, confidence: tile.conf, t: t) } } - /// Vision normalized bbox (bottom-left origin) → pixel tile rect (top-left), - /// expanded around the text centre to approximate the whole tile. - private func tileRect(_ box: CGRect, imageW: Int, imageH: Int) -> CGRect { - let W = Double(imageW), H = Double(imageH) - let pw = box.width * W - let ph = box.height * H - let cx = (box.midX) * W - let cy = (1 - box.midY) * H // flip Y to top-left origin - let nw = pw * config.tileExpandX - let nh = ph * config.tileExpandY - let rect = CGRect(x: cx - nw / 2, y: cy - nh / 2, width: nw, height: nh) + /// Vision normalized bbox (bottom-left origin) → pixel rect (top-left origin). + private func pixelRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect { + let W = Double(w), H = Double(h) + return CGRect(x: box.minX * W, y: (1 - box.maxY) * H, width: box.width * W, height: box.height * H) + } + + /// Estimate the participant tile from the name label. With `nameAtBottom`, the + /// tile extends UP from the footer (Signal); otherwise it's centred on the name. + private func tileRect(_ box: CGRect, _ w: Int, _ h: Int) -> CGRect { + let W = Double(w), H = Double(h) + let name = pixelRect(box, w, h) + let nw = name.width * config.tileExpandX + let nh = name.height * config.tileExpandY + let cx = name.midX + let rect: CGRect + if config.nameAtBottom { + let bottom = name.maxY + name.height * 0.3 + rect = CGRect(x: cx - nw / 2, y: bottom - nh, width: nw, height: nh) + } else { + rect = CGRect(x: cx - nw / 2, y: name.midY - nh / 2, width: nw, height: nh) + } return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H)) } diff --git a/Ten31TranscriptsTests/GridCallAnalyzerTests.swift b/Ten31TranscriptsTests/GridCallAnalyzerTests.swift index 471cd88..4d708bf 100644 --- a/Ten31TranscriptsTests/GridCallAnalyzerTests.swift +++ b/Ten31TranscriptsTests/GridCallAnalyzerTests.swift @@ -34,10 +34,12 @@ final class GridCallAnalyzerTests: XCTestCase { for (i, (name, rect)) in rects.enumerated() { ctx.setFillColor(CGColor(red: 0.18, green: 0.18, blue: 0.2, alpha: 1)); ctx.fill(rect) if i == speakingIndex { - ctx.setStrokeColor(CGColor(red: 0.1, green: 0.85, blue: 0.2, alpha: 1)); ctx.setLineWidth(14) - ctx.stroke(rect.insetBy(dx: 7, dy: 7)) + // Signal's cue: a WHITE rounded border (not coloured). + ctx.setStrokeColor(CGColor(red: 1, green: 1, blue: 1, alpha: 1)); ctx.setLineWidth(6) + ctx.stroke(rect.insetBy(dx: 3, dy: 3)) } - drawText(name, ctx, center: CGPoint(x: rect.midX, y: rect.midY), size: 54) + // Name footer at the BOTTOM of the tile (bottom-left origin: rect.minY). + drawText(name, ctx, center: CGPoint(x: rect.midX, y: rect.minY + 28), size: 46) } return ctx.makeImage()! }