diff --git a/Ten31Transcripts/Adapters/MeetAdapter.swift b/Ten31Transcripts/Adapters/MeetAdapter.swift index 959dbaf..ca483c7 100644 --- a/Ten31Transcripts/Adapters/MeetAdapter.swift +++ b/Ten31Transcripts/Adapters/MeetAdapter.swift @@ -32,6 +32,16 @@ struct MeetAdapter: AppAdapter { // The bright ring (#1a73e8) is ~0.89 sat but the lighter glow (#8ab4f8) is // ~0.44, below the 0.5 default — lower the threshold so the glow registers. config.colorSaturation = 0.35 + // Meet's active cue is a thin BLUE (≈210°) ring + audio pill. Detect thin blue + // EDGES only, gated to blue: this rejects solid camera-off avatar tiles (orange + // ≈30°, magenta ≈340°), which otherwise read as "speaking" for the whole call + // and collapse every remote voice onto one name. Validated on real fixtures. + config.coloredBorderThinOnly = true + config.colorHueRange = 180...240 + // Meet's blue border is faint; real rings measure ≈0.20–0.30 interior fill while + // solid tiles measure ≈0.36, so allow a higher fill here than the 0.2 default to + // recover real borders without readmitting the solid-tile false positives. + config.maxInteriorFill = 0.3 config.tileExpandX = 3.0 config.tileExpandY = 5.0 self.analyzer = GridCallAnalyzer(config: config) diff --git a/Ten31Transcripts/Visual/FrameSampler.swift b/Ten31Transcripts/Visual/FrameSampler.swift index 7419604..71b9074 100644 --- a/Ten31Transcripts/Visual/FrameSampler.swift +++ b/Ten31Transcripts/Visual/FrameSampler.swift @@ -120,6 +120,43 @@ struct FrameSampler { return points } + /// Grid-sampled saturated pixels that lie on a THIN structure (a non-saturated + /// pixel within `edgeGap` on some axis) — the coloured counterpart of + /// `thinWhitePoints`. This keeps a thin speaking BORDER/ring/pill but drops the + /// solid interior of a colour FILL (e.g. Meet's orange/magenta camera-off avatar + /// tiles), whose pixels are surrounded by the same colour. Pair with `hueRange` + /// to keep only the cue's colour (Meet's blue ring) and reject the thin edges a + /// solid tile still has against the background (orange/magenta boundaries). + func thinColoredPoints(threshold: Double = 0.35, minBrightness: Double = 60, + hueRange: ClosedRange? = nil, + edgeGap: Int = 6, gridStep: Int = 4) -> [CGPoint] { + func isCue(_ x: Int, _ y: Int) -> Bool { + guard x >= 0, x < width, y >= 0, y < height else { return false } + let i = (y * width + x) * 4 + let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2]) + let mx = max(r, g, b), mn = min(r, g, b) + let sat = mx > 0 ? (mx - mn) / mx : 0 + guard sat > threshold, mx > minBrightness else { return false } + if let hr = hueRange { return hr.contains(Self.hueDegrees(r, g, b, mx, mn)) } + return true + } + var points: [CGPoint] = [] + var y = edgeGap + while y < height - edgeGap { + var x = edgeGap + while x < width - edgeGap { + if isCue(x, y) { + let thin = !isCue(x - edgeGap, y) || !isCue(x + edgeGap, y) + || !isCue(x, y - edgeGap) || !isCue(x, y + edgeGap) + if thin { points.append(CGPoint(x: x, y: y)) } + } + x += gridStep + } + y += gridStep + } + return points + } + /// HSV hue in degrees (0…360) from RGB and its precomputed max/min channels. private static func hueDegrees(_ r: Double, _ g: Double, _ b: Double, _ mx: Double, _ mn: Double) -> Double { let d = mx - mn diff --git a/Ten31Transcripts/Visual/GridCallAnalyzer.swift b/Ten31Transcripts/Visual/GridCallAnalyzer.swift index d515082..9a70e64 100644 --- a/Ten31Transcripts/Visual/GridCallAnalyzer.swift +++ b/Ten31Transcripts/Visual/GridCallAnalyzer.swift @@ -35,11 +35,21 @@ struct GridCallAnalyzer { var colorSaturation: Double = 0.5 var colorMinBrightness: Double = 60 var colorHueRange: ClosedRange? = nil + // When true, the coloured highlight is detected from THIN edges only (drops + // solid colour fills like Meet's camera-off avatar tiles). Pair with a tight + // `colorHueRange` so a solid tile's thin background boundary is rejected too. + var coloredBorderThinOnly = false var minTextConfidence: Float = 0.3 var maxNameLength = 40 var minHighlightPoints = 6 var highlightShareOfMax = 0.35 var minRingSpan: Double = 60 // a speaking border spans a sizable box, not a speck + // A real active-speaker cue is a thin RING (border) with an EMPTY interior. + // A solid camera-off avatar tile (Meet's orange/magenta fill) or a screen-share + // fill is a filled BLOB — its highlight points spread through the interior. Reject + // a component when more than this fraction of its points fall in the central + // 60%×60% of its bbox (a hollow ring ≈ 0; a solid fill ≈ 0.36). Set ≥ 1 to disable. + var maxInteriorFill: Double = 0.2 } var config = Config() @@ -68,9 +78,13 @@ struct GridCallAnalyzer { // Highlight pixels: coloured (saturated) and/or white (thin near-white). var highlight: [CGPoint] = [] if config.detectColoredBorder { - highlight += sampler.saturatedPoints(threshold: config.colorSaturation, - minBrightness: config.colorMinBrightness, - hueRange: config.colorHueRange) + highlight += config.coloredBorderThinOnly + ? sampler.thinColoredPoints(threshold: config.colorSaturation, + minBrightness: config.colorMinBrightness, + hueRange: config.colorHueRange) + : sampler.saturatedPoints(threshold: config.colorSaturation, + minBrightness: config.colorMinBrightness, + hueRange: config.colorHueRange) } if config.detectWhiteBorder { highlight += sampler.thinWhitePoints() } @@ -89,7 +103,8 @@ struct GridCallAnalyzer { var speakingBBox: [Int: CGRect] = [:] // tile index -> the ring bbox marking it speaking for ring in rings where ring.count >= config.minHighlightPoints { let bb = Self.boundingBox(ring) - guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a blob + guard bb.width >= config.minRingSpan, bb.height >= config.minRingSpan else { continue } // a ring, not a speck + guard Self.isHollow(ring, bbox: bb, maxInteriorFill: config.maxInteriorFill) else { continue } // a ring, not a filled tile for (i, tile) in tiles.enumerated() where bb.contains(CGPoint(x: tile.textRect.midX, y: tile.textRect.midY)) { speakingBBox[i] = bb } @@ -128,6 +143,18 @@ struct GridCallAnalyzer { return Array(groups.values) } + /// True if `pts` form a hollow ring (border) rather than a filled blob: at most + /// `maxInteriorFill` of the points fall in the central 60%×60% of `bbox`. A thin + /// border has an empty interior (≈ 0); a solid camera-off avatar tile or a + /// screen-share fill spreads points through the interior (≈ 0.36). Disabled when + /// `maxInteriorFill >= 1`. + static func isHollow(_ pts: [CGPoint], bbox: CGRect, maxInteriorFill: Double) -> Bool { + guard maxInteriorFill < 1, !pts.isEmpty else { return true } + let inner = bbox.insetBy(dx: bbox.width * 0.2, dy: bbox.height * 0.2) + let innerCount = pts.reduce(into: 0) { if inner.contains($1) { $0 += 1 } } + return Double(innerCount) / Double(pts.count) <= maxInteriorFill + } + static func boundingBox(_ pts: [CGPoint]) -> CGRect { var minX = Double.greatestFiniteMagnitude, minY = minX, maxX = -minX, maxY = -minX for p in pts { minX = min(minX, p.x); minY = min(minY, p.y); maxX = max(maxX, p.x); maxY = max(maxY, p.y) } @@ -166,7 +193,11 @@ struct GridCallAnalyzer { } private func cleaned(_ s: String) -> String { + // Trim whitespace and any trailing punctuation OCR tacks on, so "Mark." folds + // into "Mark" rather than becoming a separate phantom speaker. s.trimmingCharacters(in: .whitespacesAndNewlines) + .trimmingCharacters(in: CharacterSet(charactersIn: ".,;:·•-")) + .trimmingCharacters(in: .whitespacesAndNewlines) } /// True if `s` looks like a participant name label rather than UI chrome. Call @@ -181,6 +212,14 @@ struct GridCallAnalyzer { if s.rangeOfCharacter(from: CharacterSet(charactersIn: "@:/\\|+*=<>#0123456789")) != nil { return false } + // Reject domain-like screen-share text (e.g. "WERUNBTC.COM", OCR'd "WERUNBTC.GOM"): + // a token whose final dotted segment is a 2–4 letter suffix. Real names don't end + // in a TLD; this keeps "Cait's Phone" and initials like "MO". + let lower = s.lowercased() + if let dot = lower.lastIndex(of: "."), lower.index(after: dot) < lower.endIndex { + let suffix = lower[lower.index(after: dot)...] + if (2...4).contains(suffix.count) && suffix.allSatisfy({ $0.isLetter }) { return false } + } let words = s.split(separator: " ") guard (1...3).contains(words.count) else { return false } let allowed = CharacterSet.letters.union(CharacterSet(charactersIn: "'.-")) diff --git a/Ten31Transcripts/Visual/TimelineBuilder.swift b/Ten31Transcripts/Visual/TimelineBuilder.swift index 162eb2b..a34554d 100644 --- a/Ten31Transcripts/Visual/TimelineBuilder.swift +++ b/Ten31Transcripts/Visual/TimelineBuilder.swift @@ -93,9 +93,57 @@ final class TimelineBuilder { closeSegment(name: name, state: st) states[name]?.open = false } + segments = Self.canonicalizeByFrequency(segments) segments.sort { $0.start < $1.start } } + /// Fold rare OCR misspellings into the dominant name they're a typo of: a name with + /// little total time is remapped to a much longer-running name with the same initial + /// within a small edit distance (e.g. "Matt Odel"/"MattOdell"/"Mare" → "Matt Odell"/ + /// "Mark"). Conservative by design — it won't merge two well-attested speakers, only + /// a transient variant into its clearly-dominant canonical. Pure/testable. + static func canonicalizeByFrequency(_ segs: [VisualTimeline.Segment], + minorMaxSec: Double = 5, dominanceRatio: Double = 8, + maxEdits: Int = 2) -> [VisualTimeline.Segment] { + var dur: [String: Double] = [:] + for s in segs { dur[s.name, default: 0] += s.end - s.start } + let names = Array(dur.keys) + var remap: [String: String] = [:] + for minor in names { + let md = dur[minor]! + guard md <= minorMaxSec, let mInit = minor.first else { continue } + var best: String?, bestDur = 0.0 + for major in names where major != minor { + let Md = dur[major]! + guard Md >= md * dominanceRatio, Md > bestDur, major.first == mInit else { continue } + if levenshtein(minor.lowercased(), major.lowercased()) <= maxEdits { best = major; bestDur = Md } + } + if let b = best { remap[minor] = b } + } + guard !remap.isEmpty else { return segs } + return segs.map { s in + remap[s.name].map { VisualTimeline.Segment(start: s.start, end: s.end, name: $0, + confidence: s.confidence, source: s.source) } ?? s + } + } + + /// Levenshtein edit distance (small strings — names). + static func levenshtein(_ a: String, _ b: String) -> Int { + let x = Array(a), y = Array(b) + if x.isEmpty { return y.count }; if y.isEmpty { return x.count } + var prev = Array(0...y.count) + var cur = [Int](repeating: 0, count: y.count + 1) + for i in 1...x.count { + cur[0] = i + for j in 1...y.count { + cur[j] = x[i-1] == y[j-1] ? prev[j-1] + : Swift.min(prev[j-1], prev[j], cur[j-1]) + 1 + } + swap(&prev, &cur) + } + return prev[y.count] + } + // MARK: - Internal private struct NameState { diff --git a/Ten31TranscriptsTests/GridCallAnalyzerTests.swift b/Ten31TranscriptsTests/GridCallAnalyzerTests.swift index 2f192ec..fb0ba1a 100644 --- a/Ten31TranscriptsTests/GridCallAnalyzerTests.swift +++ b/Ten31TranscriptsTests/GridCallAnalyzerTests.swift @@ -138,16 +138,37 @@ final class GridCallAnalyzerTests: XCTestCase { func testNameFilterAgainstRealMeetOCR() { // The exact strings OCR pulled from a real Meet session — only the first // group are participants; the rest are UI chrome that must NOT become speakers. - let names = ["Grant Gilliam", "Caitlyn Viggiano", "Cait's Phone", "Grant", "Me"] + let names = ["Grant Gilliam", "Caitlyn Viggiano", "Cait's Phone", "Grant", "Me", "Matt Odell"] let junk = ["11:43 AM | rvo-rmjg-rdq", "@ Embassy Er", "Admit 1 guest", "Joined as grant.gilliam@gmail.com", "Others may see your video differently", "Others might still see your full video.", "Your meeting's ready", "efforot", "g* Add others", "g+ Add others", "meet.google.com/rvo-rmjg-rdq", - "permission before they can join.", "the meeting", "G"] + "permission before they can join.", "the meeting", "G", + // Screen-share domain text OCR'd as a name (incl. OCR'd TLDs). + "WERUNBTC.COM", "WERUNBTG.COM", "WERUNBTC.GOM"] for n in names { XCTAssertTrue(GridCallAnalyzer.isLikelyName(n), "should keep name: \(n)") } for j in junk { XCTAssertFalse(GridCallAnalyzer.isLikelyName(j), "should drop junk: \(j)") } } + func testHollowRingKeptFilledTileRejected() { + // A thin ring (border): points only on the perimeter of a 120×120 box. + var ring: [CGPoint] = [] + for t in stride(from: 0.0, through: 120, by: 4) { + ring.append(.init(x: t, y: 0)); ring.append(.init(x: t, y: 120)) + ring.append(.init(x: 0, y: t)); ring.append(.init(x: 120, y: t)) + } + let rbb = GridCallAnalyzer.boundingBox(ring) + XCTAssertTrue(GridCallAnalyzer.isHollow(ring, bbox: rbb, maxInteriorFill: 0.2)) + + // A solid fill (camera-off avatar tile): points across the whole box. + var blob: [CGPoint] = [] + for x in stride(from: 0.0, through: 120, by: 4) { + for y in stride(from: 0.0, through: 120, by: 4) { blob.append(.init(x: x, y: y)) } + } + let bbb = GridCallAnalyzer.boundingBox(blob) + XCTAssertFalse(GridCallAnalyzer.isHollow(blob, bbox: bbb, maxInteriorFill: 0.2)) + } + func testWhiteBorderDetectorIgnoresColouredBorder() { // Signal looks only for the white border, so a coloured (Meet) border must // not register as a Signal speaker. diff --git a/Ten31TranscriptsTests/VisualObserverTests.swift b/Ten31TranscriptsTests/VisualObserverTests.swift index bca8da2..be70dde 100644 --- a/Ten31TranscriptsTests/VisualObserverTests.swift +++ b/Ten31TranscriptsTests/VisualObserverTests.swift @@ -11,6 +11,27 @@ final class VisualObserverTests: XCTestCase { (id, CGRect(x: 0, y: 0, width: w, height: h)) } + func testCanonicalizeFoldsOcrMisspellingsIntoDominantName() { + func seg(_ s: Double, _ e: Double, _ n: String) -> VisualTimeline.Segment { + .init(start: s, end: e, name: n, confidence: 0.9, source: "vision") + } + let segs = [ + seg(0, 1689, "Matt Odell"), // dominant + seg(1700, 1702, "Matt Odel"), // OCR typo → fold + seg(1702, 1702.3, "MattOdell"), // dropped-space typo → fold + seg(0, 1155, "Mark"), // dominant + seg(1200, 1201, "Mare"), // OCR typo → fold into Mark + seg(0, 4, "Sidisel"), // screen junk, no near-twin → kept (dropped later, no voice match) + ] + let names = Set(TimelineBuilder.canonicalizeByFrequency(segs).map { $0.name }) + XCTAssertTrue(names.contains("Matt Odell")) + XCTAssertTrue(names.contains("Mark")) + XCTAssertFalse(names.contains("Matt Odel")) + XCTAssertFalse(names.contains("MattOdell")) + XCTAssertFalse(names.contains("Mare")) + XCTAssertTrue(names.contains("Sidisel")) + } + func testPrefersMatchingWindowIDOverLargest() { // The Meet window (id 42) is NOT the largest — must still be chosen by ID. let candidates = [c(7, 1600, 1000), c(42, 800, 600), c(9, 1200, 900)]