diff --git a/Ten31Transcripts/Visual/GridCallAnalyzer.swift b/Ten31Transcripts/Visual/GridCallAnalyzer.swift index c62023a..56b17d2 100644 --- a/Ten31Transcripts/Visual/GridCallAnalyzer.swift +++ b/Ten31Transcripts/Visual/GridCallAnalyzer.swift @@ -51,7 +51,7 @@ struct GridCallAnalyzer { func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] { let texts = recognizer.recognize(in: cgImage).filter { - $0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty + $0.confidence >= config.minTextConfidence && Self.isLikelyName(cleaned($0.text)) } guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] } let w = cgImage.width, h = cgImage.height @@ -137,8 +137,29 @@ struct GridCallAnalyzer { } private func cleaned(_ s: String) -> String { - let t = s.trimmingCharacters(in: .whitespacesAndNewlines) - return t.count <= config.maxNameLength ? t : "" + s.trimmingCharacters(in: .whitespacesAndNewlines) + } + + /// True if `s` looks like a participant name label rather than UI chrome. Call + /// UIs are full of text (meeting URLs, the clock, "Add others", lobby dialogs) + /// that OCR would otherwise treat as speakers. Participant labels are short, + /// Title-Cased, 1–3 alphabetic words with no digits/URL/email punctuation. + /// Derived from real Meet/Zoom captures; errs toward dropping (a missed name + /// just means no visual hint — the backend still diarizes from audio). + static func isLikelyName(_ s: String) -> Bool { + guard s.count >= 2, s.count <= 30 else { return false } + // Reject URLs, emails, meeting codes, times, button glyphs. + if s.rangeOfCharacter(from: CharacterSet(charactersIn: "@:/\\|+*=<>#0123456789")) != nil { + return false + } + let words = s.split(separator: " ") + guard (1...3).contains(words.count) else { return false } + let allowed = CharacterSet.letters.union(CharacterSet(charactersIn: "'.-")) + for w in words { + guard let first = w.first, first.isUppercase else { return false } // Title Case + if String(w).unicodeScalars.contains(where: { !allowed.contains($0) }) { return false } + } + return true } private static let ciContext = CIContext() diff --git a/Ten31TranscriptsTests/GridCallAnalyzerTests.swift b/Ten31TranscriptsTests/GridCallAnalyzerTests.swift index a6337d9..2f192ec 100644 --- a/Ten31TranscriptsTests/GridCallAnalyzerTests.swift +++ b/Ten31TranscriptsTests/GridCallAnalyzerTests.swift @@ -135,6 +135,19 @@ final class GridCallAnalyzerTests: XCTestCase { XCTAssertTrue(obs.filter { $0.speaking }.isEmpty) } + func testNameFilterAgainstRealMeetOCR() { + // The exact strings OCR pulled from a real Meet session — only the first + // group are participants; the rest are UI chrome that must NOT become speakers. + let names = ["Grant Gilliam", "Caitlyn Viggiano", "Cait's Phone", "Grant", "Me"] + let junk = ["11:43 AM | rvo-rmjg-rdq", "@ Embassy Er", "Admit 1 guest", + "Joined as grant.gilliam@gmail.com", "Others may see your video differently", + "Others might still see your full video.", "Your meeting's ready", "efforot", + "g* Add others", "g+ Add others", "meet.google.com/rvo-rmjg-rdq", + "permission before they can join.", "the meeting", "G"] + for n in names { XCTAssertTrue(GridCallAnalyzer.isLikelyName(n), "should keep name: \(n)") } + for j in junk { XCTAssertFalse(GridCallAnalyzer.isLikelyName(j), "should drop junk: \(j)") } + } + func testWhiteBorderDetectorIgnoresColouredBorder() { // Signal looks only for the white border, so a coloured (Meet) border must // not register as a Signal speaker.