Filter OCR to participant-name labels (kill visual-timeline noise)
Real Meet capture revealed the visual pipeline was treating ALL on-screen text as participant names: meeting URL, clock, 'Add others' button, lobby 'Your meeting's ready' dialog, 'Joined as …@gmail.com', etc. 46 of 52 'visual segments' in a real session were phantom speakers. (The backend was unaffected — it diarizes from audio and ignores names that match no voice cluster — but the visual_timeline.json and the segment count were junk.) GridCallAnalyzer.isLikelyName now gates OCR strings to things shaped like a name: 2–30 chars, 1–3 Title-Cased alphabetic words, no digits/URL/email/glyph punctuation. Errs toward dropping (a missed name just loses a hint; audio diarization still runs). Unit-tested against the EXACT 19 OCR strings from the real session: keeps the 5 real names, drops all 14 chrome strings. 28/28 XCTest.
This commit is contained in:
@@ -51,7 +51,7 @@ struct GridCallAnalyzer {
|
||||
|
||||
func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
|
||||
let texts = recognizer.recognize(in: cgImage).filter {
|
||||
$0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
|
||||
$0.confidence >= config.minTextConfidence && Self.isLikelyName(cleaned($0.text))
|
||||
}
|
||||
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
|
||||
let w = cgImage.width, h = cgImage.height
|
||||
@@ -137,8 +137,29 @@ struct GridCallAnalyzer {
|
||||
}
|
||||
|
||||
private func cleaned(_ s: String) -> String {
|
||||
let t = s.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
return t.count <= config.maxNameLength ? t : ""
|
||||
s.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
}
|
||||
|
||||
/// True if `s` looks like a participant name label rather than UI chrome. Call
|
||||
/// UIs are full of text (meeting URLs, the clock, "Add others", lobby dialogs)
|
||||
/// that OCR would otherwise treat as speakers. Participant labels are short,
|
||||
/// Title-Cased, 1–3 alphabetic words with no digits/URL/email punctuation.
|
||||
/// Derived from real Meet/Zoom captures; errs toward dropping (a missed name
|
||||
/// just means no visual hint — the backend still diarizes from audio).
|
||||
static func isLikelyName(_ s: String) -> Bool {
|
||||
guard s.count >= 2, s.count <= 30 else { return false }
|
||||
// Reject URLs, emails, meeting codes, times, button glyphs.
|
||||
if s.rangeOfCharacter(from: CharacterSet(charactersIn: "@:/\\|+*=<>#0123456789")) != nil {
|
||||
return false
|
||||
}
|
||||
let words = s.split(separator: " ")
|
||||
guard (1...3).contains(words.count) else { return false }
|
||||
let allowed = CharacterSet.letters.union(CharacterSet(charactersIn: "'.-"))
|
||||
for w in words {
|
||||
guard let first = w.first, first.isUppercase else { return false } // Title Case
|
||||
if String(w).unicodeScalars.contains(where: { !allowed.contains($0) }) { return false }
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
private static let ciContext = CIContext()
|
||||
|
||||
@@ -135,6 +135,19 @@ final class GridCallAnalyzerTests: XCTestCase {
|
||||
XCTAssertTrue(obs.filter { $0.speaking }.isEmpty)
|
||||
}
|
||||
|
||||
func testNameFilterAgainstRealMeetOCR() {
|
||||
// The exact strings OCR pulled from a real Meet session — only the first
|
||||
// group are participants; the rest are UI chrome that must NOT become speakers.
|
||||
let names = ["Grant Gilliam", "Caitlyn Viggiano", "Cait's Phone", "Grant", "Me"]
|
||||
let junk = ["11:43 AM | rvo-rmjg-rdq", "@ Embassy Er", "Admit 1 guest",
|
||||
"Joined as grant.gilliam@gmail.com", "Others may see your video differently",
|
||||
"Others might still see your full video.", "Your meeting's ready", "efforot",
|
||||
"g* Add others", "g+ Add others", "meet.google.com/rvo-rmjg-rdq",
|
||||
"permission before they can join.", "the meeting", "G"]
|
||||
for n in names { XCTAssertTrue(GridCallAnalyzer.isLikelyName(n), "should keep name: \(n)") }
|
||||
for j in junk { XCTAssertFalse(GridCallAnalyzer.isLikelyName(j), "should drop junk: \(j)") }
|
||||
}
|
||||
|
||||
func testWhiteBorderDetectorIgnoresColouredBorder() {
|
||||
// Signal looks only for the white border, so a coloured (Meet) border must
|
||||
// not register as a Signal speaker.
|
||||
|
||||
Reference in New Issue
Block a user