Files
ten31-transcripts/Ten31Transcripts/Visual/TextRecognizer.swift
T
Grant Gilliam 863136aeec Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic
attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet,
ignoring our own recording; auto-record toggle. Built; pending live multi-app
confirmation by the user.

Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation,
TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema
1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR +
saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver
(window capture; frames released, never saved; minimized->visual_gap, idle != gap).
Synthetic-frame tested; adapter geometry pending real Signal fixtures + live
VisualObserver validation.

Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential,
TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline
slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated
END-TO-END against the live backend (chunk -> label-merge -> speakers.json).

Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named
fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status,
auto-send toggle (default off) + self-name setting.

All adversarial-review findings fixed. App + XCTest suite build; tests pass.
2026-06-06 00:15:49 -05:00

60 lines
2.7 KiB
Swift

import Foundation
import Vision
import CoreVideo
import CoreGraphics
/// Thin wrapper over Vision's text recognition, used by adapters to read names /
/// initials off participant tiles. Runs on the Neural Engine; no permission
/// needed. Works on any frame, so adapters can be developed against still images.
struct TextRecognizer {
struct Result {
let text: String
let confidence: Float
/// Normalized Vision bounding box (origin bottom-left, 01).
let boundingBox: CGRect
}
var recognitionLevel: VNRequestTextRecognitionLevel = .accurate
var minimumTextHeight: Float = 0 // 0 = Vision default
var usesLanguageCorrection = false // names/initials aren't dictionary words
/// Recognize text in `pixelBuffer`, optionally limited to a normalized region
/// of interest (origin bottom-left, matching Vision's coordinate space).
func recognize(in pixelBuffer: CVPixelBuffer, regionOfInterest: CGRect? = nil) -> [Result] {
let request = VNRecognizeTextRequest()
request.recognitionLevel = recognitionLevel
request.usesLanguageCorrection = usesLanguageCorrection
if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
if let roi = regionOfInterest { request.regionOfInterest = roi }
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
do {
try handler.perform([request])
} catch {
return []
}
guard let observations = request.results else { return [] }
return observations.compactMap { obs in
guard let top = obs.topCandidates(1).first else { return nil }
return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
}
}
/// Convenience for fixtures/tests: recognize text in a CGImage.
func recognize(in cgImage: CGImage, regionOfInterest: CGRect? = nil) -> [Result] {
let request = VNRecognizeTextRequest()
request.recognitionLevel = recognitionLevel
request.usesLanguageCorrection = usesLanguageCorrection
if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
if let roi = regionOfInterest { request.regionOfInterest = roi }
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
guard (try? handler.perform([request])) != nil, let results = request.results else { return [] }
return results.compactMap { obs in
guard let top = obs.topCandidates(1).first else { return nil }
return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
}
}
}