863136aeec
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
60 lines
2.7 KiB
Swift
60 lines
2.7 KiB
Swift
import Foundation
|
|
import Vision
|
|
import CoreVideo
|
|
import CoreGraphics
|
|
|
|
/// Thin wrapper over Vision's text recognition, used by adapters to read names /
|
|
/// initials off participant tiles. Runs on the Neural Engine; no permission
|
|
/// needed. Works on any frame, so adapters can be developed against still images.
|
|
struct TextRecognizer {
|
|
struct Result {
|
|
let text: String
|
|
let confidence: Float
|
|
/// Normalized Vision bounding box (origin bottom-left, 0…1).
|
|
let boundingBox: CGRect
|
|
}
|
|
|
|
var recognitionLevel: VNRequestTextRecognitionLevel = .accurate
|
|
var minimumTextHeight: Float = 0 // 0 = Vision default
|
|
var usesLanguageCorrection = false // names/initials aren't dictionary words
|
|
|
|
/// Recognize text in `pixelBuffer`, optionally limited to a normalized region
|
|
/// of interest (origin bottom-left, matching Vision's coordinate space).
|
|
func recognize(in pixelBuffer: CVPixelBuffer, regionOfInterest: CGRect? = nil) -> [Result] {
|
|
let request = VNRecognizeTextRequest()
|
|
request.recognitionLevel = recognitionLevel
|
|
request.usesLanguageCorrection = usesLanguageCorrection
|
|
if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
|
|
if let roi = regionOfInterest { request.regionOfInterest = roi }
|
|
|
|
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
|
|
do {
|
|
try handler.perform([request])
|
|
} catch {
|
|
return []
|
|
}
|
|
|
|
guard let observations = request.results else { return [] }
|
|
return observations.compactMap { obs in
|
|
guard let top = obs.topCandidates(1).first else { return nil }
|
|
return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
|
|
}
|
|
}
|
|
|
|
/// Convenience for fixtures/tests: recognize text in a CGImage.
|
|
func recognize(in cgImage: CGImage, regionOfInterest: CGRect? = nil) -> [Result] {
|
|
let request = VNRecognizeTextRequest()
|
|
request.recognitionLevel = recognitionLevel
|
|
request.usesLanguageCorrection = usesLanguageCorrection
|
|
if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
|
|
if let roi = regionOfInterest { request.regionOfInterest = roi }
|
|
|
|
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
|
guard (try? handler.perform([request])) != nil, let results = request.results else { return [] }
|
|
return results.compactMap { obs in
|
|
guard let top = obs.topCandidates(1).first else { return nil }
|
|
return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
|
|
}
|
|
}
|
|
}
|