863136aeec
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
95 lines
4.2 KiB
Swift
95 lines
4.2 KiB
Swift
import Foundation
|
|
import CoreGraphics
|
|
import CoreVideo
|
|
import CoreImage
|
|
|
|
/// Shared engine for tile-grid conferencing UIs (Signal/Zoom/Teams): OCR the
|
|
/// name/initials on each tile, then mark the active speaker(s) by the saturated
|
|
/// coloured highlight around their tile.
|
|
///
|
|
/// Geometry (`Config`) is a first pass; the exact tile expansion and saturation
|
|
/// threshold get calibrated per app against real screenshot fixtures. The
|
|
/// detection *logic* (read names; pick the highlighted tile) is validated with
|
|
/// synthetic frames.
|
|
struct GridCallAnalyzer {
|
|
struct Config {
|
|
var tileExpandX = 1.8 // grow text bbox → approx tile (for the reported bbox)
|
|
var tileExpandY = 2.6
|
|
var minTextConfidence: Float = 0.3
|
|
var maxNameLength = 40
|
|
/// Highlight detection: a name is "speaking" if enough strongly-saturated
|
|
/// highlight pixels sit within `highlightRadiusFraction` of its label.
|
|
var highlightRadiusFraction = 0.22 // of max(frame W,H)
|
|
var minHighlightPoints = 6
|
|
var highlightShareOfMax = 0.35 // must be ≥ this fraction of the busiest tile
|
|
}
|
|
|
|
var config = Config()
|
|
var recognizer = TextRecognizer()
|
|
|
|
func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
|
|
guard let cg = Self.cgImage(from: pixelBuffer) else { return [] }
|
|
return analyze(cgImage: cg, at: t)
|
|
}
|
|
|
|
func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
|
|
let texts = recognizer.recognize(in: cgImage).filter {
|
|
$0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
|
|
}
|
|
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
|
|
|
|
let w = cgImage.width, h = cgImage.height
|
|
let tiles = texts.map { r -> (name: String, center: CGPoint, rect: CGRect, conf: Double) in
|
|
let rect = tileRect(r.boundingBox, imageW: w, imageH: h)
|
|
let cx = r.boundingBox.midX * Double(w)
|
|
let cy = (1 - r.boundingBox.midY) * Double(h) // flip Y to top-left origin
|
|
return (cleaned(r.text), CGPoint(x: cx, y: cy), rect, Double(r.confidence))
|
|
}
|
|
|
|
// Find highlight pixels once, attribute each to the nearest name label.
|
|
let points = sampler.saturatedPoints()
|
|
let radius = Double(max(w, h)) * config.highlightRadiusFraction
|
|
let r2 = radius * radius
|
|
let counts = tiles.map { tile -> Int in
|
|
points.reduce(0) { acc, p in
|
|
let dx = Double(p.x) - tile.center.x, dy = Double(p.y) - tile.center.y
|
|
return acc + (dx * dx + dy * dy <= r2 ? 1 : 0)
|
|
}
|
|
}
|
|
let maxCount = counts.max() ?? 0
|
|
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
|
|
|
|
return tiles.enumerated().map { idx, tile in
|
|
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
|
|
return SpeakerObservation(name: tile.name, speaking: speaking,
|
|
bbox: tile.rect, confidence: tile.conf, t: t)
|
|
}
|
|
}
|
|
|
|
/// Vision normalized bbox (bottom-left origin) → pixel tile rect (top-left),
|
|
/// expanded around the text centre to approximate the whole tile.
|
|
private func tileRect(_ box: CGRect, imageW: Int, imageH: Int) -> CGRect {
|
|
let W = Double(imageW), H = Double(imageH)
|
|
let pw = box.width * W
|
|
let ph = box.height * H
|
|
let cx = (box.midX) * W
|
|
let cy = (1 - box.midY) * H // flip Y to top-left origin
|
|
let nw = pw * config.tileExpandX
|
|
let nh = ph * config.tileExpandY
|
|
let rect = CGRect(x: cx - nw / 2, y: cy - nh / 2, width: nw, height: nh)
|
|
return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
|
|
}
|
|
|
|
private func cleaned(_ s: String) -> String {
|
|
let t = s.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
return t.count <= config.maxNameLength ? t : ""
|
|
}
|
|
|
|
private static let ciContext = CIContext()
|
|
|
|
static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? {
|
|
let ci = CIImage(cvPixelBuffer: pixelBuffer)
|
|
return ciContext.createCGImage(ci, from: ci.extent) // reuse; allocating per frame is costly
|
|
}
|
|
}
|