Phases 2-6: detection, visual timeline, backend hand-off, voiceprints

Phase 2 (call detection): CallDetector using CoreAudio per-process mic
attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet,
ignoring our own recording; auto-record toggle. Built; pending live multi-app
confirmation by the user.

Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation,
TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema
1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR +
saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver
(window capture; frames released, never saved; minimized->visual_gap, idle != gap).
Synthetic-frame tested; adapter geometry pending real Signal fixtures + live
VisualObserver validation.

Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential,
TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline
slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated
END-TO-END against the live backend (chunk -> label-merge -> speakers.json).

Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named
fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status,
auto-send toggle (default off) + self-name setting.

All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
Grant Gilliam
2026-06-06 00:15:49 -05:00
parent fd7e1a5907
commit 863136aeec
27 changed files with 2108 additions and 22 deletions
@@ -0,0 +1,82 @@
import Foundation
import CoreGraphics
/// Renders a CGImage to an RGBA8 buffer once, then answers cheap colour queries
/// over pixel regions. Used to score the active-speaker highlight (a saturated
/// coloured border/ring) around participant tiles.
struct FrameSampler {
let width: Int
let height: Int
private let pixels: [UInt8] // RGBA8, row-major, top-left origin
init?(cgImage: CGImage) {
let w = cgImage.width, h = cgImage.height
guard w > 0, h > 0 else { return nil }
var buffer = [UInt8](repeating: 0, count: w * h * 4)
let colorSpace = CGColorSpaceCreateDeviceRGB()
let info = CGImageAlphaInfo.premultipliedLast.rawValue
guard let ctx = buffer.withUnsafeMutableBytes({ raw -> CGContext? in
CGContext(data: raw.baseAddress, width: w, height: h, bitsPerComponent: 8,
bytesPerRow: w * 4, space: colorSpace, bitmapInfo: info)
}) else { return nil }
ctx.draw(cgImage, in: CGRect(x: 0, y: 0, width: w, height: h))
self.width = w
self.height = h
self.pixels = buffer
}
/// Mean HSV saturation (01) over a pixel rect (top-left origin), sampled on a grid.
func meanSaturation(inPixelRect rect: CGRect, samples: Int = 24) -> Double {
let x0 = max(0, Int(rect.minX)), x1 = min(width, Int(rect.maxX))
let y0 = max(0, Int(rect.minY)), y1 = min(height, Int(rect.maxY))
guard x1 > x0, y1 > y0 else { return 0 }
let stepX = max(1, (x1 - x0) / samples)
let stepY = max(1, (y1 - y0) / samples)
var sum = 0.0, count = 0
var y = y0
while y < y1 {
var x = x0
while x < x1 {
let i = (y * width + x) * 4
let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2])
let mx = max(r, g, b), mn = min(r, g, b)
sum += mx > 0 ? (mx - mn) / mx : 0
count += 1
x += stepX
}
y += stepY
}
return count > 0 ? sum / Double(count) : 0
}
/// Mean saturation of a ring just inside `rect`'s edges (the tile border),
/// excluding the interior that's where the speaking highlight lives.
func borderSaturation(inPixelRect rect: CGRect, thicknessFraction: Double = 0.12) -> Double {
let t = max(2.0, min(rect.width, rect.height) * thicknessFraction)
let top = CGRect(x: rect.minX, y: rect.minY, width: rect.width, height: t)
let bottom = CGRect(x: rect.minX, y: rect.maxY - t, width: rect.width, height: t)
let left = CGRect(x: rect.minX, y: rect.minY, width: t, height: rect.height)
let right = CGRect(x: rect.maxX - t, y: rect.minY, width: t, height: rect.height)
return [top, bottom, left, right].map { meanSaturation(inPixelRect: $0) }.max() ?? 0
}
/// Grid-sampled pixel positions (top-left origin) that are strongly saturated
/// AND bright enough to be a UI highlight i.e. the speaking ring/border.
func saturatedPoints(threshold: Double = 0.5, minBrightness: Double = 60, gridStep: Int = 6) -> [CGPoint] {
var points: [CGPoint] = []
var y = 0
while y < height {
var x = 0
while x < width {
let i = (y * width + x) * 4
let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2])
let mx = max(r, g, b), mn = min(r, g, b)
let sat = mx > 0 ? (mx - mn) / mx : 0
if sat > threshold && mx > minBrightness { points.append(CGPoint(x: x, y: y)) }
x += gridStep
}
y += gridStep
}
return points
}
}
@@ -0,0 +1,94 @@
import Foundation
import CoreGraphics
import CoreVideo
import CoreImage
/// Shared engine for tile-grid conferencing UIs (Signal/Zoom/Teams): OCR the
/// name/initials on each tile, then mark the active speaker(s) by the saturated
/// coloured highlight around their tile.
///
/// Geometry (`Config`) is a first pass; the exact tile expansion and saturation
/// threshold get calibrated per app against real screenshot fixtures. The
/// detection *logic* (read names; pick the highlighted tile) is validated with
/// synthetic frames.
struct GridCallAnalyzer {
struct Config {
var tileExpandX = 1.8 // grow text bbox approx tile (for the reported bbox)
var tileExpandY = 2.6
var minTextConfidence: Float = 0.3
var maxNameLength = 40
/// Highlight detection: a name is "speaking" if enough strongly-saturated
/// highlight pixels sit within `highlightRadiusFraction` of its label.
var highlightRadiusFraction = 0.22 // of max(frame W,H)
var minHighlightPoints = 6
var highlightShareOfMax = 0.35 // must be this fraction of the busiest tile
}
var config = Config()
var recognizer = TextRecognizer()
func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
guard let cg = Self.cgImage(from: pixelBuffer) else { return [] }
return analyze(cgImage: cg, at: t)
}
func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
let texts = recognizer.recognize(in: cgImage).filter {
$0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
}
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
let w = cgImage.width, h = cgImage.height
let tiles = texts.map { r -> (name: String, center: CGPoint, rect: CGRect, conf: Double) in
let rect = tileRect(r.boundingBox, imageW: w, imageH: h)
let cx = r.boundingBox.midX * Double(w)
let cy = (1 - r.boundingBox.midY) * Double(h) // flip Y to top-left origin
return (cleaned(r.text), CGPoint(x: cx, y: cy), rect, Double(r.confidence))
}
// Find highlight pixels once, attribute each to the nearest name label.
let points = sampler.saturatedPoints()
let radius = Double(max(w, h)) * config.highlightRadiusFraction
let r2 = radius * radius
let counts = tiles.map { tile -> Int in
points.reduce(0) { acc, p in
let dx = Double(p.x) - tile.center.x, dy = Double(p.y) - tile.center.y
return acc + (dx * dx + dy * dy <= r2 ? 1 : 0)
}
}
let maxCount = counts.max() ?? 0
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
return tiles.enumerated().map { idx, tile in
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
return SpeakerObservation(name: tile.name, speaking: speaking,
bbox: tile.rect, confidence: tile.conf, t: t)
}
}
/// Vision normalized bbox (bottom-left origin) pixel tile rect (top-left),
/// expanded around the text centre to approximate the whole tile.
private func tileRect(_ box: CGRect, imageW: Int, imageH: Int) -> CGRect {
let W = Double(imageW), H = Double(imageH)
let pw = box.width * W
let ph = box.height * H
let cx = (box.midX) * W
let cy = (1 - box.midY) * H // flip Y to top-left origin
let nw = pw * config.tileExpandX
let nh = ph * config.tileExpandY
let rect = CGRect(x: cx - nw / 2, y: cy - nh / 2, width: nw, height: nh)
return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
}
private func cleaned(_ s: String) -> String {
let t = s.trimmingCharacters(in: .whitespacesAndNewlines)
return t.count <= config.maxNameLength ? t : ""
}
private static let ciContext = CIContext()
static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? {
let ci = CIImage(cvPixelBuffer: pixelBuffer)
return ciContext.createCGImage(ci, from: ci.extent) // reuse; allocating per frame is costly
}
}
@@ -0,0 +1,36 @@
import Foundation
import CoreGraphics
import CoreVideo
/// One per-frame observation from an app adapter: a participant tile, whether its
/// active-speaker cue is showing, and where it is. `name` may be a full name,
/// initials (Signal), or "" if unknown. `t` is seconds relative to the session t0.
struct SpeakerObservation: Equatable {
let name: String
let speaking: Bool
let bbox: CGRect
let confidence: Double // 01
let t: TimeInterval
}
/// Per-app screen-reading strategy. Each conferencing app gets one implementation
/// that knows that app's tile layout, name placement, and active-speaker cue.
/// Adapters must be testable offline against still-image fixtures.
protocol AppAdapter {
static var bundleIDs: [String] { get }
var adapterVersion: String { get }
var preferredFPS: Int { get }
/// Analyze one frame; return the speakers visible and whether each is speaking.
/// Must process in-memory and never persist the frame.
func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation]
/// Optional: participant names from the app's Accessibility tree (Electron
/// apps like Signal expose these), preferred over OCR when available.
func namesFromAccessibility() -> [String]?
}
extension AppAdapter {
func namesFromAccessibility() -> [String]? { nil }
var preferredFPS: Int { 3 }
}
@@ -0,0 +1,59 @@
import Foundation
import Vision
import CoreVideo
import CoreGraphics
/// Thin wrapper over Vision's text recognition, used by adapters to read names /
/// initials off participant tiles. Runs on the Neural Engine; no permission
/// needed. Works on any frame, so adapters can be developed against still images.
struct TextRecognizer {
struct Result {
let text: String
let confidence: Float
/// Normalized Vision bounding box (origin bottom-left, 01).
let boundingBox: CGRect
}
var recognitionLevel: VNRequestTextRecognitionLevel = .accurate
var minimumTextHeight: Float = 0 // 0 = Vision default
var usesLanguageCorrection = false // names/initials aren't dictionary words
/// Recognize text in `pixelBuffer`, optionally limited to a normalized region
/// of interest (origin bottom-left, matching Vision's coordinate space).
func recognize(in pixelBuffer: CVPixelBuffer, regionOfInterest: CGRect? = nil) -> [Result] {
let request = VNRecognizeTextRequest()
request.recognitionLevel = recognitionLevel
request.usesLanguageCorrection = usesLanguageCorrection
if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
if let roi = regionOfInterest { request.regionOfInterest = roi }
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
do {
try handler.perform([request])
} catch {
return []
}
guard let observations = request.results else { return [] }
return observations.compactMap { obs in
guard let top = obs.topCandidates(1).first else { return nil }
return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
}
}
/// Convenience for fixtures/tests: recognize text in a CGImage.
func recognize(in cgImage: CGImage, regionOfInterest: CGRect? = nil) -> [Result] {
let request = VNRecognizeTextRequest()
request.recognitionLevel = recognitionLevel
request.usesLanguageCorrection = usesLanguageCorrection
if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
if let roi = regionOfInterest { request.regionOfInterest = roi }
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
guard (try? handler.perform([request])) != nil, let results = request.results else { return [] }
return results.compactMap { obs in
guard let top = obs.topCandidates(1).first else { return nil }
return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
}
}
}
@@ -0,0 +1,127 @@
import Foundation
/// Turns noisy per-frame `SpeakerObservation`s into clean
/// `(start, end, name, confidence)` segments.
///
/// - Hysteresis: open a segment after `openFrames` consecutive speaking frames,
/// close after `closeFrames` quiet frames rides out UI-cue lag/flicker.
/// - Overlaps allowed: each name is tracked independently (crosstalk).
/// - mic-VAD "self" spans are merged in as high-confidence segments.
/// - OCR name variants are normalized via an alias table.
///
/// Pure logic, no UI/capture deps fully unit-testable offline.
final class TimelineBuilder {
private let openFrames: Int
private let closeFrames: Int
private var aliases: [String: String] = [:] // normalized variant -> canonical
private var states: [String: NameState] = [:]
private var lastFrameT: Double = 0
private(set) var segments: [VisualTimeline.Segment] = []
init(openFrames: Int = 2, closeFrames: Int = 2) {
self.openFrames = max(1, openFrames)
self.closeFrames = max(1, closeFrames)
}
/// Register that `variant` (e.g. "Sarah J") should map to `canonical`
/// (e.g. "Sarah Jones").
func addAlias(_ variant: String, canonical: String) {
aliases[Self.normalize(variant)] = canonical
}
/// Ingest one frame's observations (all sharing time `t`). Names not present
/// (or present but not speaking) count as a quiet frame for any open segment.
func ingest(_ observations: [SpeakerObservation], at t: TimeInterval) {
lastFrameT = t
// Best confidence per canonical name that is speaking this frame.
var speaking: [String: Double] = [:]
for obs in observations where obs.speaking && !obs.name.isEmpty {
let name = canonical(obs.name)
speaking[name] = max(speaking[name] ?? 0, obs.confidence)
}
let names = Set(states.keys).union(speaking.keys)
for name in names {
var st = states[name] ?? NameState()
if let conf = speaking[name] {
if st.voiced == 0 { st.runStart = t }
st.voiced += 1
st.silent = 0
st.lastVoicedT = t
if !st.open && st.voiced >= openFrames {
st.open = true
st.segStart = st.runStart
st.confSum = 0
st.confN = 0
}
if st.open { st.confSum += conf; st.confN += 1 }
} else {
st.silent += 1
st.voiced = 0
if st.open && st.silent >= closeFrames {
closeSegment(name: name, state: st)
st.open = false
}
}
states[name] = st
}
}
/// Merge mic-VAD self spans (the user) as high-confidence segments.
func mergeSelfSpans(_ spans: [VADSpan], selfName: String) {
for span in spans where span.end > span.start {
segments.append(.init(start: span.start, end: span.end,
name: selfName, confidence: span.confidence, source: "mic_vad"))
}
}
/// Force-close any open segments at `t` (used when a visual gap begins, so a
/// segment isn't carried across the gap).
func closeOpenSegments(at t: TimeInterval) {
for (name, st) in states where st.open {
closeSegment(name: name, state: st)
states[name]?.open = false
states[name]?.voiced = 0
states[name]?.silent = 0
}
}
/// Close any still-open segments at end of capture.
func finish() {
for (name, st) in states where st.open {
closeSegment(name: name, state: st)
states[name]?.open = false
}
segments.sort { $0.start < $1.start }
}
// MARK: - Internal
private struct NameState {
var voiced = 0
var silent = 0
var open = false
var runStart: Double = 0
var segStart: Double = 0
var lastVoicedT: Double = 0
var confSum: Double = 0
var confN = 0
}
private func closeSegment(name: String, state st: NameState) {
guard st.lastVoicedT > st.segStart else { return }
let confidence = st.confN > 0 ? st.confSum / Double(st.confN) : 0.8
segments.append(.init(start: st.segStart, end: st.lastVoicedT,
name: name, confidence: confidence, source: "vision"))
}
private func canonical(_ raw: String) -> String {
let key = Self.normalize(raw)
return aliases[key] ?? raw.trimmingCharacters(in: .whitespacesAndNewlines)
}
private static func normalize(_ s: String) -> String {
s.lowercased().trimmingCharacters(in: .whitespacesAndNewlines)
}
}
@@ -0,0 +1,131 @@
import Foundation
import ScreenCaptureKit
import CoreMedia
import QuartzCore
import AppKit
/// Window-scoped visual capture: streams the call window's own rendered content
/// at ~`fps`, hands each frame to the app adapter, and **releases it immediately
/// frames are never written to disk**. Builds the speaker timeline and records
/// `visual_gap`s when the window is minimized (SCK delivers non-live frames).
///
/// Window visibility/focus is NOT required SCK captures a window even when it's
/// occluded or on another Space; only minimization freezes the backing buffer.
@available(macOS 13.0, *)
final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
private let bundleID: String
private let adapter: any AppAdapter
private let t0Host: Double
private let fps: Int
private let queue = DispatchQueue(label: "xyz.ten31.visual")
private var stream: SCStream?
private let builder = TimelineBuilder()
private var gaps: [VisualTimeline.Gap] = []
private var gapStart: Double?
/// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?
init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
self.bundleID = bundleID
self.adapter = adapter
self.t0Host = t0Host
self.fps = max(1, fps)
}
func start() async throws {
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
// The call window: the largest window owned by the target app.
let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else {
throw NSError(domain: "Ten31", code: 2,
userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
}
let filter = SCContentFilter(desktopIndependentWindow: window)
let config = SCStreamConfiguration()
config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(fps))
config.queueDepth = 3
config.showsCursor = false
config.pixelFormat = kCVPixelFormatType_32BGRA
// window.frame is in points; capture at native pixels so OCR can read small
// initials/names (a half-res Retina capture badly hurts recognition).
let scale = NSScreen.main?.backingScaleFactor ?? 2
config.width = max(2, Int(window.frame.width * scale))
config.height = max(2, Int(window.frame.height * scale))
let stream = SCStream(filter: filter, configuration: config, delegate: self)
try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: queue)
try await stream.startCapture()
self.stream = stream
}
func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
if let stream { try? await stream.stopCapture() }
stream = nil
return queue.sync {
if let gs = gapStart {
gaps.append(.init(start: gs, end: CACurrentMediaTime() - t0Host, reason: "minimized"))
gapStart = nil
}
builder.finish()
return (builder.segments, gaps)
}
}
/// Merge mic-VAD self spans into the visual timeline (call before `stop`'s read,
/// or fold in afterwards in the packager).
func addSelfSpans(_ spans: [VADSpan], selfName: String) {
queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) }
}
// MARK: - SCStreamOutput (on `queue`)
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
of type: SCStreamOutputType) {
guard type == .screen, CMSampleBufferDataIsReady(sampleBuffer) else { return }
let now = CACurrentMediaTime() - t0Host
switch frameKind(sampleBuffer) {
case .idle:
// Window is live but static (no pixel change) no new info, not a gap.
return
case .gap:
// Minimized/blanked: the backing buffer is frozen. Open a gap once and
// close any open speaker segments so none is carried across it.
if gapStart == nil {
gapStart = now
builder.closeOpenSegments(at: now)
}
return
case .live:
if let gs = gapStart {
gaps.append(.init(start: gs, end: now, reason: "minimized"))
gapStart = nil
}
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
let observations = adapter.analyze(frame: pixelBuffer, at: now) // frame released after this scope
builder.ingest(observations, at: now)
onObservations?(observations, now)
}
}
func stream(_ stream: SCStream, didStopWithError error: Error) {}
private enum FrameKind { case live, idle, gap }
/// SCK delivers `.complete` only when content changes, `.idle` for a static
/// (but visible) window, and `.blank`/`.suspended`/`.stopped` when frozen.
private func frameKind(_ sampleBuffer: CMSampleBuffer) -> FrameKind {
guard let attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, createIfNecessary: false)
as? [[SCStreamFrameInfo: Any]],
let raw = attachments.first?[.status] as? Int,
let status = SCFrameStatus(rawValue: raw) else { return .live }
switch status {
case .complete: return .live
case .idle: return .idle
default: return .gap // .blank / .suspended / .stopped
}
}
}
@@ -0,0 +1,72 @@
import Foundation
/// `visual_timeline.json` (schema 1.1) the app's primary visual output. Times
/// are seconds relative to session t0. Segments may overlap (crosstalk).
struct VisualTimeline: Codable {
var schemaVersion = "1.1"
let sessionId: String
let app: String
let adapterVersion: String
let t0Unix: Double
let durationSec: Double
let fpsSampled: Int
let selfName: String?
let participants: [Participant]
let segments: [Segment]
let visualGaps: [Gap]
struct Participant: Codable {
let name: String
let isSelf: Bool?
let aliases: [String]?
enum CodingKeys: String, CodingKey {
case name
case isSelf = "is_self"
case aliases
}
}
struct Segment: Codable, Equatable {
let start: Double
let end: Double
let name: String
let confidence: Double
let source: String // vision | accessibility | fused | mic_vad
}
struct Gap: Codable, Equatable {
let start: Double
let end: Double
let reason: String // minimized | tab_switched
}
enum CodingKeys: String, CodingKey {
case schemaVersion = "schema_version"
case sessionId = "session_id"
case app
case adapterVersion = "adapter_version"
case t0Unix = "t0_unix"
case durationSec = "duration_sec"
case fpsSampled = "fps_sampled"
case selfName = "self_name"
case participants
case segments
case visualGaps = "visual_gaps"
}
/// Write the rich `visual_timeline.json`.
func write(to url: URL) throws {
let encoder = JSONEncoder()
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
try encoder.encode(self).write(to: url)
}
/// The flat array `label-merge` wants: `[{start,end,name,confidence}]`,
/// dropping `source`. Slice/rebase to chunk-local seconds happens in Phase 5.
func flatTimelineData() throws -> Data {
let flat = segments.map { seg -> [String: Any] in
["start": seg.start, "end": seg.end, "name": seg.name, "confidence": seg.confidence]
}
return try JSONSerialization.data(withJSONObject: flat, options: [])
}
}