Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
@@ -0,0 +1,131 @@
|
||||
import Foundation
|
||||
import ScreenCaptureKit
|
||||
import CoreMedia
|
||||
import QuartzCore
|
||||
import AppKit
|
||||
|
||||
/// Window-scoped visual capture: streams the call window's own rendered content
|
||||
/// at ~`fps`, hands each frame to the app adapter, and **releases it immediately
|
||||
/// — frames are never written to disk**. Builds the speaker timeline and records
|
||||
/// `visual_gap`s when the window is minimized (SCK delivers non-live frames).
|
||||
///
|
||||
/// Window visibility/focus is NOT required — SCK captures a window even when it's
|
||||
/// occluded or on another Space; only minimization freezes the backing buffer.
|
||||
@available(macOS 13.0, *)
|
||||
final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
|
||||
private let bundleID: String
|
||||
private let adapter: any AppAdapter
|
||||
private let t0Host: Double
|
||||
private let fps: Int
|
||||
private let queue = DispatchQueue(label: "xyz.ten31.visual")
|
||||
|
||||
private var stream: SCStream?
|
||||
private let builder = TimelineBuilder()
|
||||
private var gaps: [VisualTimeline.Gap] = []
|
||||
private var gapStart: Double?
|
||||
|
||||
/// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
|
||||
var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?
|
||||
|
||||
init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
|
||||
self.bundleID = bundleID
|
||||
self.adapter = adapter
|
||||
self.t0Host = t0Host
|
||||
self.fps = max(1, fps)
|
||||
}
|
||||
|
||||
func start() async throws {
|
||||
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
|
||||
// The call window: the largest window owned by the target app.
|
||||
let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
|
||||
guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else {
|
||||
throw NSError(domain: "Ten31", code: 2,
|
||||
userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
|
||||
}
|
||||
|
||||
let filter = SCContentFilter(desktopIndependentWindow: window)
|
||||
let config = SCStreamConfiguration()
|
||||
config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(fps))
|
||||
config.queueDepth = 3
|
||||
config.showsCursor = false
|
||||
config.pixelFormat = kCVPixelFormatType_32BGRA
|
||||
// window.frame is in points; capture at native pixels so OCR can read small
|
||||
// initials/names (a half-res Retina capture badly hurts recognition).
|
||||
let scale = NSScreen.main?.backingScaleFactor ?? 2
|
||||
config.width = max(2, Int(window.frame.width * scale))
|
||||
config.height = max(2, Int(window.frame.height * scale))
|
||||
|
||||
let stream = SCStream(filter: filter, configuration: config, delegate: self)
|
||||
try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: queue)
|
||||
try await stream.startCapture()
|
||||
self.stream = stream
|
||||
}
|
||||
|
||||
func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
|
||||
if let stream { try? await stream.stopCapture() }
|
||||
stream = nil
|
||||
return queue.sync {
|
||||
if let gs = gapStart {
|
||||
gaps.append(.init(start: gs, end: CACurrentMediaTime() - t0Host, reason: "minimized"))
|
||||
gapStart = nil
|
||||
}
|
||||
builder.finish()
|
||||
return (builder.segments, gaps)
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge mic-VAD self spans into the visual timeline (call before `stop`'s read,
|
||||
/// or fold in afterwards in the packager).
|
||||
func addSelfSpans(_ spans: [VADSpan], selfName: String) {
|
||||
queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) }
|
||||
}
|
||||
|
||||
// MARK: - SCStreamOutput (on `queue`)
|
||||
|
||||
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
|
||||
of type: SCStreamOutputType) {
|
||||
guard type == .screen, CMSampleBufferDataIsReady(sampleBuffer) else { return }
|
||||
let now = CACurrentMediaTime() - t0Host
|
||||
|
||||
switch frameKind(sampleBuffer) {
|
||||
case .idle:
|
||||
// Window is live but static (no pixel change) — no new info, not a gap.
|
||||
return
|
||||
case .gap:
|
||||
// Minimized/blanked: the backing buffer is frozen. Open a gap once and
|
||||
// close any open speaker segments so none is carried across it.
|
||||
if gapStart == nil {
|
||||
gapStart = now
|
||||
builder.closeOpenSegments(at: now)
|
||||
}
|
||||
return
|
||||
case .live:
|
||||
if let gs = gapStart {
|
||||
gaps.append(.init(start: gs, end: now, reason: "minimized"))
|
||||
gapStart = nil
|
||||
}
|
||||
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
|
||||
let observations = adapter.analyze(frame: pixelBuffer, at: now) // frame released after this scope
|
||||
builder.ingest(observations, at: now)
|
||||
onObservations?(observations, now)
|
||||
}
|
||||
}
|
||||
|
||||
func stream(_ stream: SCStream, didStopWithError error: Error) {}
|
||||
|
||||
private enum FrameKind { case live, idle, gap }
|
||||
|
||||
/// SCK delivers `.complete` only when content changes, `.idle` for a static
|
||||
/// (but visible) window, and `.blank`/`.suspended`/`.stopped` when frozen.
|
||||
private func frameKind(_ sampleBuffer: CMSampleBuffer) -> FrameKind {
|
||||
guard let attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, createIfNecessary: false)
|
||||
as? [[SCStreamFrameInfo: Any]],
|
||||
let raw = attachments.first?[.status] as? Int,
|
||||
let status = SCFrameStatus(rawValue: raw) else { return .live }
|
||||
switch status {
|
||||
case .complete: return .live
|
||||
case .idle: return .idle
|
||||
default: return .gap // .blank / .suspended / .stopped
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user