Files
ten31-transcripts/Ten31Transcripts/Detection/CallDetector.swift
T
Grant Gilliam 863136aeec Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic
attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet,
ignoring our own recording; auto-record toggle. Built; pending live multi-app
confirmation by the user.

Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation,
TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema
1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR +
saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver
(window capture; frames released, never saved; minimized->visual_gap, idle != gap).
Synthetic-frame tested; adapter geometry pending real Signal fixtures + live
VisualObserver validation.

Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential,
TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline
slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated
END-TO-END against the live backend (chunk -> label-merge -> speakers.json).

Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named
fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status,
auto-send toggle (default off) + self-name setting.

All adversarial-review findings fixed. App + XCTest suite build; tests pass.
2026-06-06 00:15:49 -05:00

227 lines
8.6 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import AppKit
import CoreGraphics
import Combine
/// Detects when the user joins/leaves a call and reports it via callbacks.
///
/// Heuristic: the mic is live system-wide AND a known call app is present
/// Zoom/Teams/Signal by bundle ID, or Google Meet by a browser window whose
/// title looks like a Meet call (read via `CGWindowList`, using the Screen
/// Recording permission). Debounced so a quick unrelated mic use doesn't trigger.
///
/// Main-actor: all evaluation runs on the main thread.
@MainActor
final class CallDetector: ObservableObject {
enum DetectedApp: String, Equatable {
case zoom, teams, signal, meet
var label: String { rawValue }
var display: String {
switch self {
case .zoom: return "Zoom"
case .teams: return "Microsoft Teams"
case .signal: return "Signal"
case .meet: return "Google Meet"
}
}
}
enum Status: Equatable {
case disabled
case listening
case inCall(DetectedApp)
}
@Published private(set) var status: Status = .disabled
var onCallStart: ((DetectedApp) -> Void)?
var onCallEnd: (() -> Void)?
private let mic = MicActivityMonitor()
private var pollTimer: Timer?
private var openTimer: Timer?
private var closeTimer: Timer?
private var inCall = false
private var currentApp: DetectedApp?
private var enabled = false
private let openDelay: TimeInterval = 2.0
private let closeDelay: TimeInterval = 4.0
private let pollInterval: TimeInterval = 3.0
private static let nativeApps: [(id: String, app: DetectedApp)] = [
("us.zoom.xos", .zoom),
("com.microsoft.teams2", .teams),
("com.microsoft.teams", .teams),
("org.whispersystems.signal-desktop", .signal),
]
private static let browserIDs: Set<String> = [
"org.mozilla.firefox", "com.google.Chrome", "com.apple.Safari",
"company.thebrowser.Browser", "com.brave.Browser", "com.microsoft.edgemac",
]
func enable() {
guard !enabled else { return }
enabled = true
mic.onChange = { [weak self] _ in self?.evaluate() }
mic.start()
status = .listening
pollTimer = Timer.scheduledTimer(withTimeInterval: pollInterval, repeats: true) { [weak self] _ in
Task { @MainActor in self?.evaluate() }
}
evaluate()
}
func disable() {
guard enabled else { return }
enabled = false
mic.stop()
pollTimer?.invalidate(); pollTimer = nil
cancelOpen(); cancelClose()
inCall = false
currentApp = nil
status = .disabled
}
// MARK: - Evaluation
private func evaluate() {
guard enabled else { return }
let candidate = mic.isRunning ? detectApp() : nil
if let candidate {
cancelClose()
if inCall {
currentApp = candidate
status = .inCall(candidate)
} else if openTimer == nil {
openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in
Task { @MainActor in self?.fireOpen() }
}
}
} else {
cancelOpen()
if inCall && closeTimer == nil {
closeTimer = Timer.scheduledTimer(withTimeInterval: closeDelay, repeats: false) { [weak self] _ in
Task { @MainActor in self?.fireClose() }
}
}
}
}
private func fireOpen() {
openTimer = nil
// Re-resolve the app at fire time (the debounce window may have changed it).
guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return }
inCall = true
currentApp = app
status = .inCall(app)
onCallStart?(app)
}
private func fireClose() {
closeTimer = nil
guard enabled, inCall else { return }
inCall = false
currentApp = nil
status = .listening
onCallEnd?()
}
private func cancelOpen() { openTimer?.invalidate(); openTimer = nil }
private func cancelClose() { closeTimer?.invalidate(); closeTimer = nil }
// MARK: - App detection
/// A call is active when a known call app is actually using the mic.
/// On macOS 14+ we attribute mic usage per-process (robust start AND stop,
/// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13
/// we fall back to the per-app call-window heuristic.
private func detectApp() -> DetectedApp? {
if #available(macOS 14.0, *) {
return detectViaMicAttribution()
}
return detectViaWindowTitle()
}
@available(macOS 14.0, *)
private func detectViaMicAttribution() -> DetectedApp? {
let micPIDs = AudioInputProcesses.micUsingPIDs()
guard !micPIDs.isEmpty else { return nil }
let selfPID = NSRunningApplication.current.processIdentifier
for app in NSWorkspace.shared.runningApplications {
let pid = app.processIdentifier
guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue }
if let native = Self.nativeApps.first(where: { $0.id == id }) {
return native.app // Signal/Zoom/Teams using the mic = in a call
}
// A browser using the mic + a Meet window = a Meet call. The mic state
// gives reliable start/stop; the window check keeps non-Meet browser
// mic use (other web apps) from being mislabeled as a Meet recording.
if Self.browserIDs.contains(id), pidHasMeetWindow(pid) {
return .meet
}
}
return nil
}
private func pidHasMeetWindow(_ pid: pid_t) -> Bool {
guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]]
else { return false }
for w in info {
guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid,
let title = w[kCGWindowName as String] as? String else { continue }
if Self.looksLikeMeet(title) { return true }
}
return false
}
/// macOS 13 fallback: detect by the presence of a call WINDOW per app.
private func detectViaWindowTitle() -> DetectedApp? {
var pidToApp: [pid_t: DetectedApp] = [:]
var browserPIDs = Set<pid_t>()
for app in NSWorkspace.shared.runningApplications {
guard let id = app.bundleIdentifier else { continue }
if let native = Self.nativeApps.first(where: { $0.id == id }) {
pidToApp[app.processIdentifier] = native.app
} else if Self.browserIDs.contains(id) {
browserPIDs.insert(app.processIdentifier)
}
}
guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil }
guard let infoList = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] else {
return nil
}
for info in infoList {
guard let pid = info[kCGWindowOwnerPID as String] as? pid_t,
let title = info[kCGWindowName as String] as? String,
!title.isEmpty else { continue }
if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet }
if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app }
}
return nil
}
/// Per-app in-call window-title signatures (macOS 13 fallback only).
private static func isCallWindow(_ app: DetectedApp, _ title: String) -> Bool {
let t = title.lowercased()
switch app {
case .zoom: return t.contains("zoom meeting") || t.contains("meeting")
case .teams: return t.contains("meeting")
case .signal: return t.contains("signal call") || t.contains("group call")
case .meet: return false // handled via the browser path above
}
}
/// Match an ACTIVE Google Meet call. Verified against real Firefox titles:
/// in a call the title is "Meet - <code>" (e.g. "Meet - tjh-pixe-ier"), while
/// the home/lobby/"you left" pages are bare "Meet" or "Google Meet". Matching
/// only the "Meet - " form is what lets auto-STOP fire when you leave (and
/// avoids false-starting on the home page). Also excludes "Zoom Meeting" etc.
private static func looksLikeMeet(_ title: String) -> Bool {
let t = title.lowercased()
return t.hasPrefix("meet - ") || t.hasPrefix("meet ") || t.hasPrefix("meet — ")
}
}