Phases 2-6: detection, visual timeline, backend hand-off, voiceprints

Phase 2 (call detection): CallDetector using CoreAudio per-process mic
attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet,
ignoring our own recording; auto-record toggle. Built; pending live multi-app
confirmation by the user.

Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation,
TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema
1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR +
saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver
(window capture; frames released, never saved; minimized->visual_gap, idle != gap).
Synthetic-frame tested; adapter geometry pending real Signal fixtures + live
VisualObserver validation.

Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential,
TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline
slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated
END-TO-END against the live backend (chunk -> label-merge -> speakers.json).

Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named
fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status,
auto-send toggle (default off) + self-name setting.

All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
Grant Gilliam
2026-06-06 00:15:49 -05:00
parent fd7e1a5907
commit 863136aeec
27 changed files with 2108 additions and 22 deletions
@@ -1,6 +1,7 @@
import Foundation
import Combine
import AppKit
import CoreGraphics
struct SessionInfo: Equatable {
let folder: URL
@@ -25,6 +26,14 @@ final class SessionController: ObservableObject {
case error(String)
}
/// Backend transcription status for the most recent session.
enum TranscriptStatus: Equatable {
case idle
case processing(Int, Int) // chunk done, total
case done(speakers: Int, segments: Int)
case failed(String)
}
/// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a
/// recording in progress before the app quits.
static weak var shared: SessionController?
@@ -37,12 +46,34 @@ final class SessionController: ObservableObject {
@Published private(set) var systemLevel: Float = 0
/// Surfaced after a session if system audio stopped early.
@Published private(set) var warning: String?
/// Mirrored from `CallDetector` for the UI.
@Published private(set) var detectionStatus: CallDetector.Status = .disabled
/// Backend transcription status for the last session.
@Published private(set) var transcriptStatus: TranscriptStatus = .idle
private let settings: AppSettings
private var voiceprints: VoiceprintStore
private let detector = CallDetector()
private var cancellables = Set<AnyCancellable>()
private var currentLabel = "manual"
/// Inputs needed to (re)process the last finished session through the backend.
private struct ProcessInputs {
let folder: URL
let sessionId: String
let app: String
let mixedURL: URL
let selfSpans: [VADSpan]
}
private var lastProcess: ProcessInputs?
private var processTask: Task<Void, Never>?
private var recorder: AudioRecorder?
private var currentFolder: URL?
private var startTime: Date?
private var timer: Timer?
/// True when the current session was started by call detection (not the user).
private var autoStarted = false
/// Set if a detected call ends while we're still in `.starting`.
private var pendingAutoStop = false
/// The in-flight start or stop Task, so `prepareForTermination` can await it.
private var lifecycleTask: Task<Void, Never>?
/// Bumped each time a start/stop Task is spawned (Task is a value type, so this
@@ -51,7 +82,64 @@ final class SessionController: ObservableObject {
init(settings: AppSettings) {
self.settings = settings
self.voiceprints = VoiceprintStore(
fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
SessionController.shared = self
detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
detector.$status
.sink { [weak self] status in self?.detectionStatus = status }
.store(in: &cancellables)
// Re-point the voiceprint DB if the output folder changes. The in-flight
// pipeline keeps its own captured reference, so this can't disrupt a run.
settings.$outputFolderPath
.dropFirst()
.sink { [weak self] path in
guard let self else { return }
let dir = URL(fileURLWithPath: (path as NSString).expandingTildeInPath, isDirectory: true)
self.voiceprints = VoiceprintStore(fileURL: dir.appendingPathComponent("voiceprints.json"))
}
.store(in: &cancellables)
settings.$autoRecordOnDetection
.sink { [weak self] on in
guard let self else { return }
if on {
self.detector.enable()
} else {
self.detector.disable()
// Don't leave an auto-started session running with no detector
// handle both .recording and the in-flight .starting case.
if self.autoStarted {
switch self.state {
case .recording: self.stop()
case .starting: self.pendingAutoStop = true
default: break
}
}
}
}
.store(in: &cancellables)
}
// MARK: - Auto-detection
private func handleCallStart(_ app: CallDetector.DetectedApp) {
guard settings.autoRecordOnDetection else { return }
switch state {
case .idle, .error: start(label: app.label, auto: true)
case .starting, .recording, .finishing: break // don't disturb an active session
}
}
private func handleCallEnd() {
// Only auto-stop a session we auto-started; never a manual recording.
guard autoStarted else { return }
switch state {
case .recording: stop()
case .starting: pendingAutoStop = true // resolved when start() completes
case .idle, .error, .finishing: break
}
}
var isBusy: Bool {
@@ -68,15 +156,18 @@ final class SessionController: ObservableObject {
// MARK: - Start / Stop
private func start() {
private func start(label: String = "manual", auto: Bool = false) {
let folder: URL
do {
folder = try makeSessionFolder()
folder = try makeSessionFolder(label: label)
} catch {
fail("Couldn't create session folder: \(error.localizedDescription)")
return
}
currentFolder = folder
currentLabel = label
autoStarted = auto
pendingAutoStop = false
let recorder = AudioRecorder(
micURL: folder.appendingPathComponent("mic.wav"),
systemURL: folder.appendingPathComponent("system.wav"),
@@ -92,12 +183,36 @@ final class SessionController: ObservableObject {
self.state = .recording
self.startTime = Date()
self.startTimer()
// A detected call may have ended while we were still starting.
if self.pendingAutoStop {
self.pendingAutoStop = false
self.stop()
}
} catch {
self.fail("Couldn't start recording: \(error.localizedDescription)")
self.handleStartFailure(error)
}
}
}
/// Map a recorder start failure to an actionable message. The common case is
/// Screen Recording getting re-checked after a rebuild (the SCStream auth
/// check fails even though CGPreflight reports granted), so re-prompt and open
/// the right Settings pane rather than show a cryptic TCC error.
private func handleStartFailure(_ error: Error) {
let msg = error.localizedDescription.lowercased()
let screenIssue = msg.contains("declined") || msg.contains("tcc")
|| msg.contains("screen") || msg.contains("permission")
if screenIssue {
_ = CGRequestScreenCaptureAccess()
if let url = URL(string: "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") {
NSWorkspace.shared.open(url)
}
fail("Screen Recording needs re-approval for this build. Toggle Ten31Transcripts off then on in System Settings ▸ Screen Recording, then restart the app.")
} else {
fail("Couldn't start recording: \(error.localizedDescription)")
}
}
private func stop() {
guard let recorder else { return }
state = .finishing
@@ -114,20 +229,66 @@ final class SessionController: ObservableObject {
micLevel = 0
systemLevel = 0
warning = result.systemNote.map { "System audio stopped early: \($0)" }
transcriptStatus = .idle
if let folder = currentFolder {
writeSelfSpans(result, to: folder)
lastSession = SessionInfo(
folder: folder, mixedURL: result.mixedURL,
duration: result.duration, selfSpanCount: result.selfSpans.count)
lastProcess = ProcessInputs(
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
mixedURL: result.mixedURL, selfSpans: result.selfSpans)
}
let autoSend = settings.autoSendOnStop
currentFolder = nil
autoStarted = false
pendingAutoStop = false
elapsed = 0
state = .idle
if autoSend { processLastSession() }
}
// MARK: - Backend transcription
/// Send the last finished session to the backend `speakers.json`. Uses the
/// mic-VAD self spans as the timeline for now; visual segments (Phase 34) get
/// merged in once the adapters land. Safe to call manually ("Send to backend")
/// or automatically on stop.
func processLastSession() {
guard let inputs = lastProcess else { return }
if case .processing = transcriptStatus { return }
transcriptStatus = .processing(0, 1)
let settings = self.settings
let voiceprints = self.voiceprints
processTask = Task {
let pipeline = TranscriptPipeline(
baseURL: settings.backendBaseURL,
skipTLS: settings.skipTLSVerification,
voiceprints: voiceprints)
let timeline = TranscriptPipeline.timeline(
fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
do {
let speakers = try await pipeline.process(
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
mixedURL: inputs.mixedURL, timeline: timeline,
progress: { done, total in
await MainActor.run { self.transcriptStatus = .processing(done, total) }
})
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
} catch is CancellationError {
self.transcriptStatus = .idle
} catch {
self.transcriptStatus = .failed(error.localizedDescription)
}
}
}
private func fail(_ message: String) {
recorder = nil
currentFolder = nil
autoStarted = false
pendingAutoStop = false
stopTimer()
micLevel = 0
systemLevel = 0
@@ -139,6 +300,9 @@ final class SessionController: ObservableObject {
/// its WAV headers are finalized before the process exits. Handles quit while
/// `.starting` and `.finishing`, not just `.recording`.
func prepareForTermination() async {
// Cancel any in-flight backend transcription (audio is already saved; the
// user can resend). The pipeline's checkCancellation + defer clean up chunks.
processTask?.cancel()
// Drain whatever lifecycle Task is in flight until nothing is busy. A Stop
// click landing in an await window can spawn a new stop Task, so loop
// rather than awaiting a single captured task.
@@ -178,9 +342,9 @@ final class SessionController: ObservableObject {
// MARK: - Files
private func makeSessionFolder() throws -> URL {
private func makeSessionFolder(label: String) throws -> URL {
let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
let folder = base.appendingPathComponent("\(Self.timestamp())_manual", isDirectory: true)
let folder = base.appendingPathComponent("\(Self.timestamp())_\(label)", isDirectory: true)
try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true)
return folder
}