Files
ten31-transcripts/Ten31Transcripts/Session/SessionController.swift
T
Grant Gilliam 863136aeec Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic
attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet,
ignoring our own recording; auto-record toggle. Built; pending live multi-app
confirmation by the user.

Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation,
TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema
1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR +
saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver
(window capture; frames released, never saved; minimized->visual_gap, idle != gap).
Synthetic-frame tested; adapter geometry pending real Signal fixtures + live
VisualObserver validation.

Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential,
TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline
slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated
END-TO-END against the live backend (chunk -> label-merge -> speakers.json).

Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named
fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status,
auto-send toggle (default off) + self-name setting.

All adversarial-review findings fixed. App + XCTest suite build; tests pass.
2026-06-06 00:15:49 -05:00

378 lines
15 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
import Combine
import AppKit
import CoreGraphics
struct SessionInfo: Equatable {
let folder: URL
let mixedURL: URL
let duration: Double
let selfSpanCount: Int
}
/// Owns a single recording session: creates the session folder, drives
/// `AudioRecorder` start/stop, tracks elapsed time, and writes the Phase-1
/// preview of mic-VAD self spans. Detection/visual/backend wiring come later.
///
/// The lifecycle is serialized through an explicit state machine so start and
/// stop can never interleave (`.starting` `.recording` `.finishing`).
@MainActor
final class SessionController: ObservableObject {
enum State: Equatable {
case idle
case starting
case recording
case finishing
case error(String)
}
/// Backend transcription status for the most recent session.
enum TranscriptStatus: Equatable {
case idle
case processing(Int, Int) // chunk done, total
case done(speakers: Int, segments: Int)
case failed(String)
}
/// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a
/// recording in progress before the app quits.
static weak var shared: SessionController?
@Published private(set) var state: State = .idle
@Published private(set) var elapsed: TimeInterval = 0
@Published private(set) var lastSession: SessionInfo?
/// Live input peak levels (01) while recording, for the UI meters.
@Published private(set) var micLevel: Float = 0
@Published private(set) var systemLevel: Float = 0
/// Surfaced after a session if system audio stopped early.
@Published private(set) var warning: String?
/// Mirrored from `CallDetector` for the UI.
@Published private(set) var detectionStatus: CallDetector.Status = .disabled
/// Backend transcription status for the last session.
@Published private(set) var transcriptStatus: TranscriptStatus = .idle
private let settings: AppSettings
private var voiceprints: VoiceprintStore
private let detector = CallDetector()
private var cancellables = Set<AnyCancellable>()
private var currentLabel = "manual"
/// Inputs needed to (re)process the last finished session through the backend.
private struct ProcessInputs {
let folder: URL
let sessionId: String
let app: String
let mixedURL: URL
let selfSpans: [VADSpan]
}
private var lastProcess: ProcessInputs?
private var processTask: Task<Void, Never>?
private var recorder: AudioRecorder?
private var currentFolder: URL?
private var startTime: Date?
private var timer: Timer?
/// True when the current session was started by call detection (not the user).
private var autoStarted = false
/// Set if a detected call ends while we're still in `.starting`.
private var pendingAutoStop = false
/// The in-flight start or stop Task, so `prepareForTermination` can await it.
private var lifecycleTask: Task<Void, Never>?
/// Bumped each time a start/stop Task is spawned (Task is a value type, so this
/// is how `prepareForTermination` detects a newly-spawned transition).
private var lifecycleGeneration = 0
init(settings: AppSettings) {
self.settings = settings
self.voiceprints = VoiceprintStore(
fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
SessionController.shared = self
detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
detector.$status
.sink { [weak self] status in self?.detectionStatus = status }
.store(in: &cancellables)
// Re-point the voiceprint DB if the output folder changes. The in-flight
// pipeline keeps its own captured reference, so this can't disrupt a run.
settings.$outputFolderPath
.dropFirst()
.sink { [weak self] path in
guard let self else { return }
let dir = URL(fileURLWithPath: (path as NSString).expandingTildeInPath, isDirectory: true)
self.voiceprints = VoiceprintStore(fileURL: dir.appendingPathComponent("voiceprints.json"))
}
.store(in: &cancellables)
settings.$autoRecordOnDetection
.sink { [weak self] on in
guard let self else { return }
if on {
self.detector.enable()
} else {
self.detector.disable()
// Don't leave an auto-started session running with no detector
// handle both .recording and the in-flight .starting case.
if self.autoStarted {
switch self.state {
case .recording: self.stop()
case .starting: self.pendingAutoStop = true
default: break
}
}
}
}
.store(in: &cancellables)
}
// MARK: - Auto-detection
private func handleCallStart(_ app: CallDetector.DetectedApp) {
guard settings.autoRecordOnDetection else { return }
switch state {
case .idle, .error: start(label: app.label, auto: true)
case .starting, .recording, .finishing: break // don't disturb an active session
}
}
private func handleCallEnd() {
// Only auto-stop a session we auto-started; never a manual recording.
guard autoStarted else { return }
switch state {
case .recording: stop()
case .starting: pendingAutoStop = true // resolved when start() completes
case .idle, .error, .finishing: break
}
}
var isBusy: Bool {
state == .starting || state == .recording || state == .finishing
}
func toggle() {
switch state {
case .idle, .error: start()
case .recording: stop()
case .starting, .finishing: break // ignore taps mid-transition
}
}
// MARK: - Start / Stop
private func start(label: String = "manual", auto: Bool = false) {
let folder: URL
do {
folder = try makeSessionFolder(label: label)
} catch {
fail("Couldn't create session folder: \(error.localizedDescription)")
return
}
currentFolder = folder
currentLabel = label
autoStarted = auto
pendingAutoStop = false
let recorder = AudioRecorder(
micURL: folder.appendingPathComponent("mic.wav"),
systemURL: folder.appendingPathComponent("system.wav"),
mixedURL: folder.appendingPathComponent("mixed_mono_16k.wav"))
self.recorder = recorder
warning = nil
state = .starting
lifecycleGeneration += 1
lifecycleTask = Task {
do {
try await recorder.start() // self-tears-down if it throws
self.state = .recording
self.startTime = Date()
self.startTimer()
// A detected call may have ended while we were still starting.
if self.pendingAutoStop {
self.pendingAutoStop = false
self.stop()
}
} catch {
self.handleStartFailure(error)
}
}
}
/// Map a recorder start failure to an actionable message. The common case is
/// Screen Recording getting re-checked after a rebuild (the SCStream auth
/// check fails even though CGPreflight reports granted), so re-prompt and open
/// the right Settings pane rather than show a cryptic TCC error.
private func handleStartFailure(_ error: Error) {
let msg = error.localizedDescription.lowercased()
let screenIssue = msg.contains("declined") || msg.contains("tcc")
|| msg.contains("screen") || msg.contains("permission")
if screenIssue {
_ = CGRequestScreenCaptureAccess()
if let url = URL(string: "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") {
NSWorkspace.shared.open(url)
}
fail("Screen Recording needs re-approval for this build. Toggle Ten31Transcripts off then on in System Settings ▸ Screen Recording, then restart the app.")
} else {
fail("Couldn't start recording: \(error.localizedDescription)")
}
}
private func stop() {
guard let recorder else { return }
state = .finishing
stopTimer()
lifecycleGeneration += 1
lifecycleTask = Task {
let result = await recorder.stop()
self.finish(result)
}
}
private func finish(_ result: RecordingResult) {
recorder = nil
micLevel = 0
systemLevel = 0
warning = result.systemNote.map { "System audio stopped early: \($0)" }
transcriptStatus = .idle
if let folder = currentFolder {
writeSelfSpans(result, to: folder)
lastSession = SessionInfo(
folder: folder, mixedURL: result.mixedURL,
duration: result.duration, selfSpanCount: result.selfSpans.count)
lastProcess = ProcessInputs(
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
mixedURL: result.mixedURL, selfSpans: result.selfSpans)
}
let autoSend = settings.autoSendOnStop
currentFolder = nil
autoStarted = false
pendingAutoStop = false
elapsed = 0
state = .idle
if autoSend { processLastSession() }
}
// MARK: - Backend transcription
/// Send the last finished session to the backend `speakers.json`. Uses the
/// mic-VAD self spans as the timeline for now; visual segments (Phase 34) get
/// merged in once the adapters land. Safe to call manually ("Send to backend")
/// or automatically on stop.
func processLastSession() {
guard let inputs = lastProcess else { return }
if case .processing = transcriptStatus { return }
transcriptStatus = .processing(0, 1)
let settings = self.settings
let voiceprints = self.voiceprints
processTask = Task {
let pipeline = TranscriptPipeline(
baseURL: settings.backendBaseURL,
skipTLS: settings.skipTLSVerification,
voiceprints: voiceprints)
let timeline = TranscriptPipeline.timeline(
fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
do {
let speakers = try await pipeline.process(
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
mixedURL: inputs.mixedURL, timeline: timeline,
progress: { done, total in
await MainActor.run { self.transcriptStatus = .processing(done, total) }
})
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
} catch is CancellationError {
self.transcriptStatus = .idle
} catch {
self.transcriptStatus = .failed(error.localizedDescription)
}
}
}
private func fail(_ message: String) {
recorder = nil
currentFolder = nil
autoStarted = false
pendingAutoStop = false
stopTimer()
micLevel = 0
systemLevel = 0
elapsed = 0
state = .error(message)
}
/// Called from `applicationShouldTerminate`: flush any in-progress session so
/// its WAV headers are finalized before the process exits. Handles quit while
/// `.starting` and `.finishing`, not just `.recording`.
func prepareForTermination() async {
// Cancel any in-flight backend transcription (audio is already saved; the
// user can resend). The pipeline's checkCancellation + defer clean up chunks.
processTask?.cancel()
// Drain whatever lifecycle Task is in flight until nothing is busy. A Stop
// click landing in an await window can spawn a new stop Task, so loop
// rather than awaiting a single captured task.
while isBusy {
let gen = lifecycleGeneration
await lifecycleTask?.value
if state == .recording, let recorder {
state = .finishing
stopTimer()
finish(await recorder.stop())
} else if lifecycleGeneration == gen {
break // settled: no new transition was spawned
}
}
}
// MARK: - Timer
private func startTimer() {
timer = Timer.scheduledTimer(withTimeInterval: 0.1, repeats: true) { [weak self] _ in
Task { @MainActor in
guard let self else { return }
if let start = self.startTime { self.elapsed = Date().timeIntervalSince(start) }
if let recorder = self.recorder {
let levels = recorder.currentLevels()
self.micLevel = levels.mic
self.systemLevel = levels.system
}
}
}
}
private func stopTimer() {
timer?.invalidate()
timer = nil
}
// MARK: - Files
private func makeSessionFolder(label: String) throws -> URL {
let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
let folder = base.appendingPathComponent("\(Self.timestamp())_\(label)", isDirectory: true)
try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true)
return folder
}
private static func timestamp() -> String {
let f = DateFormatter()
f.locale = Locale(identifier: "en_US_POSIX")
f.dateFormat = "yyyy-MM-dd'T'HH-mm-ss"
return f.string(from: Date())
}
/// Phase-1 preview of the mic-VAD "self" spans (the eventual
/// `visual_timeline.json` `mic_vad` segments). Lets us eyeball VAD quality.
private func writeSelfSpans(_ result: RecordingResult, to folder: URL) {
let segments = result.selfSpans.map { span -> [String: Any] in
["start": span.start, "end": span.end, "name": "self",
"confidence": span.confidence, "source": "mic_vad"]
}
let object: [String: Any] = [
"note": "Phase 1 mic-VAD self spans (preview of visual_timeline segments)",
"t0_unix": result.t0Unix,
"duration_sec": result.duration,
"self_spans": segments,
]
if let data = try? JSONSerialization.data(withJSONObject: object,
options: [.prettyPrinted, .sortedKeys]) {
try? data.write(to: folder.appendingPathComponent("self_vad.json"))
}
}
}