Phase 1: dual-track audio capture → mixed-mono 16 kHz WAV + mic VAD

AudioRecorder captures system audio (ScreenCaptureKit) + mic (AVAudioEngine) on a
single serial ioQueue, one shared monotonic t0, time-driven writers (pad gaps /
trim overlaps) so tracks stay aligned, and an energy mic-VAD for 'self' spans.
AudioMixer sums the aligned tracks into mixed_mono_16k.wav. SessionController
drives a serialized start/stop state machine, writes the session folder +
self_vad.json, exposes live level meters, and finalizes on quit.

Hardening from review: ioQueue single-domain (no races), stop() never hangs
(mic-first teardown + bounded stopCapture), layout-agnostic mic deep-copy,
discard-only video output to keep SCStream alive, VAD lockstep on committed
frames, stable signing team in project.yml, single-instance enforcement.
This commit is contained in:
Grant Gilliam
2026-06-05 21:30:11 -05:00
parent b2ae3a62b9
commit fd7e1a5907
12 changed files with 1018 additions and 10 deletions
@@ -0,0 +1,213 @@
import Foundation
import Combine
import AppKit
struct SessionInfo: Equatable {
let folder: URL
let mixedURL: URL
let duration: Double
let selfSpanCount: Int
}
/// Owns a single recording session: creates the session folder, drives
/// `AudioRecorder` start/stop, tracks elapsed time, and writes the Phase-1
/// preview of mic-VAD self spans. Detection/visual/backend wiring come later.
///
/// The lifecycle is serialized through an explicit state machine so start and
/// stop can never interleave (`.starting` `.recording` `.finishing`).
@MainActor
final class SessionController: ObservableObject {
enum State: Equatable {
case idle
case starting
case recording
case finishing
case error(String)
}
/// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a
/// recording in progress before the app quits.
static weak var shared: SessionController?
@Published private(set) var state: State = .idle
@Published private(set) var elapsed: TimeInterval = 0
@Published private(set) var lastSession: SessionInfo?
/// Live input peak levels (01) while recording, for the UI meters.
@Published private(set) var micLevel: Float = 0
@Published private(set) var systemLevel: Float = 0
/// Surfaced after a session if system audio stopped early.
@Published private(set) var warning: String?
private let settings: AppSettings
private var recorder: AudioRecorder?
private var currentFolder: URL?
private var startTime: Date?
private var timer: Timer?
/// The in-flight start or stop Task, so `prepareForTermination` can await it.
private var lifecycleTask: Task<Void, Never>?
/// Bumped each time a start/stop Task is spawned (Task is a value type, so this
/// is how `prepareForTermination` detects a newly-spawned transition).
private var lifecycleGeneration = 0
init(settings: AppSettings) {
self.settings = settings
SessionController.shared = self
}
var isBusy: Bool {
state == .starting || state == .recording || state == .finishing
}
func toggle() {
switch state {
case .idle, .error: start()
case .recording: stop()
case .starting, .finishing: break // ignore taps mid-transition
}
}
// MARK: - Start / Stop
private func start() {
let folder: URL
do {
folder = try makeSessionFolder()
} catch {
fail("Couldn't create session folder: \(error.localizedDescription)")
return
}
currentFolder = folder
let recorder = AudioRecorder(
micURL: folder.appendingPathComponent("mic.wav"),
systemURL: folder.appendingPathComponent("system.wav"),
mixedURL: folder.appendingPathComponent("mixed_mono_16k.wav"))
self.recorder = recorder
warning = nil
state = .starting
lifecycleGeneration += 1
lifecycleTask = Task {
do {
try await recorder.start() // self-tears-down if it throws
self.state = .recording
self.startTime = Date()
self.startTimer()
} catch {
self.fail("Couldn't start recording: \(error.localizedDescription)")
}
}
}
private func stop() {
guard let recorder else { return }
state = .finishing
stopTimer()
lifecycleGeneration += 1
lifecycleTask = Task {
let result = await recorder.stop()
self.finish(result)
}
}
private func finish(_ result: RecordingResult) {
recorder = nil
micLevel = 0
systemLevel = 0
warning = result.systemNote.map { "System audio stopped early: \($0)" }
if let folder = currentFolder {
writeSelfSpans(result, to: folder)
lastSession = SessionInfo(
folder: folder, mixedURL: result.mixedURL,
duration: result.duration, selfSpanCount: result.selfSpans.count)
}
currentFolder = nil
elapsed = 0
state = .idle
}
private func fail(_ message: String) {
recorder = nil
currentFolder = nil
stopTimer()
micLevel = 0
systemLevel = 0
elapsed = 0
state = .error(message)
}
/// Called from `applicationShouldTerminate`: flush any in-progress session so
/// its WAV headers are finalized before the process exits. Handles quit while
/// `.starting` and `.finishing`, not just `.recording`.
func prepareForTermination() async {
// Drain whatever lifecycle Task is in flight until nothing is busy. A Stop
// click landing in an await window can spawn a new stop Task, so loop
// rather than awaiting a single captured task.
while isBusy {
let gen = lifecycleGeneration
await lifecycleTask?.value
if state == .recording, let recorder {
state = .finishing
stopTimer()
finish(await recorder.stop())
} else if lifecycleGeneration == gen {
break // settled: no new transition was spawned
}
}
}
// MARK: - Timer
private func startTimer() {
timer = Timer.scheduledTimer(withTimeInterval: 0.1, repeats: true) { [weak self] _ in
Task { @MainActor in
guard let self else { return }
if let start = self.startTime { self.elapsed = Date().timeIntervalSince(start) }
if let recorder = self.recorder {
let levels = recorder.currentLevels()
self.micLevel = levels.mic
self.systemLevel = levels.system
}
}
}
}
private func stopTimer() {
timer?.invalidate()
timer = nil
}
// MARK: - Files
private func makeSessionFolder() throws -> URL {
let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
let folder = base.appendingPathComponent("\(Self.timestamp())_manual", isDirectory: true)
try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true)
return folder
}
private static func timestamp() -> String {
let f = DateFormatter()
f.locale = Locale(identifier: "en_US_POSIX")
f.dateFormat = "yyyy-MM-dd'T'HH-mm-ss"
return f.string(from: Date())
}
/// Phase-1 preview of the mic-VAD "self" spans (the eventual
/// `visual_timeline.json` `mic_vad` segments). Lets us eyeball VAD quality.
private func writeSelfSpans(_ result: RecordingResult, to folder: URL) {
let segments = result.selfSpans.map { span -> [String: Any] in
["start": span.start, "end": span.end, "name": "self",
"confidence": span.confidence, "source": "mic_vad"]
}
let object: [String: Any] = [
"note": "Phase 1 mic-VAD self spans (preview of visual_timeline segments)",
"t0_unix": result.t0Unix,
"duration_sec": result.duration,
"self_spans": segments,
]
if let data = try? JSONSerialization.data(withJSONObject: object,
options: [.prettyPrinted, .sortedKeys]) {
try? data.write(to: folder.appendingPathComponent("self_vad.json"))
}
}
}