Phase 1: dual-track audio capture → mixed-mono 16 kHz WAV + mic VAD

AudioRecorder captures system audio (ScreenCaptureKit) + mic (AVAudioEngine) on a
single serial ioQueue, one shared monotonic t0, time-driven writers (pad gaps /
trim overlaps) so tracks stay aligned, and an energy mic-VAD for 'self' spans.
AudioMixer sums the aligned tracks into mixed_mono_16k.wav. SessionController
drives a serialized start/stop state machine, writes the session folder +
self_vad.json, exposes live level meters, and finalizes on quit.

Hardening from review: ioQueue single-domain (no races), stop() never hangs
(mic-first teardown + bounded stopCapture), layout-agnostic mic deep-copy,
discard-only video output to keep SCStream alive, VAD lockstep on committed
frames, stable signing team in project.yml, single-instance enforcement.
This commit is contained in:
Grant Gilliam
2026-06-05 21:30:11 -05:00
parent b2ae3a62b9
commit fd7e1a5907
12 changed files with 1018 additions and 10 deletions
+333
View File
@@ -0,0 +1,333 @@
import AVFoundation
import ScreenCaptureKit
import CoreMedia
import QuartzCore
struct RecordingResult {
let micURL: URL
let systemURL: URL
let mixedURL: URL
let duration: Double
let selfSpans: [VADSpan]
let t0Unix: Double
/// Non-nil if system-audio capture stopped early (e.g. SCStream error).
let systemNote: String?
}
/// Dual-track local audio capture for Phase 1.
///
/// - System audio via `SCStream` (`capturesAudio`); its audio handler runs on
/// `ioQueue`. A discard-only video output runs on `screenQueue` purely to keep
/// SCStream's frame pipeline drained (an unconsumed video queue can stall the
/// whole stream) frames are dropped instantly, never stored.
/// - Mic via `AVAudioEngine` input tap: the tap deep-copies the raw buffer and
/// hands it to `ioQueue`, where it is resampled and written.
/// - **`ioQueue` is the single isolation domain** for the writers, VAD, both
/// resamplers, and lifecycle flags.
/// - One shared monotonic `t0` (`CACurrentMediaTime`). Each buffer is placed at
/// its true `(startHost t0)` frame (gaps padded, overlaps trimmed), so mic
/// and system stay aligned and the mix is a straight sum.
/// - Live peak levels are exposed via `currentLevels()` for the UI meter.
/// - `stop()` tears the mic down first and bounds `stopCapture()` with a timeout,
/// so a wedged stream can never block finalization. No video is written.
final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput {
private let micURL: URL
private let systemURL: URL
private let mixedURL: URL
private let ioQueue = DispatchQueue(label: "xyz.ten31.audio.io")
private let screenQueue = DispatchQueue(label: "xyz.ten31.audio.screen")
// ioQueue-only state:
private var t0Host: Double = 0
private var t0Unix: Double = 0
private var micWriter: MonoTrackWriter?
private var systemWriter: MonoTrackWriter?
private var vad: MicVAD?
private var tornDown = true
private let micResampler = Resampler()
private let systemResampler = Resampler()
// Cross-thread, guarded by levelLock:
private let levelLock = NSLock()
private var micPeak: Float = 0
private var sysPeak: Float = 0
private var streamStopped = false
private var systemErrorMessage: String?
private var engine: AVAudioEngine?
private var stream: SCStream?
init(micURL: URL, systemURL: URL, mixedURL: URL) {
self.micURL = micURL
self.systemURL = systemURL
self.mixedURL = mixedURL
}
// MARK: - Lifecycle
func start() async throws {
let t0 = CACurrentMediaTime()
let t0u = Date().timeIntervalSince1970
try ioQueue.sync {
let mic = try MonoTrackWriter(url: self.micURL)
let sys = try MonoTrackWriter(url: self.systemURL)
self.t0Host = t0
self.t0Unix = t0u
self.micWriter = mic
self.systemWriter = sys
self.vad = MicVAD()
self.tornDown = false
}
do {
try startMic()
try await startSystem() // throws if Screen Recording is denied
} catch {
await abortStart()
throw error
}
}
func stop() async -> RecordingResult {
// Stop the mic FIRST always succeeds and halts mic capture immediately.
engine?.inputNode.removeTap(onBus: 0)
engine?.stop()
engine = nil
// Stop system capture WITHOUT hanging: an already-errored stream can make
// stopCapture() block forever, so skip it if it already stopped and bound
// it with a timeout otherwise.
if let stream, !flag({ self.streamStopped }) {
await Self.stopCaptureWithTimeout(stream, seconds: 3)
}
stream = nil
var micFrames: Int64 = 0
var sysFrames: Int64 = 0
var spans: [VADSpan] = []
var t0u: Double = 0
ioQueue.sync {
if let tail = micResampler.drain() {
if (micWriter?.write(tail) ?? 0) > 0 { vad?.feed(tail) }
}
if let tail = systemResampler.drain() { systemWriter?.write(tail) }
vad?.finish()
micFrames = micWriter?.framesWritten ?? 0
sysFrames = systemWriter?.framesWritten ?? 0
spans = vad?.spans ?? []
t0u = t0Unix
tornDown = true
micWriter = nil
systemWriter = nil
vad = nil
}
try? AudioMixer.mix(mic: micURL, system: systemURL, into: mixedURL)
let duration = Double(max(micFrames, sysFrames)) / 16_000.0
let note = flag { self.systemErrorMessage } as String?
return RecordingResult(
micURL: micURL, systemURL: systemURL, mixedURL: mixedURL,
duration: duration, selfSpans: spans, t0Unix: t0u, systemNote: note)
}
private func abortStart() async {
engine?.inputNode.removeTap(onBus: 0)
engine?.stop()
engine = nil
if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) }
stream = nil
ioQueue.sync {
tornDown = true
micWriter = nil
systemWriter = nil
vad = nil
}
}
/// Latest peak levels (01) for each source; decays so a stalled source fades.
func currentLevels() -> (mic: Float, system: Float) {
levelLock.lock(); defer { levelLock.unlock() }
let m = micPeak, s = sysPeak
micPeak *= 0.55; sysPeak *= 0.55
return (m, s)
}
// MARK: - Ingest (ioQueue only)
private func ingestMic(_ buffer: AVAudioPCMBuffer, startHost: Double) {
guard !tornDown, let writer = micWriter, let vad else { return }
let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
if expected > writer.framesWritten {
let padded = writer.padSilence(expected - writer.framesWritten)
if padded > 0 { vad.feedSilence(padded) }
}
let startIdx = max(0, Int(writer.framesWritten - expected))
if startIdx >= Int(buffer.frameLength) { return }
guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
updateLevel(chunk, isMic: true)
if writer.write(chunk) > 0 { vad.feed(chunk) }
}
private func ingestSystem(_ buffer: AVAudioPCMBuffer, startHost: Double) {
guard !tornDown, let writer = systemWriter else { return }
let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
if expected > writer.framesWritten {
writer.padSilence(expected - writer.framesWritten)
}
let startIdx = max(0, Int(writer.framesWritten - expected))
if startIdx >= Int(buffer.frameLength) { return }
guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
updateLevel(chunk, isMic: false)
writer.write(chunk)
}
// MARK: - Mic (AVAudioEngine)
private func startMic() throws {
let engine = AVAudioEngine()
let input = engine.inputNode
let format = input.inputFormat(forBus: 0)
input.installTap(onBus: 0, bufferSize: 4096, format: format) { [weak self] buffer, when in
guard let self else { return }
let entry = CACurrentMediaTime()
let stamped = when.isHostTimeValid ? AudioRecorder.hostSeconds(when.hostTime) : entry
let startHost = abs(stamped - entry) < 5 ? stamped : entry
guard let raw = AudioRecorder.copy(buffer) else { return }
self.ioQueue.async {
guard !self.tornDown, let resampled = self.micResampler.resample(raw) else { return }
self.ingestMic(resampled, startHost: startHost)
}
}
engine.prepare()
try engine.start()
self.engine = engine
}
// MARK: - System (ScreenCaptureKit)
private func startSystem() async throws {
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
guard let display = content.displays.first else {
throw NSError(domain: "Ten31", code: 1,
userInfo: [NSLocalizedDescriptionKey: "No display available for system-audio capture."])
}
let filter = SCContentFilter(display: display, excludingWindows: [])
let config = SCStreamConfiguration()
config.capturesAudio = true
config.excludesCurrentProcessAudio = true
config.sampleRate = 48_000
config.channelCount = 2
config.width = 2
config.height = 2
config.minimumFrameInterval = CMTime(value: 1, timescale: 2) // ~2 fps tiny video
config.queueDepth = 6
let stream = SCStream(filter: filter, configuration: config, delegate: self)
try stream.addStreamOutput(self, type: .audio, sampleHandlerQueue: ioQueue)
// Discard-only video consumer keeps SCStream's frame queue drained so the
// stream stays alive; frames are dropped immediately and never stored.
try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: screenQueue)
try await stream.startCapture()
self.stream = stream
}
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
of type: SCStreamOutputType) {
guard type == .audio else { return } // .screen frames discarded here
guard CMSampleBufferDataIsReady(sampleBuffer),
let pcm = Self.pcmBuffer(from: sampleBuffer),
let resampled = systemResampler.resample(pcm) else { return }
let entry = CACurrentMediaTime()
let pts = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
let stamped = pts.isValid ? pts.seconds : entry
let startHost = abs(stamped - entry) < 5 ? stamped : entry
ingestSystem(resampled, startHost: startHost)
}
func stream(_ stream: SCStream, didStopWithError error: Error) {
levelLock.lock()
streamStopped = true
systemErrorMessage = error.localizedDescription
levelLock.unlock()
}
// MARK: - Helpers
private func updateLevel(_ buffer: AVAudioPCMBuffer, isMic: Bool) {
guard let ch = buffer.floatChannelData?[0] else { return }
var peak: Float = 0
let n = Int(buffer.frameLength)
var i = 0
while i < n { let a = abs(ch[i]); if a > peak { peak = a }; i += 1 }
levelLock.lock()
if isMic { if peak > micPeak { micPeak = peak } }
else { if peak > sysPeak { sysPeak = peak } }
levelLock.unlock()
}
/// Read a levelLock-guarded value.
private func flag<T>(_ body: () -> T) -> T {
levelLock.lock(); defer { levelLock.unlock() }
return body()
}
private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async {
await withTaskGroup(of: Void.self) { group in
group.addTask { try? await stream.stopCapture() }
group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) }
_ = await group.next() // proceed as soon as either finishes
group.cancelAll()
}
}
/// Deep-copy a PCM buffer (the engine reuses the tap buffer). Layout-agnostic.
private static func copy(_ buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
guard buffer.frameLength > 0,
let out = AVAudioPCMBuffer(pcmFormat: buffer.format, frameCapacity: buffer.frameLength)
else { return nil }
out.frameLength = buffer.frameLength
let src = UnsafeMutableAudioBufferListPointer(UnsafeMutablePointer(mutating: buffer.audioBufferList))
let dst = UnsafeMutableAudioBufferListPointer(out.mutableAudioBufferList)
guard src.count == dst.count else { return nil }
for i in 0..<src.count {
guard let s = src[i].mData, let d = dst[i].mData else { return nil }
memcpy(d, s, min(Int(src[i].mDataByteSize), Int(dst[i].mDataByteSize)))
}
return out
}
private static func trimFront(_ buffer: AVAudioPCMBuffer, by frames: Int) -> AVAudioPCMBuffer? {
if frames <= 0 { return buffer }
let total = Int(buffer.frameLength)
guard frames < total, let src = buffer.floatChannelData?[0] else { return nil }
let n = AVAudioFrameCount(total - frames)
guard let out = AVAudioPCMBuffer(pcmFormat: buffer.format, frameCapacity: n),
let dst = out.floatChannelData?[0] else { return nil }
out.frameLength = n
memcpy(dst, src + frames, Int(n) * MemoryLayout<Float>.size)
return out
}
private static func hostSeconds(_ hostTime: UInt64) -> Double {
var info = mach_timebase_info_data_t()
mach_timebase_info(&info)
return Double(hostTime) * Double(info.numer) / Double(info.denom) / 1_000_000_000.0
}
private static func pcmBuffer(from sampleBuffer: CMSampleBuffer) -> AVAudioPCMBuffer? {
guard let fmtDesc = CMSampleBufferGetFormatDescription(sampleBuffer),
let asbdPtr = CMAudioFormatDescriptionGetStreamBasicDescription(fmtDesc) else { return nil }
var asbd = asbdPtr.pointee
guard let format = AVAudioFormat(streamDescription: &asbd) else { return nil }
let frames = AVAudioFrameCount(CMSampleBufferGetNumSamples(sampleBuffer))
guard frames > 0,
let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frames) else { return nil }
buffer.frameLength = frames
let status = CMSampleBufferCopyPCMDataIntoAudioBufferList(
sampleBuffer, at: 0, frameCount: Int32(frames), into: buffer.mutableAudioBufferList)
return status == noErr ? buffer : nil
}
}