Files
ten31-transcripts/Ten31Transcripts/Audio/AudioRecorder.swift
T
Grant Gilliam fd7e1a5907 Phase 1: dual-track audio capture → mixed-mono 16 kHz WAV + mic VAD
AudioRecorder captures system audio (ScreenCaptureKit) + mic (AVAudioEngine) on a
single serial ioQueue, one shared monotonic t0, time-driven writers (pad gaps /
trim overlaps) so tracks stay aligned, and an energy mic-VAD for 'self' spans.
AudioMixer sums the aligned tracks into mixed_mono_16k.wav. SessionController
drives a serialized start/stop state machine, writes the session folder +
self_vad.json, exposes live level meters, and finalizes on quit.

Hardening from review: ioQueue single-domain (no races), stop() never hangs
(mic-first teardown + bounded stopCapture), layout-agnostic mic deep-copy,
discard-only video output to keep SCStream alive, VAD lockstep on committed
frames, stable signing team in project.yml, single-instance enforcement.
2026-06-05 21:30:11 -05:00

334 lines
14 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import AVFoundation
import ScreenCaptureKit
import CoreMedia
import QuartzCore
struct RecordingResult {
let micURL: URL
let systemURL: URL
let mixedURL: URL
let duration: Double
let selfSpans: [VADSpan]
let t0Unix: Double
/// Non-nil if system-audio capture stopped early (e.g. SCStream error).
let systemNote: String?
}
/// Dual-track local audio capture for Phase 1.
///
/// - System audio via `SCStream` (`capturesAudio`); its audio handler runs on
/// `ioQueue`. A discard-only video output runs on `screenQueue` purely to keep
/// SCStream's frame pipeline drained (an unconsumed video queue can stall the
/// whole stream) frames are dropped instantly, never stored.
/// - Mic via `AVAudioEngine` input tap: the tap deep-copies the raw buffer and
/// hands it to `ioQueue`, where it is resampled and written.
/// - **`ioQueue` is the single isolation domain** for the writers, VAD, both
/// resamplers, and lifecycle flags.
/// - One shared monotonic `t0` (`CACurrentMediaTime`). Each buffer is placed at
/// its true `(startHost t0)` frame (gaps padded, overlaps trimmed), so mic
/// and system stay aligned and the mix is a straight sum.
/// - Live peak levels are exposed via `currentLevels()` for the UI meter.
/// - `stop()` tears the mic down first and bounds `stopCapture()` with a timeout,
/// so a wedged stream can never block finalization. No video is written.
final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput {
private let micURL: URL
private let systemURL: URL
private let mixedURL: URL
private let ioQueue = DispatchQueue(label: "xyz.ten31.audio.io")
private let screenQueue = DispatchQueue(label: "xyz.ten31.audio.screen")
// ioQueue-only state:
private var t0Host: Double = 0
private var t0Unix: Double = 0
private var micWriter: MonoTrackWriter?
private var systemWriter: MonoTrackWriter?
private var vad: MicVAD?
private var tornDown = true
private let micResampler = Resampler()
private let systemResampler = Resampler()
// Cross-thread, guarded by levelLock:
private let levelLock = NSLock()
private var micPeak: Float = 0
private var sysPeak: Float = 0
private var streamStopped = false
private var systemErrorMessage: String?
private var engine: AVAudioEngine?
private var stream: SCStream?
init(micURL: URL, systemURL: URL, mixedURL: URL) {
self.micURL = micURL
self.systemURL = systemURL
self.mixedURL = mixedURL
}
// MARK: - Lifecycle
func start() async throws {
let t0 = CACurrentMediaTime()
let t0u = Date().timeIntervalSince1970
try ioQueue.sync {
let mic = try MonoTrackWriter(url: self.micURL)
let sys = try MonoTrackWriter(url: self.systemURL)
self.t0Host = t0
self.t0Unix = t0u
self.micWriter = mic
self.systemWriter = sys
self.vad = MicVAD()
self.tornDown = false
}
do {
try startMic()
try await startSystem() // throws if Screen Recording is denied
} catch {
await abortStart()
throw error
}
}
func stop() async -> RecordingResult {
// Stop the mic FIRST always succeeds and halts mic capture immediately.
engine?.inputNode.removeTap(onBus: 0)
engine?.stop()
engine = nil
// Stop system capture WITHOUT hanging: an already-errored stream can make
// stopCapture() block forever, so skip it if it already stopped and bound
// it with a timeout otherwise.
if let stream, !flag({ self.streamStopped }) {
await Self.stopCaptureWithTimeout(stream, seconds: 3)
}
stream = nil
var micFrames: Int64 = 0
var sysFrames: Int64 = 0
var spans: [VADSpan] = []
var t0u: Double = 0
ioQueue.sync {
if let tail = micResampler.drain() {
if (micWriter?.write(tail) ?? 0) > 0 { vad?.feed(tail) }
}
if let tail = systemResampler.drain() { systemWriter?.write(tail) }
vad?.finish()
micFrames = micWriter?.framesWritten ?? 0
sysFrames = systemWriter?.framesWritten ?? 0
spans = vad?.spans ?? []
t0u = t0Unix
tornDown = true
micWriter = nil
systemWriter = nil
vad = nil
}
try? AudioMixer.mix(mic: micURL, system: systemURL, into: mixedURL)
let duration = Double(max(micFrames, sysFrames)) / 16_000.0
let note = flag { self.systemErrorMessage } as String?
return RecordingResult(
micURL: micURL, systemURL: systemURL, mixedURL: mixedURL,
duration: duration, selfSpans: spans, t0Unix: t0u, systemNote: note)
}
private func abortStart() async {
engine?.inputNode.removeTap(onBus: 0)
engine?.stop()
engine = nil
if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) }
stream = nil
ioQueue.sync {
tornDown = true
micWriter = nil
systemWriter = nil
vad = nil
}
}
/// Latest peak levels (01) for each source; decays so a stalled source fades.
func currentLevels() -> (mic: Float, system: Float) {
levelLock.lock(); defer { levelLock.unlock() }
let m = micPeak, s = sysPeak
micPeak *= 0.55; sysPeak *= 0.55
return (m, s)
}
// MARK: - Ingest (ioQueue only)
private func ingestMic(_ buffer: AVAudioPCMBuffer, startHost: Double) {
guard !tornDown, let writer = micWriter, let vad else { return }
let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
if expected > writer.framesWritten {
let padded = writer.padSilence(expected - writer.framesWritten)
if padded > 0 { vad.feedSilence(padded) }
}
let startIdx = max(0, Int(writer.framesWritten - expected))
if startIdx >= Int(buffer.frameLength) { return }
guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
updateLevel(chunk, isMic: true)
if writer.write(chunk) > 0 { vad.feed(chunk) }
}
private func ingestSystem(_ buffer: AVAudioPCMBuffer, startHost: Double) {
guard !tornDown, let writer = systemWriter else { return }
let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
if expected > writer.framesWritten {
writer.padSilence(expected - writer.framesWritten)
}
let startIdx = max(0, Int(writer.framesWritten - expected))
if startIdx >= Int(buffer.frameLength) { return }
guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
updateLevel(chunk, isMic: false)
writer.write(chunk)
}
// MARK: - Mic (AVAudioEngine)
private func startMic() throws {
let engine = AVAudioEngine()
let input = engine.inputNode
let format = input.inputFormat(forBus: 0)
input.installTap(onBus: 0, bufferSize: 4096, format: format) { [weak self] buffer, when in
guard let self else { return }
let entry = CACurrentMediaTime()
let stamped = when.isHostTimeValid ? AudioRecorder.hostSeconds(when.hostTime) : entry
let startHost = abs(stamped - entry) < 5 ? stamped : entry
guard let raw = AudioRecorder.copy(buffer) else { return }
self.ioQueue.async {
guard !self.tornDown, let resampled = self.micResampler.resample(raw) else { return }
self.ingestMic(resampled, startHost: startHost)
}
}
engine.prepare()
try engine.start()
self.engine = engine
}
// MARK: - System (ScreenCaptureKit)
private func startSystem() async throws {
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
guard let display = content.displays.first else {
throw NSError(domain: "Ten31", code: 1,
userInfo: [NSLocalizedDescriptionKey: "No display available for system-audio capture."])
}
let filter = SCContentFilter(display: display, excludingWindows: [])
let config = SCStreamConfiguration()
config.capturesAudio = true
config.excludesCurrentProcessAudio = true
config.sampleRate = 48_000
config.channelCount = 2
config.width = 2
config.height = 2
config.minimumFrameInterval = CMTime(value: 1, timescale: 2) // ~2 fps tiny video
config.queueDepth = 6
let stream = SCStream(filter: filter, configuration: config, delegate: self)
try stream.addStreamOutput(self, type: .audio, sampleHandlerQueue: ioQueue)
// Discard-only video consumer keeps SCStream's frame queue drained so the
// stream stays alive; frames are dropped immediately and never stored.
try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: screenQueue)
try await stream.startCapture()
self.stream = stream
}
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
of type: SCStreamOutputType) {
guard type == .audio else { return } // .screen frames discarded here
guard CMSampleBufferDataIsReady(sampleBuffer),
let pcm = Self.pcmBuffer(from: sampleBuffer),
let resampled = systemResampler.resample(pcm) else { return }
let entry = CACurrentMediaTime()
let pts = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
let stamped = pts.isValid ? pts.seconds : entry
let startHost = abs(stamped - entry) < 5 ? stamped : entry
ingestSystem(resampled, startHost: startHost)
}
func stream(_ stream: SCStream, didStopWithError error: Error) {
levelLock.lock()
streamStopped = true
systemErrorMessage = error.localizedDescription
levelLock.unlock()
}
// MARK: - Helpers
private func updateLevel(_ buffer: AVAudioPCMBuffer, isMic: Bool) {
guard let ch = buffer.floatChannelData?[0] else { return }
var peak: Float = 0
let n = Int(buffer.frameLength)
var i = 0
while i < n { let a = abs(ch[i]); if a > peak { peak = a }; i += 1 }
levelLock.lock()
if isMic { if peak > micPeak { micPeak = peak } }
else { if peak > sysPeak { sysPeak = peak } }
levelLock.unlock()
}
/// Read a levelLock-guarded value.
private func flag<T>(_ body: () -> T) -> T {
levelLock.lock(); defer { levelLock.unlock() }
return body()
}
private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async {
await withTaskGroup(of: Void.self) { group in
group.addTask { try? await stream.stopCapture() }
group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) }
_ = await group.next() // proceed as soon as either finishes
group.cancelAll()
}
}
/// Deep-copy a PCM buffer (the engine reuses the tap buffer). Layout-agnostic.
private static func copy(_ buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
guard buffer.frameLength > 0,
let out = AVAudioPCMBuffer(pcmFormat: buffer.format, frameCapacity: buffer.frameLength)
else { return nil }
out.frameLength = buffer.frameLength
let src = UnsafeMutableAudioBufferListPointer(UnsafeMutablePointer(mutating: buffer.audioBufferList))
let dst = UnsafeMutableAudioBufferListPointer(out.mutableAudioBufferList)
guard src.count == dst.count else { return nil }
for i in 0..<src.count {
guard let s = src[i].mData, let d = dst[i].mData else { return nil }
memcpy(d, s, min(Int(src[i].mDataByteSize), Int(dst[i].mDataByteSize)))
}
return out
}
private static func trimFront(_ buffer: AVAudioPCMBuffer, by frames: Int) -> AVAudioPCMBuffer? {
if frames <= 0 { return buffer }
let total = Int(buffer.frameLength)
guard frames < total, let src = buffer.floatChannelData?[0] else { return nil }
let n = AVAudioFrameCount(total - frames)
guard let out = AVAudioPCMBuffer(pcmFormat: buffer.format, frameCapacity: n),
let dst = out.floatChannelData?[0] else { return nil }
out.frameLength = n
memcpy(dst, src + frames, Int(n) * MemoryLayout<Float>.size)
return out
}
private static func hostSeconds(_ hostTime: UInt64) -> Double {
var info = mach_timebase_info_data_t()
mach_timebase_info(&info)
return Double(hostTime) * Double(info.numer) / Double(info.denom) / 1_000_000_000.0
}
private static func pcmBuffer(from sampleBuffer: CMSampleBuffer) -> AVAudioPCMBuffer? {
guard let fmtDesc = CMSampleBufferGetFormatDescription(sampleBuffer),
let asbdPtr = CMAudioFormatDescriptionGetStreamBasicDescription(fmtDesc) else { return nil }
var asbd = asbdPtr.pointee
guard let format = AVAudioFormat(streamDescription: &asbd) else { return nil }
let frames = AVAudioFrameCount(CMSampleBufferGetNumSamples(sampleBuffer))
guard frames > 0,
let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frames) else { return nil }
buffer.frameLength = frames
let status = CMSampleBufferCopyPCMDataIntoAudioBufferList(
sampleBuffer, at: 0, frameCount: Int32(frames), into: buffer.mutableAudioBufferList)
return status == noErr ? buffer : nil
}
}