Phase 1: dual-track audio capture → mixed-mono 16 kHz WAV + mic VAD

AudioRecorder captures system audio (ScreenCaptureKit) + mic (AVAudioEngine) on a single serial ioQueue, one shared monotonic t0, time-driven writers (pad gaps / trim overlaps) so tracks stay aligned, and an energy mic-VAD for 'self' spans. AudioMixer sums the aligned tracks into mixed_mono_16k.wav. SessionController drives a serialized start/stop state machine, writes the session folder + self_vad.json, exposes live level meters, and finalizes on quit. Hardening from review: ioQueue single-domain (no races), stop() never hangs (mic-first teardown + bounded stopCapture), layout-agnostic mic deep-copy, discard-only video output to keep SCStream alive, VAD lockstep on committed frames, stable signing team in project.yml, single-instance enforcement.
2026-06-05 21:30:11 -05:00
parent b2ae3a62b9
commit fd7e1a5907
12 changed files with 1018 additions and 10 deletions
@@ -0,0 +1,67 @@
+import AVFoundation
+
+/// Sums the two aligned 16 kHz mono tracks (mic + system) into the single
+/// **mixed-mono 16 kHz WAV** that the backend receives. Both inputs are already
+/// front-padded to the shared t0, so frame N of each file is the same instant.
+/// Streamed in 1-second chunks to keep memory flat for long calls.
+enum AudioMixer {
+    static func mix(mic micURL: URL, system systemURL: URL, into outURL: URL) throws {
+        let mic = try? AVAudioFile(forReading: micURL)
+        let sys = try? AVAudioFile(forReading: systemURL)
+
+        let settings: [String: Any] = [
+            AVFormatIDKey: kAudioFormatLinearPCM,
+            AVSampleRateKey: 16_000,
+            AVNumberOfChannelsKey: 1,
+            AVLinearPCMBitDepthKey: 16,
+            AVLinearPCMIsFloatKey: false,
+            AVLinearPCMIsBigEndianKey: false,
+        ]
+        let out = try AVAudioFile(
+            forWriting: outURL,
+            settings: settings,
+            commonFormat: .pcmFormatFloat32,
+            interleaved: false)
+
+        let outFormat = Resampler.targetFormat
+        let chunk: AVAudioFramePosition = 16_000
+        let total = max(mic?.length ?? 0, sys?.length ?? 0)
+        var pos: AVAudioFramePosition = 0
+
+        while pos < total {
+            let frames = AVAudioFrameCount(min(chunk, total - pos))
+            guard let mixBuf = AVAudioPCMBuffer(pcmFormat: outFormat, frameCapacity: frames),
+                  let dst = mixBuf.floatChannelData?[0] else { break }
+            mixBuf.frameLength = frames
+            memset(dst, 0, Int(frames) * MemoryLayout<Float>.size)
+
+            add(file: mic, at: pos, maxFrames: frames, into: dst)
+            add(file: sys, at: pos, maxFrames: frames, into: dst)
+
+            var i = 0
+            while i < Int(frames) {
+                if dst[i] > 1 { dst[i] = 1 } else if dst[i] < -1 { dst[i] = -1 }
+                i += 1
+            }
+            try out.write(from: mixBuf)
+            pos += AVAudioFramePosition(frames)
+        }
+    }
+
+    private static func add(file: AVAudioFile?, at pos: AVAudioFramePosition,
+                            maxFrames: AVAudioFrameCount, into dst: UnsafeMutablePointer<Float>) {
+        guard let file, pos < file.length else { return }
+        file.framePosition = pos
+        let toRead = AVAudioFrameCount(min(AVAudioFramePosition(maxFrames), file.length - pos))
+        guard toRead > 0,
+              let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: toRead)
+        else { return }
+        do {
+            try file.read(into: buf, frameCount: toRead)
+            guard let src = buf.floatChannelData?[0] else { return }
+            var i = 0
+            let count = Int(buf.frameLength)
+            while i < count { dst[i] += src[i]; i += 1 }
+        } catch {}
+    }
+}
@@ -0,0 +1,333 @@
+import AVFoundation
+import ScreenCaptureKit
+import CoreMedia
+import QuartzCore
+
+struct RecordingResult {
+    let micURL: URL
+    let systemURL: URL
+    let mixedURL: URL
+    let duration: Double
+    let selfSpans: [VADSpan]
+    let t0Unix: Double
+    /// Non-nil if system-audio capture stopped early (e.g. SCStream error).
+    let systemNote: String?
+}
+
+/// Dual-track local audio capture for Phase 1.
+///
+/// - System audio via `SCStream` (`capturesAudio`); its audio handler runs on
+///   `ioQueue`. A discard-only video output runs on `screenQueue` purely to keep
+///   SCStream's frame pipeline drained (an unconsumed video queue can stall the
+///   whole stream) — frames are dropped instantly, never stored.
+/// - Mic via `AVAudioEngine` input tap: the tap deep-copies the raw buffer and
+///   hands it to `ioQueue`, where it is resampled and written.
+/// - **`ioQueue` is the single isolation domain** for the writers, VAD, both
+///   resamplers, and lifecycle flags.
+/// - One shared monotonic `t0` (`CACurrentMediaTime`). Each buffer is placed at
+///   its true `(startHost − t0)` frame (gaps padded, overlaps trimmed), so mic
+///   and system stay aligned and the mix is a straight sum.
+/// - Live peak levels are exposed via `currentLevels()` for the UI meter.
+/// - `stop()` tears the mic down first and bounds `stopCapture()` with a timeout,
+///   so a wedged stream can never block finalization. No video is written.
+final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput {
+    private let micURL: URL
+    private let systemURL: URL
+    private let mixedURL: URL
+
+    private let ioQueue = DispatchQueue(label: "xyz.ten31.audio.io")
+    private let screenQueue = DispatchQueue(label: "xyz.ten31.audio.screen")
+
+    // ioQueue-only state:
+    private var t0Host: Double = 0
+    private var t0Unix: Double = 0
+    private var micWriter: MonoTrackWriter?
+    private var systemWriter: MonoTrackWriter?
+    private var vad: MicVAD?
+    private var tornDown = true
+    private let micResampler = Resampler()
+    private let systemResampler = Resampler()
+
+    // Cross-thread, guarded by levelLock:
+    private let levelLock = NSLock()
+    private var micPeak: Float = 0
+    private var sysPeak: Float = 0
+    private var streamStopped = false
+    private var systemErrorMessage: String?
+
+    private var engine: AVAudioEngine?
+    private var stream: SCStream?
+
+    init(micURL: URL, systemURL: URL, mixedURL: URL) {
+        self.micURL = micURL
+        self.systemURL = systemURL
+        self.mixedURL = mixedURL
+    }
+
+    // MARK: - Lifecycle
+
+    func start() async throws {
+        let t0 = CACurrentMediaTime()
+        let t0u = Date().timeIntervalSince1970
+        try ioQueue.sync {
+            let mic = try MonoTrackWriter(url: self.micURL)
+            let sys = try MonoTrackWriter(url: self.systemURL)
+            self.t0Host = t0
+            self.t0Unix = t0u
+            self.micWriter = mic
+            self.systemWriter = sys
+            self.vad = MicVAD()
+            self.tornDown = false
+        }
+        do {
+            try startMic()
+            try await startSystem()   // throws if Screen Recording is denied
+        } catch {
+            await abortStart()
+            throw error
+        }
+    }
+
+    func stop() async -> RecordingResult {
+        // Stop the mic FIRST — always succeeds and halts mic capture immediately.
+        engine?.inputNode.removeTap(onBus: 0)
+        engine?.stop()
+        engine = nil
+
+        // Stop system capture WITHOUT hanging: an already-errored stream can make
+        // stopCapture() block forever, so skip it if it already stopped and bound
+        // it with a timeout otherwise.
+        if let stream, !flag({ self.streamStopped }) {
+            await Self.stopCaptureWithTimeout(stream, seconds: 3)
+        }
+        stream = nil
+
+        var micFrames: Int64 = 0
+        var sysFrames: Int64 = 0
+        var spans: [VADSpan] = []
+        var t0u: Double = 0
+
+        ioQueue.sync {
+            if let tail = micResampler.drain() {
+                if (micWriter?.write(tail) ?? 0) > 0 { vad?.feed(tail) }
+            }
+            if let tail = systemResampler.drain() { systemWriter?.write(tail) }
+            vad?.finish()
+            micFrames = micWriter?.framesWritten ?? 0
+            sysFrames = systemWriter?.framesWritten ?? 0
+            spans = vad?.spans ?? []
+            t0u = t0Unix
+            tornDown = true
+            micWriter = nil
+            systemWriter = nil
+            vad = nil
+        }
+
+        try? AudioMixer.mix(mic: micURL, system: systemURL, into: mixedURL)
+
+        let duration = Double(max(micFrames, sysFrames)) / 16_000.0
+        let note = flag { self.systemErrorMessage } as String?
+        return RecordingResult(
+            micURL: micURL, systemURL: systemURL, mixedURL: mixedURL,
+            duration: duration, selfSpans: spans, t0Unix: t0u, systemNote: note)
+    }
+
+    private func abortStart() async {
+        engine?.inputNode.removeTap(onBus: 0)
+        engine?.stop()
+        engine = nil
+        if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) }
+        stream = nil
+        ioQueue.sync {
+            tornDown = true
+            micWriter = nil
+            systemWriter = nil
+            vad = nil
+        }
+    }
+
+    /// Latest peak levels (0…1) for each source; decays so a stalled source fades.
+    func currentLevels() -> (mic: Float, system: Float) {
+        levelLock.lock(); defer { levelLock.unlock() }
+        let m = micPeak, s = sysPeak
+        micPeak *= 0.55; sysPeak *= 0.55
+        return (m, s)
+    }
+
+    // MARK: - Ingest (ioQueue only)
+
+    private func ingestMic(_ buffer: AVAudioPCMBuffer, startHost: Double) {
+        guard !tornDown, let writer = micWriter, let vad else { return }
+        let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
+        if expected > writer.framesWritten {
+            let padded = writer.padSilence(expected - writer.framesWritten)
+            if padded > 0 { vad.feedSilence(padded) }
+        }
+        let startIdx = max(0, Int(writer.framesWritten - expected))
+        if startIdx >= Int(buffer.frameLength) { return }
+        guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
+        updateLevel(chunk, isMic: true)
+        if writer.write(chunk) > 0 { vad.feed(chunk) }
+    }
+
+    private func ingestSystem(_ buffer: AVAudioPCMBuffer, startHost: Double) {
+        guard !tornDown, let writer = systemWriter else { return }
+        let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
+        if expected > writer.framesWritten {
+            writer.padSilence(expected - writer.framesWritten)
+        }
+        let startIdx = max(0, Int(writer.framesWritten - expected))
+        if startIdx >= Int(buffer.frameLength) { return }
+        guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
+        updateLevel(chunk, isMic: false)
+        writer.write(chunk)
+    }
+
+    // MARK: - Mic (AVAudioEngine)
+
+    private func startMic() throws {
+        let engine = AVAudioEngine()
+        let input = engine.inputNode
+        let format = input.inputFormat(forBus: 0)
+
+        input.installTap(onBus: 0, bufferSize: 4096, format: format) { [weak self] buffer, when in
+            guard let self else { return }
+            let entry = CACurrentMediaTime()
+            let stamped = when.isHostTimeValid ? AudioRecorder.hostSeconds(when.hostTime) : entry
+            let startHost = abs(stamped - entry) < 5 ? stamped : entry
+            guard let raw = AudioRecorder.copy(buffer) else { return }
+            self.ioQueue.async {
+                guard !self.tornDown, let resampled = self.micResampler.resample(raw) else { return }
+                self.ingestMic(resampled, startHost: startHost)
+            }
+        }
+        engine.prepare()
+        try engine.start()
+        self.engine = engine
+    }
+
+    // MARK: - System (ScreenCaptureKit)
+
+    private func startSystem() async throws {
+        let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
+        guard let display = content.displays.first else {
+            throw NSError(domain: "Ten31", code: 1,
+                          userInfo: [NSLocalizedDescriptionKey: "No display available for system-audio capture."])
+        }
+        let filter = SCContentFilter(display: display, excludingWindows: [])
+        let config = SCStreamConfiguration()
+        config.capturesAudio = true
+        config.excludesCurrentProcessAudio = true
+        config.sampleRate = 48_000
+        config.channelCount = 2
+        config.width = 2
+        config.height = 2
+        config.minimumFrameInterval = CMTime(value: 1, timescale: 2)   // ~2 fps tiny video
+        config.queueDepth = 6
+
+        let stream = SCStream(filter: filter, configuration: config, delegate: self)
+        try stream.addStreamOutput(self, type: .audio, sampleHandlerQueue: ioQueue)
+        // Discard-only video consumer keeps SCStream's frame queue drained so the
+        // stream stays alive; frames are dropped immediately and never stored.
+        try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: screenQueue)
+        try await stream.startCapture()
+        self.stream = stream
+    }
+
+    func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
+                of type: SCStreamOutputType) {
+        guard type == .audio else { return }   // .screen frames discarded here
+        guard CMSampleBufferDataIsReady(sampleBuffer),
+              let pcm = Self.pcmBuffer(from: sampleBuffer),
+              let resampled = systemResampler.resample(pcm) else { return }
+        let entry = CACurrentMediaTime()
+        let pts = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
+        let stamped = pts.isValid ? pts.seconds : entry
+        let startHost = abs(stamped - entry) < 5 ? stamped : entry
+        ingestSystem(resampled, startHost: startHost)
+    }
+
+    func stream(_ stream: SCStream, didStopWithError error: Error) {
+        levelLock.lock()
+        streamStopped = true
+        systemErrorMessage = error.localizedDescription
+        levelLock.unlock()
+    }
+
+    // MARK: - Helpers
+
+    private func updateLevel(_ buffer: AVAudioPCMBuffer, isMic: Bool) {
+        guard let ch = buffer.floatChannelData?[0] else { return }
+        var peak: Float = 0
+        let n = Int(buffer.frameLength)
+        var i = 0
+        while i < n { let a = abs(ch[i]); if a > peak { peak = a }; i += 1 }
+        levelLock.lock()
+        if isMic { if peak > micPeak { micPeak = peak } }
+        else { if peak > sysPeak { sysPeak = peak } }
+        levelLock.unlock()
+    }
+
+    /// Read a levelLock-guarded value.
+    private func flag<T>(_ body: () -> T) -> T {
+        levelLock.lock(); defer { levelLock.unlock() }
+        return body()
+    }
+
+    private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async {
+        await withTaskGroup(of: Void.self) { group in
+            group.addTask { try? await stream.stopCapture() }
+            group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) }
+            _ = await group.next()   // proceed as soon as either finishes
+            group.cancelAll()
+        }
+    }
+
+    /// Deep-copy a PCM buffer (the engine reuses the tap buffer). Layout-agnostic.
+    private static func copy(_ buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
+        guard buffer.frameLength > 0,
+              let out = AVAudioPCMBuffer(pcmFormat: buffer.format, frameCapacity: buffer.frameLength)
+        else { return nil }
+        out.frameLength = buffer.frameLength
+        let src = UnsafeMutableAudioBufferListPointer(UnsafeMutablePointer(mutating: buffer.audioBufferList))
+        let dst = UnsafeMutableAudioBufferListPointer(out.mutableAudioBufferList)
+        guard src.count == dst.count else { return nil }
+        for i in 0..<src.count {
+            guard let s = src[i].mData, let d = dst[i].mData else { return nil }
+            memcpy(d, s, min(Int(src[i].mDataByteSize), Int(dst[i].mDataByteSize)))
+        }
+        return out
+    }
+
+    private static func trimFront(_ buffer: AVAudioPCMBuffer, by frames: Int) -> AVAudioPCMBuffer? {
+        if frames <= 0 { return buffer }
+        let total = Int(buffer.frameLength)
+        guard frames < total, let src = buffer.floatChannelData?[0] else { return nil }
+        let n = AVAudioFrameCount(total - frames)
+        guard let out = AVAudioPCMBuffer(pcmFormat: buffer.format, frameCapacity: n),
+              let dst = out.floatChannelData?[0] else { return nil }
+        out.frameLength = n
+        memcpy(dst, src + frames, Int(n) * MemoryLayout<Float>.size)
+        return out
+    }
+
+    private static func hostSeconds(_ hostTime: UInt64) -> Double {
+        var info = mach_timebase_info_data_t()
+        mach_timebase_info(&info)
+        return Double(hostTime) * Double(info.numer) / Double(info.denom) / 1_000_000_000.0
+    }
+
+    private static func pcmBuffer(from sampleBuffer: CMSampleBuffer) -> AVAudioPCMBuffer? {
+        guard let fmtDesc = CMSampleBufferGetFormatDescription(sampleBuffer),
+              let asbdPtr = CMAudioFormatDescriptionGetStreamBasicDescription(fmtDesc) else { return nil }
+        var asbd = asbdPtr.pointee
+        guard let format = AVAudioFormat(streamDescription: &asbd) else { return nil }
+        let frames = AVAudioFrameCount(CMSampleBufferGetNumSamples(sampleBuffer))
+        guard frames > 0,
+              let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frames) else { return nil }
+        buffer.frameLength = frames
+        let status = CMSampleBufferCopyPCMDataIntoAudioBufferList(
+            sampleBuffer, at: 0, frameCount: Int32(frames), into: buffer.mutableAudioBufferList)
+        return status == noErr ? buffer : nil
+    }
+}
@@ -0,0 +1,98 @@
+import AVFoundation
+
+/// A speaking span on the session `t0` timeline (seconds).
+struct VADSpan: Equatable {
+    let start: Double
+    let end: Double
+    let confidence: Double
+}
+
+/// Lightweight energy-based voice-activity detector for the **mic** track (the
+/// user). It is fed the *exact same* 16 kHz mono stream the mic WAV receives —
+/// real samples via `feed` and timeline-gap silence via `feedSilence` — so its
+/// internal sample cursor always equals the mic file position, and span times
+/// land on the same instants as `mixed_mono_16k.wav`.
+///
+/// Phase 3's `TimelineBuilder` will fold these in as high-confidence pre-seeded
+/// "self" segments. Thresholds are intentionally simple and will be tuned later.
+///
+/// Single-threaded: all calls happen on `AudioRecorder.ioQueue`.
+final class MicVAD {
+    private let frameSize = 320            // 20 ms @ 16 kHz
+    private let openFrames = 2             // ~40 ms above threshold to open
+    private let closeFrames = 10           // ~200 ms hangover to close
+    private let absoluteFloor: Float = 0.006
+    private let floorMultiplier: Float = 2.5
+
+    private var cursorSamples = 0           // total samples fed (== mic file position)
+    private var noiseFloor: Float = 0.01
+    private var voicedRun = 0
+    private var silentRun = 0
+    private var inSpeech = false
+    private var spanStartSample = 0
+    private var acc: [Float] = []
+    private(set) var spans: [VADSpan] = []
+
+    func feed(_ buffer: AVAudioPCMBuffer) {
+        guard let ch = buffer.floatChannelData, buffer.frameLength > 0 else { return }
+        acc.append(contentsOf: UnsafeBufferPointer(start: ch[0], count: Int(buffer.frameLength)))
+        drainFrames()
+    }
+
+    func feedSilence(_ count: Int64) {
+        guard count > 0 else { return }
+        acc.append(contentsOf: repeatElement(0, count: Int(count)))
+        drainFrames()
+    }
+
+    /// Close any span still open at end of capture.
+    func finish() {
+        if inSpeech {
+            appendSpan(startSample: spanStartSample, endSample: cursorSamples)
+            inSpeech = false
+        }
+    }
+
+    private func drainFrames() {
+        var i = 0
+        while i + frameSize <= acc.count {
+            var sum: Float = 0
+            var j = i
+            while j < i + frameSize { sum += acc[j] * acc[j]; j += 1 }
+            step(rms: (sum / Float(frameSize)).squareRoot())
+            cursorSamples += frameSize
+            i += frameSize
+        }
+        if i > 0 { acc.removeFirst(i) }
+    }
+
+    /// `cursorSamples` is the start sample of the frame being evaluated.
+    private func step(rms: Float) {
+        if rms < noiseFloor { noiseFloor = 0.9 * noiseFloor + 0.1 * rms }
+        else { noiseFloor = 0.995 * noiseFloor + 0.005 * rms }
+
+        let threshold = max(absoluteFloor, noiseFloor * floorMultiplier)
+        let voiced = rms > threshold
+
+        if voiced {
+            voicedRun += 1; silentRun = 0
+            if !inSpeech && voicedRun >= openFrames {
+                inSpeech = true
+                spanStartSample = cursorSamples - (voicedRun - 1) * frameSize
+            }
+        } else {
+            silentRun += 1; voicedRun = 0
+            if inSpeech && silentRun >= closeFrames {
+                inSpeech = false
+                appendSpan(startSample: spanStartSample,
+                           endSample: cursorSamples - (closeFrames - 1) * frameSize)
+            }
+        }
+    }
+
+    private func appendSpan(startSample: Int, endSample: Int) {
+        let start = Double(max(0, startSample)) / 16_000.0
+        let end = Double(endSample) / 16_000.0
+        if end > start { spans.append(VADSpan(start: start, end: end, confidence: 0.9)) }
+    }
+}
@@ -0,0 +1,67 @@
+import AVFoundation
+
+/// Sequential **16 kHz mono PCM-16 WAV** writer. Deliberately "dumb": it only
+/// appends buffers and silence and tracks `framesWritten`. Time alignment to the
+/// shared `t0` is done by the caller (`AudioRecorder`), which pads/trims using
+/// each buffer's true host time so the mic and system tracks stay anchored to
+/// the same timeline even if buffers are dropped or the hardware clocks drift.
+///
+/// Single-threaded: all calls happen on `AudioRecorder.ioQueue`.
+final class MonoTrackWriter {
+    private let file: AVAudioFile
+    private(set) var framesWritten: Int64 = 0
+
+    init(url: URL) throws {
+        let settings: [String: Any] = [
+            AVFormatIDKey: kAudioFormatLinearPCM,
+            AVSampleRateKey: 16_000,
+            AVNumberOfChannelsKey: 1,
+            AVLinearPCMBitDepthKey: 16,
+            AVLinearPCMIsFloatKey: false,
+            AVLinearPCMIsBigEndianKey: false,
+        ]
+        // On disk = Int16 PCM; processing/buffer format = Float32 (matches Resampler).
+        self.file = try AVAudioFile(
+            forWriting: url,
+            settings: settings,
+            commonFormat: .pcmFormatFloat32,
+            interleaved: false)
+    }
+
+    /// Writes the buffer; returns the number of frames actually committed (0 on
+    /// failure). Callers feed the VAD this committed count to stay in lockstep.
+    @discardableResult
+    func write(_ buffer: AVAudioPCMBuffer) -> Int64 {
+        guard buffer.frameLength > 0 else { return 0 }
+        do {
+            try file.write(from: buffer)
+            let n = Int64(buffer.frameLength)
+            framesWritten += n
+            return n
+        } catch {
+            return 0   // best-effort: drop a buffer rather than tear down
+        }
+    }
+
+    /// Append `count` frames of silence (to fill timeline gaps); returns frames
+    /// actually committed.
+    @discardableResult
+    func padSilence(_ count: Int64) -> Int64 {
+        guard count > 0 else { return 0 }
+        var remaining = count
+        var committed: Int64 = 0
+        let chunk: Int64 = 16_000
+        while remaining > 0 {
+            let n = AVAudioFrameCount(min(chunk, remaining))
+            guard let buffer = AVAudioPCMBuffer(pcmFormat: Resampler.targetFormat, frameCapacity: n) else { break }
+            buffer.frameLength = n
+            if let ch = buffer.floatChannelData {
+                memset(ch[0], 0, Int(n) * MemoryLayout<Float>.size)
+            }
+            if write(buffer) == 0 { break }
+            committed += Int64(n)
+            remaining -= Int64(n)
+        }
+        return committed
+    }
+}
@@ -0,0 +1,65 @@
+import AVFoundation
+
+/// Converts arbitrary input PCM buffers to **16 kHz mono Float32**, maintaining
+/// resampler state across calls. Reuse one instance per source stream so the
+/// internal sample-rate converter stays continuous across buffers.
+///
+/// Not thread-safe: use one instance from a single thread. Both the mic and
+/// system instances are driven exclusively from `AudioRecorder.ioQueue` (one per
+/// source stream), kept continuous across buffers.
+final class Resampler {
+    /// The canonical Phase-1 audio format: 16 kHz, mono, Float32, deinterleaved.
+    static let targetFormat = AVAudioFormat(
+        commonFormat: .pcmFormatFloat32,
+        sampleRate: 16_000,
+        channels: 1,
+        interleaved: false)!
+
+    private var converter: AVAudioConverter?
+    private var sourceFormat: AVAudioFormat?
+    private var ended = false
+
+    /// 16 kHz mono buffer for `input`, or nil if conversion produced nothing.
+    func resample(_ input: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
+        guard !ended, input.frameLength > 0 else { return nil }
+
+        if converter == nil || sourceFormat != input.format {
+            converter = AVAudioConverter(from: input.format, to: Self.targetFormat)
+            sourceFormat = input.format
+        }
+        guard let converter else { return nil }
+
+        let ratio = Self.targetFormat.sampleRate / input.format.sampleRate
+        let capacity = AVAudioFrameCount((Double(input.frameLength) * ratio).rounded(.up)) + 64
+        guard let output = AVAudioPCMBuffer(pcmFormat: Self.targetFormat, frameCapacity: capacity) else {
+            return nil
+        }
+
+        var consumed = false
+        var error: NSError?
+        let status = converter.convert(to: output, error: &error) { _, inputStatus in
+            if consumed { inputStatus.pointee = .noDataNow; return nil }
+            consumed = true
+            inputStatus.pointee = .haveData
+            return input
+        }
+        if status == .error || output.frameLength == 0 { return nil }
+        return output
+    }
+
+    /// Flush the converter's internal tail at end of stream (call once on stop).
+    func drain() -> AVAudioPCMBuffer? {
+        guard !ended, let converter else { ended = true; return nil }
+        ended = true
+        guard let output = AVAudioPCMBuffer(pcmFormat: Self.targetFormat, frameCapacity: 8192) else {
+            return nil
+        }
+        var error: NSError?
+        let status = converter.convert(to: output, error: &error) { _, inputStatus in
+            inputStatus.pointee = .endOfStream
+            return nil
+        }
+        if status == .error || output.frameLength == 0 { return nil }
+        return output
+    }
+}