Phase 1: dual-track audio capture → mixed-mono 16 kHz WAV + mic VAD
AudioRecorder captures system audio (ScreenCaptureKit) + mic (AVAudioEngine) on a single serial ioQueue, one shared monotonic t0, time-driven writers (pad gaps / trim overlaps) so tracks stay aligned, and an energy mic-VAD for 'self' spans. AudioMixer sums the aligned tracks into mixed_mono_16k.wav. SessionController drives a serialized start/stop state machine, writes the session folder + self_vad.json, exposes live level meters, and finalizes on quit. Hardening from review: ioQueue single-domain (no races), stop() never hangs (mic-first teardown + bounded stopCapture), layout-agnostic mic deep-copy, discard-only video output to keep SCStream alive, VAD lockstep on committed frames, stable signing team in project.yml, single-instance enforcement.
This commit is contained in:
@@ -0,0 +1,67 @@
|
||||
import AVFoundation
|
||||
|
||||
/// Sums the two aligned 16 kHz mono tracks (mic + system) into the single
|
||||
/// **mixed-mono 16 kHz WAV** that the backend receives. Both inputs are already
|
||||
/// front-padded to the shared t0, so frame N of each file is the same instant.
|
||||
/// Streamed in 1-second chunks to keep memory flat for long calls.
|
||||
enum AudioMixer {
|
||||
static func mix(mic micURL: URL, system systemURL: URL, into outURL: URL) throws {
|
||||
let mic = try? AVAudioFile(forReading: micURL)
|
||||
let sys = try? AVAudioFile(forReading: systemURL)
|
||||
|
||||
let settings: [String: Any] = [
|
||||
AVFormatIDKey: kAudioFormatLinearPCM,
|
||||
AVSampleRateKey: 16_000,
|
||||
AVNumberOfChannelsKey: 1,
|
||||
AVLinearPCMBitDepthKey: 16,
|
||||
AVLinearPCMIsFloatKey: false,
|
||||
AVLinearPCMIsBigEndianKey: false,
|
||||
]
|
||||
let out = try AVAudioFile(
|
||||
forWriting: outURL,
|
||||
settings: settings,
|
||||
commonFormat: .pcmFormatFloat32,
|
||||
interleaved: false)
|
||||
|
||||
let outFormat = Resampler.targetFormat
|
||||
let chunk: AVAudioFramePosition = 16_000
|
||||
let total = max(mic?.length ?? 0, sys?.length ?? 0)
|
||||
var pos: AVAudioFramePosition = 0
|
||||
|
||||
while pos < total {
|
||||
let frames = AVAudioFrameCount(min(chunk, total - pos))
|
||||
guard let mixBuf = AVAudioPCMBuffer(pcmFormat: outFormat, frameCapacity: frames),
|
||||
let dst = mixBuf.floatChannelData?[0] else { break }
|
||||
mixBuf.frameLength = frames
|
||||
memset(dst, 0, Int(frames) * MemoryLayout<Float>.size)
|
||||
|
||||
add(file: mic, at: pos, maxFrames: frames, into: dst)
|
||||
add(file: sys, at: pos, maxFrames: frames, into: dst)
|
||||
|
||||
var i = 0
|
||||
while i < Int(frames) {
|
||||
if dst[i] > 1 { dst[i] = 1 } else if dst[i] < -1 { dst[i] = -1 }
|
||||
i += 1
|
||||
}
|
||||
try out.write(from: mixBuf)
|
||||
pos += AVAudioFramePosition(frames)
|
||||
}
|
||||
}
|
||||
|
||||
private static func add(file: AVAudioFile?, at pos: AVAudioFramePosition,
|
||||
maxFrames: AVAudioFrameCount, into dst: UnsafeMutablePointer<Float>) {
|
||||
guard let file, pos < file.length else { return }
|
||||
file.framePosition = pos
|
||||
let toRead = AVAudioFrameCount(min(AVAudioFramePosition(maxFrames), file.length - pos))
|
||||
guard toRead > 0,
|
||||
let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: toRead)
|
||||
else { return }
|
||||
do {
|
||||
try file.read(into: buf, frameCount: toRead)
|
||||
guard let src = buf.floatChannelData?[0] else { return }
|
||||
var i = 0
|
||||
let count = Int(buf.frameLength)
|
||||
while i < count { dst[i] += src[i]; i += 1 }
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user