ten31-transcripts/Ten31Transcripts/Audio/AudioMixer.swift

import AVFoundation

/// Sums the two aligned 16 kHz mono tracks (mic + system) into the single
/// **mixed-mono 16 kHz WAV** that the backend receives. Both inputs are already
/// front-padded to the shared t0, so frame N of each file is the same instant.
/// Streamed in 1-second chunks to keep memory flat for long calls.
enum AudioMixer {
    static func mix(mic micURL: URL, system systemURL: URL, into outURL: URL) throws {
        let mic = try? AVAudioFile(forReading: micURL)
        let sys = try? AVAudioFile(forReading: systemURL)

        let settings: [String: Any] = [
            AVFormatIDKey: kAudioFormatLinearPCM,
            AVSampleRateKey: 16_000,
            AVNumberOfChannelsKey: 1,
            AVLinearPCMBitDepthKey: 16,
            AVLinearPCMIsFloatKey: false,
            AVLinearPCMIsBigEndianKey: false,
        ]
        let out = try AVAudioFile(
            forWriting: outURL,
            settings: settings,
            commonFormat: .pcmFormatFloat32,
            interleaved: false)

        let outFormat = Resampler.targetFormat
        let chunk: AVAudioFramePosition = 16_000
        let total = max(mic?.length ?? 0, sys?.length ?? 0)
        var pos: AVAudioFramePosition = 0

        while pos < total {
            let frames = AVAudioFrameCount(min(chunk, total - pos))
            guard let mixBuf = AVAudioPCMBuffer(pcmFormat: outFormat, frameCapacity: frames),
                  let dst = mixBuf.floatChannelData?[0] else { break }
            mixBuf.frameLength = frames
            memset(dst, 0, Int(frames) * MemoryLayout<Float>.size)

            add(file: mic, at: pos, maxFrames: frames, into: dst)
            add(file: sys, at: pos, maxFrames: frames, into: dst)

            var i = 0
            while i < Int(frames) {
                if dst[i] > 1 { dst[i] = 1 } else if dst[i] < -1 { dst[i] = -1 }
                i += 1
            }
            try out.write(from: mixBuf)
            pos += AVAudioFramePosition(frames)
        }
    }

    private static func add(file: AVAudioFile?, at pos: AVAudioFramePosition,
                            maxFrames: AVAudioFrameCount, into dst: UnsafeMutablePointer<Float>) {
        guard let file, pos < file.length else { return }
        file.framePosition = pos
        let toRead = AVAudioFrameCount(min(AVAudioFramePosition(maxFrames), file.length - pos))
        guard toRead > 0,
              let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: toRead)
        else { return }
        do {
            try file.read(into: buf, frameCount: toRead)
            guard let src = buf.floatChannelData?[0] else { return }
            var i = 0
            let count = Int(buf.frameLength)
            while i < count { dst[i] += src[i]; i += 1 }
        } catch {}
    }
}