ten31-transcripts/Ten31Transcripts/Audio/MicVAD.swift

import AVFoundation

/// A speaking span on the session `t0` timeline (seconds).
struct VADSpan: Equatable {
    let start: Double
    let end: Double
    let confidence: Double
}

/// Lightweight energy-based voice-activity detector for the **mic** track (the
/// user). It is fed the *exact same* 16 kHz mono stream the mic WAV receives —
/// real samples via `feed` and timeline-gap silence via `feedSilence` — so its
/// internal sample cursor always equals the mic file position, and span times
/// land on the same instants as `mixed_mono_16k.wav`.
///
/// Phase 3's `TimelineBuilder` will fold these in as high-confidence pre-seeded
/// "self" segments. Thresholds are intentionally simple and will be tuned later.
///
/// Single-threaded: all calls happen on `AudioRecorder.ioQueue`.
final class MicVAD {
    private let frameSize = 320            // 20 ms @ 16 kHz
    private let openFrames = 2             // ~40 ms above threshold to open
    private let closeFrames = 10           // ~200 ms hangover to close
    private let absoluteFloor: Float = 0.006
    private let floorMultiplier: Float = 2.5

    private var cursorSamples = 0           // total samples fed (== mic file position)
    private var noiseFloor: Float = 0.01
    private var voicedRun = 0
    private var silentRun = 0
    private var inSpeech = false
    private var spanStartSample = 0
    private var acc: [Float] = []
    private(set) var spans: [VADSpan] = []

    func feed(_ buffer: AVAudioPCMBuffer) {
        guard let ch = buffer.floatChannelData, buffer.frameLength > 0 else { return }
        acc.append(contentsOf: UnsafeBufferPointer(start: ch[0], count: Int(buffer.frameLength)))
        drainFrames()
    }

    func feedSilence(_ count: Int64) {
        guard count > 0 else { return }
        acc.append(contentsOf: repeatElement(0, count: Int(count)))
        drainFrames()
    }

    /// Close any span still open at end of capture.
    func finish() {
        if inSpeech {
            appendSpan(startSample: spanStartSample, endSample: cursorSamples)
            inSpeech = false
        }
    }

    private func drainFrames() {
        var i = 0
        while i + frameSize <= acc.count {
            var sum: Float = 0
            var j = i
            while j < i + frameSize { sum += acc[j] * acc[j]; j += 1 }
            step(rms: (sum / Float(frameSize)).squareRoot())
            cursorSamples += frameSize
            i += frameSize
        }
        if i > 0 { acc.removeFirst(i) }
    }

    /// `cursorSamples` is the start sample of the frame being evaluated.
    private func step(rms: Float) {
        if rms < noiseFloor { noiseFloor = 0.9 * noiseFloor + 0.1 * rms }
        else { noiseFloor = 0.995 * noiseFloor + 0.005 * rms }

        let threshold = max(absoluteFloor, noiseFloor * floorMultiplier)
        let voiced = rms > threshold

        if voiced {
            voicedRun += 1; silentRun = 0
            if !inSpeech && voicedRun >= openFrames {
                inSpeech = true
                spanStartSample = cursorSamples - (voicedRun - 1) * frameSize
            }
        } else {
            silentRun += 1; voicedRun = 0
            if inSpeech && silentRun >= closeFrames {
                inSpeech = false
                appendSpan(startSample: spanStartSample,
                           endSample: cursorSamples - (closeFrames - 1) * frameSize)
            }
        }
    }

    private func appendSpan(startSample: Int, endSample: Int) {
        let start = Double(max(0, startSample)) / 16_000.0
        let end = Double(endSample) / 16_000.0
        if end > start { spans.append(VADSpan(start: start, end: end, confidence: 0.9)) }
    }
}