import AVFoundation /// A speaking span on the session `t0` timeline (seconds). struct VADSpan: Equatable { let start: Double let end: Double let confidence: Double } /// Lightweight energy-based voice-activity detector for the **mic** track (the /// user). It is fed the *exact same* 16 kHz mono stream the mic WAV receives — /// real samples via `feed` and timeline-gap silence via `feedSilence` — so its /// internal sample cursor always equals the mic file position, and span times /// land on the same instants as `mixed_mono_16k.wav`. /// /// Phase 3's `TimelineBuilder` will fold these in as high-confidence pre-seeded /// "self" segments. Thresholds are intentionally simple and will be tuned later. /// /// Single-threaded: all calls happen on `AudioRecorder.ioQueue`. final class MicVAD { private let frameSize = 320 // 20 ms @ 16 kHz private let openFrames = 2 // ~40 ms above threshold to open private let closeFrames = 10 // ~200 ms hangover to close private let absoluteFloor: Float = 0.006 private let floorMultiplier: Float = 2.5 private var cursorSamples = 0 // total samples fed (== mic file position) private var noiseFloor: Float = 0.01 private var voicedRun = 0 private var silentRun = 0 private var inSpeech = false private var spanStartSample = 0 private var acc: [Float] = [] private(set) var spans: [VADSpan] = [] func feed(_ buffer: AVAudioPCMBuffer) { guard let ch = buffer.floatChannelData, buffer.frameLength > 0 else { return } acc.append(contentsOf: UnsafeBufferPointer(start: ch[0], count: Int(buffer.frameLength))) drainFrames() } func feedSilence(_ count: Int64) { guard count > 0 else { return } acc.append(contentsOf: repeatElement(0, count: Int(count))) drainFrames() } /// Close any span still open at end of capture. func finish() { if inSpeech { appendSpan(startSample: spanStartSample, endSample: cursorSamples) inSpeech = false } } private func drainFrames() { var i = 0 while i + frameSize <= acc.count { var sum: Float = 0 var j = i while j < i + frameSize { sum += acc[j] * acc[j]; j += 1 } step(rms: (sum / Float(frameSize)).squareRoot()) cursorSamples += frameSize i += frameSize } if i > 0 { acc.removeFirst(i) } } /// `cursorSamples` is the start sample of the frame being evaluated. private func step(rms: Float) { if rms < noiseFloor { noiseFloor = 0.9 * noiseFloor + 0.1 * rms } else { noiseFloor = 0.995 * noiseFloor + 0.005 * rms } let threshold = max(absoluteFloor, noiseFloor * floorMultiplier) let voiced = rms > threshold if voiced { voicedRun += 1; silentRun = 0 if !inSpeech && voicedRun >= openFrames { inSpeech = true spanStartSample = cursorSamples - (voicedRun - 1) * frameSize } } else { silentRun += 1; voicedRun = 0 if inSpeech && silentRun >= closeFrames { inSpeech = false appendSpan(startSample: spanStartSample, endSample: cursorSamples - (closeFrames - 1) * frameSize) } } } private func appendSpan(startSample: Int, endSample: Int) { let start = Double(max(0, startSample)) / 16_000.0 let end = Double(endSample) / 16_000.0 if end > start { spans.append(VADSpan(start: start, end: end, confidence: 0.9)) } } }