Files
Grant Gilliam 35ba6ecf05 Drop unused AppleEvents usage string; de-stale Phase-N comments
The NSAppleEventsUsageDescription usage string was dead — the app has no AppleEvents/AppleScript code path (Meet detection reads window titles), so the permission prompt never fired; remove it. Rephrase the leftover "Phase N" build-plan references in source comments (one of which falsely claimed "no audio, capture, or call detection yet"), and complete the AGENTS.md Audio/Detection layout listings.
2026-06-16 22:15:44 -05:00

99 lines
3.6 KiB
Swift

import AVFoundation
/// A speaking span on the session `t0` timeline (seconds).
struct VADSpan: Equatable {
let start: Double
let end: Double
let confidence: Double
}
/// Lightweight energy-based voice-activity detector for the **mic** track (the
/// user). It is fed the *exact same* 16 kHz mono stream the mic WAV receives
/// real samples via `feed` and timeline-gap silence via `feedSilence` so its
/// internal sample cursor always equals the mic file position, and span times
/// land on the same instants as `mixed_mono_16k.wav`.
///
/// `TimelineBuilder` folds these in as high-confidence pre-seeded "self"
/// segments. Thresholds are intentionally simple.
///
/// Single-threaded: all calls happen on `AudioRecorder.ioQueue`.
final class MicVAD {
private let frameSize = 320 // 20 ms @ 16 kHz
private let openFrames = 2 // ~40 ms above threshold to open
private let closeFrames = 10 // ~200 ms hangover to close
private let absoluteFloor: Float = 0.006
private let floorMultiplier: Float = 2.5
private var cursorSamples = 0 // total samples fed (== mic file position)
private var noiseFloor: Float = 0.01
private var voicedRun = 0
private var silentRun = 0
private var inSpeech = false
private var spanStartSample = 0
private var acc: [Float] = []
private(set) var spans: [VADSpan] = []
func feed(_ buffer: AVAudioPCMBuffer) {
guard let ch = buffer.floatChannelData, buffer.frameLength > 0 else { return }
acc.append(contentsOf: UnsafeBufferPointer(start: ch[0], count: Int(buffer.frameLength)))
drainFrames()
}
func feedSilence(_ count: Int64) {
guard count > 0 else { return }
acc.append(contentsOf: repeatElement(0, count: Int(count)))
drainFrames()
}
/// Close any span still open at end of capture.
func finish() {
if inSpeech {
appendSpan(startSample: spanStartSample, endSample: cursorSamples)
inSpeech = false
}
}
private func drainFrames() {
var i = 0
while i + frameSize <= acc.count {
var sum: Float = 0
var j = i
while j < i + frameSize { sum += acc[j] * acc[j]; j += 1 }
step(rms: (sum / Float(frameSize)).squareRoot())
cursorSamples += frameSize
i += frameSize
}
if i > 0 { acc.removeFirst(i) }
}
/// `cursorSamples` is the start sample of the frame being evaluated.
private func step(rms: Float) {
if rms < noiseFloor { noiseFloor = 0.9 * noiseFloor + 0.1 * rms }
else { noiseFloor = 0.995 * noiseFloor + 0.005 * rms }
let threshold = max(absoluteFloor, noiseFloor * floorMultiplier)
let voiced = rms > threshold
if voiced {
voicedRun += 1; silentRun = 0
if !inSpeech && voicedRun >= openFrames {
inSpeech = true
spanStartSample = cursorSamples - (voicedRun - 1) * frameSize
}
} else {
silentRun += 1; voicedRun = 0
if inSpeech && silentRun >= closeFrames {
inSpeech = false
appendSpan(startSample: spanStartSample,
endSample: cursorSamples - (closeFrames - 1) * frameSize)
}
}
}
private func appendSpan(startSample: Int, endSample: Int) {
let start = Double(max(0, startSample)) / 16_000.0
let end = Double(endSample) / 16_000.0
if end > start { spans.append(VADSpan(start: start, end: end, confidence: 0.9)) }
}
}