Phase 1: dual-track audio capture → mixed-mono 16 kHz WAV + mic VAD
AudioRecorder captures system audio (ScreenCaptureKit) + mic (AVAudioEngine) on a single serial ioQueue, one shared monotonic t0, time-driven writers (pad gaps / trim overlaps) so tracks stay aligned, and an energy mic-VAD for 'self' spans. AudioMixer sums the aligned tracks into mixed_mono_16k.wav. SessionController drives a serialized start/stop state machine, writes the session folder + self_vad.json, exposes live level meters, and finalizes on quit. Hardening from review: ioQueue single-domain (no races), stop() never hangs (mic-first teardown + bounded stopCapture), layout-agnostic mic deep-copy, discard-only video output to keep SCStream alive, VAD lockstep on committed frames, stable signing team in project.yml, single-instance enforcement.
This commit is contained in:
@@ -0,0 +1,98 @@
|
||||
import AVFoundation
|
||||
|
||||
/// A speaking span on the session `t0` timeline (seconds).
|
||||
struct VADSpan: Equatable {
|
||||
let start: Double
|
||||
let end: Double
|
||||
let confidence: Double
|
||||
}
|
||||
|
||||
/// Lightweight energy-based voice-activity detector for the **mic** track (the
|
||||
/// user). It is fed the *exact same* 16 kHz mono stream the mic WAV receives —
|
||||
/// real samples via `feed` and timeline-gap silence via `feedSilence` — so its
|
||||
/// internal sample cursor always equals the mic file position, and span times
|
||||
/// land on the same instants as `mixed_mono_16k.wav`.
|
||||
///
|
||||
/// Phase 3's `TimelineBuilder` will fold these in as high-confidence pre-seeded
|
||||
/// "self" segments. Thresholds are intentionally simple and will be tuned later.
|
||||
///
|
||||
/// Single-threaded: all calls happen on `AudioRecorder.ioQueue`.
|
||||
final class MicVAD {
|
||||
private let frameSize = 320 // 20 ms @ 16 kHz
|
||||
private let openFrames = 2 // ~40 ms above threshold to open
|
||||
private let closeFrames = 10 // ~200 ms hangover to close
|
||||
private let absoluteFloor: Float = 0.006
|
||||
private let floorMultiplier: Float = 2.5
|
||||
|
||||
private var cursorSamples = 0 // total samples fed (== mic file position)
|
||||
private var noiseFloor: Float = 0.01
|
||||
private var voicedRun = 0
|
||||
private var silentRun = 0
|
||||
private var inSpeech = false
|
||||
private var spanStartSample = 0
|
||||
private var acc: [Float] = []
|
||||
private(set) var spans: [VADSpan] = []
|
||||
|
||||
func feed(_ buffer: AVAudioPCMBuffer) {
|
||||
guard let ch = buffer.floatChannelData, buffer.frameLength > 0 else { return }
|
||||
acc.append(contentsOf: UnsafeBufferPointer(start: ch[0], count: Int(buffer.frameLength)))
|
||||
drainFrames()
|
||||
}
|
||||
|
||||
func feedSilence(_ count: Int64) {
|
||||
guard count > 0 else { return }
|
||||
acc.append(contentsOf: repeatElement(0, count: Int(count)))
|
||||
drainFrames()
|
||||
}
|
||||
|
||||
/// Close any span still open at end of capture.
|
||||
func finish() {
|
||||
if inSpeech {
|
||||
appendSpan(startSample: spanStartSample, endSample: cursorSamples)
|
||||
inSpeech = false
|
||||
}
|
||||
}
|
||||
|
||||
private func drainFrames() {
|
||||
var i = 0
|
||||
while i + frameSize <= acc.count {
|
||||
var sum: Float = 0
|
||||
var j = i
|
||||
while j < i + frameSize { sum += acc[j] * acc[j]; j += 1 }
|
||||
step(rms: (sum / Float(frameSize)).squareRoot())
|
||||
cursorSamples += frameSize
|
||||
i += frameSize
|
||||
}
|
||||
if i > 0 { acc.removeFirst(i) }
|
||||
}
|
||||
|
||||
/// `cursorSamples` is the start sample of the frame being evaluated.
|
||||
private func step(rms: Float) {
|
||||
if rms < noiseFloor { noiseFloor = 0.9 * noiseFloor + 0.1 * rms }
|
||||
else { noiseFloor = 0.995 * noiseFloor + 0.005 * rms }
|
||||
|
||||
let threshold = max(absoluteFloor, noiseFloor * floorMultiplier)
|
||||
let voiced = rms > threshold
|
||||
|
||||
if voiced {
|
||||
voicedRun += 1; silentRun = 0
|
||||
if !inSpeech && voicedRun >= openFrames {
|
||||
inSpeech = true
|
||||
spanStartSample = cursorSamples - (voicedRun - 1) * frameSize
|
||||
}
|
||||
} else {
|
||||
silentRun += 1; voicedRun = 0
|
||||
if inSpeech && silentRun >= closeFrames {
|
||||
inSpeech = false
|
||||
appendSpan(startSample: spanStartSample,
|
||||
endSample: cursorSamples - (closeFrames - 1) * frameSize)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func appendSpan(startSample: Int, endSample: Int) {
|
||||
let start = Double(max(0, startSample)) / 16_000.0
|
||||
let end = Double(endSample) / 16_000.0
|
||||
if end > start { spans.append(VADSpan(start: start, end: end, confidence: 0.9)) }
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user