Phase 1: dual-track audio capture → mixed-mono 16 kHz WAV + mic VAD
AudioRecorder captures system audio (ScreenCaptureKit) + mic (AVAudioEngine) on a single serial ioQueue, one shared monotonic t0, time-driven writers (pad gaps / trim overlaps) so tracks stay aligned, and an energy mic-VAD for 'self' spans. AudioMixer sums the aligned tracks into mixed_mono_16k.wav. SessionController drives a serialized start/stop state machine, writes the session folder + self_vad.json, exposes live level meters, and finalizes on quit. Hardening from review: ioQueue single-domain (no races), stop() never hangs (mic-first teardown + bounded stopCapture), layout-agnostic mic deep-copy, discard-only video output to keep SCStream alive, VAD lockstep on committed frames, stable signing team in project.yml, single-instance enforcement.
This commit is contained in:
@@ -0,0 +1,67 @@
|
||||
import AVFoundation
|
||||
|
||||
/// Sums the two aligned 16 kHz mono tracks (mic + system) into the single
|
||||
/// **mixed-mono 16 kHz WAV** that the backend receives. Both inputs are already
|
||||
/// front-padded to the shared t0, so frame N of each file is the same instant.
|
||||
/// Streamed in 1-second chunks to keep memory flat for long calls.
|
||||
enum AudioMixer {
|
||||
static func mix(mic micURL: URL, system systemURL: URL, into outURL: URL) throws {
|
||||
let mic = try? AVAudioFile(forReading: micURL)
|
||||
let sys = try? AVAudioFile(forReading: systemURL)
|
||||
|
||||
let settings: [String: Any] = [
|
||||
AVFormatIDKey: kAudioFormatLinearPCM,
|
||||
AVSampleRateKey: 16_000,
|
||||
AVNumberOfChannelsKey: 1,
|
||||
AVLinearPCMBitDepthKey: 16,
|
||||
AVLinearPCMIsFloatKey: false,
|
||||
AVLinearPCMIsBigEndianKey: false,
|
||||
]
|
||||
let out = try AVAudioFile(
|
||||
forWriting: outURL,
|
||||
settings: settings,
|
||||
commonFormat: .pcmFormatFloat32,
|
||||
interleaved: false)
|
||||
|
||||
let outFormat = Resampler.targetFormat
|
||||
let chunk: AVAudioFramePosition = 16_000
|
||||
let total = max(mic?.length ?? 0, sys?.length ?? 0)
|
||||
var pos: AVAudioFramePosition = 0
|
||||
|
||||
while pos < total {
|
||||
let frames = AVAudioFrameCount(min(chunk, total - pos))
|
||||
guard let mixBuf = AVAudioPCMBuffer(pcmFormat: outFormat, frameCapacity: frames),
|
||||
let dst = mixBuf.floatChannelData?[0] else { break }
|
||||
mixBuf.frameLength = frames
|
||||
memset(dst, 0, Int(frames) * MemoryLayout<Float>.size)
|
||||
|
||||
add(file: mic, at: pos, maxFrames: frames, into: dst)
|
||||
add(file: sys, at: pos, maxFrames: frames, into: dst)
|
||||
|
||||
var i = 0
|
||||
while i < Int(frames) {
|
||||
if dst[i] > 1 { dst[i] = 1 } else if dst[i] < -1 { dst[i] = -1 }
|
||||
i += 1
|
||||
}
|
||||
try out.write(from: mixBuf)
|
||||
pos += AVAudioFramePosition(frames)
|
||||
}
|
||||
}
|
||||
|
||||
private static func add(file: AVAudioFile?, at pos: AVAudioFramePosition,
|
||||
maxFrames: AVAudioFrameCount, into dst: UnsafeMutablePointer<Float>) {
|
||||
guard let file, pos < file.length else { return }
|
||||
file.framePosition = pos
|
||||
let toRead = AVAudioFrameCount(min(AVAudioFramePosition(maxFrames), file.length - pos))
|
||||
guard toRead > 0,
|
||||
let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: toRead)
|
||||
else { return }
|
||||
do {
|
||||
try file.read(into: buf, frameCount: toRead)
|
||||
guard let src = buf.floatChannelData?[0] else { return }
|
||||
var i = 0
|
||||
let count = Int(buf.frameLength)
|
||||
while i < count { dst[i] += src[i]; i += 1 }
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,333 @@
|
||||
import AVFoundation
|
||||
import ScreenCaptureKit
|
||||
import CoreMedia
|
||||
import QuartzCore
|
||||
|
||||
struct RecordingResult {
|
||||
let micURL: URL
|
||||
let systemURL: URL
|
||||
let mixedURL: URL
|
||||
let duration: Double
|
||||
let selfSpans: [VADSpan]
|
||||
let t0Unix: Double
|
||||
/// Non-nil if system-audio capture stopped early (e.g. SCStream error).
|
||||
let systemNote: String?
|
||||
}
|
||||
|
||||
/// Dual-track local audio capture for Phase 1.
|
||||
///
|
||||
/// - System audio via `SCStream` (`capturesAudio`); its audio handler runs on
|
||||
/// `ioQueue`. A discard-only video output runs on `screenQueue` purely to keep
|
||||
/// SCStream's frame pipeline drained (an unconsumed video queue can stall the
|
||||
/// whole stream) — frames are dropped instantly, never stored.
|
||||
/// - Mic via `AVAudioEngine` input tap: the tap deep-copies the raw buffer and
|
||||
/// hands it to `ioQueue`, where it is resampled and written.
|
||||
/// - **`ioQueue` is the single isolation domain** for the writers, VAD, both
|
||||
/// resamplers, and lifecycle flags.
|
||||
/// - One shared monotonic `t0` (`CACurrentMediaTime`). Each buffer is placed at
|
||||
/// its true `(startHost − t0)` frame (gaps padded, overlaps trimmed), so mic
|
||||
/// and system stay aligned and the mix is a straight sum.
|
||||
/// - Live peak levels are exposed via `currentLevels()` for the UI meter.
|
||||
/// - `stop()` tears the mic down first and bounds `stopCapture()` with a timeout,
|
||||
/// so a wedged stream can never block finalization. No video is written.
|
||||
final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput {
|
||||
private let micURL: URL
|
||||
private let systemURL: URL
|
||||
private let mixedURL: URL
|
||||
|
||||
private let ioQueue = DispatchQueue(label: "xyz.ten31.audio.io")
|
||||
private let screenQueue = DispatchQueue(label: "xyz.ten31.audio.screen")
|
||||
|
||||
// ioQueue-only state:
|
||||
private var t0Host: Double = 0
|
||||
private var t0Unix: Double = 0
|
||||
private var micWriter: MonoTrackWriter?
|
||||
private var systemWriter: MonoTrackWriter?
|
||||
private var vad: MicVAD?
|
||||
private var tornDown = true
|
||||
private let micResampler = Resampler()
|
||||
private let systemResampler = Resampler()
|
||||
|
||||
// Cross-thread, guarded by levelLock:
|
||||
private let levelLock = NSLock()
|
||||
private var micPeak: Float = 0
|
||||
private var sysPeak: Float = 0
|
||||
private var streamStopped = false
|
||||
private var systemErrorMessage: String?
|
||||
|
||||
private var engine: AVAudioEngine?
|
||||
private var stream: SCStream?
|
||||
|
||||
init(micURL: URL, systemURL: URL, mixedURL: URL) {
|
||||
self.micURL = micURL
|
||||
self.systemURL = systemURL
|
||||
self.mixedURL = mixedURL
|
||||
}
|
||||
|
||||
// MARK: - Lifecycle
|
||||
|
||||
func start() async throws {
|
||||
let t0 = CACurrentMediaTime()
|
||||
let t0u = Date().timeIntervalSince1970
|
||||
try ioQueue.sync {
|
||||
let mic = try MonoTrackWriter(url: self.micURL)
|
||||
let sys = try MonoTrackWriter(url: self.systemURL)
|
||||
self.t0Host = t0
|
||||
self.t0Unix = t0u
|
||||
self.micWriter = mic
|
||||
self.systemWriter = sys
|
||||
self.vad = MicVAD()
|
||||
self.tornDown = false
|
||||
}
|
||||
do {
|
||||
try startMic()
|
||||
try await startSystem() // throws if Screen Recording is denied
|
||||
} catch {
|
||||
await abortStart()
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
func stop() async -> RecordingResult {
|
||||
// Stop the mic FIRST — always succeeds and halts mic capture immediately.
|
||||
engine?.inputNode.removeTap(onBus: 0)
|
||||
engine?.stop()
|
||||
engine = nil
|
||||
|
||||
// Stop system capture WITHOUT hanging: an already-errored stream can make
|
||||
// stopCapture() block forever, so skip it if it already stopped and bound
|
||||
// it with a timeout otherwise.
|
||||
if let stream, !flag({ self.streamStopped }) {
|
||||
await Self.stopCaptureWithTimeout(stream, seconds: 3)
|
||||
}
|
||||
stream = nil
|
||||
|
||||
var micFrames: Int64 = 0
|
||||
var sysFrames: Int64 = 0
|
||||
var spans: [VADSpan] = []
|
||||
var t0u: Double = 0
|
||||
|
||||
ioQueue.sync {
|
||||
if let tail = micResampler.drain() {
|
||||
if (micWriter?.write(tail) ?? 0) > 0 { vad?.feed(tail) }
|
||||
}
|
||||
if let tail = systemResampler.drain() { systemWriter?.write(tail) }
|
||||
vad?.finish()
|
||||
micFrames = micWriter?.framesWritten ?? 0
|
||||
sysFrames = systemWriter?.framesWritten ?? 0
|
||||
spans = vad?.spans ?? []
|
||||
t0u = t0Unix
|
||||
tornDown = true
|
||||
micWriter = nil
|
||||
systemWriter = nil
|
||||
vad = nil
|
||||
}
|
||||
|
||||
try? AudioMixer.mix(mic: micURL, system: systemURL, into: mixedURL)
|
||||
|
||||
let duration = Double(max(micFrames, sysFrames)) / 16_000.0
|
||||
let note = flag { self.systemErrorMessage } as String?
|
||||
return RecordingResult(
|
||||
micURL: micURL, systemURL: systemURL, mixedURL: mixedURL,
|
||||
duration: duration, selfSpans: spans, t0Unix: t0u, systemNote: note)
|
||||
}
|
||||
|
||||
private func abortStart() async {
|
||||
engine?.inputNode.removeTap(onBus: 0)
|
||||
engine?.stop()
|
||||
engine = nil
|
||||
if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) }
|
||||
stream = nil
|
||||
ioQueue.sync {
|
||||
tornDown = true
|
||||
micWriter = nil
|
||||
systemWriter = nil
|
||||
vad = nil
|
||||
}
|
||||
}
|
||||
|
||||
/// Latest peak levels (0…1) for each source; decays so a stalled source fades.
|
||||
func currentLevels() -> (mic: Float, system: Float) {
|
||||
levelLock.lock(); defer { levelLock.unlock() }
|
||||
let m = micPeak, s = sysPeak
|
||||
micPeak *= 0.55; sysPeak *= 0.55
|
||||
return (m, s)
|
||||
}
|
||||
|
||||
// MARK: - Ingest (ioQueue only)
|
||||
|
||||
private func ingestMic(_ buffer: AVAudioPCMBuffer, startHost: Double) {
|
||||
guard !tornDown, let writer = micWriter, let vad else { return }
|
||||
let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
|
||||
if expected > writer.framesWritten {
|
||||
let padded = writer.padSilence(expected - writer.framesWritten)
|
||||
if padded > 0 { vad.feedSilence(padded) }
|
||||
}
|
||||
let startIdx = max(0, Int(writer.framesWritten - expected))
|
||||
if startIdx >= Int(buffer.frameLength) { return }
|
||||
guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
|
||||
updateLevel(chunk, isMic: true)
|
||||
if writer.write(chunk) > 0 { vad.feed(chunk) }
|
||||
}
|
||||
|
||||
private func ingestSystem(_ buffer: AVAudioPCMBuffer, startHost: Double) {
|
||||
guard !tornDown, let writer = systemWriter else { return }
|
||||
let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
|
||||
if expected > writer.framesWritten {
|
||||
writer.padSilence(expected - writer.framesWritten)
|
||||
}
|
||||
let startIdx = max(0, Int(writer.framesWritten - expected))
|
||||
if startIdx >= Int(buffer.frameLength) { return }
|
||||
guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
|
||||
updateLevel(chunk, isMic: false)
|
||||
writer.write(chunk)
|
||||
}
|
||||
|
||||
// MARK: - Mic (AVAudioEngine)
|
||||
|
||||
private func startMic() throws {
|
||||
let engine = AVAudioEngine()
|
||||
let input = engine.inputNode
|
||||
let format = input.inputFormat(forBus: 0)
|
||||
|
||||
input.installTap(onBus: 0, bufferSize: 4096, format: format) { [weak self] buffer, when in
|
||||
guard let self else { return }
|
||||
let entry = CACurrentMediaTime()
|
||||
let stamped = when.isHostTimeValid ? AudioRecorder.hostSeconds(when.hostTime) : entry
|
||||
let startHost = abs(stamped - entry) < 5 ? stamped : entry
|
||||
guard let raw = AudioRecorder.copy(buffer) else { return }
|
||||
self.ioQueue.async {
|
||||
guard !self.tornDown, let resampled = self.micResampler.resample(raw) else { return }
|
||||
self.ingestMic(resampled, startHost: startHost)
|
||||
}
|
||||
}
|
||||
engine.prepare()
|
||||
try engine.start()
|
||||
self.engine = engine
|
||||
}
|
||||
|
||||
// MARK: - System (ScreenCaptureKit)
|
||||
|
||||
private func startSystem() async throws {
|
||||
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
|
||||
guard let display = content.displays.first else {
|
||||
throw NSError(domain: "Ten31", code: 1,
|
||||
userInfo: [NSLocalizedDescriptionKey: "No display available for system-audio capture."])
|
||||
}
|
||||
let filter = SCContentFilter(display: display, excludingWindows: [])
|
||||
let config = SCStreamConfiguration()
|
||||
config.capturesAudio = true
|
||||
config.excludesCurrentProcessAudio = true
|
||||
config.sampleRate = 48_000
|
||||
config.channelCount = 2
|
||||
config.width = 2
|
||||
config.height = 2
|
||||
config.minimumFrameInterval = CMTime(value: 1, timescale: 2) // ~2 fps tiny video
|
||||
config.queueDepth = 6
|
||||
|
||||
let stream = SCStream(filter: filter, configuration: config, delegate: self)
|
||||
try stream.addStreamOutput(self, type: .audio, sampleHandlerQueue: ioQueue)
|
||||
// Discard-only video consumer keeps SCStream's frame queue drained so the
|
||||
// stream stays alive; frames are dropped immediately and never stored.
|
||||
try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: screenQueue)
|
||||
try await stream.startCapture()
|
||||
self.stream = stream
|
||||
}
|
||||
|
||||
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
|
||||
of type: SCStreamOutputType) {
|
||||
guard type == .audio else { return } // .screen frames discarded here
|
||||
guard CMSampleBufferDataIsReady(sampleBuffer),
|
||||
let pcm = Self.pcmBuffer(from: sampleBuffer),
|
||||
let resampled = systemResampler.resample(pcm) else { return }
|
||||
let entry = CACurrentMediaTime()
|
||||
let pts = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
|
||||
let stamped = pts.isValid ? pts.seconds : entry
|
||||
let startHost = abs(stamped - entry) < 5 ? stamped : entry
|
||||
ingestSystem(resampled, startHost: startHost)
|
||||
}
|
||||
|
||||
func stream(_ stream: SCStream, didStopWithError error: Error) {
|
||||
levelLock.lock()
|
||||
streamStopped = true
|
||||
systemErrorMessage = error.localizedDescription
|
||||
levelLock.unlock()
|
||||
}
|
||||
|
||||
// MARK: - Helpers
|
||||
|
||||
private func updateLevel(_ buffer: AVAudioPCMBuffer, isMic: Bool) {
|
||||
guard let ch = buffer.floatChannelData?[0] else { return }
|
||||
var peak: Float = 0
|
||||
let n = Int(buffer.frameLength)
|
||||
var i = 0
|
||||
while i < n { let a = abs(ch[i]); if a > peak { peak = a }; i += 1 }
|
||||
levelLock.lock()
|
||||
if isMic { if peak > micPeak { micPeak = peak } }
|
||||
else { if peak > sysPeak { sysPeak = peak } }
|
||||
levelLock.unlock()
|
||||
}
|
||||
|
||||
/// Read a levelLock-guarded value.
|
||||
private func flag<T>(_ body: () -> T) -> T {
|
||||
levelLock.lock(); defer { levelLock.unlock() }
|
||||
return body()
|
||||
}
|
||||
|
||||
private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async {
|
||||
await withTaskGroup(of: Void.self) { group in
|
||||
group.addTask { try? await stream.stopCapture() }
|
||||
group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) }
|
||||
_ = await group.next() // proceed as soon as either finishes
|
||||
group.cancelAll()
|
||||
}
|
||||
}
|
||||
|
||||
/// Deep-copy a PCM buffer (the engine reuses the tap buffer). Layout-agnostic.
|
||||
private static func copy(_ buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
|
||||
guard buffer.frameLength > 0,
|
||||
let out = AVAudioPCMBuffer(pcmFormat: buffer.format, frameCapacity: buffer.frameLength)
|
||||
else { return nil }
|
||||
out.frameLength = buffer.frameLength
|
||||
let src = UnsafeMutableAudioBufferListPointer(UnsafeMutablePointer(mutating: buffer.audioBufferList))
|
||||
let dst = UnsafeMutableAudioBufferListPointer(out.mutableAudioBufferList)
|
||||
guard src.count == dst.count else { return nil }
|
||||
for i in 0..<src.count {
|
||||
guard let s = src[i].mData, let d = dst[i].mData else { return nil }
|
||||
memcpy(d, s, min(Int(src[i].mDataByteSize), Int(dst[i].mDataByteSize)))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
private static func trimFront(_ buffer: AVAudioPCMBuffer, by frames: Int) -> AVAudioPCMBuffer? {
|
||||
if frames <= 0 { return buffer }
|
||||
let total = Int(buffer.frameLength)
|
||||
guard frames < total, let src = buffer.floatChannelData?[0] else { return nil }
|
||||
let n = AVAudioFrameCount(total - frames)
|
||||
guard let out = AVAudioPCMBuffer(pcmFormat: buffer.format, frameCapacity: n),
|
||||
let dst = out.floatChannelData?[0] else { return nil }
|
||||
out.frameLength = n
|
||||
memcpy(dst, src + frames, Int(n) * MemoryLayout<Float>.size)
|
||||
return out
|
||||
}
|
||||
|
||||
private static func hostSeconds(_ hostTime: UInt64) -> Double {
|
||||
var info = mach_timebase_info_data_t()
|
||||
mach_timebase_info(&info)
|
||||
return Double(hostTime) * Double(info.numer) / Double(info.denom) / 1_000_000_000.0
|
||||
}
|
||||
|
||||
private static func pcmBuffer(from sampleBuffer: CMSampleBuffer) -> AVAudioPCMBuffer? {
|
||||
guard let fmtDesc = CMSampleBufferGetFormatDescription(sampleBuffer),
|
||||
let asbdPtr = CMAudioFormatDescriptionGetStreamBasicDescription(fmtDesc) else { return nil }
|
||||
var asbd = asbdPtr.pointee
|
||||
guard let format = AVAudioFormat(streamDescription: &asbd) else { return nil }
|
||||
let frames = AVAudioFrameCount(CMSampleBufferGetNumSamples(sampleBuffer))
|
||||
guard frames > 0,
|
||||
let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frames) else { return nil }
|
||||
buffer.frameLength = frames
|
||||
let status = CMSampleBufferCopyPCMDataIntoAudioBufferList(
|
||||
sampleBuffer, at: 0, frameCount: Int32(frames), into: buffer.mutableAudioBufferList)
|
||||
return status == noErr ? buffer : nil
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
import AVFoundation
|
||||
|
||||
/// A speaking span on the session `t0` timeline (seconds).
|
||||
struct VADSpan: Equatable {
|
||||
let start: Double
|
||||
let end: Double
|
||||
let confidence: Double
|
||||
}
|
||||
|
||||
/// Lightweight energy-based voice-activity detector for the **mic** track (the
|
||||
/// user). It is fed the *exact same* 16 kHz mono stream the mic WAV receives —
|
||||
/// real samples via `feed` and timeline-gap silence via `feedSilence` — so its
|
||||
/// internal sample cursor always equals the mic file position, and span times
|
||||
/// land on the same instants as `mixed_mono_16k.wav`.
|
||||
///
|
||||
/// Phase 3's `TimelineBuilder` will fold these in as high-confidence pre-seeded
|
||||
/// "self" segments. Thresholds are intentionally simple and will be tuned later.
|
||||
///
|
||||
/// Single-threaded: all calls happen on `AudioRecorder.ioQueue`.
|
||||
final class MicVAD {
|
||||
private let frameSize = 320 // 20 ms @ 16 kHz
|
||||
private let openFrames = 2 // ~40 ms above threshold to open
|
||||
private let closeFrames = 10 // ~200 ms hangover to close
|
||||
private let absoluteFloor: Float = 0.006
|
||||
private let floorMultiplier: Float = 2.5
|
||||
|
||||
private var cursorSamples = 0 // total samples fed (== mic file position)
|
||||
private var noiseFloor: Float = 0.01
|
||||
private var voicedRun = 0
|
||||
private var silentRun = 0
|
||||
private var inSpeech = false
|
||||
private var spanStartSample = 0
|
||||
private var acc: [Float] = []
|
||||
private(set) var spans: [VADSpan] = []
|
||||
|
||||
func feed(_ buffer: AVAudioPCMBuffer) {
|
||||
guard let ch = buffer.floatChannelData, buffer.frameLength > 0 else { return }
|
||||
acc.append(contentsOf: UnsafeBufferPointer(start: ch[0], count: Int(buffer.frameLength)))
|
||||
drainFrames()
|
||||
}
|
||||
|
||||
func feedSilence(_ count: Int64) {
|
||||
guard count > 0 else { return }
|
||||
acc.append(contentsOf: repeatElement(0, count: Int(count)))
|
||||
drainFrames()
|
||||
}
|
||||
|
||||
/// Close any span still open at end of capture.
|
||||
func finish() {
|
||||
if inSpeech {
|
||||
appendSpan(startSample: spanStartSample, endSample: cursorSamples)
|
||||
inSpeech = false
|
||||
}
|
||||
}
|
||||
|
||||
private func drainFrames() {
|
||||
var i = 0
|
||||
while i + frameSize <= acc.count {
|
||||
var sum: Float = 0
|
||||
var j = i
|
||||
while j < i + frameSize { sum += acc[j] * acc[j]; j += 1 }
|
||||
step(rms: (sum / Float(frameSize)).squareRoot())
|
||||
cursorSamples += frameSize
|
||||
i += frameSize
|
||||
}
|
||||
if i > 0 { acc.removeFirst(i) }
|
||||
}
|
||||
|
||||
/// `cursorSamples` is the start sample of the frame being evaluated.
|
||||
private func step(rms: Float) {
|
||||
if rms < noiseFloor { noiseFloor = 0.9 * noiseFloor + 0.1 * rms }
|
||||
else { noiseFloor = 0.995 * noiseFloor + 0.005 * rms }
|
||||
|
||||
let threshold = max(absoluteFloor, noiseFloor * floorMultiplier)
|
||||
let voiced = rms > threshold
|
||||
|
||||
if voiced {
|
||||
voicedRun += 1; silentRun = 0
|
||||
if !inSpeech && voicedRun >= openFrames {
|
||||
inSpeech = true
|
||||
spanStartSample = cursorSamples - (voicedRun - 1) * frameSize
|
||||
}
|
||||
} else {
|
||||
silentRun += 1; voicedRun = 0
|
||||
if inSpeech && silentRun >= closeFrames {
|
||||
inSpeech = false
|
||||
appendSpan(startSample: spanStartSample,
|
||||
endSample: cursorSamples - (closeFrames - 1) * frameSize)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func appendSpan(startSample: Int, endSample: Int) {
|
||||
let start = Double(max(0, startSample)) / 16_000.0
|
||||
let end = Double(endSample) / 16_000.0
|
||||
if end > start { spans.append(VADSpan(start: start, end: end, confidence: 0.9)) }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
import AVFoundation
|
||||
|
||||
/// Sequential **16 kHz mono PCM-16 WAV** writer. Deliberately "dumb": it only
|
||||
/// appends buffers and silence and tracks `framesWritten`. Time alignment to the
|
||||
/// shared `t0` is done by the caller (`AudioRecorder`), which pads/trims using
|
||||
/// each buffer's true host time so the mic and system tracks stay anchored to
|
||||
/// the same timeline even if buffers are dropped or the hardware clocks drift.
|
||||
///
|
||||
/// Single-threaded: all calls happen on `AudioRecorder.ioQueue`.
|
||||
final class MonoTrackWriter {
|
||||
private let file: AVAudioFile
|
||||
private(set) var framesWritten: Int64 = 0
|
||||
|
||||
init(url: URL) throws {
|
||||
let settings: [String: Any] = [
|
||||
AVFormatIDKey: kAudioFormatLinearPCM,
|
||||
AVSampleRateKey: 16_000,
|
||||
AVNumberOfChannelsKey: 1,
|
||||
AVLinearPCMBitDepthKey: 16,
|
||||
AVLinearPCMIsFloatKey: false,
|
||||
AVLinearPCMIsBigEndianKey: false,
|
||||
]
|
||||
// On disk = Int16 PCM; processing/buffer format = Float32 (matches Resampler).
|
||||
self.file = try AVAudioFile(
|
||||
forWriting: url,
|
||||
settings: settings,
|
||||
commonFormat: .pcmFormatFloat32,
|
||||
interleaved: false)
|
||||
}
|
||||
|
||||
/// Writes the buffer; returns the number of frames actually committed (0 on
|
||||
/// failure). Callers feed the VAD this committed count to stay in lockstep.
|
||||
@discardableResult
|
||||
func write(_ buffer: AVAudioPCMBuffer) -> Int64 {
|
||||
guard buffer.frameLength > 0 else { return 0 }
|
||||
do {
|
||||
try file.write(from: buffer)
|
||||
let n = Int64(buffer.frameLength)
|
||||
framesWritten += n
|
||||
return n
|
||||
} catch {
|
||||
return 0 // best-effort: drop a buffer rather than tear down
|
||||
}
|
||||
}
|
||||
|
||||
/// Append `count` frames of silence (to fill timeline gaps); returns frames
|
||||
/// actually committed.
|
||||
@discardableResult
|
||||
func padSilence(_ count: Int64) -> Int64 {
|
||||
guard count > 0 else { return 0 }
|
||||
var remaining = count
|
||||
var committed: Int64 = 0
|
||||
let chunk: Int64 = 16_000
|
||||
while remaining > 0 {
|
||||
let n = AVAudioFrameCount(min(chunk, remaining))
|
||||
guard let buffer = AVAudioPCMBuffer(pcmFormat: Resampler.targetFormat, frameCapacity: n) else { break }
|
||||
buffer.frameLength = n
|
||||
if let ch = buffer.floatChannelData {
|
||||
memset(ch[0], 0, Int(n) * MemoryLayout<Float>.size)
|
||||
}
|
||||
if write(buffer) == 0 { break }
|
||||
committed += Int64(n)
|
||||
remaining -= Int64(n)
|
||||
}
|
||||
return committed
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
import AVFoundation
|
||||
|
||||
/// Converts arbitrary input PCM buffers to **16 kHz mono Float32**, maintaining
|
||||
/// resampler state across calls. Reuse one instance per source stream so the
|
||||
/// internal sample-rate converter stays continuous across buffers.
|
||||
///
|
||||
/// Not thread-safe: use one instance from a single thread. Both the mic and
|
||||
/// system instances are driven exclusively from `AudioRecorder.ioQueue` (one per
|
||||
/// source stream), kept continuous across buffers.
|
||||
final class Resampler {
|
||||
/// The canonical Phase-1 audio format: 16 kHz, mono, Float32, deinterleaved.
|
||||
static let targetFormat = AVAudioFormat(
|
||||
commonFormat: .pcmFormatFloat32,
|
||||
sampleRate: 16_000,
|
||||
channels: 1,
|
||||
interleaved: false)!
|
||||
|
||||
private var converter: AVAudioConverter?
|
||||
private var sourceFormat: AVAudioFormat?
|
||||
private var ended = false
|
||||
|
||||
/// 16 kHz mono buffer for `input`, or nil if conversion produced nothing.
|
||||
func resample(_ input: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
|
||||
guard !ended, input.frameLength > 0 else { return nil }
|
||||
|
||||
if converter == nil || sourceFormat != input.format {
|
||||
converter = AVAudioConverter(from: input.format, to: Self.targetFormat)
|
||||
sourceFormat = input.format
|
||||
}
|
||||
guard let converter else { return nil }
|
||||
|
||||
let ratio = Self.targetFormat.sampleRate / input.format.sampleRate
|
||||
let capacity = AVAudioFrameCount((Double(input.frameLength) * ratio).rounded(.up)) + 64
|
||||
guard let output = AVAudioPCMBuffer(pcmFormat: Self.targetFormat, frameCapacity: capacity) else {
|
||||
return nil
|
||||
}
|
||||
|
||||
var consumed = false
|
||||
var error: NSError?
|
||||
let status = converter.convert(to: output, error: &error) { _, inputStatus in
|
||||
if consumed { inputStatus.pointee = .noDataNow; return nil }
|
||||
consumed = true
|
||||
inputStatus.pointee = .haveData
|
||||
return input
|
||||
}
|
||||
if status == .error || output.frameLength == 0 { return nil }
|
||||
return output
|
||||
}
|
||||
|
||||
/// Flush the converter's internal tail at end of stream (call once on stop).
|
||||
func drain() -> AVAudioPCMBuffer? {
|
||||
guard !ended, let converter else { ended = true; return nil }
|
||||
ended = true
|
||||
guard let output = AVAudioPCMBuffer(pcmFormat: Self.targetFormat, frameCapacity: 8192) else {
|
||||
return nil
|
||||
}
|
||||
var error: NSError?
|
||||
let status = converter.convert(to: output, error: &error) { _, inputStatus in
|
||||
inputStatus.pointee = .endOfStream
|
||||
return nil
|
||||
}
|
||||
if status == .error || output.frameLength == 0 { return nil }
|
||||
return output
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user