import AVFoundation /// Channel-verified "self" detection. The **mic track is, by definition, the local /// user** — so instead of guessing self from the screen name, we mark self-speech /// as the windows where the mic channel is active AND clearly louder than the /// system channel (the remote participants). The `mic > system` test means a remote /// person leaking faintly into the mic (room echo) can never be mislabeled as self, /// and self is identified the same way on every platform — the user sets ONE name, /// no need to match per-app display names. /// /// Runs at finalize over the two finished WAVs; the live capture path is untouched. enum ChannelSelfVAD { /// Pure core (testable): self spans from per-window RMS envelopes of the two /// channels. `windowSec` is the envelope resolution. /// - `floor`: minimum mic RMS to count as voice. /// - `margin`: mic must exceed system by this factor (self dominates the room). /// - `hangover`: bridge gaps up to this many windows so one span isn't chopped. /// - `minWindows`: drop blips shorter than this. static func selfSpans(micRMS: [Float], systemRMS: [Float], windowSec: Double, floor: Float = 0.01, margin: Float = 1.5, hangover: Int = 7, minWindows: Int = 3, confidence: Double = 0.9) -> [VADSpan] { let n = min(micRMS.count, systemRMS.count) guard n > 0 else { return [] } var spans: [VADSpan] = [] var start = -1 var lastActive = -1 func close(_ endExclusive: Int) { guard start >= 0, lastActive - start + 1 >= minWindows else { start = -1; return } spans.append(VADSpan(start: Double(start) * windowSec, end: Double(lastActive + 1) * windowSec, confidence: confidence)) start = -1 } for i in 0.. floor && micRMS[i] > systemRMS[i] * margin if active { if start < 0 { start = i } lastActive = i } else if start >= 0 && i - lastActive > hangover { close(i) } } close(n) return spans } /// File wrapper: read the two aligned 16 kHz mono WAVs into RMS envelopes and run /// the core. Returns nil if either file can't be read (caller falls back to mic-VAD). static func selfSpans(micURL: URL, systemURL: URL, windowSec: Double = 0.03) -> [VADSpan]? { guard let mic = rmsEnvelope(of: micURL, windowSec: windowSec), let sys = rmsEnvelope(of: systemURL, windowSec: windowSec) else { return nil } return selfSpans(micRMS: mic, systemRMS: sys, windowSec: windowSec) } private static func rmsEnvelope(of url: URL, windowSec: Double) -> [Float]? { guard let file = try? AVAudioFile(forReading: url) else { return nil } let sr = file.processingFormat.sampleRate let win = max(1, Int(sr * windowSec)) let chunkFrames = AVAudioFrameCount(win * 64) var env: [Float] = [] var acc: Float = 0 var accCount = 0 while file.framePosition < file.length { let remaining = file.length - file.framePosition let toRead = AVAudioFrameCount(min(AVAudioFramePosition(chunkFrames), remaining)) guard toRead > 0, let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: toRead), (try? file.read(into: buf, frameCount: toRead)) != nil, let ch = buf.floatChannelData?[0] else { break } for i in 0.. 0 { env.append((acc / Float(accCount)).squareRoot()) } return env } }