diff --git a/Ten31Transcripts/Audio/ChannelSelfVAD.swift b/Ten31Transcripts/Audio/ChannelSelfVAD.swift new file mode 100644 index 0000000..cf86092 --- /dev/null +++ b/Ten31Transcripts/Audio/ChannelSelfVAD.swift @@ -0,0 +1,85 @@ +import AVFoundation + +/// Channel-verified "self" detection. The **mic track is, by definition, the local +/// user** — so instead of guessing self from the screen name, we mark self-speech +/// as the windows where the mic channel is active AND clearly louder than the +/// system channel (the remote participants). The `mic > system` test means a remote +/// person leaking faintly into the mic (room echo) can never be mislabeled as self, +/// and self is identified the same way on every platform — the user sets ONE name, +/// no need to match per-app display names. +/// +/// Runs at finalize over the two finished WAVs; the live capture path is untouched. +enum ChannelSelfVAD { + + /// Pure core (testable): self spans from per-window RMS envelopes of the two + /// channels. `windowSec` is the envelope resolution. + /// - `floor`: minimum mic RMS to count as voice. + /// - `margin`: mic must exceed system by this factor (self dominates the room). + /// - `hangover`: bridge gaps up to this many windows so one span isn't chopped. + /// - `minWindows`: drop blips shorter than this. + static func selfSpans(micRMS: [Float], systemRMS: [Float], windowSec: Double, + floor: Float = 0.01, margin: Float = 1.5, + hangover: Int = 7, minWindows: Int = 3, + confidence: Double = 0.9) -> [VADSpan] { + let n = min(micRMS.count, systemRMS.count) + guard n > 0 else { return [] } + var spans: [VADSpan] = [] + var start = -1 + var lastActive = -1 + func close(_ endExclusive: Int) { + guard start >= 0, lastActive - start + 1 >= minWindows else { start = -1; return } + spans.append(VADSpan(start: Double(start) * windowSec, + end: Double(lastActive + 1) * windowSec, + confidence: confidence)) + start = -1 + } + for i in 0.. floor && micRMS[i] > systemRMS[i] * margin + if active { + if start < 0 { start = i } + lastActive = i + } else if start >= 0 && i - lastActive > hangover { + close(i) + } + } + close(n) + return spans + } + + /// File wrapper: read the two aligned 16 kHz mono WAVs into RMS envelopes and run + /// the core. Returns nil if either file can't be read (caller falls back to mic-VAD). + static func selfSpans(micURL: URL, systemURL: URL, windowSec: Double = 0.03) -> [VADSpan]? { + guard let mic = rmsEnvelope(of: micURL, windowSec: windowSec), + let sys = rmsEnvelope(of: systemURL, windowSec: windowSec) else { return nil } + return selfSpans(micRMS: mic, systemRMS: sys, windowSec: windowSec) + } + + private static func rmsEnvelope(of url: URL, windowSec: Double) -> [Float]? { + guard let file = try? AVAudioFile(forReading: url) else { return nil } + let sr = file.processingFormat.sampleRate + let win = max(1, Int(sr * windowSec)) + let chunkFrames = AVAudioFrameCount(win * 64) + var env: [Float] = [] + var acc: Float = 0 + var accCount = 0 + while file.framePosition < file.length { + let remaining = file.length - file.framePosition + let toRead = AVAudioFrameCount(min(AVAudioFramePosition(chunkFrames), remaining)) + guard toRead > 0, + let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: toRead), + (try? file.read(into: buf, frameCount: toRead)) != nil, + let ch = buf.floatChannelData?[0] else { break } + for i in 0.. 0 { env.append((acc / Float(accCount)).squareRoot()) } + return env + } +} diff --git a/Ten31Transcripts/Session/SessionController.swift b/Ten31Transcripts/Session/SessionController.swift index 279c308..efe9eb9 100644 --- a/Ten31Transcripts/Session/SessionController.swift +++ b/Ten31Transcripts/Session/SessionController.swift @@ -277,16 +277,30 @@ final class SessionController: ObservableObject { private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?) async -> (timeline: [VisualTimeline.Segment], visualRan: Bool) { let selfName = settings.selfName + let selfSpans = await channelSelfSpans(result: result, folder: folder) if let vc = visualCapture, let folder { visualCapture = nil let timeline = await vc.finish( - selfSpans: result.selfSpans, selfName: selfName, + selfSpans: selfSpans, selfName: selfName, sessionId: folder.lastPathComponent, t0Unix: result.t0Unix, durationSec: result.duration, folder: folder) return (timeline, true) } if let vc = visualCapture { await vc.cancel(); visualCapture = nil } - return (TranscriptPipeline.timeline(fromSelfSpans: result.selfSpans, selfName: selfName), false) + return (TranscriptPipeline.timeline(fromSelfSpans: selfSpans, selfName: selfName), false) + } + + /// Self spans for the backend timeline, identified by CHANNEL: the mic track is + /// the local user, so self = mic active AND louder than system. This makes self + /// platform-independent (one name, no display-name matching) and stops a remote + /// speaker from being mislabeled as self. Falls back to the mic-VAD spans if the + /// tracks can't be read. Runs off the main actor (file I/O). + private func channelSelfSpans(result: RecordingResult, folder: URL?) async -> [VADSpan] { + guard let folder else { return result.selfSpans } + let mic = folder.appendingPathComponent("mic.wav") + let sys = folder.appendingPathComponent("system.wav") + let spans = await Task.detached { ChannelSelfVAD.selfSpans(micURL: mic, systemURL: sys) }.value + return spans ?? result.selfSpans } private func stop() { diff --git a/Ten31TranscriptsTests/ChannelSelfVADTests.swift b/Ten31TranscriptsTests/ChannelSelfVADTests.swift new file mode 100644 index 0000000..2832e8c --- /dev/null +++ b/Ten31TranscriptsTests/ChannelSelfVADTests.swift @@ -0,0 +1,45 @@ +import XCTest +@testable import Ten31Transcripts + +/// Channel-verified self detection: self = mic active AND louder than system, so a +/// remote speaker (in the system channel) is never mislabeled as the local user. +final class ChannelSelfVADTests: XCTestCase { + private let win = 0.05 + + func testSelfSpanWhereMicDominates() { + // Self talks windows 0–9 (mic loud, system silent), other talks 10–19. + let mic = Array(repeating: Float(0.15), count: 10) + Array(repeating: Float(0.0), count: 10) + let sys = Array(repeating: Float(0.0), count: 10) + Array(repeating: Float(0.15), count: 10) + let spans = ChannelSelfVAD.selfSpans(micRMS: mic, systemRMS: sys, windowSec: win) + XCTAssertEqual(spans.count, 1) + XCTAssertEqual(spans.first?.start ?? -1, 0.0, accuracy: 0.001) + XCTAssertEqual(spans.first?.end ?? -1, 0.5, accuracy: win + 0.001) // ~windows 0–9 + } + + func testRemoteEchoIntoMicIsNotSelf() { + // Remote person loud in system, only faintly echoed into mic (below margin). + let mic = Array(repeating: Float(0.02), count: 20) + let sys = Array(repeating: Float(0.15), count: 20) + XCTAssertTrue(ChannelSelfVAD.selfSpans(micRMS: mic, systemRMS: sys, windowSec: win).isEmpty) + } + + func testSilenceProducesNoSpans() { + let q = Array(repeating: Float(0.003), count: 20) + XCTAssertTrue(ChannelSelfVAD.selfSpans(micRMS: q, systemRMS: q, windowSec: win).isEmpty) + } + + func testShortBlipDropped() { + // 2 active windows < minWindows(3) → ignored. + let mic: [Float] = [0.2, 0.2] + Array(repeating: Float(0.0), count: 18) + let sys = Array(repeating: Float(0.0), count: 20) + XCTAssertTrue(ChannelSelfVAD.selfSpans(micRMS: mic, systemRMS: sys, windowSec: win).isEmpty) + } + + func testHangoverBridgesShortGap() { + // Brief dip (2 windows) inside a self turn stays ONE span, not two. + let mic = Array(repeating: Float(0.2), count: 8) + [0.0, 0.0] + Array(repeating: Float(0.2), count: 8) + let sys = Array(repeating: Float(0.0), count: 18) + let spans = ChannelSelfVAD.selfSpans(micRMS: mic, systemRMS: sys, windowSec: win) + XCTAssertEqual(spans.count, 1) + } +}