Channel-verified self identity: the mic track is you
Grant's insight + proven on real session audio: we capture self (mic) and others (system) as separate tracks, then throw the separation away by mixing to mono — so the backend has to re-guess who's who. Analysis of a real call showed the channels are cleanly separated (envelope corr 0.015, NO echo); Caitlyn's 'Go Bitcoin' was 11.8x louder in system than mic, yet the mono mix + noisy visual named it 'Grant'. ChannelSelfVAD marks self-speech as windows where the mic is active AND louder than system (mic > system x1.5). Benefits: (1) self is identified by CHANNEL, not by the on-screen name — set one name in Settings, no per-platform matching; (2) a remote speaker (or room echo) can never be mislabeled as self. Computed at finalize from the two finished WAVs; the live capture path is untouched. Falls back to mic-VAD if tracks can't be read. SessionController feeds these spans to the backend timeline. Validated on the real session: 16 self spans; 'Go Bitcoin' (72-74s) correctly EXCLUDED, Grant's 49.9-53.3s / 62.6-64s correctly INCLUDED. 33/33 XCTest (5 new).
This commit is contained in:
@@ -0,0 +1,85 @@
|
|||||||
|
import AVFoundation
|
||||||
|
|
||||||
|
/// Channel-verified "self" detection. The **mic track is, by definition, the local
|
||||||
|
/// user** — so instead of guessing self from the screen name, we mark self-speech
|
||||||
|
/// as the windows where the mic channel is active AND clearly louder than the
|
||||||
|
/// system channel (the remote participants). The `mic > system` test means a remote
|
||||||
|
/// person leaking faintly into the mic (room echo) can never be mislabeled as self,
|
||||||
|
/// and self is identified the same way on every platform — the user sets ONE name,
|
||||||
|
/// no need to match per-app display names.
|
||||||
|
///
|
||||||
|
/// Runs at finalize over the two finished WAVs; the live capture path is untouched.
|
||||||
|
enum ChannelSelfVAD {
|
||||||
|
|
||||||
|
/// Pure core (testable): self spans from per-window RMS envelopes of the two
|
||||||
|
/// channels. `windowSec` is the envelope resolution.
|
||||||
|
/// - `floor`: minimum mic RMS to count as voice.
|
||||||
|
/// - `margin`: mic must exceed system by this factor (self dominates the room).
|
||||||
|
/// - `hangover`: bridge gaps up to this many windows so one span isn't chopped.
|
||||||
|
/// - `minWindows`: drop blips shorter than this.
|
||||||
|
static func selfSpans(micRMS: [Float], systemRMS: [Float], windowSec: Double,
|
||||||
|
floor: Float = 0.01, margin: Float = 1.5,
|
||||||
|
hangover: Int = 7, minWindows: Int = 3,
|
||||||
|
confidence: Double = 0.9) -> [VADSpan] {
|
||||||
|
let n = min(micRMS.count, systemRMS.count)
|
||||||
|
guard n > 0 else { return [] }
|
||||||
|
var spans: [VADSpan] = []
|
||||||
|
var start = -1
|
||||||
|
var lastActive = -1
|
||||||
|
func close(_ endExclusive: Int) {
|
||||||
|
guard start >= 0, lastActive - start + 1 >= minWindows else { start = -1; return }
|
||||||
|
spans.append(VADSpan(start: Double(start) * windowSec,
|
||||||
|
end: Double(lastActive + 1) * windowSec,
|
||||||
|
confidence: confidence))
|
||||||
|
start = -1
|
||||||
|
}
|
||||||
|
for i in 0..<n {
|
||||||
|
let active = micRMS[i] > floor && micRMS[i] > systemRMS[i] * margin
|
||||||
|
if active {
|
||||||
|
if start < 0 { start = i }
|
||||||
|
lastActive = i
|
||||||
|
} else if start >= 0 && i - lastActive > hangover {
|
||||||
|
close(i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close(n)
|
||||||
|
return spans
|
||||||
|
}
|
||||||
|
|
||||||
|
/// File wrapper: read the two aligned 16 kHz mono WAVs into RMS envelopes and run
|
||||||
|
/// the core. Returns nil if either file can't be read (caller falls back to mic-VAD).
|
||||||
|
static func selfSpans(micURL: URL, systemURL: URL, windowSec: Double = 0.03) -> [VADSpan]? {
|
||||||
|
guard let mic = rmsEnvelope(of: micURL, windowSec: windowSec),
|
||||||
|
let sys = rmsEnvelope(of: systemURL, windowSec: windowSec) else { return nil }
|
||||||
|
return selfSpans(micRMS: mic, systemRMS: sys, windowSec: windowSec)
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func rmsEnvelope(of url: URL, windowSec: Double) -> [Float]? {
|
||||||
|
guard let file = try? AVAudioFile(forReading: url) else { return nil }
|
||||||
|
let sr = file.processingFormat.sampleRate
|
||||||
|
let win = max(1, Int(sr * windowSec))
|
||||||
|
let chunkFrames = AVAudioFrameCount(win * 64)
|
||||||
|
var env: [Float] = []
|
||||||
|
var acc: Float = 0
|
||||||
|
var accCount = 0
|
||||||
|
while file.framePosition < file.length {
|
||||||
|
let remaining = file.length - file.framePosition
|
||||||
|
let toRead = AVAudioFrameCount(min(AVAudioFramePosition(chunkFrames), remaining))
|
||||||
|
guard toRead > 0,
|
||||||
|
let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: toRead),
|
||||||
|
(try? file.read(into: buf, frameCount: toRead)) != nil,
|
||||||
|
let ch = buf.floatChannelData?[0] else { break }
|
||||||
|
for i in 0..<Int(buf.frameLength) {
|
||||||
|
let v = ch[i]
|
||||||
|
acc += v * v
|
||||||
|
accCount += 1
|
||||||
|
if accCount == win {
|
||||||
|
env.append((acc / Float(win)).squareRoot())
|
||||||
|
acc = 0; accCount = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if accCount > 0 { env.append((acc / Float(accCount)).squareRoot()) }
|
||||||
|
return env
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -277,16 +277,30 @@ final class SessionController: ObservableObject {
|
|||||||
private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?)
|
private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?)
|
||||||
async -> (timeline: [VisualTimeline.Segment], visualRan: Bool) {
|
async -> (timeline: [VisualTimeline.Segment], visualRan: Bool) {
|
||||||
let selfName = settings.selfName
|
let selfName = settings.selfName
|
||||||
|
let selfSpans = await channelSelfSpans(result: result, folder: folder)
|
||||||
if let vc = visualCapture, let folder {
|
if let vc = visualCapture, let folder {
|
||||||
visualCapture = nil
|
visualCapture = nil
|
||||||
let timeline = await vc.finish(
|
let timeline = await vc.finish(
|
||||||
selfSpans: result.selfSpans, selfName: selfName,
|
selfSpans: selfSpans, selfName: selfName,
|
||||||
sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
|
sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
|
||||||
durationSec: result.duration, folder: folder)
|
durationSec: result.duration, folder: folder)
|
||||||
return (timeline, true)
|
return (timeline, true)
|
||||||
}
|
}
|
||||||
if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
|
if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
|
||||||
return (TranscriptPipeline.timeline(fromSelfSpans: result.selfSpans, selfName: selfName), false)
|
return (TranscriptPipeline.timeline(fromSelfSpans: selfSpans, selfName: selfName), false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Self spans for the backend timeline, identified by CHANNEL: the mic track is
|
||||||
|
/// the local user, so self = mic active AND louder than system. This makes self
|
||||||
|
/// platform-independent (one name, no display-name matching) and stops a remote
|
||||||
|
/// speaker from being mislabeled as self. Falls back to the mic-VAD spans if the
|
||||||
|
/// tracks can't be read. Runs off the main actor (file I/O).
|
||||||
|
private func channelSelfSpans(result: RecordingResult, folder: URL?) async -> [VADSpan] {
|
||||||
|
guard let folder else { return result.selfSpans }
|
||||||
|
let mic = folder.appendingPathComponent("mic.wav")
|
||||||
|
let sys = folder.appendingPathComponent("system.wav")
|
||||||
|
let spans = await Task.detached { ChannelSelfVAD.selfSpans(micURL: mic, systemURL: sys) }.value
|
||||||
|
return spans ?? result.selfSpans
|
||||||
}
|
}
|
||||||
|
|
||||||
private func stop() {
|
private func stop() {
|
||||||
|
|||||||
@@ -0,0 +1,45 @@
|
|||||||
|
import XCTest
|
||||||
|
@testable import Ten31Transcripts
|
||||||
|
|
||||||
|
/// Channel-verified self detection: self = mic active AND louder than system, so a
|
||||||
|
/// remote speaker (in the system channel) is never mislabeled as the local user.
|
||||||
|
final class ChannelSelfVADTests: XCTestCase {
|
||||||
|
private let win = 0.05
|
||||||
|
|
||||||
|
func testSelfSpanWhereMicDominates() {
|
||||||
|
// Self talks windows 0–9 (mic loud, system silent), other talks 10–19.
|
||||||
|
let mic = Array(repeating: Float(0.15), count: 10) + Array(repeating: Float(0.0), count: 10)
|
||||||
|
let sys = Array(repeating: Float(0.0), count: 10) + Array(repeating: Float(0.15), count: 10)
|
||||||
|
let spans = ChannelSelfVAD.selfSpans(micRMS: mic, systemRMS: sys, windowSec: win)
|
||||||
|
XCTAssertEqual(spans.count, 1)
|
||||||
|
XCTAssertEqual(spans.first?.start ?? -1, 0.0, accuracy: 0.001)
|
||||||
|
XCTAssertEqual(spans.first?.end ?? -1, 0.5, accuracy: win + 0.001) // ~windows 0–9
|
||||||
|
}
|
||||||
|
|
||||||
|
func testRemoteEchoIntoMicIsNotSelf() {
|
||||||
|
// Remote person loud in system, only faintly echoed into mic (below margin).
|
||||||
|
let mic = Array(repeating: Float(0.02), count: 20)
|
||||||
|
let sys = Array(repeating: Float(0.15), count: 20)
|
||||||
|
XCTAssertTrue(ChannelSelfVAD.selfSpans(micRMS: mic, systemRMS: sys, windowSec: win).isEmpty)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testSilenceProducesNoSpans() {
|
||||||
|
let q = Array(repeating: Float(0.003), count: 20)
|
||||||
|
XCTAssertTrue(ChannelSelfVAD.selfSpans(micRMS: q, systemRMS: q, windowSec: win).isEmpty)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testShortBlipDropped() {
|
||||||
|
// 2 active windows < minWindows(3) → ignored.
|
||||||
|
let mic: [Float] = [0.2, 0.2] + Array(repeating: Float(0.0), count: 18)
|
||||||
|
let sys = Array(repeating: Float(0.0), count: 20)
|
||||||
|
XCTAssertTrue(ChannelSelfVAD.selfSpans(micRMS: mic, systemRMS: sys, windowSec: win).isEmpty)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testHangoverBridgesShortGap() {
|
||||||
|
// Brief dip (2 windows) inside a self turn stays ONE span, not two.
|
||||||
|
let mic = Array(repeating: Float(0.2), count: 8) + [0.0, 0.0] + Array(repeating: Float(0.2), count: 8)
|
||||||
|
let sys = Array(repeating: Float(0.0), count: 18)
|
||||||
|
let spans = ChannelSelfVAD.selfSpans(micRMS: mic, systemRMS: sys, windowSec: win)
|
||||||
|
XCTAssertEqual(spans.count, 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user