Client: dual-channel label-merge (mic_file + system_file)

The backend shipped dual-channel mode; wire the client to it. We already capture
mic (you) and system (others) separately, so send them as two files instead of the
mono mix — fixing the misattribution at the source.

- SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad);
  multipart generalized to N files; shared POST/retry/decode extracted.
- SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad;
  sliceAudio reused for both tracks.
- TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase
  timeline + self_vad per chunk) when system audio is healthy; mono mixed-file
  fallback (self folded into the timeline) otherwise.
- VisualCapture.finish: write the full visual_timeline.json (remote + self merged)
  but return REMOTE (vision) segments only — self travels via the mic channel.
- TranscriptAssembler: rank mic_channel highest (the user's own track wins).
- VoiceprintStore: store the clean mic_channel self voiceprint.
- SessionController: pass mic/system URLs + remote timeline + channel self-spans +
  self_name + systemHealthy; self_vad.json now reflects the channel-verified spans.

Validated END-TO-END against the live backend on the real misattributing session:
'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own
lines come back source=mic_channel; per-channel ASR recovered fuller remote text.
36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
This commit is contained in:
Grant Gilliam
2026-06-06 13:15:29 -05:00
parent 2191486506
commit 53d7fcdac0
9 changed files with 199 additions and 62 deletions
@@ -65,8 +65,13 @@ final class SessionController: ObservableObject {
let folder: URL
let sessionId: String
let app: String
let micURL: URL
let systemURL: URL
let mixedURL: URL
let timeline: [VisualTimeline.Segment]
let timeline: [VisualTimeline.Segment] // remote visual names; self handled via the mic channel
let selfSpans: [VADSpan]
let selfName: String
let systemHealthy: Bool
}
private var lastProcess: ProcessInputs?
private var processTask: Task<Void, Never>?
@@ -275,19 +280,20 @@ final class SessionController: ObservableObject {
/// ran, otherwise the mic-VAD self spans alone. `visualRan` reports whether the
/// visual pipeline actually attached (for the after-session indicator).
private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?)
async -> (timeline: [VisualTimeline.Segment], visualRan: Bool) {
async -> (timeline: [VisualTimeline.Segment], selfSpans: [VADSpan], visualRan: Bool) {
let selfName = settings.selfName
let selfSpans = await channelSelfSpans(result: result, folder: folder)
if let vc = visualCapture, let folder {
visualCapture = nil
let timeline = await vc.finish(
// Remote (vision) segments only; self travels separately as the mic channel.
let remote = await vc.finish(
selfSpans: selfSpans, selfName: selfName,
sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
durationSec: result.duration, folder: folder)
return (timeline, true)
return (remote, selfSpans, true)
}
if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
return (TranscriptPipeline.timeline(fromSelfSpans: selfSpans, selfName: selfName), false)
return ([], selfSpans, false)
}
/// Self spans for the backend timeline, identified by CHANNEL: the mic track is
@@ -312,26 +318,29 @@ final class SessionController: ObservableObject {
lifecycleTask = Task {
let result = await recorder.stop()
let visual = await self.stopVisualAndTimeline(result, folder: folder)
self.finish(result, timeline: visual.timeline, visualRan: visual.visualRan)
self.finish(result, timeline: visual.timeline, selfSpans: visual.selfSpans, visualRan: visual.visualRan)
}
}
private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment], visualRan: Bool) {
private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment],
selfSpans: [VADSpan], visualRan: Bool) {
recorder = nil
micLevel = 0
systemLevel = 0
warning = result.systemNote.map { "System audio stopped early: \($0)" }
transcriptStatus = .idle
if let folder = currentFolder {
writeSelfSpans(result, to: folder)
let visualCount = visualRan ? timeline.filter { $0.source == "vision" }.count : nil
writeSelfSpans(spans: selfSpans, result: result, to: folder)
let visualCount = visualRan ? timeline.count : nil // `timeline` is the remote vision segments
lastSession = SessionInfo(
folder: folder, mixedURL: result.mixedURL,
duration: result.duration, selfSpanCount: result.selfSpans.count,
duration: result.duration, selfSpanCount: selfSpans.count,
visualSegmentCount: visualCount)
lastProcess = ProcessInputs(
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
mixedURL: result.mixedURL, timeline: timeline)
micURL: result.micURL, systemURL: result.systemURL, mixedURL: result.mixedURL,
timeline: timeline, selfSpans: selfSpans, selfName: settings.selfName,
systemHealthy: result.systemNote == nil)
}
let autoSend = settings.autoSendOnStop
currentFolder = nil
@@ -360,11 +369,12 @@ final class SessionController: ObservableObject {
baseURL: settings.backendBaseURL,
skipTLS: settings.skipTLSVerification,
voiceprints: voiceprints)
let timeline = inputs.timeline
do {
let speakers = try await pipeline.process(
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
mixedURL: inputs.mixedURL, timeline: timeline,
micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
systemHealthy: inputs.systemHealthy,
progress: { done, total in
await MainActor.run { self.transcriptStatus = .processing(done, total) }
})
@@ -411,7 +421,7 @@ final class SessionController: ObservableObject {
let folder = currentFolder
let result = await recorder.stop()
let visual = await stopVisualAndTimeline(result, folder: folder)
finish(result, timeline: visual.timeline, visualRan: visual.visualRan)
finish(result, timeline: visual.timeline, selfSpans: visual.selfSpans, visualRan: visual.visualRan)
} else if lifecycleGeneration == gen {
break // settled: no new transition was spawned
}
@@ -461,15 +471,15 @@ final class SessionController: ObservableObject {
return f.string(from: Date())
}
/// Phase-1 preview of the mic-VAD "self" spans (the eventual
/// `visual_timeline.json` `mic_vad` segments). Lets us eyeball VAD quality.
private func writeSelfSpans(_ result: RecordingResult, to folder: URL) {
let segments = result.selfSpans.map { span -> [String: Any] in
/// Debug artifact: the channel-verified "self" spans actually sent to the backend
/// as `self_vad` (mic active AND louder than system). Lets us eyeball self detection.
private func writeSelfSpans(spans: [VADSpan], result: RecordingResult, to folder: URL) {
let segments = spans.map { span -> [String: Any] in
["start": span.start, "end": span.end, "name": "self",
"confidence": span.confidence, "source": "mic_vad"]
"confidence": span.confidence, "source": "mic_channel"]
}
let object: [String: Any] = [
"note": "Phase 1 mic-VAD self spans (preview of visual_timeline segments)",
"note": "channel-verified self spans (mic active and louder than system) — the self_vad sent to label-merge",
"t0_unix": result.t0Unix,
"duration_sec": result.duration,
"self_spans": segments,