Client: dual-channel label-merge (mic_file + system_file)

The backend shipped dual-channel mode; wire the client to it. We already capture mic (you) and system (others) separately, so send them as two files instead of the mono mix — fixing the misattribution at the source. - SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad); multipart generalized to N files; shared POST/retry/decode extracted. - SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad; sliceAudio reused for both tracks. - TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase timeline + self_vad per chunk) when system audio is healthy; mono mixed-file fallback (self folded into the timeline) otherwise. - VisualCapture.finish: write the full visual_timeline.json (remote + self merged) but return REMOTE (vision) segments only — self travels via the mic channel. - TranscriptAssembler: rank mic_channel highest (the user's own track wins). - VoiceprintStore: store the clean mic_channel self voiceprint. - SessionController: pass mic/system URLs + remote timeline + channel self-spans + self_name + systemHealthy; self_vad.json now reflects the channel-verified spans. Validated END-TO-END against the live backend on the real misattributing session: 'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own lines come back source=mic_channel; per-channel ASR recovered fuller remote text. 36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
2026-06-06 13:15:29 -05:00
parent 2191486506
commit 53d7fcdac0
9 changed files with 199 additions and 62 deletions
@@ -88,8 +88,7 @@ final class SparkControlClient {

    deinit { urlSession.finishTasksAndInvalidate() }

-    /// One `label-merge` call. `timeline` is the flat `[{start,end,name,confidence}]`
-    /// JSON (chunk-local seconds). Retries on `503 + Retry-After`.
+    /// Mono `label-merge`: one mixed-mono file + timeline. Retries on `503`.
    func labelMerge(audioURL: URL,
                    timeline: Data,
                    knownVoiceprints: [String: [Float]]?,
@@ -97,14 +96,46 @@ final class SparkControlClient {
                    minOverlap: Double? = nil,
                    voiceprintThreshold: Double? = nil,
                    maxRetries: Int = 3) async throws -> LabelMergeResponse {
-        guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
-            throw SparkControlError.invalidHost
-        }
+        let fields = Self.commonFields(timeline: timeline, knownVoiceprints: knownVoiceprints,
+                                       transcribe: transcribe, minOverlap: minOverlap,
+                                       voiceprintThreshold: voiceprintThreshold)
+        let files = [(field: "file", filename: audioURL.lastPathComponent, data: try Data(contentsOf: audioURL))]
+        return try await perform(fields: fields, files: files, maxRetries: maxRetries)
+    }

+    /// Dual-channel `label-merge`: separate mic (local user) + system (remote)
+    /// tracks. The mic channel is attributed as `self_name`; `timeline` names only
+    /// the remote/system speakers; `selfVad` (optional) are the chunk-local windows
+    /// where the mic is genuinely the user (active and louder than system).
+    func labelMergeDual(micURL: URL,
+                        systemURL: URL,
+                        selfName: String,
+                        selfVad: Data?,
+                        timeline: Data,
+                        knownVoiceprints: [String: [Float]]?,
+                        transcribe: Bool,
+                        minOverlap: Double? = nil,
+                        voiceprintThreshold: Double? = nil,
+                        maxRetries: Int = 3) async throws -> LabelMergeResponse {
+        var fields = Self.commonFields(timeline: timeline, knownVoiceprints: knownVoiceprints,
+                                       transcribe: transcribe, minOverlap: minOverlap,
+                                       voiceprintThreshold: voiceprintThreshold)
+        fields["self_name"] = selfName
+        if let selfVad, let str = String(data: selfVad, encoding: .utf8) { fields["self_vad"] = str }
+        let files = [
+            (field: "mic_file", filename: micURL.lastPathComponent, data: try Data(contentsOf: micURL)),
+            (field: "system_file", filename: systemURL.lastPathComponent, data: try Data(contentsOf: systemURL)),
+        ]
+        return try await perform(fields: fields, files: files, maxRetries: maxRetries)
+    }
+
+    // MARK: - Transport
+
+    private static func commonFields(timeline: Data, knownVoiceprints: [String: [Float]]?,
+                                     transcribe: Bool, minOverlap: Double?,
+                                     voiceprintThreshold: Double?) -> [String: String] {
        var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"]
-        if let timelineString = String(data: timeline, encoding: .utf8) {
-            fields["timeline"] = timelineString
-        }
+        if let timelineString = String(data: timeline, encoding: .utf8) { fields["timeline"] = timelineString }
        if let known = knownVoiceprints, !known.isEmpty,
           let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }),
           let str = String(data: data, encoding: .utf8) {
@@ -112,11 +143,17 @@ final class SparkControlClient {
        }
        if let minOverlap { fields["min_overlap"] = String(minOverlap) }
        if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) }
+        return fields
+    }

-        let audio = try Data(contentsOf: audioURL)
-        // Body doesn't change between retries — build it once.
-        let (body, contentType) = Self.multipart(fields: fields, fileField: "file",
-                                                 filename: audioURL.lastPathComponent, fileData: audio)
+    /// Shared POST + retry-on-503 + decode. Body is built once (constant across retries).
+    private func perform(fields: [String: String],
+                         files: [(field: String, filename: String, data: Data)],
+                         maxRetries: Int) async throws -> LabelMergeResponse {
+        guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
+            throw SparkControlError.invalidHost
+        }
+        let (body, contentType) = Self.multipart(fields: fields, files: files)

        var attempt = 0
        while true {
@@ -158,8 +195,8 @@ final class SparkControlClient {
        return String(data: data, encoding: .utf8) ?? "unknown error"
    }

-    private static func multipart(fields: [String: String], fileField: String,
-                                  filename: String, fileData: Data) -> (Data, String) {
+    private static func multipart(fields: [String: String],
+                                  files: [(field: String, filename: String, data: Data)]) -> (Data, String) {
        let boundary = "Boundary-\(UUID().uuidString)"
        var body = Data()
        func append(_ s: String) { body.append(s.data(using: .utf8)!) }
@@ -169,11 +206,14 @@ final class SparkControlClient {
            append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n")
            append("\(value)\r\n")
        }
-        append("--\(boundary)\r\n")
-        append("Content-Disposition: form-data; name=\"\(fileField)\"; filename=\"\(filename)\"\r\n")
-        append("Content-Type: audio/wav\r\n\r\n")
-        body.append(fileData)
-        append("\r\n--\(boundary)--\r\n")
+        for file in files {
+            append("--\(boundary)\r\n")
+            append("Content-Disposition: form-data; name=\"\(file.field)\"; filename=\"\(file.filename)\"\r\n")
+            append("Content-Type: audio/wav\r\n\r\n")
+            body.append(file.data)
+            append("\r\n")
+        }
+        append("--\(boundary)--\r\n")
        return (body, "multipart/form-data; boundary=\(boundary)")
    }
 }
@@ -48,9 +48,10 @@ final class VoiceprintStore {
            guard !Self.isUnknown(sp.name) else { continue }
            let acceptable: Bool
            switch sp.source {
-            case "visual":     acceptable = (sp.overlapConfidence ?? 0) >= minOverlapToStore
-            case "voiceprint": acceptable = true            // already matched a known print
-            default:           acceptable = false           // unmatched
+            case "mic_channel": acceptable = true           // the user's own clean mic voiceprint
+            case "visual":      acceptable = (sp.overlapConfidence ?? 0) >= minOverlapToStore
+            case "voiceprint":  acceptable = true            // already matched a known print
+            default:            acceptable = false           // unmatched
            }
            guard acceptable, let vector = sp.fingerprint ?? response.fingerprints[sp.name],
                  !vector.isEmpty else { continue }
@@ -65,8 +65,13 @@ final class SessionController: ObservableObject {
        let folder: URL
        let sessionId: String
        let app: String
+        let micURL: URL
+        let systemURL: URL
        let mixedURL: URL
-        let timeline: [VisualTimeline.Segment]
+        let timeline: [VisualTimeline.Segment]   // remote visual names; self handled via the mic channel
+        let selfSpans: [VADSpan]
+        let selfName: String
+        let systemHealthy: Bool
    }
    private var lastProcess: ProcessInputs?
    private var processTask: Task<Void, Never>?
@@ -275,19 +280,20 @@ final class SessionController: ObservableObject {
    /// ran, otherwise the mic-VAD self spans alone. `visualRan` reports whether the
    /// visual pipeline actually attached (for the after-session indicator).
    private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?)
-        async -> (timeline: [VisualTimeline.Segment], visualRan: Bool) {
+        async -> (timeline: [VisualTimeline.Segment], selfSpans: [VADSpan], visualRan: Bool) {
        let selfName = settings.selfName
        let selfSpans = await channelSelfSpans(result: result, folder: folder)
        if let vc = visualCapture, let folder {
            visualCapture = nil
-            let timeline = await vc.finish(
+            // Remote (vision) segments only; self travels separately as the mic channel.
+            let remote = await vc.finish(
                selfSpans: selfSpans, selfName: selfName,
                sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
                durationSec: result.duration, folder: folder)
-            return (timeline, true)
+            return (remote, selfSpans, true)
        }
        if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
-        return (TranscriptPipeline.timeline(fromSelfSpans: selfSpans, selfName: selfName), false)
+        return ([], selfSpans, false)
    }

    /// Self spans for the backend timeline, identified by CHANNEL: the mic track is
@@ -312,26 +318,29 @@ final class SessionController: ObservableObject {
        lifecycleTask = Task {
            let result = await recorder.stop()
            let visual = await self.stopVisualAndTimeline(result, folder: folder)
-            self.finish(result, timeline: visual.timeline, visualRan: visual.visualRan)
+            self.finish(result, timeline: visual.timeline, selfSpans: visual.selfSpans, visualRan: visual.visualRan)
        }
    }

-    private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment], visualRan: Bool) {
+    private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment],
+                        selfSpans: [VADSpan], visualRan: Bool) {
        recorder = nil
        micLevel = 0
        systemLevel = 0
        warning = result.systemNote.map { "System audio stopped early: \($0)" }
        transcriptStatus = .idle
        if let folder = currentFolder {
-            writeSelfSpans(result, to: folder)
-            let visualCount = visualRan ? timeline.filter { $0.source == "vision" }.count : nil
+            writeSelfSpans(spans: selfSpans, result: result, to: folder)
+            let visualCount = visualRan ? timeline.count : nil   // `timeline` is the remote vision segments
            lastSession = SessionInfo(
                folder: folder, mixedURL: result.mixedURL,
-                duration: result.duration, selfSpanCount: result.selfSpans.count,
+                duration: result.duration, selfSpanCount: selfSpans.count,
                visualSegmentCount: visualCount)
            lastProcess = ProcessInputs(
                folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
-                mixedURL: result.mixedURL, timeline: timeline)
+                micURL: result.micURL, systemURL: result.systemURL, mixedURL: result.mixedURL,
+                timeline: timeline, selfSpans: selfSpans, selfName: settings.selfName,
+                systemHealthy: result.systemNote == nil)
        }
        let autoSend = settings.autoSendOnStop
        currentFolder = nil
@@ -360,11 +369,12 @@ final class SessionController: ObservableObject {
                baseURL: settings.backendBaseURL,
                skipTLS: settings.skipTLSVerification,
                voiceprints: voiceprints)
-            let timeline = inputs.timeline
            do {
                let speakers = try await pipeline.process(
                    sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
-                    mixedURL: inputs.mixedURL, timeline: timeline,
+                    micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
+                    timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
+                    systemHealthy: inputs.systemHealthy,
                    progress: { done, total in
                        await MainActor.run { self.transcriptStatus = .processing(done, total) }
                    })
@@ -411,7 +421,7 @@ final class SessionController: ObservableObject {
                let folder = currentFolder
                let result = await recorder.stop()
                let visual = await stopVisualAndTimeline(result, folder: folder)
-                finish(result, timeline: visual.timeline, visualRan: visual.visualRan)
+                finish(result, timeline: visual.timeline, selfSpans: visual.selfSpans, visualRan: visual.visualRan)
            } else if lifecycleGeneration == gen {
                break   // settled: no new transition was spawned
            }
@@ -461,15 +471,15 @@ final class SessionController: ObservableObject {
        return f.string(from: Date())
    }

-    /// Phase-1 preview of the mic-VAD "self" spans (the eventual
-    /// `visual_timeline.json` `mic_vad` segments). Lets us eyeball VAD quality.
-    private func writeSelfSpans(_ result: RecordingResult, to folder: URL) {
-        let segments = result.selfSpans.map { span -> [String: Any] in
+    /// Debug artifact: the channel-verified "self" spans actually sent to the backend
+    /// as `self_vad` (mic active AND louder than system). Lets us eyeball self detection.
+    private func writeSelfSpans(spans: [VADSpan], result: RecordingResult, to folder: URL) {
+        let segments = spans.map { span -> [String: Any] in
            ["start": span.start, "end": span.end, "name": "self",
-             "confidence": span.confidence, "source": "mic_vad"]
+             "confidence": span.confidence, "source": "mic_channel"]
        }
        let object: [String: Any] = [
-            "note": "Phase 1 mic-VAD self spans (preview of visual_timeline segments)",
+            "note": "channel-verified self spans (mic active and louder than system) — the self_vad sent to label-merge",
            "t0_unix": result.t0Unix,
            "duration_sec": result.duration,
            "self_spans": segments,
@@ -46,6 +46,18 @@ enum SessionPackager {
        return try JSONSerialization.data(withJSONObject: flat, options: [])
    }

+    /// Clip self-VAD spans to `[start, end)` and rebase to chunk-local seconds, as
+    /// the `self_vad` array `[{start,end}]` for dual-channel `label-merge`.
+    static func rebasedSelfVadData(_ spans: [VADSpan], start: Double, end: Double) throws -> Data {
+        let flat: [[String: Any]] = spans.compactMap { span in
+            let s = max(span.start, start)
+            let e = min(span.end, end)
+            guard e > s else { return nil }
+            return ["start": s - start, "end": e - start]
+        }
+        return try JSONSerialization.data(withJSONObject: flat, options: [])
+    }
+
    /// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
    static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
        let input = try AVAudioFile(forReading: source)
@@ -15,8 +15,10 @@ enum TranscriptAssembler {
    }

    /// Source ranking when the same name appears across chunks with different sources.
+    /// `mic_channel` (the local user's own microphone) is the most authoritative.
    private static func rank(_ source: String) -> Int {
        switch source {
+        case "mic_channel": return 4
        case "visual": return 3
        case "voiceprint": return 2
        default: return 1            // unmatched
@@ -12,16 +12,30 @@ final class TranscriptPipeline {
        self.voiceprints = voiceprints
    }

-    /// Process `mixedURL` against `timeline` (visual + self spans). Writes
-    /// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)`
+    /// Process a finished session. **Dual-channel** when the system track is healthy
+    /// and present: mic (the local user) + system (remote) go as separate files, the
+    /// `timeline` names only the remote speakers, and `selfSpans` become `self_vad`.
+    /// Otherwise falls back to the **mono** mixed file with self folded into the
+    /// timeline. Writes `speakers.json` into `sessionFolder`. `progress(done,total)`
    /// is called per chunk.
    func process(sessionFolder: URL,
                 sessionId: String,
                 app: String,
+                 micURL: URL,
+                 systemURL: URL,
                 mixedURL: URL,
                 timeline: [VisualTimeline.Segment],
+                 selfSpans: [VADSpan],
+                 selfName: String,
+                 systemHealthy: Bool,
                 progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
-        let duration = SessionPackager.duration(of: mixedURL)
+        let fm = FileManager.default
+        let dual = systemHealthy
+            && fm.fileExists(atPath: micURL.path) && fm.fileExists(atPath: systemURL.path)
+            && SessionPackager.duration(of: systemURL) > 0
+        let duration = dual
+            ? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
+            : SessionPackager.duration(of: mixedURL)
        let plan = SessionPackager.planChunks(durationSec: duration)

        // Zero-duration / empty session → a valid empty speakers.json, no backend call.
@@ -33,32 +47,51 @@ final class TranscriptPipeline {
        }

        let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
-        try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true)
-        defer { try? FileManager.default.removeItem(at: chunksDir) }   // cleanup on success OR throw
+        try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true)
+        defer { try? fm.removeItem(at: chunksDir) }   // cleanup on success OR throw

        // Start from stored voiceprints; accumulate this call's prints across chunks
        // for within-call unification (the store only persists high-confidence ones).
        var known = voiceprints.knownVoiceprints()
        var results: [TranscriptAssembler.ChunkResult] = []
+        // Mono fallback needs self folded into the timeline; dual sends it separately.
+        let monoTimeline = dual ? timeline
+            : timeline + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName)

        for chunk in plan {
            try Task.checkCancellation()
            await progress?(chunk.index, plan.count)
-            let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav")
-            try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
-            guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue }  // empty slice → skip
+            let pad = String(format: "%03d", chunk.index)
+            let response: LabelMergeResponse

-            let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
-            let response = try await client.labelMerge(
-                audioURL: chunkURL, timeline: timelineData,
-                knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
+            if dual {
+                let micChunk = chunksDir.appendingPathComponent("chunk_\(pad)_mic.wav")
+                let sysChunk = chunksDir.appendingPathComponent("chunk_\(pad)_sys.wav")
+                try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk)
+                try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk)
+                guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue }
+                let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
+                let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end)
+                response = try await client.labelMergeDual(
+                    micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData,
+                    timeline: timelineData, knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
+                try? fm.removeItem(at: micChunk); try? fm.removeItem(at: sysChunk)
+            } else {
+                let chunkURL = chunksDir.appendingPathComponent("chunk_\(pad).wav")
+                try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
+                guard fm.fileExists(atPath: chunkURL.path) else { continue }   // empty slice → skip
+                let timelineData = try SessionPackager.rebasedTimelineData(monoTimeline, start: chunk.start, end: chunk.end)
+                response = try await client.labelMerge(
+                    audioURL: chunkURL, timeline: timelineData,
+                    knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
+                try? fm.removeItem(at: chunkURL)
+            }

            for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
                known[name] = fp
            }
            voiceprints.update(with: response)
            results.append(.init(chunkStart: chunk.start, response: response))
-            try? FileManager.default.removeItem(at: chunkURL)
        }
        await progress?(plan.count, plan.count)

@@ -55,29 +55,35 @@ final class VisualCapture {
        }
    }

-    /// Stop capture, fold in the mic-VAD self spans, write `visual_timeline.json`
-    /// into the session folder, and return the merged segments for `label-merge`.
+    /// Stop capture and write `visual_timeline.json` (the full human-readable picture:
+    /// remote visual segments + the mic-VAD self spans, merged). Returns ONLY the
+    /// remote (vision) segments — in dual-channel mode the backend names the system
+    /// track from these, while self is handled by the mic channel + `self_vad`.
    func finish(selfSpans: [VADSpan], selfName: String,
                sessionId: String, t0Unix: Double, durationSec: Double,
                folder: URL) async -> [VisualTimeline.Segment] {
-        observer.addSelfSpans(selfSpans, selfName: selfName)
        let (rawSegments, rawGaps) = await observer.stop()

        // The observer stops slightly after audio fixes `durationSec`, so a trailing
        // gap/segment can run past it. Clamp ends so the JSON is internally consistent
        // (and we never hand the backend a segment longer than the audio).
-        let segments = Self.clampSegments(rawSegments, to: durationSec)
+        let vision = Self.clampSegments(rawSegments, to: durationSec)   // remote speakers
        let gaps = Self.clampGaps(rawGaps, to: durationSec)
+        let selfSegs = Self.clampSegments(selfSpans.map {
+            VisualTimeline.Segment(start: $0.start, end: $0.end, name: selfName,
+                                   confidence: $0.confidence, source: "mic_vad")
+        }, to: durationSec)

-        let names = Set(segments.map { $0.name })
+        let artifact = (vision + selfSegs).sorted { $0.start < $1.start }
+        let names = Set(artifact.map { $0.name })
        let participants = names.sorted().map {
            VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
        }
        let timeline = VisualTimeline(
            sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion,
            t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS,
-            selfName: selfName, participants: participants, segments: segments, visualGaps: gaps)
+            selfName: selfName, participants: participants, segments: artifact, visualGaps: gaps)
        try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json"))
-        return segments
+        return vision
    }
 }