Client: dual-channel label-merge (mic_file + system_file)

The backend shipped dual-channel mode; wire the client to it. We already capture mic (you) and system (others) separately, so send them as two files instead of the mono mix — fixing the misattribution at the source. - SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad); multipart generalized to N files; shared POST/retry/decode extracted. - SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad; sliceAudio reused for both tracks. - TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase timeline + self_vad per chunk) when system audio is healthy; mono mixed-file fallback (self folded into the timeline) otherwise. - VisualCapture.finish: write the full visual_timeline.json (remote + self merged) but return REMOTE (vision) segments only — self travels via the mic channel. - TranscriptAssembler: rank mic_channel highest (the user's own track wins). - VoiceprintStore: store the clean mic_channel self voiceprint. - SessionController: pass mic/system URLs + remote timeline + channel self-spans + self_name + systemHealthy; self_vad.json now reflects the channel-verified spans. Validated END-TO-END against the live backend on the real misattributing session: 'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own lines come back source=mic_channel; per-channel ASR recovered fuller remote text. 36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
2026-06-06 13:15:29 -05:00
parent 2191486506
commit 53d7fcdac0
9 changed files with 199 additions and 62 deletions
@@ -12,16 +12,30 @@ final class TranscriptPipeline {
        self.voiceprints = voiceprints
    }

-    /// Process `mixedURL` against `timeline` (visual + self spans). Writes
-    /// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)`
+    /// Process a finished session. **Dual-channel** when the system track is healthy
+    /// and present: mic (the local user) + system (remote) go as separate files, the
+    /// `timeline` names only the remote speakers, and `selfSpans` become `self_vad`.
+    /// Otherwise falls back to the **mono** mixed file with self folded into the
+    /// timeline. Writes `speakers.json` into `sessionFolder`. `progress(done,total)`
    /// is called per chunk.
    func process(sessionFolder: URL,
                 sessionId: String,
                 app: String,
+                 micURL: URL,
+                 systemURL: URL,
                 mixedURL: URL,
                 timeline: [VisualTimeline.Segment],
+                 selfSpans: [VADSpan],
+                 selfName: String,
+                 systemHealthy: Bool,
                 progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
-        let duration = SessionPackager.duration(of: mixedURL)
+        let fm = FileManager.default
+        let dual = systemHealthy
+            && fm.fileExists(atPath: micURL.path) && fm.fileExists(atPath: systemURL.path)
+            && SessionPackager.duration(of: systemURL) > 0
+        let duration = dual
+            ? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
+            : SessionPackager.duration(of: mixedURL)
        let plan = SessionPackager.planChunks(durationSec: duration)

        // Zero-duration / empty session → a valid empty speakers.json, no backend call.
@@ -33,32 +47,51 @@ final class TranscriptPipeline {
        }

        let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
-        try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true)
-        defer { try? FileManager.default.removeItem(at: chunksDir) }   // cleanup on success OR throw
+        try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true)
+        defer { try? fm.removeItem(at: chunksDir) }   // cleanup on success OR throw

        // Start from stored voiceprints; accumulate this call's prints across chunks
        // for within-call unification (the store only persists high-confidence ones).
        var known = voiceprints.knownVoiceprints()
        var results: [TranscriptAssembler.ChunkResult] = []
+        // Mono fallback needs self folded into the timeline; dual sends it separately.
+        let monoTimeline = dual ? timeline
+            : timeline + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName)

        for chunk in plan {
            try Task.checkCancellation()
            await progress?(chunk.index, plan.count)
-            let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav")
-            try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
-            guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue }  // empty slice → skip
+            let pad = String(format: "%03d", chunk.index)
+            let response: LabelMergeResponse

-            let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
-            let response = try await client.labelMerge(
-                audioURL: chunkURL, timeline: timelineData,
-                knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
+            if dual {
+                let micChunk = chunksDir.appendingPathComponent("chunk_\(pad)_mic.wav")
+                let sysChunk = chunksDir.appendingPathComponent("chunk_\(pad)_sys.wav")
+                try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk)
+                try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk)
+                guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue }
+                let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
+                let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end)
+                response = try await client.labelMergeDual(
+                    micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData,
+                    timeline: timelineData, knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
+                try? fm.removeItem(at: micChunk); try? fm.removeItem(at: sysChunk)
+            } else {
+                let chunkURL = chunksDir.appendingPathComponent("chunk_\(pad).wav")
+                try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
+                guard fm.fileExists(atPath: chunkURL.path) else { continue }   // empty slice → skip
+                let timelineData = try SessionPackager.rebasedTimelineData(monoTimeline, start: chunk.start, end: chunk.end)
+                response = try await client.labelMerge(
+                    audioURL: chunkURL, timeline: timelineData,
+                    knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
+                try? fm.removeItem(at: chunkURL)
+            }

            for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
                known[name] = fp
            }
            voiceprints.update(with: response)
            results.append(.init(chunkStart: chunk.start, response: response))
-            try? FileManager.default.removeItem(at: chunkURL)
        }
        await progress?(plan.count, plan.count)