Client: dual-channel label-merge (mic_file + system_file)

The backend shipped dual-channel mode; wire the client to it. We already capture
mic (you) and system (others) separately, so send them as two files instead of the
mono mix — fixing the misattribution at the source.

- SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad);
  multipart generalized to N files; shared POST/retry/decode extracted.
- SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad;
  sliceAudio reused for both tracks.
- TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase
  timeline + self_vad per chunk) when system audio is healthy; mono mixed-file
  fallback (self folded into the timeline) otherwise.
- VisualCapture.finish: write the full visual_timeline.json (remote + self merged)
  but return REMOTE (vision) segments only — self travels via the mic channel.
- TranscriptAssembler: rank mic_channel highest (the user's own track wins).
- VoiceprintStore: store the clean mic_channel self voiceprint.
- SessionController: pass mic/system URLs + remote timeline + channel self-spans +
  self_name + systemHealthy; self_vad.json now reflects the channel-verified spans.

Validated END-TO-END against the live backend on the real misattributing session:
'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own
lines come back source=mic_channel; per-channel ASR recovered fuller remote text.
36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
This commit is contained in:
Grant Gilliam
2026-06-06 13:15:29 -05:00
parent 2191486506
commit 53d7fcdac0
9 changed files with 199 additions and 62 deletions
@@ -88,8 +88,7 @@ final class SparkControlClient {
deinit { urlSession.finishTasksAndInvalidate() }
/// One `label-merge` call. `timeline` is the flat `[{start,end,name,confidence}]`
/// JSON (chunk-local seconds). Retries on `503 + Retry-After`.
/// Mono `label-merge`: one mixed-mono file + timeline. Retries on `503`.
func labelMerge(audioURL: URL,
timeline: Data,
knownVoiceprints: [String: [Float]]?,
@@ -97,14 +96,46 @@ final class SparkControlClient {
minOverlap: Double? = nil,
voiceprintThreshold: Double? = nil,
maxRetries: Int = 3) async throws -> LabelMergeResponse {
guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
throw SparkControlError.invalidHost
}
let fields = Self.commonFields(timeline: timeline, knownVoiceprints: knownVoiceprints,
transcribe: transcribe, minOverlap: minOverlap,
voiceprintThreshold: voiceprintThreshold)
let files = [(field: "file", filename: audioURL.lastPathComponent, data: try Data(contentsOf: audioURL))]
return try await perform(fields: fields, files: files, maxRetries: maxRetries)
}
/// Dual-channel `label-merge`: separate mic (local user) + system (remote)
/// tracks. The mic channel is attributed as `self_name`; `timeline` names only
/// the remote/system speakers; `selfVad` (optional) are the chunk-local windows
/// where the mic is genuinely the user (active and louder than system).
func labelMergeDual(micURL: URL,
systemURL: URL,
selfName: String,
selfVad: Data?,
timeline: Data,
knownVoiceprints: [String: [Float]]?,
transcribe: Bool,
minOverlap: Double? = nil,
voiceprintThreshold: Double? = nil,
maxRetries: Int = 3) async throws -> LabelMergeResponse {
var fields = Self.commonFields(timeline: timeline, knownVoiceprints: knownVoiceprints,
transcribe: transcribe, minOverlap: minOverlap,
voiceprintThreshold: voiceprintThreshold)
fields["self_name"] = selfName
if let selfVad, let str = String(data: selfVad, encoding: .utf8) { fields["self_vad"] = str }
let files = [
(field: "mic_file", filename: micURL.lastPathComponent, data: try Data(contentsOf: micURL)),
(field: "system_file", filename: systemURL.lastPathComponent, data: try Data(contentsOf: systemURL)),
]
return try await perform(fields: fields, files: files, maxRetries: maxRetries)
}
// MARK: - Transport
private static func commonFields(timeline: Data, knownVoiceprints: [String: [Float]]?,
transcribe: Bool, minOverlap: Double?,
voiceprintThreshold: Double?) -> [String: String] {
var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"]
if let timelineString = String(data: timeline, encoding: .utf8) {
fields["timeline"] = timelineString
}
if let timelineString = String(data: timeline, encoding: .utf8) { fields["timeline"] = timelineString }
if let known = knownVoiceprints, !known.isEmpty,
let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }),
let str = String(data: data, encoding: .utf8) {
@@ -112,11 +143,17 @@ final class SparkControlClient {
}
if let minOverlap { fields["min_overlap"] = String(minOverlap) }
if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) }
return fields
}
let audio = try Data(contentsOf: audioURL)
// Body doesn't change between retries build it once.
let (body, contentType) = Self.multipart(fields: fields, fileField: "file",
filename: audioURL.lastPathComponent, fileData: audio)
/// Shared POST + retry-on-503 + decode. Body is built once (constant across retries).
private func perform(fields: [String: String],
files: [(field: String, filename: String, data: Data)],
maxRetries: Int) async throws -> LabelMergeResponse {
guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
throw SparkControlError.invalidHost
}
let (body, contentType) = Self.multipart(fields: fields, files: files)
var attempt = 0
while true {
@@ -158,8 +195,8 @@ final class SparkControlClient {
return String(data: data, encoding: .utf8) ?? "unknown error"
}
private static func multipart(fields: [String: String], fileField: String,
filename: String, fileData: Data) -> (Data, String) {
private static func multipart(fields: [String: String],
files: [(field: String, filename: String, data: Data)]) -> (Data, String) {
let boundary = "Boundary-\(UUID().uuidString)"
var body = Data()
func append(_ s: String) { body.append(s.data(using: .utf8)!) }
@@ -169,11 +206,14 @@ final class SparkControlClient {
append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n")
append("\(value)\r\n")
}
append("--\(boundary)\r\n")
append("Content-Disposition: form-data; name=\"\(fileField)\"; filename=\"\(filename)\"\r\n")
append("Content-Type: audio/wav\r\n\r\n")
body.append(fileData)
append("\r\n--\(boundary)--\r\n")
for file in files {
append("--\(boundary)\r\n")
append("Content-Disposition: form-data; name=\"\(file.field)\"; filename=\"\(file.filename)\"\r\n")
append("Content-Type: audio/wav\r\n\r\n")
body.append(file.data)
append("\r\n")
}
append("--\(boundary)--\r\n")
return (body, "multipart/form-data; boundary=\(boundary)")
}
}