Client: dual-channel label-merge (mic_file + system_file)
The backend shipped dual-channel mode; wire the client to it. We already capture
mic (you) and system (others) separately, so send them as two files instead of the
mono mix — fixing the misattribution at the source.
- SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad);
multipart generalized to N files; shared POST/retry/decode extracted.
- SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad;
sliceAudio reused for both tracks.
- TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase
timeline + self_vad per chunk) when system audio is healthy; mono mixed-file
fallback (self folded into the timeline) otherwise.
- VisualCapture.finish: write the full visual_timeline.json (remote + self merged)
but return REMOTE (vision) segments only — self travels via the mic channel.
- TranscriptAssembler: rank mic_channel highest (the user's own track wins).
- VoiceprintStore: store the clean mic_channel self voiceprint.
- SessionController: pass mic/system URLs + remote timeline + channel self-spans +
self_name + systemHealthy; self_vad.json now reflects the channel-verified spans.
Validated END-TO-END against the live backend on the real misattributing session:
'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own
lines come back source=mic_channel; per-channel ASR recovered fuller remote text.
36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
This commit is contained in:
@@ -88,8 +88,7 @@ final class SparkControlClient {
|
||||
|
||||
deinit { urlSession.finishTasksAndInvalidate() }
|
||||
|
||||
/// One `label-merge` call. `timeline` is the flat `[{start,end,name,confidence}]`
|
||||
/// JSON (chunk-local seconds). Retries on `503 + Retry-After`.
|
||||
/// Mono `label-merge`: one mixed-mono file + timeline. Retries on `503`.
|
||||
func labelMerge(audioURL: URL,
|
||||
timeline: Data,
|
||||
knownVoiceprints: [String: [Float]]?,
|
||||
@@ -97,14 +96,46 @@ final class SparkControlClient {
|
||||
minOverlap: Double? = nil,
|
||||
voiceprintThreshold: Double? = nil,
|
||||
maxRetries: Int = 3) async throws -> LabelMergeResponse {
|
||||
guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
|
||||
throw SparkControlError.invalidHost
|
||||
}
|
||||
let fields = Self.commonFields(timeline: timeline, knownVoiceprints: knownVoiceprints,
|
||||
transcribe: transcribe, minOverlap: minOverlap,
|
||||
voiceprintThreshold: voiceprintThreshold)
|
||||
let files = [(field: "file", filename: audioURL.lastPathComponent, data: try Data(contentsOf: audioURL))]
|
||||
return try await perform(fields: fields, files: files, maxRetries: maxRetries)
|
||||
}
|
||||
|
||||
/// Dual-channel `label-merge`: separate mic (local user) + system (remote)
|
||||
/// tracks. The mic channel is attributed as `self_name`; `timeline` names only
|
||||
/// the remote/system speakers; `selfVad` (optional) are the chunk-local windows
|
||||
/// where the mic is genuinely the user (active and louder than system).
|
||||
func labelMergeDual(micURL: URL,
|
||||
systemURL: URL,
|
||||
selfName: String,
|
||||
selfVad: Data?,
|
||||
timeline: Data,
|
||||
knownVoiceprints: [String: [Float]]?,
|
||||
transcribe: Bool,
|
||||
minOverlap: Double? = nil,
|
||||
voiceprintThreshold: Double? = nil,
|
||||
maxRetries: Int = 3) async throws -> LabelMergeResponse {
|
||||
var fields = Self.commonFields(timeline: timeline, knownVoiceprints: knownVoiceprints,
|
||||
transcribe: transcribe, minOverlap: minOverlap,
|
||||
voiceprintThreshold: voiceprintThreshold)
|
||||
fields["self_name"] = selfName
|
||||
if let selfVad, let str = String(data: selfVad, encoding: .utf8) { fields["self_vad"] = str }
|
||||
let files = [
|
||||
(field: "mic_file", filename: micURL.lastPathComponent, data: try Data(contentsOf: micURL)),
|
||||
(field: "system_file", filename: systemURL.lastPathComponent, data: try Data(contentsOf: systemURL)),
|
||||
]
|
||||
return try await perform(fields: fields, files: files, maxRetries: maxRetries)
|
||||
}
|
||||
|
||||
// MARK: - Transport
|
||||
|
||||
private static func commonFields(timeline: Data, knownVoiceprints: [String: [Float]]?,
|
||||
transcribe: Bool, minOverlap: Double?,
|
||||
voiceprintThreshold: Double?) -> [String: String] {
|
||||
var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"]
|
||||
if let timelineString = String(data: timeline, encoding: .utf8) {
|
||||
fields["timeline"] = timelineString
|
||||
}
|
||||
if let timelineString = String(data: timeline, encoding: .utf8) { fields["timeline"] = timelineString }
|
||||
if let known = knownVoiceprints, !known.isEmpty,
|
||||
let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }),
|
||||
let str = String(data: data, encoding: .utf8) {
|
||||
@@ -112,11 +143,17 @@ final class SparkControlClient {
|
||||
}
|
||||
if let minOverlap { fields["min_overlap"] = String(minOverlap) }
|
||||
if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) }
|
||||
return fields
|
||||
}
|
||||
|
||||
let audio = try Data(contentsOf: audioURL)
|
||||
// Body doesn't change between retries — build it once.
|
||||
let (body, contentType) = Self.multipart(fields: fields, fileField: "file",
|
||||
filename: audioURL.lastPathComponent, fileData: audio)
|
||||
/// Shared POST + retry-on-503 + decode. Body is built once (constant across retries).
|
||||
private func perform(fields: [String: String],
|
||||
files: [(field: String, filename: String, data: Data)],
|
||||
maxRetries: Int) async throws -> LabelMergeResponse {
|
||||
guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
|
||||
throw SparkControlError.invalidHost
|
||||
}
|
||||
let (body, contentType) = Self.multipart(fields: fields, files: files)
|
||||
|
||||
var attempt = 0
|
||||
while true {
|
||||
@@ -158,8 +195,8 @@ final class SparkControlClient {
|
||||
return String(data: data, encoding: .utf8) ?? "unknown error"
|
||||
}
|
||||
|
||||
private static func multipart(fields: [String: String], fileField: String,
|
||||
filename: String, fileData: Data) -> (Data, String) {
|
||||
private static func multipart(fields: [String: String],
|
||||
files: [(field: String, filename: String, data: Data)]) -> (Data, String) {
|
||||
let boundary = "Boundary-\(UUID().uuidString)"
|
||||
var body = Data()
|
||||
func append(_ s: String) { body.append(s.data(using: .utf8)!) }
|
||||
@@ -169,11 +206,14 @@ final class SparkControlClient {
|
||||
append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n")
|
||||
append("\(value)\r\n")
|
||||
}
|
||||
append("--\(boundary)\r\n")
|
||||
append("Content-Disposition: form-data; name=\"\(fileField)\"; filename=\"\(filename)\"\r\n")
|
||||
append("Content-Type: audio/wav\r\n\r\n")
|
||||
body.append(fileData)
|
||||
append("\r\n--\(boundary)--\r\n")
|
||||
for file in files {
|
||||
append("--\(boundary)\r\n")
|
||||
append("Content-Disposition: form-data; name=\"\(file.field)\"; filename=\"\(file.filename)\"\r\n")
|
||||
append("Content-Type: audio/wav\r\n\r\n")
|
||||
body.append(file.data)
|
||||
append("\r\n")
|
||||
}
|
||||
append("--\(boundary)--\r\n")
|
||||
return (body, "multipart/form-data; boundary=\(boundary)")
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user