Client: dual-channel label-merge (mic_file + system_file)
The backend shipped dual-channel mode; wire the client to it. We already capture
mic (you) and system (others) separately, so send them as two files instead of the
mono mix — fixing the misattribution at the source.
- SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad);
multipart generalized to N files; shared POST/retry/decode extracted.
- SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad;
sliceAudio reused for both tracks.
- TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase
timeline + self_vad per chunk) when system audio is healthy; mono mixed-file
fallback (self folded into the timeline) otherwise.
- VisualCapture.finish: write the full visual_timeline.json (remote + self merged)
but return REMOTE (vision) segments only — self travels via the mic channel.
- TranscriptAssembler: rank mic_channel highest (the user's own track wins).
- VoiceprintStore: store the clean mic_channel self voiceprint.
- SessionController: pass mic/system URLs + remote timeline + channel self-spans +
self_name + systemHealthy; self_vad.json now reflects the channel-verified spans.
Validated END-TO-END against the live backend on the real misattributing session:
'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own
lines come back source=mic_channel; per-channel ASR recovered fuller remote text.
36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
This commit is contained in:
@@ -88,8 +88,7 @@ final class SparkControlClient {
|
|||||||
|
|
||||||
deinit { urlSession.finishTasksAndInvalidate() }
|
deinit { urlSession.finishTasksAndInvalidate() }
|
||||||
|
|
||||||
/// One `label-merge` call. `timeline` is the flat `[{start,end,name,confidence}]`
|
/// Mono `label-merge`: one mixed-mono file + timeline. Retries on `503`.
|
||||||
/// JSON (chunk-local seconds). Retries on `503 + Retry-After`.
|
|
||||||
func labelMerge(audioURL: URL,
|
func labelMerge(audioURL: URL,
|
||||||
timeline: Data,
|
timeline: Data,
|
||||||
knownVoiceprints: [String: [Float]]?,
|
knownVoiceprints: [String: [Float]]?,
|
||||||
@@ -97,14 +96,46 @@ final class SparkControlClient {
|
|||||||
minOverlap: Double? = nil,
|
minOverlap: Double? = nil,
|
||||||
voiceprintThreshold: Double? = nil,
|
voiceprintThreshold: Double? = nil,
|
||||||
maxRetries: Int = 3) async throws -> LabelMergeResponse {
|
maxRetries: Int = 3) async throws -> LabelMergeResponse {
|
||||||
guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
|
let fields = Self.commonFields(timeline: timeline, knownVoiceprints: knownVoiceprints,
|
||||||
throw SparkControlError.invalidHost
|
transcribe: transcribe, minOverlap: minOverlap,
|
||||||
|
voiceprintThreshold: voiceprintThreshold)
|
||||||
|
let files = [(field: "file", filename: audioURL.lastPathComponent, data: try Data(contentsOf: audioURL))]
|
||||||
|
return try await perform(fields: fields, files: files, maxRetries: maxRetries)
|
||||||
}
|
}
|
||||||
|
|
||||||
var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"]
|
/// Dual-channel `label-merge`: separate mic (local user) + system (remote)
|
||||||
if let timelineString = String(data: timeline, encoding: .utf8) {
|
/// tracks. The mic channel is attributed as `self_name`; `timeline` names only
|
||||||
fields["timeline"] = timelineString
|
/// the remote/system speakers; `selfVad` (optional) are the chunk-local windows
|
||||||
|
/// where the mic is genuinely the user (active and louder than system).
|
||||||
|
func labelMergeDual(micURL: URL,
|
||||||
|
systemURL: URL,
|
||||||
|
selfName: String,
|
||||||
|
selfVad: Data?,
|
||||||
|
timeline: Data,
|
||||||
|
knownVoiceprints: [String: [Float]]?,
|
||||||
|
transcribe: Bool,
|
||||||
|
minOverlap: Double? = nil,
|
||||||
|
voiceprintThreshold: Double? = nil,
|
||||||
|
maxRetries: Int = 3) async throws -> LabelMergeResponse {
|
||||||
|
var fields = Self.commonFields(timeline: timeline, knownVoiceprints: knownVoiceprints,
|
||||||
|
transcribe: transcribe, minOverlap: minOverlap,
|
||||||
|
voiceprintThreshold: voiceprintThreshold)
|
||||||
|
fields["self_name"] = selfName
|
||||||
|
if let selfVad, let str = String(data: selfVad, encoding: .utf8) { fields["self_vad"] = str }
|
||||||
|
let files = [
|
||||||
|
(field: "mic_file", filename: micURL.lastPathComponent, data: try Data(contentsOf: micURL)),
|
||||||
|
(field: "system_file", filename: systemURL.lastPathComponent, data: try Data(contentsOf: systemURL)),
|
||||||
|
]
|
||||||
|
return try await perform(fields: fields, files: files, maxRetries: maxRetries)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MARK: - Transport
|
||||||
|
|
||||||
|
private static func commonFields(timeline: Data, knownVoiceprints: [String: [Float]]?,
|
||||||
|
transcribe: Bool, minOverlap: Double?,
|
||||||
|
voiceprintThreshold: Double?) -> [String: String] {
|
||||||
|
var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"]
|
||||||
|
if let timelineString = String(data: timeline, encoding: .utf8) { fields["timeline"] = timelineString }
|
||||||
if let known = knownVoiceprints, !known.isEmpty,
|
if let known = knownVoiceprints, !known.isEmpty,
|
||||||
let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }),
|
let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }),
|
||||||
let str = String(data: data, encoding: .utf8) {
|
let str = String(data: data, encoding: .utf8) {
|
||||||
@@ -112,11 +143,17 @@ final class SparkControlClient {
|
|||||||
}
|
}
|
||||||
if let minOverlap { fields["min_overlap"] = String(minOverlap) }
|
if let minOverlap { fields["min_overlap"] = String(minOverlap) }
|
||||||
if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) }
|
if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) }
|
||||||
|
return fields
|
||||||
|
}
|
||||||
|
|
||||||
let audio = try Data(contentsOf: audioURL)
|
/// Shared POST + retry-on-503 + decode. Body is built once (constant across retries).
|
||||||
// Body doesn't change between retries — build it once.
|
private func perform(fields: [String: String],
|
||||||
let (body, contentType) = Self.multipart(fields: fields, fileField: "file",
|
files: [(field: String, filename: String, data: Data)],
|
||||||
filename: audioURL.lastPathComponent, fileData: audio)
|
maxRetries: Int) async throws -> LabelMergeResponse {
|
||||||
|
guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
|
||||||
|
throw SparkControlError.invalidHost
|
||||||
|
}
|
||||||
|
let (body, contentType) = Self.multipart(fields: fields, files: files)
|
||||||
|
|
||||||
var attempt = 0
|
var attempt = 0
|
||||||
while true {
|
while true {
|
||||||
@@ -158,8 +195,8 @@ final class SparkControlClient {
|
|||||||
return String(data: data, encoding: .utf8) ?? "unknown error"
|
return String(data: data, encoding: .utf8) ?? "unknown error"
|
||||||
}
|
}
|
||||||
|
|
||||||
private static func multipart(fields: [String: String], fileField: String,
|
private static func multipart(fields: [String: String],
|
||||||
filename: String, fileData: Data) -> (Data, String) {
|
files: [(field: String, filename: String, data: Data)]) -> (Data, String) {
|
||||||
let boundary = "Boundary-\(UUID().uuidString)"
|
let boundary = "Boundary-\(UUID().uuidString)"
|
||||||
var body = Data()
|
var body = Data()
|
||||||
func append(_ s: String) { body.append(s.data(using: .utf8)!) }
|
func append(_ s: String) { body.append(s.data(using: .utf8)!) }
|
||||||
@@ -169,11 +206,14 @@ final class SparkControlClient {
|
|||||||
append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n")
|
append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n")
|
||||||
append("\(value)\r\n")
|
append("\(value)\r\n")
|
||||||
}
|
}
|
||||||
|
for file in files {
|
||||||
append("--\(boundary)\r\n")
|
append("--\(boundary)\r\n")
|
||||||
append("Content-Disposition: form-data; name=\"\(fileField)\"; filename=\"\(filename)\"\r\n")
|
append("Content-Disposition: form-data; name=\"\(file.field)\"; filename=\"\(file.filename)\"\r\n")
|
||||||
append("Content-Type: audio/wav\r\n\r\n")
|
append("Content-Type: audio/wav\r\n\r\n")
|
||||||
body.append(fileData)
|
body.append(file.data)
|
||||||
append("\r\n--\(boundary)--\r\n")
|
append("\r\n")
|
||||||
|
}
|
||||||
|
append("--\(boundary)--\r\n")
|
||||||
return (body, "multipart/form-data; boundary=\(boundary)")
|
return (body, "multipart/form-data; boundary=\(boundary)")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -48,6 +48,7 @@ final class VoiceprintStore {
|
|||||||
guard !Self.isUnknown(sp.name) else { continue }
|
guard !Self.isUnknown(sp.name) else { continue }
|
||||||
let acceptable: Bool
|
let acceptable: Bool
|
||||||
switch sp.source {
|
switch sp.source {
|
||||||
|
case "mic_channel": acceptable = true // the user's own clean mic voiceprint
|
||||||
case "visual": acceptable = (sp.overlapConfidence ?? 0) >= minOverlapToStore
|
case "visual": acceptable = (sp.overlapConfidence ?? 0) >= minOverlapToStore
|
||||||
case "voiceprint": acceptable = true // already matched a known print
|
case "voiceprint": acceptable = true // already matched a known print
|
||||||
default: acceptable = false // unmatched
|
default: acceptable = false // unmatched
|
||||||
|
|||||||
@@ -65,8 +65,13 @@ final class SessionController: ObservableObject {
|
|||||||
let folder: URL
|
let folder: URL
|
||||||
let sessionId: String
|
let sessionId: String
|
||||||
let app: String
|
let app: String
|
||||||
|
let micURL: URL
|
||||||
|
let systemURL: URL
|
||||||
let mixedURL: URL
|
let mixedURL: URL
|
||||||
let timeline: [VisualTimeline.Segment]
|
let timeline: [VisualTimeline.Segment] // remote visual names; self handled via the mic channel
|
||||||
|
let selfSpans: [VADSpan]
|
||||||
|
let selfName: String
|
||||||
|
let systemHealthy: Bool
|
||||||
}
|
}
|
||||||
private var lastProcess: ProcessInputs?
|
private var lastProcess: ProcessInputs?
|
||||||
private var processTask: Task<Void, Never>?
|
private var processTask: Task<Void, Never>?
|
||||||
@@ -275,19 +280,20 @@ final class SessionController: ObservableObject {
|
|||||||
/// ran, otherwise the mic-VAD self spans alone. `visualRan` reports whether the
|
/// ran, otherwise the mic-VAD self spans alone. `visualRan` reports whether the
|
||||||
/// visual pipeline actually attached (for the after-session indicator).
|
/// visual pipeline actually attached (for the after-session indicator).
|
||||||
private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?)
|
private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?)
|
||||||
async -> (timeline: [VisualTimeline.Segment], visualRan: Bool) {
|
async -> (timeline: [VisualTimeline.Segment], selfSpans: [VADSpan], visualRan: Bool) {
|
||||||
let selfName = settings.selfName
|
let selfName = settings.selfName
|
||||||
let selfSpans = await channelSelfSpans(result: result, folder: folder)
|
let selfSpans = await channelSelfSpans(result: result, folder: folder)
|
||||||
if let vc = visualCapture, let folder {
|
if let vc = visualCapture, let folder {
|
||||||
visualCapture = nil
|
visualCapture = nil
|
||||||
let timeline = await vc.finish(
|
// Remote (vision) segments only; self travels separately as the mic channel.
|
||||||
|
let remote = await vc.finish(
|
||||||
selfSpans: selfSpans, selfName: selfName,
|
selfSpans: selfSpans, selfName: selfName,
|
||||||
sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
|
sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
|
||||||
durationSec: result.duration, folder: folder)
|
durationSec: result.duration, folder: folder)
|
||||||
return (timeline, true)
|
return (remote, selfSpans, true)
|
||||||
}
|
}
|
||||||
if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
|
if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
|
||||||
return (TranscriptPipeline.timeline(fromSelfSpans: selfSpans, selfName: selfName), false)
|
return ([], selfSpans, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Self spans for the backend timeline, identified by CHANNEL: the mic track is
|
/// Self spans for the backend timeline, identified by CHANNEL: the mic track is
|
||||||
@@ -312,26 +318,29 @@ final class SessionController: ObservableObject {
|
|||||||
lifecycleTask = Task {
|
lifecycleTask = Task {
|
||||||
let result = await recorder.stop()
|
let result = await recorder.stop()
|
||||||
let visual = await self.stopVisualAndTimeline(result, folder: folder)
|
let visual = await self.stopVisualAndTimeline(result, folder: folder)
|
||||||
self.finish(result, timeline: visual.timeline, visualRan: visual.visualRan)
|
self.finish(result, timeline: visual.timeline, selfSpans: visual.selfSpans, visualRan: visual.visualRan)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment], visualRan: Bool) {
|
private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment],
|
||||||
|
selfSpans: [VADSpan], visualRan: Bool) {
|
||||||
recorder = nil
|
recorder = nil
|
||||||
micLevel = 0
|
micLevel = 0
|
||||||
systemLevel = 0
|
systemLevel = 0
|
||||||
warning = result.systemNote.map { "System audio stopped early: \($0)" }
|
warning = result.systemNote.map { "System audio stopped early: \($0)" }
|
||||||
transcriptStatus = .idle
|
transcriptStatus = .idle
|
||||||
if let folder = currentFolder {
|
if let folder = currentFolder {
|
||||||
writeSelfSpans(result, to: folder)
|
writeSelfSpans(spans: selfSpans, result: result, to: folder)
|
||||||
let visualCount = visualRan ? timeline.filter { $0.source == "vision" }.count : nil
|
let visualCount = visualRan ? timeline.count : nil // `timeline` is the remote vision segments
|
||||||
lastSession = SessionInfo(
|
lastSession = SessionInfo(
|
||||||
folder: folder, mixedURL: result.mixedURL,
|
folder: folder, mixedURL: result.mixedURL,
|
||||||
duration: result.duration, selfSpanCount: result.selfSpans.count,
|
duration: result.duration, selfSpanCount: selfSpans.count,
|
||||||
visualSegmentCount: visualCount)
|
visualSegmentCount: visualCount)
|
||||||
lastProcess = ProcessInputs(
|
lastProcess = ProcessInputs(
|
||||||
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
|
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
|
||||||
mixedURL: result.mixedURL, timeline: timeline)
|
micURL: result.micURL, systemURL: result.systemURL, mixedURL: result.mixedURL,
|
||||||
|
timeline: timeline, selfSpans: selfSpans, selfName: settings.selfName,
|
||||||
|
systemHealthy: result.systemNote == nil)
|
||||||
}
|
}
|
||||||
let autoSend = settings.autoSendOnStop
|
let autoSend = settings.autoSendOnStop
|
||||||
currentFolder = nil
|
currentFolder = nil
|
||||||
@@ -360,11 +369,12 @@ final class SessionController: ObservableObject {
|
|||||||
baseURL: settings.backendBaseURL,
|
baseURL: settings.backendBaseURL,
|
||||||
skipTLS: settings.skipTLSVerification,
|
skipTLS: settings.skipTLSVerification,
|
||||||
voiceprints: voiceprints)
|
voiceprints: voiceprints)
|
||||||
let timeline = inputs.timeline
|
|
||||||
do {
|
do {
|
||||||
let speakers = try await pipeline.process(
|
let speakers = try await pipeline.process(
|
||||||
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
||||||
mixedURL: inputs.mixedURL, timeline: timeline,
|
micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
|
||||||
|
timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
|
||||||
|
systemHealthy: inputs.systemHealthy,
|
||||||
progress: { done, total in
|
progress: { done, total in
|
||||||
await MainActor.run { self.transcriptStatus = .processing(done, total) }
|
await MainActor.run { self.transcriptStatus = .processing(done, total) }
|
||||||
})
|
})
|
||||||
@@ -411,7 +421,7 @@ final class SessionController: ObservableObject {
|
|||||||
let folder = currentFolder
|
let folder = currentFolder
|
||||||
let result = await recorder.stop()
|
let result = await recorder.stop()
|
||||||
let visual = await stopVisualAndTimeline(result, folder: folder)
|
let visual = await stopVisualAndTimeline(result, folder: folder)
|
||||||
finish(result, timeline: visual.timeline, visualRan: visual.visualRan)
|
finish(result, timeline: visual.timeline, selfSpans: visual.selfSpans, visualRan: visual.visualRan)
|
||||||
} else if lifecycleGeneration == gen {
|
} else if lifecycleGeneration == gen {
|
||||||
break // settled: no new transition was spawned
|
break // settled: no new transition was spawned
|
||||||
}
|
}
|
||||||
@@ -461,15 +471,15 @@ final class SessionController: ObservableObject {
|
|||||||
return f.string(from: Date())
|
return f.string(from: Date())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Phase-1 preview of the mic-VAD "self" spans (the eventual
|
/// Debug artifact: the channel-verified "self" spans actually sent to the backend
|
||||||
/// `visual_timeline.json` `mic_vad` segments). Lets us eyeball VAD quality.
|
/// as `self_vad` (mic active AND louder than system). Lets us eyeball self detection.
|
||||||
private func writeSelfSpans(_ result: RecordingResult, to folder: URL) {
|
private func writeSelfSpans(spans: [VADSpan], result: RecordingResult, to folder: URL) {
|
||||||
let segments = result.selfSpans.map { span -> [String: Any] in
|
let segments = spans.map { span -> [String: Any] in
|
||||||
["start": span.start, "end": span.end, "name": "self",
|
["start": span.start, "end": span.end, "name": "self",
|
||||||
"confidence": span.confidence, "source": "mic_vad"]
|
"confidence": span.confidence, "source": "mic_channel"]
|
||||||
}
|
}
|
||||||
let object: [String: Any] = [
|
let object: [String: Any] = [
|
||||||
"note": "Phase 1 mic-VAD self spans (preview of visual_timeline segments)",
|
"note": "channel-verified self spans (mic active and louder than system) — the self_vad sent to label-merge",
|
||||||
"t0_unix": result.t0Unix,
|
"t0_unix": result.t0Unix,
|
||||||
"duration_sec": result.duration,
|
"duration_sec": result.duration,
|
||||||
"self_spans": segments,
|
"self_spans": segments,
|
||||||
|
|||||||
@@ -46,6 +46,18 @@ enum SessionPackager {
|
|||||||
return try JSONSerialization.data(withJSONObject: flat, options: [])
|
return try JSONSerialization.data(withJSONObject: flat, options: [])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Clip self-VAD spans to `[start, end)` and rebase to chunk-local seconds, as
|
||||||
|
/// the `self_vad` array `[{start,end}]` for dual-channel `label-merge`.
|
||||||
|
static func rebasedSelfVadData(_ spans: [VADSpan], start: Double, end: Double) throws -> Data {
|
||||||
|
let flat: [[String: Any]] = spans.compactMap { span in
|
||||||
|
let s = max(span.start, start)
|
||||||
|
let e = min(span.end, end)
|
||||||
|
guard e > s else { return nil }
|
||||||
|
return ["start": s - start, "end": e - start]
|
||||||
|
}
|
||||||
|
return try JSONSerialization.data(withJSONObject: flat, options: [])
|
||||||
|
}
|
||||||
|
|
||||||
/// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
|
/// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
|
||||||
static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
|
static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
|
||||||
let input = try AVAudioFile(forReading: source)
|
let input = try AVAudioFile(forReading: source)
|
||||||
|
|||||||
@@ -15,8 +15,10 @@ enum TranscriptAssembler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Source ranking when the same name appears across chunks with different sources.
|
/// Source ranking when the same name appears across chunks with different sources.
|
||||||
|
/// `mic_channel` (the local user's own microphone) is the most authoritative.
|
||||||
private static func rank(_ source: String) -> Int {
|
private static func rank(_ source: String) -> Int {
|
||||||
switch source {
|
switch source {
|
||||||
|
case "mic_channel": return 4
|
||||||
case "visual": return 3
|
case "visual": return 3
|
||||||
case "voiceprint": return 2
|
case "voiceprint": return 2
|
||||||
default: return 1 // unmatched
|
default: return 1 // unmatched
|
||||||
|
|||||||
@@ -12,16 +12,30 @@ final class TranscriptPipeline {
|
|||||||
self.voiceprints = voiceprints
|
self.voiceprints = voiceprints
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Process `mixedURL` against `timeline` (visual + self spans). Writes
|
/// Process a finished session. **Dual-channel** when the system track is healthy
|
||||||
/// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)`
|
/// and present: mic (the local user) + system (remote) go as separate files, the
|
||||||
|
/// `timeline` names only the remote speakers, and `selfSpans` become `self_vad`.
|
||||||
|
/// Otherwise falls back to the **mono** mixed file with self folded into the
|
||||||
|
/// timeline. Writes `speakers.json` into `sessionFolder`. `progress(done,total)`
|
||||||
/// is called per chunk.
|
/// is called per chunk.
|
||||||
func process(sessionFolder: URL,
|
func process(sessionFolder: URL,
|
||||||
sessionId: String,
|
sessionId: String,
|
||||||
app: String,
|
app: String,
|
||||||
|
micURL: URL,
|
||||||
|
systemURL: URL,
|
||||||
mixedURL: URL,
|
mixedURL: URL,
|
||||||
timeline: [VisualTimeline.Segment],
|
timeline: [VisualTimeline.Segment],
|
||||||
|
selfSpans: [VADSpan],
|
||||||
|
selfName: String,
|
||||||
|
systemHealthy: Bool,
|
||||||
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
|
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
|
||||||
let duration = SessionPackager.duration(of: mixedURL)
|
let fm = FileManager.default
|
||||||
|
let dual = systemHealthy
|
||||||
|
&& fm.fileExists(atPath: micURL.path) && fm.fileExists(atPath: systemURL.path)
|
||||||
|
&& SessionPackager.duration(of: systemURL) > 0
|
||||||
|
let duration = dual
|
||||||
|
? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
|
||||||
|
: SessionPackager.duration(of: mixedURL)
|
||||||
let plan = SessionPackager.planChunks(durationSec: duration)
|
let plan = SessionPackager.planChunks(durationSec: duration)
|
||||||
|
|
||||||
// Zero-duration / empty session → a valid empty speakers.json, no backend call.
|
// Zero-duration / empty session → a valid empty speakers.json, no backend call.
|
||||||
@@ -33,32 +47,51 @@ final class TranscriptPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
|
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
|
||||||
try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true)
|
try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true)
|
||||||
defer { try? FileManager.default.removeItem(at: chunksDir) } // cleanup on success OR throw
|
defer { try? fm.removeItem(at: chunksDir) } // cleanup on success OR throw
|
||||||
|
|
||||||
// Start from stored voiceprints; accumulate this call's prints across chunks
|
// Start from stored voiceprints; accumulate this call's prints across chunks
|
||||||
// for within-call unification (the store only persists high-confidence ones).
|
// for within-call unification (the store only persists high-confidence ones).
|
||||||
var known = voiceprints.knownVoiceprints()
|
var known = voiceprints.knownVoiceprints()
|
||||||
var results: [TranscriptAssembler.ChunkResult] = []
|
var results: [TranscriptAssembler.ChunkResult] = []
|
||||||
|
// Mono fallback needs self folded into the timeline; dual sends it separately.
|
||||||
|
let monoTimeline = dual ? timeline
|
||||||
|
: timeline + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName)
|
||||||
|
|
||||||
for chunk in plan {
|
for chunk in plan {
|
||||||
try Task.checkCancellation()
|
try Task.checkCancellation()
|
||||||
await progress?(chunk.index, plan.count)
|
await progress?(chunk.index, plan.count)
|
||||||
let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav")
|
let pad = String(format: "%03d", chunk.index)
|
||||||
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
|
let response: LabelMergeResponse
|
||||||
guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue } // empty slice → skip
|
|
||||||
|
|
||||||
|
if dual {
|
||||||
|
let micChunk = chunksDir.appendingPathComponent("chunk_\(pad)_mic.wav")
|
||||||
|
let sysChunk = chunksDir.appendingPathComponent("chunk_\(pad)_sys.wav")
|
||||||
|
try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk)
|
||||||
|
try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk)
|
||||||
|
guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue }
|
||||||
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
|
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
|
||||||
let response = try await client.labelMerge(
|
let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end)
|
||||||
|
response = try await client.labelMergeDual(
|
||||||
|
micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData,
|
||||||
|
timeline: timelineData, knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||||||
|
try? fm.removeItem(at: micChunk); try? fm.removeItem(at: sysChunk)
|
||||||
|
} else {
|
||||||
|
let chunkURL = chunksDir.appendingPathComponent("chunk_\(pad).wav")
|
||||||
|
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
|
||||||
|
guard fm.fileExists(atPath: chunkURL.path) else { continue } // empty slice → skip
|
||||||
|
let timelineData = try SessionPackager.rebasedTimelineData(monoTimeline, start: chunk.start, end: chunk.end)
|
||||||
|
response = try await client.labelMerge(
|
||||||
audioURL: chunkURL, timeline: timelineData,
|
audioURL: chunkURL, timeline: timelineData,
|
||||||
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||||||
|
try? fm.removeItem(at: chunkURL)
|
||||||
|
}
|
||||||
|
|
||||||
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
|
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
|
||||||
known[name] = fp
|
known[name] = fp
|
||||||
}
|
}
|
||||||
voiceprints.update(with: response)
|
voiceprints.update(with: response)
|
||||||
results.append(.init(chunkStart: chunk.start, response: response))
|
results.append(.init(chunkStart: chunk.start, response: response))
|
||||||
try? FileManager.default.removeItem(at: chunkURL)
|
|
||||||
}
|
}
|
||||||
await progress?(plan.count, plan.count)
|
await progress?(plan.count, plan.count)
|
||||||
|
|
||||||
|
|||||||
@@ -55,29 +55,35 @@ final class VisualCapture {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Stop capture, fold in the mic-VAD self spans, write `visual_timeline.json`
|
/// Stop capture and write `visual_timeline.json` (the full human-readable picture:
|
||||||
/// into the session folder, and return the merged segments for `label-merge`.
|
/// remote visual segments + the mic-VAD self spans, merged). Returns ONLY the
|
||||||
|
/// remote (vision) segments — in dual-channel mode the backend names the system
|
||||||
|
/// track from these, while self is handled by the mic channel + `self_vad`.
|
||||||
func finish(selfSpans: [VADSpan], selfName: String,
|
func finish(selfSpans: [VADSpan], selfName: String,
|
||||||
sessionId: String, t0Unix: Double, durationSec: Double,
|
sessionId: String, t0Unix: Double, durationSec: Double,
|
||||||
folder: URL) async -> [VisualTimeline.Segment] {
|
folder: URL) async -> [VisualTimeline.Segment] {
|
||||||
observer.addSelfSpans(selfSpans, selfName: selfName)
|
|
||||||
let (rawSegments, rawGaps) = await observer.stop()
|
let (rawSegments, rawGaps) = await observer.stop()
|
||||||
|
|
||||||
// The observer stops slightly after audio fixes `durationSec`, so a trailing
|
// The observer stops slightly after audio fixes `durationSec`, so a trailing
|
||||||
// gap/segment can run past it. Clamp ends so the JSON is internally consistent
|
// gap/segment can run past it. Clamp ends so the JSON is internally consistent
|
||||||
// (and we never hand the backend a segment longer than the audio).
|
// (and we never hand the backend a segment longer than the audio).
|
||||||
let segments = Self.clampSegments(rawSegments, to: durationSec)
|
let vision = Self.clampSegments(rawSegments, to: durationSec) // remote speakers
|
||||||
let gaps = Self.clampGaps(rawGaps, to: durationSec)
|
let gaps = Self.clampGaps(rawGaps, to: durationSec)
|
||||||
|
let selfSegs = Self.clampSegments(selfSpans.map {
|
||||||
|
VisualTimeline.Segment(start: $0.start, end: $0.end, name: selfName,
|
||||||
|
confidence: $0.confidence, source: "mic_vad")
|
||||||
|
}, to: durationSec)
|
||||||
|
|
||||||
let names = Set(segments.map { $0.name })
|
let artifact = (vision + selfSegs).sorted { $0.start < $1.start }
|
||||||
|
let names = Set(artifact.map { $0.name })
|
||||||
let participants = names.sorted().map {
|
let participants = names.sorted().map {
|
||||||
VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
|
VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
|
||||||
}
|
}
|
||||||
let timeline = VisualTimeline(
|
let timeline = VisualTimeline(
|
||||||
sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion,
|
sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion,
|
||||||
t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS,
|
t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS,
|
||||||
selfName: selfName, participants: participants, segments: segments, visualGaps: gaps)
|
selfName: selfName, participants: participants, segments: artifact, visualGaps: gaps)
|
||||||
try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json"))
|
try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json"))
|
||||||
return segments
|
return vision
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,6 +29,31 @@ final class Phase5Tests: XCTestCase {
|
|||||||
XCTAssertEqual(arr[1]["end"] as? Double, 110)
|
XCTAssertEqual(arr[1]["end"] as? Double, 110)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testRebaseSelfVadClipsAndRebases() throws {
|
||||||
|
let spans = [VADSpan(start: 140, end: 160, confidence: 0.9),
|
||||||
|
VADSpan(start: 200, end: 260, confidence: 0.8)]
|
||||||
|
let data = try SessionPackager.rebasedSelfVadData(spans, start: 150, end: 300)
|
||||||
|
let arr = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [[String: Any]])
|
||||||
|
XCTAssertEqual(arr.count, 2)
|
||||||
|
XCTAssertEqual(arr[0]["start"] as? Double, 0)
|
||||||
|
XCTAssertEqual(arr[0]["end"] as? Double, 10) // 160 clipped at 150 base → 0–10
|
||||||
|
XCTAssertEqual(arr[1]["start"] as? Double, 50)
|
||||||
|
XCTAssertEqual(arr[1]["end"] as? Double, 110)
|
||||||
|
XCTAssertNil(arr[0]["name"]) // self_vad carries no name
|
||||||
|
}
|
||||||
|
|
||||||
|
func testAssembleRanksMicChannelOverVisual() throws {
|
||||||
|
// Same person resolved by visual in one chunk and by the mic channel in
|
||||||
|
// another → the mic-channel attribution (the user's own track) wins.
|
||||||
|
let visual = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1]}],"segments":[],"fingerprints":{},"models":{}}"#
|
||||||
|
let mic = #"{"duration":100,"speakers":[{"cluster":"mic","name":"Grant","source":"mic_channel","fingerprint":[0.2]}],"segments":[],"fingerprints":{"Grant":[0.2]},"models":{}}"#
|
||||||
|
let rv = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(visual.utf8))
|
||||||
|
let rm = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(mic.utf8))
|
||||||
|
let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet",
|
||||||
|
chunks: [.init(chunkStart: 0, response: rv), .init(chunkStart: 100, response: rm)])
|
||||||
|
XCTAssertEqual(asm.speakersFile.speakers.first { $0.name == "Grant" }?.source, "mic_channel")
|
||||||
|
}
|
||||||
|
|
||||||
func testAssembleOffsetsAndUnifies() throws {
|
func testAssembleOffsetsAndUnifies() throws {
|
||||||
let resp0 = #"{"duration":150,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1,0.2]}],"segments":[{"start_ms":1000,"end_ms":2000,"speaker":"Grant","text":"hi"}],"fingerprints":{"Grant":[0.1,0.2]},"models":{"diarization":"x"}}"#
|
let resp0 = #"{"duration":150,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1,0.2]}],"segments":[{"start_ms":1000,"end_ms":2000,"speaker":"Grant","text":"hi"}],"fingerprints":{"Grant":[0.1,0.2]},"models":{"diarization":"x"}}"#
|
||||||
let resp1 = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Sarah","source":"voiceprint","match_similarity":0.7,"fingerprint":[0.3,0.4]},{"cluster":"Speaker_1","name":"Unknown_0","source":"unmatched"}],"segments":[{"start_ms":500,"end_ms":1500,"speaker":"Sarah","text":"hello"}],"fingerprints":{"Sarah":[0.3,0.4]},"models":{"diarization":"x"}}"#
|
let resp1 = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Sarah","source":"voiceprint","match_similarity":0.7,"fingerprint":[0.3,0.4]},{"cluster":"Speaker_1","name":"Unknown_0","source":"unmatched"}],"segments":[{"start_ms":500,"end_ms":1500,"speaker":"Sarah","text":"hello"}],"fingerprints":{"Sarah":[0.3,0.4]},"models":{"diarization":"x"}}"#
|
||||||
|
|||||||
@@ -23,6 +23,14 @@ final class VoiceprintStoreTests: XCTestCase {
|
|||||||
XCTAssertEqual(store.entries["Grant"]?.calls, 1)
|
XCTAssertEqual(store.entries["Grant"]?.calls, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testStoresMicChannelSelf() throws {
|
||||||
|
let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) }
|
||||||
|
let store = VoiceprintStore(fileURL: url)
|
||||||
|
let json = #"{"duration":10,"speakers":[{"cluster":"mic","name":"Grant","source":"mic_channel","fingerprint":[0.5,0.6]}],"segments":[],"fingerprints":{"Grant":[0.5,0.6]},"models":{}}"#
|
||||||
|
store.update(with: try JSONDecoder().decode(LabelMergeResponse.self, from: Data(json.utf8)))
|
||||||
|
XCTAssertEqual(store.knownVoiceprints()["Grant"], [0.5, 0.6]) // clean self print stored
|
||||||
|
}
|
||||||
|
|
||||||
func testPersistsAcrossInstancesAndIncrementsCalls() throws {
|
func testPersistsAcrossInstancesAndIncrementsCalls() throws {
|
||||||
let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) }
|
let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) }
|
||||||
let store = VoiceprintStore(fileURL: url)
|
let store = VoiceprintStore(fileURL: url)
|
||||||
|
|||||||
Reference in New Issue
Block a user