Client: dual-channel label-merge (mic_file + system_file)
The backend shipped dual-channel mode; wire the client to it. We already capture
mic (you) and system (others) separately, so send them as two files instead of the
mono mix — fixing the misattribution at the source.
- SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad);
multipart generalized to N files; shared POST/retry/decode extracted.
- SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad;
sliceAudio reused for both tracks.
- TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase
timeline + self_vad per chunk) when system audio is healthy; mono mixed-file
fallback (self folded into the timeline) otherwise.
- VisualCapture.finish: write the full visual_timeline.json (remote + self merged)
but return REMOTE (vision) segments only — self travels via the mic channel.
- TranscriptAssembler: rank mic_channel highest (the user's own track wins).
- VoiceprintStore: store the clean mic_channel self voiceprint.
- SessionController: pass mic/system URLs + remote timeline + channel self-spans +
self_name + systemHealthy; self_vad.json now reflects the channel-verified spans.
Validated END-TO-END against the live backend on the real misattributing session:
'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own
lines come back source=mic_channel; per-channel ASR recovered fuller remote text.
36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
This commit is contained in:
@@ -65,8 +65,13 @@ final class SessionController: ObservableObject {
|
||||
let folder: URL
|
||||
let sessionId: String
|
||||
let app: String
|
||||
let micURL: URL
|
||||
let systemURL: URL
|
||||
let mixedURL: URL
|
||||
let timeline: [VisualTimeline.Segment]
|
||||
let timeline: [VisualTimeline.Segment] // remote visual names; self handled via the mic channel
|
||||
let selfSpans: [VADSpan]
|
||||
let selfName: String
|
||||
let systemHealthy: Bool
|
||||
}
|
||||
private var lastProcess: ProcessInputs?
|
||||
private var processTask: Task<Void, Never>?
|
||||
@@ -275,19 +280,20 @@ final class SessionController: ObservableObject {
|
||||
/// ran, otherwise the mic-VAD self spans alone. `visualRan` reports whether the
|
||||
/// visual pipeline actually attached (for the after-session indicator).
|
||||
private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?)
|
||||
async -> (timeline: [VisualTimeline.Segment], visualRan: Bool) {
|
||||
async -> (timeline: [VisualTimeline.Segment], selfSpans: [VADSpan], visualRan: Bool) {
|
||||
let selfName = settings.selfName
|
||||
let selfSpans = await channelSelfSpans(result: result, folder: folder)
|
||||
if let vc = visualCapture, let folder {
|
||||
visualCapture = nil
|
||||
let timeline = await vc.finish(
|
||||
// Remote (vision) segments only; self travels separately as the mic channel.
|
||||
let remote = await vc.finish(
|
||||
selfSpans: selfSpans, selfName: selfName,
|
||||
sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
|
||||
durationSec: result.duration, folder: folder)
|
||||
return (timeline, true)
|
||||
return (remote, selfSpans, true)
|
||||
}
|
||||
if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
|
||||
return (TranscriptPipeline.timeline(fromSelfSpans: selfSpans, selfName: selfName), false)
|
||||
return ([], selfSpans, false)
|
||||
}
|
||||
|
||||
/// Self spans for the backend timeline, identified by CHANNEL: the mic track is
|
||||
@@ -312,26 +318,29 @@ final class SessionController: ObservableObject {
|
||||
lifecycleTask = Task {
|
||||
let result = await recorder.stop()
|
||||
let visual = await self.stopVisualAndTimeline(result, folder: folder)
|
||||
self.finish(result, timeline: visual.timeline, visualRan: visual.visualRan)
|
||||
self.finish(result, timeline: visual.timeline, selfSpans: visual.selfSpans, visualRan: visual.visualRan)
|
||||
}
|
||||
}
|
||||
|
||||
private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment], visualRan: Bool) {
|
||||
private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment],
|
||||
selfSpans: [VADSpan], visualRan: Bool) {
|
||||
recorder = nil
|
||||
micLevel = 0
|
||||
systemLevel = 0
|
||||
warning = result.systemNote.map { "System audio stopped early: \($0)" }
|
||||
transcriptStatus = .idle
|
||||
if let folder = currentFolder {
|
||||
writeSelfSpans(result, to: folder)
|
||||
let visualCount = visualRan ? timeline.filter { $0.source == "vision" }.count : nil
|
||||
writeSelfSpans(spans: selfSpans, result: result, to: folder)
|
||||
let visualCount = visualRan ? timeline.count : nil // `timeline` is the remote vision segments
|
||||
lastSession = SessionInfo(
|
||||
folder: folder, mixedURL: result.mixedURL,
|
||||
duration: result.duration, selfSpanCount: result.selfSpans.count,
|
||||
duration: result.duration, selfSpanCount: selfSpans.count,
|
||||
visualSegmentCount: visualCount)
|
||||
lastProcess = ProcessInputs(
|
||||
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
|
||||
mixedURL: result.mixedURL, timeline: timeline)
|
||||
micURL: result.micURL, systemURL: result.systemURL, mixedURL: result.mixedURL,
|
||||
timeline: timeline, selfSpans: selfSpans, selfName: settings.selfName,
|
||||
systemHealthy: result.systemNote == nil)
|
||||
}
|
||||
let autoSend = settings.autoSendOnStop
|
||||
currentFolder = nil
|
||||
@@ -360,11 +369,12 @@ final class SessionController: ObservableObject {
|
||||
baseURL: settings.backendBaseURL,
|
||||
skipTLS: settings.skipTLSVerification,
|
||||
voiceprints: voiceprints)
|
||||
let timeline = inputs.timeline
|
||||
do {
|
||||
let speakers = try await pipeline.process(
|
||||
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
||||
mixedURL: inputs.mixedURL, timeline: timeline,
|
||||
micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
|
||||
timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
|
||||
systemHealthy: inputs.systemHealthy,
|
||||
progress: { done, total in
|
||||
await MainActor.run { self.transcriptStatus = .processing(done, total) }
|
||||
})
|
||||
@@ -411,7 +421,7 @@ final class SessionController: ObservableObject {
|
||||
let folder = currentFolder
|
||||
let result = await recorder.stop()
|
||||
let visual = await stopVisualAndTimeline(result, folder: folder)
|
||||
finish(result, timeline: visual.timeline, visualRan: visual.visualRan)
|
||||
finish(result, timeline: visual.timeline, selfSpans: visual.selfSpans, visualRan: visual.visualRan)
|
||||
} else if lifecycleGeneration == gen {
|
||||
break // settled: no new transition was spawned
|
||||
}
|
||||
@@ -461,15 +471,15 @@ final class SessionController: ObservableObject {
|
||||
return f.string(from: Date())
|
||||
}
|
||||
|
||||
/// Phase-1 preview of the mic-VAD "self" spans (the eventual
|
||||
/// `visual_timeline.json` `mic_vad` segments). Lets us eyeball VAD quality.
|
||||
private func writeSelfSpans(_ result: RecordingResult, to folder: URL) {
|
||||
let segments = result.selfSpans.map { span -> [String: Any] in
|
||||
/// Debug artifact: the channel-verified "self" spans actually sent to the backend
|
||||
/// as `self_vad` (mic active AND louder than system). Lets us eyeball self detection.
|
||||
private func writeSelfSpans(spans: [VADSpan], result: RecordingResult, to folder: URL) {
|
||||
let segments = spans.map { span -> [String: Any] in
|
||||
["start": span.start, "end": span.end, "name": "self",
|
||||
"confidence": span.confidence, "source": "mic_vad"]
|
||||
"confidence": span.confidence, "source": "mic_channel"]
|
||||
}
|
||||
let object: [String: Any] = [
|
||||
"note": "Phase 1 mic-VAD self spans (preview of visual_timeline segments)",
|
||||
"note": "channel-verified self spans (mic active and louder than system) — the self_vad sent to label-merge",
|
||||
"t0_unix": result.t0Unix,
|
||||
"duration_sec": result.duration,
|
||||
"self_spans": segments,
|
||||
|
||||
@@ -46,6 +46,18 @@ enum SessionPackager {
|
||||
return try JSONSerialization.data(withJSONObject: flat, options: [])
|
||||
}
|
||||
|
||||
/// Clip self-VAD spans to `[start, end)` and rebase to chunk-local seconds, as
|
||||
/// the `self_vad` array `[{start,end}]` for dual-channel `label-merge`.
|
||||
static func rebasedSelfVadData(_ spans: [VADSpan], start: Double, end: Double) throws -> Data {
|
||||
let flat: [[String: Any]] = spans.compactMap { span in
|
||||
let s = max(span.start, start)
|
||||
let e = min(span.end, end)
|
||||
guard e > s else { return nil }
|
||||
return ["start": s - start, "end": e - start]
|
||||
}
|
||||
return try JSONSerialization.data(withJSONObject: flat, options: [])
|
||||
}
|
||||
|
||||
/// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
|
||||
static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
|
||||
let input = try AVAudioFile(forReading: source)
|
||||
|
||||
@@ -15,8 +15,10 @@ enum TranscriptAssembler {
|
||||
}
|
||||
|
||||
/// Source ranking when the same name appears across chunks with different sources.
|
||||
/// `mic_channel` (the local user's own microphone) is the most authoritative.
|
||||
private static func rank(_ source: String) -> Int {
|
||||
switch source {
|
||||
case "mic_channel": return 4
|
||||
case "visual": return 3
|
||||
case "voiceprint": return 2
|
||||
default: return 1 // unmatched
|
||||
|
||||
@@ -12,16 +12,30 @@ final class TranscriptPipeline {
|
||||
self.voiceprints = voiceprints
|
||||
}
|
||||
|
||||
/// Process `mixedURL` against `timeline` (visual + self spans). Writes
|
||||
/// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)`
|
||||
/// Process a finished session. **Dual-channel** when the system track is healthy
|
||||
/// and present: mic (the local user) + system (remote) go as separate files, the
|
||||
/// `timeline` names only the remote speakers, and `selfSpans` become `self_vad`.
|
||||
/// Otherwise falls back to the **mono** mixed file with self folded into the
|
||||
/// timeline. Writes `speakers.json` into `sessionFolder`. `progress(done,total)`
|
||||
/// is called per chunk.
|
||||
func process(sessionFolder: URL,
|
||||
sessionId: String,
|
||||
app: String,
|
||||
micURL: URL,
|
||||
systemURL: URL,
|
||||
mixedURL: URL,
|
||||
timeline: [VisualTimeline.Segment],
|
||||
selfSpans: [VADSpan],
|
||||
selfName: String,
|
||||
systemHealthy: Bool,
|
||||
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
|
||||
let duration = SessionPackager.duration(of: mixedURL)
|
||||
let fm = FileManager.default
|
||||
let dual = systemHealthy
|
||||
&& fm.fileExists(atPath: micURL.path) && fm.fileExists(atPath: systemURL.path)
|
||||
&& SessionPackager.duration(of: systemURL) > 0
|
||||
let duration = dual
|
||||
? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
|
||||
: SessionPackager.duration(of: mixedURL)
|
||||
let plan = SessionPackager.planChunks(durationSec: duration)
|
||||
|
||||
// Zero-duration / empty session → a valid empty speakers.json, no backend call.
|
||||
@@ -33,32 +47,51 @@ final class TranscriptPipeline {
|
||||
}
|
||||
|
||||
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
|
||||
try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true)
|
||||
defer { try? FileManager.default.removeItem(at: chunksDir) } // cleanup on success OR throw
|
||||
try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true)
|
||||
defer { try? fm.removeItem(at: chunksDir) } // cleanup on success OR throw
|
||||
|
||||
// Start from stored voiceprints; accumulate this call's prints across chunks
|
||||
// for within-call unification (the store only persists high-confidence ones).
|
||||
var known = voiceprints.knownVoiceprints()
|
||||
var results: [TranscriptAssembler.ChunkResult] = []
|
||||
// Mono fallback needs self folded into the timeline; dual sends it separately.
|
||||
let monoTimeline = dual ? timeline
|
||||
: timeline + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName)
|
||||
|
||||
for chunk in plan {
|
||||
try Task.checkCancellation()
|
||||
await progress?(chunk.index, plan.count)
|
||||
let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav")
|
||||
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
|
||||
guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue } // empty slice → skip
|
||||
let pad = String(format: "%03d", chunk.index)
|
||||
let response: LabelMergeResponse
|
||||
|
||||
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
|
||||
let response = try await client.labelMerge(
|
||||
audioURL: chunkURL, timeline: timelineData,
|
||||
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||||
if dual {
|
||||
let micChunk = chunksDir.appendingPathComponent("chunk_\(pad)_mic.wav")
|
||||
let sysChunk = chunksDir.appendingPathComponent("chunk_\(pad)_sys.wav")
|
||||
try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk)
|
||||
try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk)
|
||||
guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue }
|
||||
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
|
||||
let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end)
|
||||
response = try await client.labelMergeDual(
|
||||
micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData,
|
||||
timeline: timelineData, knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||||
try? fm.removeItem(at: micChunk); try? fm.removeItem(at: sysChunk)
|
||||
} else {
|
||||
let chunkURL = chunksDir.appendingPathComponent("chunk_\(pad).wav")
|
||||
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
|
||||
guard fm.fileExists(atPath: chunkURL.path) else { continue } // empty slice → skip
|
||||
let timelineData = try SessionPackager.rebasedTimelineData(monoTimeline, start: chunk.start, end: chunk.end)
|
||||
response = try await client.labelMerge(
|
||||
audioURL: chunkURL, timeline: timelineData,
|
||||
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||||
try? fm.removeItem(at: chunkURL)
|
||||
}
|
||||
|
||||
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
|
||||
known[name] = fp
|
||||
}
|
||||
voiceprints.update(with: response)
|
||||
results.append(.init(chunkStart: chunk.start, response: response))
|
||||
try? FileManager.default.removeItem(at: chunkURL)
|
||||
}
|
||||
await progress?(plan.count, plan.count)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user