53d7fcdac0
The backend shipped dual-channel mode; wire the client to it. We already capture
mic (you) and system (others) separately, so send them as two files instead of the
mono mix — fixing the misattribution at the source.
- SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad);
multipart generalized to N files; shared POST/retry/decode extracted.
- SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad;
sliceAudio reused for both tracks.
- TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase
timeline + self_vad per chunk) when system audio is healthy; mono mixed-file
fallback (self folded into the timeline) otherwise.
- VisualCapture.finish: write the full visual_timeline.json (remote + self merged)
but return REMOTE (vision) segments only — self travels via the mic channel.
- TranscriptAssembler: rank mic_channel highest (the user's own track wins).
- VoiceprintStore: store the clean mic_channel self voiceprint.
- SessionController: pass mic/system URLs + remote timeline + channel self-spans +
self_name + systemHealthy; self_vad.json now reflects the channel-verified spans.
Validated END-TO-END against the live backend on the real misattributing session:
'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own
lines come back source=mic_channel; per-channel ASR recovered fuller remote text.
36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
109 lines
6.1 KiB
Swift
109 lines
6.1 KiB
Swift
import Foundation
|
||
|
||
/// Drives a finished session through the backend: chunk → sequential
|
||
/// `label-merge` (accumulating voiceprints) → assemble `speakers.json` → persist
|
||
/// fingerprints. Requests are sequential by construction (one chunk at a time).
|
||
final class TranscriptPipeline {
|
||
private let client: SparkControlClient
|
||
private let voiceprints: VoiceprintStore
|
||
|
||
init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
|
||
self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
|
||
self.voiceprints = voiceprints
|
||
}
|
||
|
||
/// Process a finished session. **Dual-channel** when the system track is healthy
|
||
/// and present: mic (the local user) + system (remote) go as separate files, the
|
||
/// `timeline` names only the remote speakers, and `selfSpans` become `self_vad`.
|
||
/// Otherwise falls back to the **mono** mixed file with self folded into the
|
||
/// timeline. Writes `speakers.json` into `sessionFolder`. `progress(done,total)`
|
||
/// is called per chunk.
|
||
func process(sessionFolder: URL,
|
||
sessionId: String,
|
||
app: String,
|
||
micURL: URL,
|
||
systemURL: URL,
|
||
mixedURL: URL,
|
||
timeline: [VisualTimeline.Segment],
|
||
selfSpans: [VADSpan],
|
||
selfName: String,
|
||
systemHealthy: Bool,
|
||
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
|
||
let fm = FileManager.default
|
||
let dual = systemHealthy
|
||
&& fm.fileExists(atPath: micURL.path) && fm.fileExists(atPath: systemURL.path)
|
||
&& SessionPackager.duration(of: systemURL) > 0
|
||
let duration = dual
|
||
? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
|
||
: SessionPackager.duration(of: mixedURL)
|
||
let plan = SessionPackager.planChunks(durationSec: duration)
|
||
|
||
// Zero-duration / empty session → a valid empty speakers.json, no backend call.
|
||
if plan.isEmpty || duration <= 0 {
|
||
let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
|
||
try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
|
||
await progress?(0, 0)
|
||
return empty.speakersFile
|
||
}
|
||
|
||
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
|
||
try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true)
|
||
defer { try? fm.removeItem(at: chunksDir) } // cleanup on success OR throw
|
||
|
||
// Start from stored voiceprints; accumulate this call's prints across chunks
|
||
// for within-call unification (the store only persists high-confidence ones).
|
||
var known = voiceprints.knownVoiceprints()
|
||
var results: [TranscriptAssembler.ChunkResult] = []
|
||
// Mono fallback needs self folded into the timeline; dual sends it separately.
|
||
let monoTimeline = dual ? timeline
|
||
: timeline + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName)
|
||
|
||
for chunk in plan {
|
||
try Task.checkCancellation()
|
||
await progress?(chunk.index, plan.count)
|
||
let pad = String(format: "%03d", chunk.index)
|
||
let response: LabelMergeResponse
|
||
|
||
if dual {
|
||
let micChunk = chunksDir.appendingPathComponent("chunk_\(pad)_mic.wav")
|
||
let sysChunk = chunksDir.appendingPathComponent("chunk_\(pad)_sys.wav")
|
||
try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk)
|
||
try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk)
|
||
guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue }
|
||
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
|
||
let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end)
|
||
response = try await client.labelMergeDual(
|
||
micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData,
|
||
timeline: timelineData, knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||
try? fm.removeItem(at: micChunk); try? fm.removeItem(at: sysChunk)
|
||
} else {
|
||
let chunkURL = chunksDir.appendingPathComponent("chunk_\(pad).wav")
|
||
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
|
||
guard fm.fileExists(atPath: chunkURL.path) else { continue } // empty slice → skip
|
||
let timelineData = try SessionPackager.rebasedTimelineData(monoTimeline, start: chunk.start, end: chunk.end)
|
||
response = try await client.labelMerge(
|
||
audioURL: chunkURL, timeline: timelineData,
|
||
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||
try? fm.removeItem(at: chunkURL)
|
||
}
|
||
|
||
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
|
||
known[name] = fp
|
||
}
|
||
voiceprints.update(with: response)
|
||
results.append(.init(chunkStart: chunk.start, response: response))
|
||
}
|
||
await progress?(plan.count, plan.count)
|
||
|
||
let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
|
||
try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
|
||
return assembled.speakersFile
|
||
}
|
||
|
||
/// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
|
||
/// the visual adapters land (Phase 3–4), their segments are merged in too.
|
||
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
|
||
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
|
||
}
|
||
}
|