Files
ten31-transcripts/Ten31Transcripts/Session/TranscriptPipeline.swift
T
Grant Gilliam 53d7fcdac0 Client: dual-channel label-merge (mic_file + system_file)
The backend shipped dual-channel mode; wire the client to it. We already capture
mic (you) and system (others) separately, so send them as two files instead of the
mono mix — fixing the misattribution at the source.

- SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad);
  multipart generalized to N files; shared POST/retry/decode extracted.
- SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad;
  sliceAudio reused for both tracks.
- TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase
  timeline + self_vad per chunk) when system audio is healthy; mono mixed-file
  fallback (self folded into the timeline) otherwise.
- VisualCapture.finish: write the full visual_timeline.json (remote + self merged)
  but return REMOTE (vision) segments only — self travels via the mic channel.
- TranscriptAssembler: rank mic_channel highest (the user's own track wins).
- VoiceprintStore: store the clean mic_channel self voiceprint.
- SessionController: pass mic/system URLs + remote timeline + channel self-spans +
  self_name + systemHealthy; self_vad.json now reflects the channel-verified spans.

Validated END-TO-END against the live backend on the real misattributing session:
'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own
lines come back source=mic_channel; per-channel ASR recovered fuller remote text.
36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
2026-06-06 13:15:29 -05:00

109 lines
6.1 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
/// Drives a finished session through the backend: chunk sequential
/// `label-merge` (accumulating voiceprints) assemble `speakers.json` persist
/// fingerprints. Requests are sequential by construction (one chunk at a time).
final class TranscriptPipeline {
private let client: SparkControlClient
private let voiceprints: VoiceprintStore
init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
self.voiceprints = voiceprints
}
/// Process a finished session. **Dual-channel** when the system track is healthy
/// and present: mic (the local user) + system (remote) go as separate files, the
/// `timeline` names only the remote speakers, and `selfSpans` become `self_vad`.
/// Otherwise falls back to the **mono** mixed file with self folded into the
/// timeline. Writes `speakers.json` into `sessionFolder`. `progress(done,total)`
/// is called per chunk.
func process(sessionFolder: URL,
sessionId: String,
app: String,
micURL: URL,
systemURL: URL,
mixedURL: URL,
timeline: [VisualTimeline.Segment],
selfSpans: [VADSpan],
selfName: String,
systemHealthy: Bool,
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
let fm = FileManager.default
let dual = systemHealthy
&& fm.fileExists(atPath: micURL.path) && fm.fileExists(atPath: systemURL.path)
&& SessionPackager.duration(of: systemURL) > 0
let duration = dual
? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
: SessionPackager.duration(of: mixedURL)
let plan = SessionPackager.planChunks(durationSec: duration)
// Zero-duration / empty session a valid empty speakers.json, no backend call.
if plan.isEmpty || duration <= 0 {
let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
await progress?(0, 0)
return empty.speakersFile
}
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true)
defer { try? fm.removeItem(at: chunksDir) } // cleanup on success OR throw
// Start from stored voiceprints; accumulate this call's prints across chunks
// for within-call unification (the store only persists high-confidence ones).
var known = voiceprints.knownVoiceprints()
var results: [TranscriptAssembler.ChunkResult] = []
// Mono fallback needs self folded into the timeline; dual sends it separately.
let monoTimeline = dual ? timeline
: timeline + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName)
for chunk in plan {
try Task.checkCancellation()
await progress?(chunk.index, plan.count)
let pad = String(format: "%03d", chunk.index)
let response: LabelMergeResponse
if dual {
let micChunk = chunksDir.appendingPathComponent("chunk_\(pad)_mic.wav")
let sysChunk = chunksDir.appendingPathComponent("chunk_\(pad)_sys.wav")
try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk)
try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk)
guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue }
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end)
response = try await client.labelMergeDual(
micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData,
timeline: timelineData, knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
try? fm.removeItem(at: micChunk); try? fm.removeItem(at: sysChunk)
} else {
let chunkURL = chunksDir.appendingPathComponent("chunk_\(pad).wav")
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
guard fm.fileExists(atPath: chunkURL.path) else { continue } // empty slice skip
let timelineData = try SessionPackager.rebasedTimelineData(monoTimeline, start: chunk.start, end: chunk.end)
response = try await client.labelMerge(
audioURL: chunkURL, timeline: timelineData,
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
try? fm.removeItem(at: chunkURL)
}
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
known[name] = fp
}
voiceprints.update(with: response)
results.append(.init(chunkStart: chunk.start, response: response))
}
await progress?(plan.count, plan.count)
let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
return assembled.speakersFile
}
/// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
/// the visual adapters land (Phase 34), their segments are merged in too.
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
}
}