import Foundation /// Drives a finished session through the backend: chunk → sequential /// `label-merge` (accumulating voiceprints) → assemble `speakers.json` → persist /// fingerprints. Requests are sequential by construction (one chunk at a time). final class TranscriptPipeline { private let client: SparkControlClient private let voiceprints: VoiceprintStore init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) { self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS) self.voiceprints = voiceprints } /// Process `mixedURL` against `timeline` (visual + self spans). Writes /// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)` /// is called per chunk. func process(sessionFolder: URL, sessionId: String, app: String, mixedURL: URL, timeline: [VisualTimeline.Segment], progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile { let duration = SessionPackager.duration(of: mixedURL) let plan = SessionPackager.planChunks(durationSec: duration) // Zero-duration / empty session → a valid empty speakers.json, no backend call. if plan.isEmpty || duration <= 0 { let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: []) try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json")) await progress?(0, 0) return empty.speakersFile } let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true) try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true) defer { try? FileManager.default.removeItem(at: chunksDir) } // cleanup on success OR throw // Start from stored voiceprints; accumulate this call's prints across chunks // for within-call unification (the store only persists high-confidence ones). var known = voiceprints.knownVoiceprints() var results: [TranscriptAssembler.ChunkResult] = [] for chunk in plan { try Task.checkCancellation() await progress?(chunk.index, plan.count) let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav") try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL) guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue } // empty slice → skip let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end) let response = try await client.labelMerge( audioURL: chunkURL, timeline: timelineData, knownVoiceprints: known.isEmpty ? nil : known, transcribe: true) for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) { known[name] = fp } voiceprints.update(with: response) results.append(.init(chunkStart: chunk.start, response: response)) try? FileManager.default.removeItem(at: chunkURL) } await progress?(plan.count, plan.count) let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results) try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json")) return assembled.speakersFile } /// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once /// the visual adapters land (Phase 3–4), their segments are merged in too. static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] { spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") } } }