4c086251d9
Native editor to fix speaker-ID errors after transcription (modeled on recap-relay's correction UX): rename a speaker in the legend, merge two speakers, or reassign an individual transcript line. Saving rewrites speakers.json, re-renders transcript.md + recap.html, and updates the voiceprint memory — so a correction compounds: naming an "Unknown" speaker teaches that voice for future calls. - SpeakerEditing (pure, tested): replaceSpeaker (rename = merge-onto-existing), reassign, netNameMap (compose ops), and remap (apply a name map to a recap's structured fields + whole-word free text, so summaries/extras update without re-LLM). - RecapEditModel (@MainActor): loads speakers.json (+ optional recap.json + cluster_fingerprints.json); on save writes the resolved speakers.json, re-renders, and reconciles voiceprints — merge keeps the survivor's print; rename/name-an-Unknown enrolls the cluster's fingerprint under the new name. - TranscriptEditorView (SwiftUI) + EditorWindow (AppKit window for the LSUIElement app); menu gains "Edit speakers". - Pipeline now persists cluster_fingerprints.json (every cluster incl. Unknown) and recap.json (RecapFile) so the editor can learn voices + re-render offline. - RecapModels made Codable; TranscriptAssembler exposes allFingerprints; VoiceprintStore gains enroll() + merge(). 52/52 XCTest (6 new, incl. a full rename→artifacts→voiceprint round-trip on disk).
116 lines
6.6 KiB
Swift
116 lines
6.6 KiB
Swift
import Foundation
|
||
|
||
/// Drives a finished session through the backend: chunk → sequential
|
||
/// `label-merge` (accumulating voiceprints) → assemble `speakers.json` → persist
|
||
/// fingerprints. Requests are sequential by construction (one chunk at a time).
|
||
final class TranscriptPipeline {
|
||
private let client: SparkControlClient
|
||
private let voiceprints: VoiceprintStore
|
||
|
||
init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
|
||
self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
|
||
self.voiceprints = voiceprints
|
||
}
|
||
|
||
/// Process a finished session. **Dual-channel** when the system track is healthy
|
||
/// and present: mic (the local user) + system (remote) go as separate files, the
|
||
/// `timeline` names only the remote speakers, and `selfSpans` become `self_vad`.
|
||
/// Otherwise falls back to the **mono** mixed file with self folded into the
|
||
/// timeline. Writes `speakers.json` into `sessionFolder`. `progress(done,total)`
|
||
/// is called per chunk.
|
||
func process(sessionFolder: URL,
|
||
sessionId: String,
|
||
app: String,
|
||
micURL: URL,
|
||
systemURL: URL,
|
||
mixedURL: URL,
|
||
timeline: [VisualTimeline.Segment],
|
||
selfSpans: [VADSpan],
|
||
selfName: String,
|
||
systemHealthy: Bool,
|
||
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
|
||
let fm = FileManager.default
|
||
let dual = systemHealthy
|
||
&& fm.fileExists(atPath: micURL.path) && fm.fileExists(atPath: systemURL.path)
|
||
&& SessionPackager.duration(of: systemURL) > 0
|
||
let duration = dual
|
||
? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
|
||
: SessionPackager.duration(of: mixedURL)
|
||
let plan = SessionPackager.planChunks(durationSec: duration)
|
||
|
||
// Zero-duration / empty session → a valid empty speakers.json, no backend call.
|
||
if plan.isEmpty || duration <= 0 {
|
||
let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
|
||
try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
|
||
await progress?(0, 0)
|
||
return empty.speakersFile
|
||
}
|
||
|
||
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
|
||
try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true)
|
||
defer { try? fm.removeItem(at: chunksDir) } // cleanup on success OR throw
|
||
|
||
// Start from stored voiceprints; accumulate this call's prints across chunks
|
||
// for within-call unification (the store only persists high-confidence ones).
|
||
var known = voiceprints.knownVoiceprints()
|
||
var results: [TranscriptAssembler.ChunkResult] = []
|
||
// Mono fallback needs self folded into the timeline; dual sends it separately.
|
||
let monoTimeline = dual ? timeline
|
||
: timeline + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName)
|
||
|
||
for chunk in plan {
|
||
try Task.checkCancellation()
|
||
await progress?(chunk.index, plan.count)
|
||
let pad = String(format: "%03d", chunk.index)
|
||
let response: LabelMergeResponse
|
||
|
||
if dual {
|
||
let micChunk = chunksDir.appendingPathComponent("chunk_\(pad)_mic.wav")
|
||
let sysChunk = chunksDir.appendingPathComponent("chunk_\(pad)_sys.wav")
|
||
try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk)
|
||
try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk)
|
||
guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue }
|
||
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
|
||
let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end)
|
||
response = try await client.labelMergeDual(
|
||
micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData,
|
||
timeline: timelineData, knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||
try? fm.removeItem(at: micChunk); try? fm.removeItem(at: sysChunk)
|
||
} else {
|
||
let chunkURL = chunksDir.appendingPathComponent("chunk_\(pad).wav")
|
||
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
|
||
guard fm.fileExists(atPath: chunkURL.path) else { continue } // empty slice → skip
|
||
let timelineData = try SessionPackager.rebasedTimelineData(monoTimeline, start: chunk.start, end: chunk.end)
|
||
response = try await client.labelMerge(
|
||
audioURL: chunkURL, timeline: timelineData,
|
||
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||
try? fm.removeItem(at: chunkURL)
|
||
}
|
||
|
||
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
|
||
known[name] = fp
|
||
}
|
||
voiceprints.update(with: response)
|
||
results.append(.init(chunkStart: chunk.start, response: response))
|
||
}
|
||
await progress?(plan.count, plan.count)
|
||
|
||
let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
|
||
try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
|
||
// Persist every cluster's voiceprint (incl. Unknown) so the speaker editor can
|
||
// teach the store a voice when the user renames an Unknown to a real name.
|
||
if !assembled.allFingerprints.isEmpty,
|
||
let data = try? JSONSerialization.data(withJSONObject: assembled.allFingerprints.mapValues { $0.map(Double.init) },
|
||
options: [.sortedKeys]) {
|
||
try? data.write(to: sessionFolder.appendingPathComponent("cluster_fingerprints.json"))
|
||
}
|
||
return assembled.speakersFile
|
||
}
|
||
|
||
/// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
|
||
/// the visual adapters land (Phase 3–4), their segments are merged in too.
|
||
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
|
||
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
|
||
}
|
||
}
|