Speaker reconciliation + open/re-process any saved session

Reconciliation (the marry-the-signals layer): after transcription, before the recap,
SpeakerReconciler (1) MERGES non-self clusters whose voiceprints are highly similar
(cosine >= 0.82) — fixes a person split across chunks (the real 1-on-1 failure: one
remote came back as 'MH' + 'Unknown_0'); and (2) NAMES remaining non-self clusters
from transcript CONTENT via the gateway LLM (people addressed by name / self-intros),
conservative + confidence-gated, keeping the placeholder when unrevealed. The
mic-channel self is protected and never reassigned. Voice does the segmentation; the
fingerprint-merge fixes splits; the LLM adds the content signal visual/voiceprint lack.

- SpeakerReconciler: pure cosine merge (tested) + LLM content-naming pass; rewrites
  speakers.json before recap. SessionController.finishBackend shares one model lookup
  for reconcile + recap. Gated by settings.reconcileSpeakers (default on).
- Open saved session: menu 'Open saved session…' → folder picker. Edits it if already
  transcribed, else reconstructs inputs from disk (visual_timeline vision segs +
  channel self-spans) and runs transcribe → reconcile → recap, then opens the editor.
  Lets you evaluate/correct ANY past call, not just the in-memory last one.

Note (from real Signal data): visual naming is unreliable on Signal (sparse, misread
initials, lowercase/center names) — so reconciliation + the editor (which teaches
voiceprints on confirm) carry it; the editor remains the human arbiter. 59/59 XCTest.
This commit is contained in:
Grant Gilliam
2026-06-08 11:54:41 -05:00
parent f77f33ce04
commit 6d0c8be8c9
7 changed files with 317 additions and 39 deletions
@@ -361,52 +361,60 @@ final class SessionController: ObservableObject {
/// when visual capture ran, or the self spans alone otherwise. Safe to call
/// manually ("Send to backend") or automatically on stop.
func processLastSession() {
guard let inputs = lastProcess else { return }
if case .processing = transcriptStatus { return }
guard let inputs = lastProcess, !isProcessing else { return }
transcriptStatus = .processing(0, 1)
recapURL = nil
processTask = Task { await self.runBackend(inputs, openEditorWhenDone: false) }
}
private var isProcessing: Bool { if case .processing = transcriptStatus { return true }; return false }
/// Transcribe + reconcile + recap one session's inputs. Shared by "Send to
/// backend" and "Open saved session".
private func runBackend(_ inputs: ProcessInputs, openEditorWhenDone: Bool) async {
let settings = self.settings
let voiceprints = self.voiceprints
processTask = Task {
let pipeline = TranscriptPipeline(
baseURL: settings.backendBaseURL,
skipTLS: settings.skipTLSVerification,
voiceprints: voiceprints)
do {
let speakers = try await pipeline.process(
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
systemHealthy: inputs.systemHealthy,
progress: { done, total in
await MainActor.run { self.transcriptStatus = .processing(done, total) }
})
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
// Best-effort readable recap (topic sections + extras) via the gateway LLM.
if settings.recapEnabled, !speakers.segments.isEmpty {
try Task.checkCancellation()
await self.buildRecap(speakers: speakers, inputs: inputs, settings: settings)
}
} catch is CancellationError {
self.transcriptStatus = .idle
} catch {
self.transcriptStatus = .failed(error.localizedDescription)
}
let pipeline = TranscriptPipeline(baseURL: settings.backendBaseURL,
skipTLS: settings.skipTLSVerification, voiceprints: voiceprints)
do {
let speakers = try await pipeline.process(
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
systemHealthy: inputs.systemHealthy,
progress: { done, total in await MainActor.run { self.transcriptStatus = .processing(done, total) } })
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
try Task.checkCancellation()
await self.finishBackend(speakers: speakers, inputs: inputs, settings: settings)
if openEditorWhenDone { self.openEditor(folder: inputs.folder) }
} catch is CancellationError {
self.transcriptStatus = .idle
} catch {
self.transcriptStatus = .failed(error.localizedDescription)
}
}
/// Build `transcript.md` + `recap.html` from the finished `speakers.json` using
/// the gateway LLM. Best-effort: a missing LLM or any failure leaves the
/// transcript intact and just skips the recap.
private func buildRecap(speakers: SpeakersFile, inputs: ProcessInputs, settings: AppSettings) async {
let template = settings.defaultTemplate
/// Post-transcription LLM passes (best-effort, share one gateway model lookup):
/// reconcile speaker labels (merge split clusters + name from content), then build
/// the readable recap. A missing LLM or any failure leaves speakers.json intact.
private func finishBackend(speakers: SpeakersFile, inputs: ProcessInputs, settings: AppSettings) async {
let llm = GatewayLLMClient(baseURL: settings.backendBaseURL, skipTLS: settings.skipTLSVerification)
guard let model = await llm.chatModelId() else { return } // no LLM on the gateway skip
guard let model = await llm.chatModelId() else { return } // no LLM on the gateway skip both
var resolved = speakers
if settings.reconcileSpeakers, !speakers.segments.isEmpty {
self.transcriptStatus = .processing(0, 0)
let fps = RecapEditModel.loadFingerprints(inputs.folder.appendingPathComponent("cluster_fingerprints.json"))
resolved = await SpeakerReconciler.reconcile(file: speakers, fingerprints: fps,
selfName: inputs.selfName, llm: llm, model: model)
try? resolved.write(to: inputs.folder.appendingPathComponent("speakers.json"))
self.transcriptStatus = .done(speakers: resolved.speakers.count, segments: resolved.segments.count)
}
guard settings.recapEnabled, !resolved.segments.isEmpty else { return }
let analyzer = RecapAnalyzer(llm: llm, model: model)
guard let result = try? await analyzer.recap(file: speakers, template: template) else { return }
guard let result = try? await analyzer.recap(file: resolved, template: settings.defaultTemplate) else { return }
let title = Self.recapTitle(app: inputs.app, sessionId: inputs.sessionId)
try? RecapRenderer.write(file: speakers, result: result, title: title, to: inputs.folder)
try? RecapRenderer.write(file: resolved, result: result, title: title, to: inputs.folder)
try? RecapFile(title: title, result: result).write(to: inputs.folder.appendingPathComponent("recap.json"))
let url = inputs.folder.appendingPathComponent("recap.html")
if FileManager.default.fileExists(atPath: url.path) { self.recapURL = url }
@@ -433,14 +441,65 @@ final class SessionController: ObservableObject {
/// Open the speaker-correction editor for the last session.
func editLastSession() {
guard let folder = lastSession?.folder,
let model = RecapEditModel(folder: folder, voiceprints: voiceprints,
if let folder = lastSession?.folder { openEditor(folder: folder) }
}
/// Open the editor for any session folder that has a `speakers.json`.
private func openEditor(folder: URL) {
guard let model = RecapEditModel(folder: folder, voiceprints: voiceprints,
baseURL: settings.backendBaseURL, skipTLS: settings.skipTLSVerification,
templates: settings.recapTemplates, defaultTemplateId: settings.defaultTemplateId)
else { return }
EditorWindow.shared.show(model: model)
}
/// Pick any past session folder and open it: edit it if already transcribed,
/// otherwise transcribe + reconcile + recap it first, then open the editor.
func openSavedSession() {
let panel = NSOpenPanel()
panel.canChooseDirectories = true
panel.canChooseFiles = false
panel.allowsMultipleSelection = false
panel.prompt = "Open"
panel.message = "Choose a session folder"
panel.directoryURL = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
NSApp.activate(ignoringOtherApps: true)
guard panel.runModal() == .OK, let folder = panel.url else { return }
let fm = FileManager.default
if fm.fileExists(atPath: folder.appendingPathComponent("speakers.json").path) {
openEditor(folder: folder)
return
}
// Not transcribed yet needs the raw tracks to (re)process.
let mic = folder.appendingPathComponent("mic.wav")
let sys = folder.appendingPathComponent("system.wav")
guard fm.fileExists(atPath: mic.path), fm.fileExists(atPath: sys.path), !isProcessing else { return }
transcriptStatus = .processing(0, 1)
recapURL = nil
let selfName = settings.selfName
processTask = Task {
let selfSpans = await Task.detached { ChannelSelfVAD.selfSpans(micURL: mic, systemURL: sys) }.value ?? []
let inputs = ProcessInputs(
folder: folder, sessionId: folder.lastPathComponent, app: Self.appLabel(from: folder),
micURL: mic, systemURL: sys, mixedURL: folder.appendingPathComponent("mixed_mono_16k.wav"),
timeline: Self.remoteTimeline(in: folder), selfSpans: selfSpans,
selfName: selfName, systemHealthy: true)
await self.runBackend(inputs, openEditorWhenDone: true)
}
}
/// The remote (vision) visual-timeline segments saved for a session, if any.
private static func remoteTimeline(in folder: URL) -> [VisualTimeline.Segment] {
guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")),
let vt = try? JSONDecoder().decode(VisualTimeline.self, from: data) else { return [] }
return vt.segments.filter { $0.source == "vision" }
}
/// App label from a session folder name like "_signal".
private static func appLabel(from folder: URL) -> String {
folder.lastPathComponent.split(separator: "_").last.map(String.init) ?? "manual"
}
private func fail(_ message: String) {
recorder = nil
visualCapture = nil // recorder.start() failed before visual started; nothing running