From 6d0c8be8c965a00ffefa6514ca3e86c7a6b299d3 Mon Sep 17 00:00:00 2001 From: Grant Gilliam Date: Mon, 8 Jun 2026 11:54:41 -0500 Subject: [PATCH] Speaker reconciliation + open/re-process any saved session MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reconciliation (the marry-the-signals layer): after transcription, before the recap, SpeakerReconciler (1) MERGES non-self clusters whose voiceprints are highly similar (cosine >= 0.82) — fixes a person split across chunks (the real 1-on-1 failure: one remote came back as 'MH' + 'Unknown_0'); and (2) NAMES remaining non-self clusters from transcript CONTENT via the gateway LLM (people addressed by name / self-intros), conservative + confidence-gated, keeping the placeholder when unrevealed. The mic-channel self is protected and never reassigned. Voice does the segmentation; the fingerprint-merge fixes splits; the LLM adds the content signal visual/voiceprint lack. - SpeakerReconciler: pure cosine merge (tested) + LLM content-naming pass; rewrites speakers.json before recap. SessionController.finishBackend shares one model lookup for reconcile + recap. Gated by settings.reconcileSpeakers (default on). - Open saved session: menu 'Open saved session…' → folder picker. Edits it if already transcribed, else reconstructs inputs from disk (visual_timeline vision segs + channel self-spans) and runs transcribe → reconcile → recap, then opens the editor. Lets you evaluate/correct ANY past call, not just the in-memory last one. Note (from real Signal data): visual naming is unreliable on Signal (sparse, misread initials, lowercase/center names) — so reconciliation + the editor (which teaches voiceprints on confirm) carry it; the editor remains the human arbiter. 59/59 XCTest. --- Ten31Transcripts/Recap/RecapEditModel.swift | 2 +- .../Session/SessionController.swift | 135 +++++++++++----- .../Session/SpeakerReconciler.swift | 146 ++++++++++++++++++ Ten31Transcripts/Settings/AppSettings.swift | 9 ++ Ten31Transcripts/UI/MenuBarView.swift | 4 + Ten31Transcripts/UI/SettingsView.swift | 1 + .../SpeakerReconcilerTests.swift | 59 +++++++ 7 files changed, 317 insertions(+), 39 deletions(-) create mode 100644 Ten31Transcripts/Session/SpeakerReconciler.swift create mode 100644 Ten31TranscriptsTests/SpeakerReconcilerTests.swift diff --git a/Ten31Transcripts/Recap/RecapEditModel.swift b/Ten31Transcripts/Recap/RecapEditModel.swift index 208fda3..8ddb475 100644 --- a/Ten31Transcripts/Recap/RecapEditModel.swift +++ b/Ten31Transcripts/Recap/RecapEditModel.swift @@ -170,7 +170,7 @@ final class RecapEditModel: ObservableObject { } } - private static func loadFingerprints(_ url: URL) -> [String: [Float]] { + static func loadFingerprints(_ url: URL) -> [String: [Float]] { guard let data = try? Data(contentsOf: url), let obj = try? JSONSerialization.jsonObject(with: data) as? [String: [Any]] else { return [:] } return obj.mapValues { $0.compactMap { ($0 as? NSNumber)?.floatValue } } diff --git a/Ten31Transcripts/Session/SessionController.swift b/Ten31Transcripts/Session/SessionController.swift index 0823158..06c75a2 100644 --- a/Ten31Transcripts/Session/SessionController.swift +++ b/Ten31Transcripts/Session/SessionController.swift @@ -361,52 +361,60 @@ final class SessionController: ObservableObject { /// when visual capture ran, or the self spans alone otherwise. Safe to call /// manually ("Send to backend") or automatically on stop. func processLastSession() { - guard let inputs = lastProcess else { return } - if case .processing = transcriptStatus { return } + guard let inputs = lastProcess, !isProcessing else { return } transcriptStatus = .processing(0, 1) recapURL = nil + processTask = Task { await self.runBackend(inputs, openEditorWhenDone: false) } + } + private var isProcessing: Bool { if case .processing = transcriptStatus { return true }; return false } + + /// Transcribe + reconcile + recap one session's inputs. Shared by "Send to + /// backend" and "Open saved session". + private func runBackend(_ inputs: ProcessInputs, openEditorWhenDone: Bool) async { let settings = self.settings - let voiceprints = self.voiceprints - processTask = Task { - let pipeline = TranscriptPipeline( - baseURL: settings.backendBaseURL, - skipTLS: settings.skipTLSVerification, - voiceprints: voiceprints) - do { - let speakers = try await pipeline.process( - sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app, - micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL, - timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName, - systemHealthy: inputs.systemHealthy, - progress: { done, total in - await MainActor.run { self.transcriptStatus = .processing(done, total) } - }) - self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count) - // Best-effort readable recap (topic sections + extras) via the gateway LLM. - if settings.recapEnabled, !speakers.segments.isEmpty { - try Task.checkCancellation() - await self.buildRecap(speakers: speakers, inputs: inputs, settings: settings) - } - } catch is CancellationError { - self.transcriptStatus = .idle - } catch { - self.transcriptStatus = .failed(error.localizedDescription) - } + let pipeline = TranscriptPipeline(baseURL: settings.backendBaseURL, + skipTLS: settings.skipTLSVerification, voiceprints: voiceprints) + do { + let speakers = try await pipeline.process( + sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app, + micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL, + timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName, + systemHealthy: inputs.systemHealthy, + progress: { done, total in await MainActor.run { self.transcriptStatus = .processing(done, total) } }) + self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count) + try Task.checkCancellation() + await self.finishBackend(speakers: speakers, inputs: inputs, settings: settings) + if openEditorWhenDone { self.openEditor(folder: inputs.folder) } + } catch is CancellationError { + self.transcriptStatus = .idle + } catch { + self.transcriptStatus = .failed(error.localizedDescription) } } - /// Build `transcript.md` + `recap.html` from the finished `speakers.json` using - /// the gateway LLM. Best-effort: a missing LLM or any failure leaves the - /// transcript intact and just skips the recap. - private func buildRecap(speakers: SpeakersFile, inputs: ProcessInputs, settings: AppSettings) async { - let template = settings.defaultTemplate + /// Post-transcription LLM passes (best-effort, share one gateway model lookup): + /// reconcile speaker labels (merge split clusters + name from content), then build + /// the readable recap. A missing LLM or any failure leaves speakers.json intact. + private func finishBackend(speakers: SpeakersFile, inputs: ProcessInputs, settings: AppSettings) async { let llm = GatewayLLMClient(baseURL: settings.backendBaseURL, skipTLS: settings.skipTLSVerification) - guard let model = await llm.chatModelId() else { return } // no LLM on the gateway → skip + guard let model = await llm.chatModelId() else { return } // no LLM on the gateway → skip both + + var resolved = speakers + if settings.reconcileSpeakers, !speakers.segments.isEmpty { + self.transcriptStatus = .processing(0, 0) + let fps = RecapEditModel.loadFingerprints(inputs.folder.appendingPathComponent("cluster_fingerprints.json")) + resolved = await SpeakerReconciler.reconcile(file: speakers, fingerprints: fps, + selfName: inputs.selfName, llm: llm, model: model) + try? resolved.write(to: inputs.folder.appendingPathComponent("speakers.json")) + self.transcriptStatus = .done(speakers: resolved.speakers.count, segments: resolved.segments.count) + } + + guard settings.recapEnabled, !resolved.segments.isEmpty else { return } let analyzer = RecapAnalyzer(llm: llm, model: model) - guard let result = try? await analyzer.recap(file: speakers, template: template) else { return } + guard let result = try? await analyzer.recap(file: resolved, template: settings.defaultTemplate) else { return } let title = Self.recapTitle(app: inputs.app, sessionId: inputs.sessionId) - try? RecapRenderer.write(file: speakers, result: result, title: title, to: inputs.folder) + try? RecapRenderer.write(file: resolved, result: result, title: title, to: inputs.folder) try? RecapFile(title: title, result: result).write(to: inputs.folder.appendingPathComponent("recap.json")) let url = inputs.folder.appendingPathComponent("recap.html") if FileManager.default.fileExists(atPath: url.path) { self.recapURL = url } @@ -433,14 +441,65 @@ final class SessionController: ObservableObject { /// Open the speaker-correction editor for the last session. func editLastSession() { - guard let folder = lastSession?.folder, - let model = RecapEditModel(folder: folder, voiceprints: voiceprints, + if let folder = lastSession?.folder { openEditor(folder: folder) } + } + + /// Open the editor for any session folder that has a `speakers.json`. + private func openEditor(folder: URL) { + guard let model = RecapEditModel(folder: folder, voiceprints: voiceprints, baseURL: settings.backendBaseURL, skipTLS: settings.skipTLSVerification, templates: settings.recapTemplates, defaultTemplateId: settings.defaultTemplateId) else { return } EditorWindow.shared.show(model: model) } + /// Pick any past session folder and open it: edit it if already transcribed, + /// otherwise transcribe + reconcile + recap it first, then open the editor. + func openSavedSession() { + let panel = NSOpenPanel() + panel.canChooseDirectories = true + panel.canChooseFiles = false + panel.allowsMultipleSelection = false + panel.prompt = "Open" + panel.message = "Choose a session folder" + panel.directoryURL = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true) + NSApp.activate(ignoringOtherApps: true) + guard panel.runModal() == .OK, let folder = panel.url else { return } + let fm = FileManager.default + if fm.fileExists(atPath: folder.appendingPathComponent("speakers.json").path) { + openEditor(folder: folder) + return + } + // Not transcribed yet — needs the raw tracks to (re)process. + let mic = folder.appendingPathComponent("mic.wav") + let sys = folder.appendingPathComponent("system.wav") + guard fm.fileExists(atPath: mic.path), fm.fileExists(atPath: sys.path), !isProcessing else { return } + transcriptStatus = .processing(0, 1) + recapURL = nil + let selfName = settings.selfName + processTask = Task { + let selfSpans = await Task.detached { ChannelSelfVAD.selfSpans(micURL: mic, systemURL: sys) }.value ?? [] + let inputs = ProcessInputs( + folder: folder, sessionId: folder.lastPathComponent, app: Self.appLabel(from: folder), + micURL: mic, systemURL: sys, mixedURL: folder.appendingPathComponent("mixed_mono_16k.wav"), + timeline: Self.remoteTimeline(in: folder), selfSpans: selfSpans, + selfName: selfName, systemHealthy: true) + await self.runBackend(inputs, openEditorWhenDone: true) + } + } + + /// The remote (vision) visual-timeline segments saved for a session, if any. + private static func remoteTimeline(in folder: URL) -> [VisualTimeline.Segment] { + guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")), + let vt = try? JSONDecoder().decode(VisualTimeline.self, from: data) else { return [] } + return vt.segments.filter { $0.source == "vision" } + } + + /// App label from a session folder name like "…_signal". + private static func appLabel(from folder: URL) -> String { + folder.lastPathComponent.split(separator: "_").last.map(String.init) ?? "manual" + } + private func fail(_ message: String) { recorder = nil visualCapture = nil // recorder.start() failed before visual started; nothing running diff --git a/Ten31Transcripts/Session/SpeakerReconciler.swift b/Ten31Transcripts/Session/SpeakerReconciler.swift new file mode 100644 index 0000000..be92780 --- /dev/null +++ b/Ten31Transcripts/Session/SpeakerReconciler.swift @@ -0,0 +1,146 @@ +import Foundation + +/// Reconciles the backend's per-cluster speaker labels into cleaner identities: +/// 1. **Merge** non-self clusters whose voiceprints are highly similar — fixes one +/// person being split across chunks (e.g. "MH" + "Unknown_0" → one person). +/// 2. **Name** remaining non-self clusters from the transcript *content* (people +/// addressed by name, self-introductions) via the gateway LLM — fixes wrong/initial +/// labels that the visual cue produced. Conservative: keeps the current label when +/// the content doesn't clearly reveal a name; never touches the mic-channel self. +/// +/// The merge math is pure/testable; the naming pass is one LLM call. +enum SpeakerReconciler { + + /// Full reconciliation: merge by voiceprint, then name by content. + static func reconcile(file: SpeakersFile, fingerprints: [String: [Float]], selfName: String, + llm: GatewayLLMClient, model: String, + mergeThreshold: Double = 0.82) async -> SpeakersFile { + let protected = protectedNames(file, selfName: selfName) + let merged = mergeByFingerprint(file, fingerprints: fingerprints, protected: protected, threshold: mergeThreshold) + + // Name the non-self clusters from content. + let labels = SpeakerEditing.orderedSpeakers(merged.segments).filter { !protected.contains($0) } + guard !labels.isEmpty else { return merged } + let prompt = namingPrompt(file: merged, selfName: selfName, labels: labels) + guard let content = try? await llm.completeJSON(model: model, system: nil, user: prompt, maxTokens: 1024) else { + return merged + } + let names = parseNaming(content) + var renamed = merged + for (current, proposal) in names where current != proposal.name { + guard !proposal.name.isEmpty, proposal.confidence != "low", + !protected.contains(current), + !LabelMergeResponse.isUnknownName(proposal.name) else { continue } + renamed = apply(rename: current, to: proposal.name, source: "content", in: renamed) + } + return renamed + } + + // MARK: - Voiceprint merge (pure) + + static func protectedNames(_ file: SpeakersFile, selfName: String) -> Set { + var p: Set = [selfName] + for s in file.speakers where s.source == "mic_channel" { p.insert(s.name) } + return p + } + + static func cosine(_ a: [Float], _ b: [Float]) -> Double { + guard a.count == b.count, !a.isEmpty else { return 0 } + var dot = 0.0, na = 0.0, nb = 0.0 + for i in 0.. 0, nb > 0 else { return 0 } + return dot / (na.squareRoot() * nb.squareRoot()) + } + + /// Greedily merge non-self clusters with cosine similarity ≥ threshold. The + /// survivor is the "better-named" one (a real name beats Unknown; higher + /// confidence wins ties). Segments + the speaker roster are remapped. + static func mergeByFingerprint(_ file: SpeakersFile, fingerprints: [String: [Float]], + protected: Set, threshold: Double) -> SpeakersFile { + let names = file.speakers.map { $0.name }.filter { !protected.contains($0) && fingerprints[$0] != nil } + guard names.count > 1 else { return file } + let rank = Dictionary(uniqueKeysWithValues: file.speakers.map { ($0.name, $0) }) + + var canonical: [String: String] = [:] // name -> survivor + for n in names { canonical[n] = n } + func find(_ x: String) -> String { var r = x; while canonical[r]! != r { r = canonical[r]! }; return r } + + for i in 0..= threshold { + let survivor = better(a, b, rank: rank) + let absorbed = survivor == a ? b : a + canonical[absorbed] = survivor + } + } + } + let map = Dictionary(uniqueKeysWithValues: names.map { ($0, find($0)) }).filter { $0.key != $0.value } + guard !map.isEmpty else { return file } + + let segments = file.segments.map { s in map[s.speaker].map { + SpeakersFile.Segment(start: s.start, end: s.end, speaker: $0, text: s.text) } ?? s } + let keep = SpeakerEditing.orderedSpeakers(segments) + let speakers = keep.map { rank[$0] ?? SpeakersFile.Speaker(name: $0, source: "reconciled", overlapConfidence: nil, matchSimilarity: nil) } + return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec, + speakers: speakers, segments: segments, models: file.models) + } + + /// Prefer a real name over Unknown; otherwise the higher-confidence cluster. + private static func better(_ a: String, _ b: String, rank: [String: SpeakersFile.Speaker]) -> String { + let au = LabelMergeResponse.isUnknownName(a), bu = LabelMergeResponse.isUnknownName(b) + if au != bu { return au ? b : a } + let ca = (rank[a]?.overlapConfidence ?? rank[a]?.matchSimilarity ?? 0) + let cb = (rank[b]?.overlapConfidence ?? rank[b]?.matchSimilarity ?? 0) + return ca >= cb ? a : b + } + + private static func apply(rename current: String, to new: String, source: String, in file: SpeakersFile) -> SpeakersFile { + let segments = SpeakerEditing.replaceSpeaker(current, with: new, in: file.segments) + let speakers = SpeakerEditing.orderedSpeakers(segments).map { name -> SpeakersFile.Speaker in + if name == new { return SpeakersFile.Speaker(name: new, source: source, overlapConfidence: nil, matchSimilarity: nil) } + return file.speakers.first { $0.name == name } ?? SpeakersFile.Speaker(name: name, source: "reconciled", overlapConfidence: nil, matchSimilarity: nil) + } + return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec, + speakers: speakers, segments: segments, models: file.models) + } + + // MARK: - LLM content naming + + static func namingPrompt(file: SpeakersFile, selfName: String, labels: [String]) -> String { + let entries = RecapAnalyzer.entries(from: file) + let transcript = RecapAnalyzer.cappedTranscript(entries, maxChars: 20_000) + return """ + You are reconciling speaker labels in a diarized transcript. The voices were separated acoustically and labeled with placeholder initials or "Unknown_N". Your ONLY job is to map a placeholder to a person's REAL name when the conversation clearly reveals it — someone is addressed by name, introduces themselves, or is unambiguously referred to. If a label's real name is not clearly revealed, KEEP IT (return null). Never guess. + + SELF (already correct — never reassign): \(selfName) + LABELS TO RESOLVE: \(labels.joined(separator: ", ")) + + TRANSCRIPT (each line is "[