Speaker reconciliation + open/re-process any saved session
Reconciliation (the marry-the-signals layer): after transcription, before the recap, SpeakerReconciler (1) MERGES non-self clusters whose voiceprints are highly similar (cosine >= 0.82) — fixes a person split across chunks (the real 1-on-1 failure: one remote came back as 'MH' + 'Unknown_0'); and (2) NAMES remaining non-self clusters from transcript CONTENT via the gateway LLM (people addressed by name / self-intros), conservative + confidence-gated, keeping the placeholder when unrevealed. The mic-channel self is protected and never reassigned. Voice does the segmentation; the fingerprint-merge fixes splits; the LLM adds the content signal visual/voiceprint lack. - SpeakerReconciler: pure cosine merge (tested) + LLM content-naming pass; rewrites speakers.json before recap. SessionController.finishBackend shares one model lookup for reconcile + recap. Gated by settings.reconcileSpeakers (default on). - Open saved session: menu 'Open saved session…' → folder picker. Edits it if already transcribed, else reconstructs inputs from disk (visual_timeline vision segs + channel self-spans) and runs transcribe → reconcile → recap, then opens the editor. Lets you evaluate/correct ANY past call, not just the in-memory last one. Note (from real Signal data): visual naming is unreliable on Signal (sparse, misread initials, lowercase/center names) — so reconciliation + the editor (which teaches voiceprints on confirm) carry it; the editor remains the human arbiter. 59/59 XCTest.
This commit is contained in:
@@ -361,52 +361,60 @@ final class SessionController: ObservableObject {
|
||||
/// when visual capture ran, or the self spans alone otherwise. Safe to call
|
||||
/// manually ("Send to backend") or automatically on stop.
|
||||
func processLastSession() {
|
||||
guard let inputs = lastProcess else { return }
|
||||
if case .processing = transcriptStatus { return }
|
||||
guard let inputs = lastProcess, !isProcessing else { return }
|
||||
transcriptStatus = .processing(0, 1)
|
||||
recapURL = nil
|
||||
processTask = Task { await self.runBackend(inputs, openEditorWhenDone: false) }
|
||||
}
|
||||
|
||||
private var isProcessing: Bool { if case .processing = transcriptStatus { return true }; return false }
|
||||
|
||||
/// Transcribe + reconcile + recap one session's inputs. Shared by "Send to
|
||||
/// backend" and "Open saved session".
|
||||
private func runBackend(_ inputs: ProcessInputs, openEditorWhenDone: Bool) async {
|
||||
let settings = self.settings
|
||||
let voiceprints = self.voiceprints
|
||||
processTask = Task {
|
||||
let pipeline = TranscriptPipeline(
|
||||
baseURL: settings.backendBaseURL,
|
||||
skipTLS: settings.skipTLSVerification,
|
||||
voiceprints: voiceprints)
|
||||
do {
|
||||
let speakers = try await pipeline.process(
|
||||
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
||||
micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
|
||||
timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
|
||||
systemHealthy: inputs.systemHealthy,
|
||||
progress: { done, total in
|
||||
await MainActor.run { self.transcriptStatus = .processing(done, total) }
|
||||
})
|
||||
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
|
||||
// Best-effort readable recap (topic sections + extras) via the gateway LLM.
|
||||
if settings.recapEnabled, !speakers.segments.isEmpty {
|
||||
try Task.checkCancellation()
|
||||
await self.buildRecap(speakers: speakers, inputs: inputs, settings: settings)
|
||||
}
|
||||
} catch is CancellationError {
|
||||
self.transcriptStatus = .idle
|
||||
} catch {
|
||||
self.transcriptStatus = .failed(error.localizedDescription)
|
||||
}
|
||||
let pipeline = TranscriptPipeline(baseURL: settings.backendBaseURL,
|
||||
skipTLS: settings.skipTLSVerification, voiceprints: voiceprints)
|
||||
do {
|
||||
let speakers = try await pipeline.process(
|
||||
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
||||
micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
|
||||
timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
|
||||
systemHealthy: inputs.systemHealthy,
|
||||
progress: { done, total in await MainActor.run { self.transcriptStatus = .processing(done, total) } })
|
||||
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
|
||||
try Task.checkCancellation()
|
||||
await self.finishBackend(speakers: speakers, inputs: inputs, settings: settings)
|
||||
if openEditorWhenDone { self.openEditor(folder: inputs.folder) }
|
||||
} catch is CancellationError {
|
||||
self.transcriptStatus = .idle
|
||||
} catch {
|
||||
self.transcriptStatus = .failed(error.localizedDescription)
|
||||
}
|
||||
}
|
||||
|
||||
/// Build `transcript.md` + `recap.html` from the finished `speakers.json` using
|
||||
/// the gateway LLM. Best-effort: a missing LLM or any failure leaves the
|
||||
/// transcript intact and just skips the recap.
|
||||
private func buildRecap(speakers: SpeakersFile, inputs: ProcessInputs, settings: AppSettings) async {
|
||||
let template = settings.defaultTemplate
|
||||
/// Post-transcription LLM passes (best-effort, share one gateway model lookup):
|
||||
/// reconcile speaker labels (merge split clusters + name from content), then build
|
||||
/// the readable recap. A missing LLM or any failure leaves speakers.json intact.
|
||||
private func finishBackend(speakers: SpeakersFile, inputs: ProcessInputs, settings: AppSettings) async {
|
||||
let llm = GatewayLLMClient(baseURL: settings.backendBaseURL, skipTLS: settings.skipTLSVerification)
|
||||
guard let model = await llm.chatModelId() else { return } // no LLM on the gateway → skip
|
||||
guard let model = await llm.chatModelId() else { return } // no LLM on the gateway → skip both
|
||||
|
||||
var resolved = speakers
|
||||
if settings.reconcileSpeakers, !speakers.segments.isEmpty {
|
||||
self.transcriptStatus = .processing(0, 0)
|
||||
let fps = RecapEditModel.loadFingerprints(inputs.folder.appendingPathComponent("cluster_fingerprints.json"))
|
||||
resolved = await SpeakerReconciler.reconcile(file: speakers, fingerprints: fps,
|
||||
selfName: inputs.selfName, llm: llm, model: model)
|
||||
try? resolved.write(to: inputs.folder.appendingPathComponent("speakers.json"))
|
||||
self.transcriptStatus = .done(speakers: resolved.speakers.count, segments: resolved.segments.count)
|
||||
}
|
||||
|
||||
guard settings.recapEnabled, !resolved.segments.isEmpty else { return }
|
||||
let analyzer = RecapAnalyzer(llm: llm, model: model)
|
||||
guard let result = try? await analyzer.recap(file: speakers, template: template) else { return }
|
||||
guard let result = try? await analyzer.recap(file: resolved, template: settings.defaultTemplate) else { return }
|
||||
let title = Self.recapTitle(app: inputs.app, sessionId: inputs.sessionId)
|
||||
try? RecapRenderer.write(file: speakers, result: result, title: title, to: inputs.folder)
|
||||
try? RecapRenderer.write(file: resolved, result: result, title: title, to: inputs.folder)
|
||||
try? RecapFile(title: title, result: result).write(to: inputs.folder.appendingPathComponent("recap.json"))
|
||||
let url = inputs.folder.appendingPathComponent("recap.html")
|
||||
if FileManager.default.fileExists(atPath: url.path) { self.recapURL = url }
|
||||
@@ -433,14 +441,65 @@ final class SessionController: ObservableObject {
|
||||
|
||||
/// Open the speaker-correction editor for the last session.
|
||||
func editLastSession() {
|
||||
guard let folder = lastSession?.folder,
|
||||
let model = RecapEditModel(folder: folder, voiceprints: voiceprints,
|
||||
if let folder = lastSession?.folder { openEditor(folder: folder) }
|
||||
}
|
||||
|
||||
/// Open the editor for any session folder that has a `speakers.json`.
|
||||
private func openEditor(folder: URL) {
|
||||
guard let model = RecapEditModel(folder: folder, voiceprints: voiceprints,
|
||||
baseURL: settings.backendBaseURL, skipTLS: settings.skipTLSVerification,
|
||||
templates: settings.recapTemplates, defaultTemplateId: settings.defaultTemplateId)
|
||||
else { return }
|
||||
EditorWindow.shared.show(model: model)
|
||||
}
|
||||
|
||||
/// Pick any past session folder and open it: edit it if already transcribed,
|
||||
/// otherwise transcribe + reconcile + recap it first, then open the editor.
|
||||
func openSavedSession() {
|
||||
let panel = NSOpenPanel()
|
||||
panel.canChooseDirectories = true
|
||||
panel.canChooseFiles = false
|
||||
panel.allowsMultipleSelection = false
|
||||
panel.prompt = "Open"
|
||||
panel.message = "Choose a session folder"
|
||||
panel.directoryURL = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
|
||||
NSApp.activate(ignoringOtherApps: true)
|
||||
guard panel.runModal() == .OK, let folder = panel.url else { return }
|
||||
let fm = FileManager.default
|
||||
if fm.fileExists(atPath: folder.appendingPathComponent("speakers.json").path) {
|
||||
openEditor(folder: folder)
|
||||
return
|
||||
}
|
||||
// Not transcribed yet — needs the raw tracks to (re)process.
|
||||
let mic = folder.appendingPathComponent("mic.wav")
|
||||
let sys = folder.appendingPathComponent("system.wav")
|
||||
guard fm.fileExists(atPath: mic.path), fm.fileExists(atPath: sys.path), !isProcessing else { return }
|
||||
transcriptStatus = .processing(0, 1)
|
||||
recapURL = nil
|
||||
let selfName = settings.selfName
|
||||
processTask = Task {
|
||||
let selfSpans = await Task.detached { ChannelSelfVAD.selfSpans(micURL: mic, systemURL: sys) }.value ?? []
|
||||
let inputs = ProcessInputs(
|
||||
folder: folder, sessionId: folder.lastPathComponent, app: Self.appLabel(from: folder),
|
||||
micURL: mic, systemURL: sys, mixedURL: folder.appendingPathComponent("mixed_mono_16k.wav"),
|
||||
timeline: Self.remoteTimeline(in: folder), selfSpans: selfSpans,
|
||||
selfName: selfName, systemHealthy: true)
|
||||
await self.runBackend(inputs, openEditorWhenDone: true)
|
||||
}
|
||||
}
|
||||
|
||||
/// The remote (vision) visual-timeline segments saved for a session, if any.
|
||||
private static func remoteTimeline(in folder: URL) -> [VisualTimeline.Segment] {
|
||||
guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")),
|
||||
let vt = try? JSONDecoder().decode(VisualTimeline.self, from: data) else { return [] }
|
||||
return vt.segments.filter { $0.source == "vision" }
|
||||
}
|
||||
|
||||
/// App label from a session folder name like "…_signal".
|
||||
private static func appLabel(from folder: URL) -> String {
|
||||
folder.lastPathComponent.split(separator: "_").last.map(String.init) ?? "manual"
|
||||
}
|
||||
|
||||
private func fail(_ message: String) {
|
||||
recorder = nil
|
||||
visualCapture = nil // recorder.start() failed before visual started; nothing running
|
||||
|
||||
@@ -0,0 +1,146 @@
|
||||
import Foundation
|
||||
|
||||
/// Reconciles the backend's per-cluster speaker labels into cleaner identities:
|
||||
/// 1. **Merge** non-self clusters whose voiceprints are highly similar — fixes one
|
||||
/// person being split across chunks (e.g. "MH" + "Unknown_0" → one person).
|
||||
/// 2. **Name** remaining non-self clusters from the transcript *content* (people
|
||||
/// addressed by name, self-introductions) via the gateway LLM — fixes wrong/initial
|
||||
/// labels that the visual cue produced. Conservative: keeps the current label when
|
||||
/// the content doesn't clearly reveal a name; never touches the mic-channel self.
|
||||
///
|
||||
/// The merge math is pure/testable; the naming pass is one LLM call.
|
||||
enum SpeakerReconciler {
|
||||
|
||||
/// Full reconciliation: merge by voiceprint, then name by content.
|
||||
static func reconcile(file: SpeakersFile, fingerprints: [String: [Float]], selfName: String,
|
||||
llm: GatewayLLMClient, model: String,
|
||||
mergeThreshold: Double = 0.82) async -> SpeakersFile {
|
||||
let protected = protectedNames(file, selfName: selfName)
|
||||
let merged = mergeByFingerprint(file, fingerprints: fingerprints, protected: protected, threshold: mergeThreshold)
|
||||
|
||||
// Name the non-self clusters from content.
|
||||
let labels = SpeakerEditing.orderedSpeakers(merged.segments).filter { !protected.contains($0) }
|
||||
guard !labels.isEmpty else { return merged }
|
||||
let prompt = namingPrompt(file: merged, selfName: selfName, labels: labels)
|
||||
guard let content = try? await llm.completeJSON(model: model, system: nil, user: prompt, maxTokens: 1024) else {
|
||||
return merged
|
||||
}
|
||||
let names = parseNaming(content)
|
||||
var renamed = merged
|
||||
for (current, proposal) in names where current != proposal.name {
|
||||
guard !proposal.name.isEmpty, proposal.confidence != "low",
|
||||
!protected.contains(current),
|
||||
!LabelMergeResponse.isUnknownName(proposal.name) else { continue }
|
||||
renamed = apply(rename: current, to: proposal.name, source: "content", in: renamed)
|
||||
}
|
||||
return renamed
|
||||
}
|
||||
|
||||
// MARK: - Voiceprint merge (pure)
|
||||
|
||||
static func protectedNames(_ file: SpeakersFile, selfName: String) -> Set<String> {
|
||||
var p: Set<String> = [selfName]
|
||||
for s in file.speakers where s.source == "mic_channel" { p.insert(s.name) }
|
||||
return p
|
||||
}
|
||||
|
||||
static func cosine(_ a: [Float], _ b: [Float]) -> Double {
|
||||
guard a.count == b.count, !a.isEmpty else { return 0 }
|
||||
var dot = 0.0, na = 0.0, nb = 0.0
|
||||
for i in 0..<a.count { dot += Double(a[i] * b[i]); na += Double(a[i] * a[i]); nb += Double(b[i] * b[i]) }
|
||||
guard na > 0, nb > 0 else { return 0 }
|
||||
return dot / (na.squareRoot() * nb.squareRoot())
|
||||
}
|
||||
|
||||
/// Greedily merge non-self clusters with cosine similarity ≥ threshold. The
|
||||
/// survivor is the "better-named" one (a real name beats Unknown; higher
|
||||
/// confidence wins ties). Segments + the speaker roster are remapped.
|
||||
static func mergeByFingerprint(_ file: SpeakersFile, fingerprints: [String: [Float]],
|
||||
protected: Set<String>, threshold: Double) -> SpeakersFile {
|
||||
let names = file.speakers.map { $0.name }.filter { !protected.contains($0) && fingerprints[$0] != nil }
|
||||
guard names.count > 1 else { return file }
|
||||
let rank = Dictionary(uniqueKeysWithValues: file.speakers.map { ($0.name, $0) })
|
||||
|
||||
var canonical: [String: String] = [:] // name -> survivor
|
||||
for n in names { canonical[n] = n }
|
||||
func find(_ x: String) -> String { var r = x; while canonical[r]! != r { r = canonical[r]! }; return r }
|
||||
|
||||
for i in 0..<names.count {
|
||||
for j in (i + 1)..<names.count {
|
||||
let a = find(names[i]), b = find(names[j])
|
||||
guard a != b, let fa = fingerprints[a], let fb = fingerprints[b] else { continue }
|
||||
if cosine(fa, fb) >= threshold {
|
||||
let survivor = better(a, b, rank: rank)
|
||||
let absorbed = survivor == a ? b : a
|
||||
canonical[absorbed] = survivor
|
||||
}
|
||||
}
|
||||
}
|
||||
let map = Dictionary(uniqueKeysWithValues: names.map { ($0, find($0)) }).filter { $0.key != $0.value }
|
||||
guard !map.isEmpty else { return file }
|
||||
|
||||
let segments = file.segments.map { s in map[s.speaker].map {
|
||||
SpeakersFile.Segment(start: s.start, end: s.end, speaker: $0, text: s.text) } ?? s }
|
||||
let keep = SpeakerEditing.orderedSpeakers(segments)
|
||||
let speakers = keep.map { rank[$0] ?? SpeakersFile.Speaker(name: $0, source: "reconciled", overlapConfidence: nil, matchSimilarity: nil) }
|
||||
return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec,
|
||||
speakers: speakers, segments: segments, models: file.models)
|
||||
}
|
||||
|
||||
/// Prefer a real name over Unknown; otherwise the higher-confidence cluster.
|
||||
private static func better(_ a: String, _ b: String, rank: [String: SpeakersFile.Speaker]) -> String {
|
||||
let au = LabelMergeResponse.isUnknownName(a), bu = LabelMergeResponse.isUnknownName(b)
|
||||
if au != bu { return au ? b : a }
|
||||
let ca = (rank[a]?.overlapConfidence ?? rank[a]?.matchSimilarity ?? 0)
|
||||
let cb = (rank[b]?.overlapConfidence ?? rank[b]?.matchSimilarity ?? 0)
|
||||
return ca >= cb ? a : b
|
||||
}
|
||||
|
||||
private static func apply(rename current: String, to new: String, source: String, in file: SpeakersFile) -> SpeakersFile {
|
||||
let segments = SpeakerEditing.replaceSpeaker(current, with: new, in: file.segments)
|
||||
let speakers = SpeakerEditing.orderedSpeakers(segments).map { name -> SpeakersFile.Speaker in
|
||||
if name == new { return SpeakersFile.Speaker(name: new, source: source, overlapConfidence: nil, matchSimilarity: nil) }
|
||||
return file.speakers.first { $0.name == name } ?? SpeakersFile.Speaker(name: name, source: "reconciled", overlapConfidence: nil, matchSimilarity: nil)
|
||||
}
|
||||
return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec,
|
||||
speakers: speakers, segments: segments, models: file.models)
|
||||
}
|
||||
|
||||
// MARK: - LLM content naming
|
||||
|
||||
static func namingPrompt(file: SpeakersFile, selfName: String, labels: [String]) -> String {
|
||||
let entries = RecapAnalyzer.entries(from: file)
|
||||
let transcript = RecapAnalyzer.cappedTranscript(entries, maxChars: 20_000)
|
||||
return """
|
||||
You are reconciling speaker labels in a diarized transcript. The voices were separated acoustically and labeled with placeholder initials or "Unknown_N". Your ONLY job is to map a placeholder to a person's REAL name when the conversation clearly reveals it — someone is addressed by name, introduces themselves, or is unambiguously referred to. If a label's real name is not clearly revealed, KEEP IT (return null). Never guess.
|
||||
|
||||
SELF (already correct — never reassign): \(selfName)
|
||||
LABELS TO RESOLVE: \(labels.joined(separator: ", "))
|
||||
|
||||
TRANSCRIPT (each line is "[<label> <MM:SS>] text"):
|
||||
\(transcript)
|
||||
|
||||
Respond with ONLY valid JSON, no other text:
|
||||
{
|
||||
"speakers": [
|
||||
{"current": "<label>", "name": "Real Name" or null, "confidence": "high" | "medium" | "low"}
|
||||
]
|
||||
}
|
||||
"""
|
||||
}
|
||||
|
||||
static func parseNaming(_ content: String) -> [String: (name: String, confidence: String)] {
|
||||
let cleaned = GatewayLLMClient.stripCodeFence(content)
|
||||
guard let o = (try? JSONSerialization.jsonObject(with: Data(cleaned.utf8))) as? [String: Any],
|
||||
let arr = o["speakers"] as? [[String: Any]] else { return [:] }
|
||||
var out: [String: (name: String, confidence: String)] = [:]
|
||||
for d in arr {
|
||||
guard let cur = (d["current"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines), !cur.isEmpty,
|
||||
let name = (d["name"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines),
|
||||
!name.isEmpty, name.lowercased() != "null" else { continue }
|
||||
let conf = (d["confidence"] as? String)?.lowercased() ?? "medium"
|
||||
out[cur] = (name, conf)
|
||||
}
|
||||
return out
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user