diff --git a/Ten31Transcripts/Session/SessionController.swift b/Ten31Transcripts/Session/SessionController.swift index f12e045..8e05dae 100644 --- a/Ten31Transcripts/Session/SessionController.swift +++ b/Ten31Transcripts/Session/SessionController.swift @@ -470,12 +470,24 @@ final class SessionController: ObservableObject { guard panel.runModal() == .OK, let folder = panel.url else { return } let fm = FileManager.default if fm.fileExists(atPath: folder.appendingPathComponent("speakers.json").path) { - if !openEditor(folder: folder) { - Self.alert("Couldn't open this session — its transcript looks empty or unreadable.") + // Already transcribed — edit it, or re-process to apply newer logic. + switch Self.editOrReprocess() { + case .edit: + if !openEditor(folder: folder) { + Self.alert("Couldn't open this session — its transcript looks empty or unreadable.") + } + case .reprocess: reprocess(folder) + case .cancel: break } return } - // Not transcribed yet — needs the raw tracks to (re)process. + reprocess(folder) // not transcribed yet — must process + } + + /// Transcribe + reconcile + recap a saved session folder from its raw tracks, then + /// open the editor. Used by "Open saved session" (fresh, or re-process choice). + private func reprocess(_ folder: URL) { + let fm = FileManager.default let mic = folder.appendingPathComponent("mic.wav") let sys = folder.appendingPathComponent("system.wav") guard fm.fileExists(atPath: mic.path), fm.fileExists(atPath: sys.path) else { @@ -500,6 +512,22 @@ final class SessionController: ObservableObject { } } + private enum SavedAction { case edit, reprocess, cancel } + private static func editOrReprocess() -> SavedAction { + let a = NSAlert() + a.messageText = "This session is already transcribed" + a.informativeText = "Open the speaker editor, or re-process it from the audio to apply the latest naming/cleanup." + a.addButton(withTitle: "Open Editor") // .alertFirstButtonReturn + a.addButton(withTitle: "Re-process") // .alertSecondButtonReturn + a.addButton(withTitle: "Cancel") // .alertThirdButtonReturn + NSApp.activate(ignoringOtherApps: true) + switch a.runModal() { + case .alertFirstButtonReturn: return .edit + case .alertSecondButtonReturn: return .reprocess + default: return .cancel + } + } + /// The remote (vision) visual-timeline segments saved for a session, if any. private static func remoteTimeline(in folder: URL) -> [VisualTimeline.Segment] { guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")), diff --git a/Ten31Transcripts/Session/SpeakerReconciler.swift b/Ten31Transcripts/Session/SpeakerReconciler.swift index be92780..e7b654e 100644 --- a/Ten31Transcripts/Session/SpeakerReconciler.swift +++ b/Ten31Transcripts/Session/SpeakerReconciler.swift @@ -11,31 +11,77 @@ import Foundation /// The merge math is pure/testable; the naming pass is one LLM call. enum SpeakerReconciler { - /// Full reconciliation: merge by voiceprint, then name by content. + /// Full reconciliation: merge by voiceprint → dissolve fragment clusters → name + /// remaining non-self clusters by content (guard-railed). static func reconcile(file: SpeakersFile, fingerprints: [String: [Float]], selfName: String, llm: GatewayLLMClient, model: String, mergeThreshold: Double = 0.82) async -> SpeakersFile { let protected = protectedNames(file, selfName: selfName) let merged = mergeByFingerprint(file, fingerprints: fingerprints, protected: protected, threshold: mergeThreshold) + let smoothed = smoothFragments(merged, protected: protected) // Name the non-self clusters from content. - let labels = SpeakerEditing.orderedSpeakers(merged.segments).filter { !protected.contains($0) } - guard !labels.isEmpty else { return merged } - let prompt = namingPrompt(file: merged, selfName: selfName, labels: labels) + let labels = SpeakerEditing.orderedSpeakers(smoothed.segments).filter { !protected.contains($0) } + guard !labels.isEmpty else { return smoothed } + // Names the LLM must NOT reuse for another speaker: self + everyone already named. + let forbidden = protected.union(labels.filter { !LabelMergeResponse.isUnknownName($0) }) + let prompt = namingPrompt(file: smoothed, selfName: selfName, labels: labels, forbidden: forbidden) guard let content = try? await llm.completeJSON(model: model, system: nil, user: prompt, maxTokens: 1024) else { - return merged + return smoothed } let names = parseNaming(content) - var renamed = merged + var renamed = smoothed + var used = Set(SpeakerEditing.orderedSpeakers(smoothed.segments)) for (current, proposal) in names where current != proposal.name { - guard !proposal.name.isEmpty, proposal.confidence != "low", - !protected.contains(current), - !LabelMergeResponse.isUnknownName(proposal.name) else { continue } - renamed = apply(rename: current, to: proposal.name, source: "content", in: renamed) + let new = proposal.name + guard !new.isEmpty, proposal.confidence != "low", + !protected.contains(current), !LabelMergeResponse.isUnknownName(new), + !protected.contains(new), // never assign the self/protected name to another voice + !(used.contains(new) && new != current) // never collide with an already-present different speaker + else { continue } + renamed = apply(rename: current, to: new, source: "content", in: renamed) + used.remove(current); used.insert(new) } return renamed } + /// Dissolve fragment clusters: a non-self "speaker" whose segments are MOSTLY tiny + /// (median duration ≤ `shortDur`) isn't a real participant — it's diarization + /// micro-fragments (single words split off mid-sentence; one stray longer segment + /// shouldn't rescue it, so we use the median, not the max). Reassign each of its + /// segments to the temporally-nearest real speaker. Pure/testable. + static func smoothFragments(_ file: SpeakersFile, protected: Set, + shortDur: Double = 1.0, minSegs: Int = 3) -> SpeakersFile { + var durs: [String: [Double]] = [:] + for s in file.segments { durs[s.speaker, default: []].append(s.end - s.start) } + func isReal(_ name: String) -> Bool { + if protected.contains(name) { return true } + guard let d = durs[name], d.count >= minSegs else { return true } // too few to judge → keep + let sorted = d.sorted() + return sorted[sorted.count / 2] > shortDur // median > shortDur → real + } + guard file.segments.contains(where: { isReal($0.speaker) }), + file.segments.contains(where: { !isReal($0.speaker) }) else { return file } + + let out = file.segments.sorted { $0.start < $1.start } + var result = out + for i in out.indices where !isReal(out[i].speaker) { + var bestName: String?, bestGap = Double.greatestFiniteMagnitude + var j = i - 1 + while j >= 0 { if isReal(out[j].speaker) { let gap = out[i].start - out[j].end; if gap < bestGap { bestGap = gap; bestName = out[j].speaker }; break }; j -= 1 } + var k = i + 1 + while k < out.count { if isReal(out[k].speaker) { let gap = out[k].start - out[i].end; if gap < bestGap { bestGap = gap; bestName = out[k].speaker }; break }; k += 1 } + if let name = bestName { + let s = out[i] + result[i] = SpeakersFile.Segment(start: s.start, end: s.end, speaker: name, text: s.text) + } + } + let keep = SpeakerEditing.orderedSpeakers(result) + let speakers = keep.map { n in file.speakers.first { $0.name == n } ?? SpeakersFile.Speaker(name: n, source: "reconciled", overlapConfidence: nil, matchSimilarity: nil) } + return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec, + speakers: speakers, segments: result, models: file.models) + } + // MARK: - Voiceprint merge (pure) static func protectedNames(_ file: SpeakersFile, selfName: String) -> Set { @@ -108,19 +154,23 @@ enum SpeakerReconciler { // MARK: - LLM content naming - static func namingPrompt(file: SpeakersFile, selfName: String, labels: [String]) -> String { + static func namingPrompt(file: SpeakersFile, selfName: String, labels: [String], forbidden: Set) -> String { let entries = RecapAnalyzer.entries(from: file) let transcript = RecapAnalyzer.cappedTranscript(entries, maxChars: 20_000) + let forbiddenList = forbidden.sorted().joined(separator: ", ") return """ - You are reconciling speaker labels in a diarized transcript. The voices were separated acoustically and labeled with placeholder initials or "Unknown_N". Your ONLY job is to map a placeholder to a person's REAL name when the conversation clearly reveals it — someone is addressed by name, introduces themselves, or is unambiguously referred to. If a label's real name is not clearly revealed, KEEP IT (return null). Never guess. + You are reconciling speaker labels in a diarized transcript. The voices were separated acoustically and labeled with placeholder initials or "Unknown_N". Your ONLY job is to map a placeholder to a person's REAL name when the conversation UNAMBIGUOUSLY reveals it — they introduce themselves ("this is Sarah"), or are directly addressed AND respond. Hearing a name mentioned is NOT enough; people are talked ABOUT without being on the call. When in doubt, return null. Precision matters far more than coverage — a wrong name is worse than no name. + + "\(selfName)" is the local user (their own channel) and is already correct. + Do NOT assign any of these already-taken names to a different speaker: \(forbiddenList) + Each real name may be used for AT MOST ONE label. - SELF (already correct — never reassign): \(selfName) LABELS TO RESOLVE: \(labels.joined(separator: ", ")) TRANSCRIPT (each line is "[