Fix mis-attributed fragments + LLM naming guardrails + re-process saved sessions
Investigating Grant's real 38-min group call: 'Marty' was a GARBAGE cluster (192 segs, 0.37s mean, 186 ≤2 words, 125 single words flanked by the same other speaker — diarization micro-fragments split mid-sentence, then LLM-named 'Marty'). Same for 'Message'/'HI'. - SpeakerReconciler.smoothFragments: dissolve non-self clusters whose MEDIAN segment duration ≤ 1s (≥3 segs) — reassign each fragment to the temporally-nearest real speaker. (Median, not max, so one stray long segment can't rescue a fragment cluster — the bug in the first cut.) On the real call: 7 speakers (3 junk) → 4 real (Marty/Message/HI absorbed into Grant/Jonathan/Me/MH). Runs before LLM naming. - LLM naming guardrails: forbid assigning the self name or ANY already-taken name to another voice (fixes 'Grant' = the user's name pinned on a remote speaker); prompt demands self-intro / direct-address evidence (mention ≠ presence), 'precision over coverage', one name per speaker. - Open saved session now offers Open Editor vs Re-process, so newer logic can be applied to past calls (+ always-visible progress from the prior fix). NOTE: the self-name guardrail needs the app to KNOW the user's name — selfName is still 'Me', so set it in Settings (e.g. 'Grant') so the LLM can't reuse it. 62/62 XCTest.
This commit is contained in:
@@ -470,12 +470,24 @@ final class SessionController: ObservableObject {
|
|||||||
guard panel.runModal() == .OK, let folder = panel.url else { return }
|
guard panel.runModal() == .OK, let folder = panel.url else { return }
|
||||||
let fm = FileManager.default
|
let fm = FileManager.default
|
||||||
if fm.fileExists(atPath: folder.appendingPathComponent("speakers.json").path) {
|
if fm.fileExists(atPath: folder.appendingPathComponent("speakers.json").path) {
|
||||||
|
// Already transcribed — edit it, or re-process to apply newer logic.
|
||||||
|
switch Self.editOrReprocess() {
|
||||||
|
case .edit:
|
||||||
if !openEditor(folder: folder) {
|
if !openEditor(folder: folder) {
|
||||||
Self.alert("Couldn't open this session — its transcript looks empty or unreadable.")
|
Self.alert("Couldn't open this session — its transcript looks empty or unreadable.")
|
||||||
}
|
}
|
||||||
|
case .reprocess: reprocess(folder)
|
||||||
|
case .cancel: break
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Not transcribed yet — needs the raw tracks to (re)process.
|
reprocess(folder) // not transcribed yet — must process
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Transcribe + reconcile + recap a saved session folder from its raw tracks, then
|
||||||
|
/// open the editor. Used by "Open saved session" (fresh, or re-process choice).
|
||||||
|
private func reprocess(_ folder: URL) {
|
||||||
|
let fm = FileManager.default
|
||||||
let mic = folder.appendingPathComponent("mic.wav")
|
let mic = folder.appendingPathComponent("mic.wav")
|
||||||
let sys = folder.appendingPathComponent("system.wav")
|
let sys = folder.appendingPathComponent("system.wav")
|
||||||
guard fm.fileExists(atPath: mic.path), fm.fileExists(atPath: sys.path) else {
|
guard fm.fileExists(atPath: mic.path), fm.fileExists(atPath: sys.path) else {
|
||||||
@@ -500,6 +512,22 @@ final class SessionController: ObservableObject {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private enum SavedAction { case edit, reprocess, cancel }
|
||||||
|
private static func editOrReprocess() -> SavedAction {
|
||||||
|
let a = NSAlert()
|
||||||
|
a.messageText = "This session is already transcribed"
|
||||||
|
a.informativeText = "Open the speaker editor, or re-process it from the audio to apply the latest naming/cleanup."
|
||||||
|
a.addButton(withTitle: "Open Editor") // .alertFirstButtonReturn
|
||||||
|
a.addButton(withTitle: "Re-process") // .alertSecondButtonReturn
|
||||||
|
a.addButton(withTitle: "Cancel") // .alertThirdButtonReturn
|
||||||
|
NSApp.activate(ignoringOtherApps: true)
|
||||||
|
switch a.runModal() {
|
||||||
|
case .alertFirstButtonReturn: return .edit
|
||||||
|
case .alertSecondButtonReturn: return .reprocess
|
||||||
|
default: return .cancel
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// The remote (vision) visual-timeline segments saved for a session, if any.
|
/// The remote (vision) visual-timeline segments saved for a session, if any.
|
||||||
private static func remoteTimeline(in folder: URL) -> [VisualTimeline.Segment] {
|
private static func remoteTimeline(in folder: URL) -> [VisualTimeline.Segment] {
|
||||||
guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")),
|
guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")),
|
||||||
|
|||||||
@@ -11,31 +11,77 @@ import Foundation
|
|||||||
/// The merge math is pure/testable; the naming pass is one LLM call.
|
/// The merge math is pure/testable; the naming pass is one LLM call.
|
||||||
enum SpeakerReconciler {
|
enum SpeakerReconciler {
|
||||||
|
|
||||||
/// Full reconciliation: merge by voiceprint, then name by content.
|
/// Full reconciliation: merge by voiceprint → dissolve fragment clusters → name
|
||||||
|
/// remaining non-self clusters by content (guard-railed).
|
||||||
static func reconcile(file: SpeakersFile, fingerprints: [String: [Float]], selfName: String,
|
static func reconcile(file: SpeakersFile, fingerprints: [String: [Float]], selfName: String,
|
||||||
llm: GatewayLLMClient, model: String,
|
llm: GatewayLLMClient, model: String,
|
||||||
mergeThreshold: Double = 0.82) async -> SpeakersFile {
|
mergeThreshold: Double = 0.82) async -> SpeakersFile {
|
||||||
let protected = protectedNames(file, selfName: selfName)
|
let protected = protectedNames(file, selfName: selfName)
|
||||||
let merged = mergeByFingerprint(file, fingerprints: fingerprints, protected: protected, threshold: mergeThreshold)
|
let merged = mergeByFingerprint(file, fingerprints: fingerprints, protected: protected, threshold: mergeThreshold)
|
||||||
|
let smoothed = smoothFragments(merged, protected: protected)
|
||||||
|
|
||||||
// Name the non-self clusters from content.
|
// Name the non-self clusters from content.
|
||||||
let labels = SpeakerEditing.orderedSpeakers(merged.segments).filter { !protected.contains($0) }
|
let labels = SpeakerEditing.orderedSpeakers(smoothed.segments).filter { !protected.contains($0) }
|
||||||
guard !labels.isEmpty else { return merged }
|
guard !labels.isEmpty else { return smoothed }
|
||||||
let prompt = namingPrompt(file: merged, selfName: selfName, labels: labels)
|
// Names the LLM must NOT reuse for another speaker: self + everyone already named.
|
||||||
|
let forbidden = protected.union(labels.filter { !LabelMergeResponse.isUnknownName($0) })
|
||||||
|
let prompt = namingPrompt(file: smoothed, selfName: selfName, labels: labels, forbidden: forbidden)
|
||||||
guard let content = try? await llm.completeJSON(model: model, system: nil, user: prompt, maxTokens: 1024) else {
|
guard let content = try? await llm.completeJSON(model: model, system: nil, user: prompt, maxTokens: 1024) else {
|
||||||
return merged
|
return smoothed
|
||||||
}
|
}
|
||||||
let names = parseNaming(content)
|
let names = parseNaming(content)
|
||||||
var renamed = merged
|
var renamed = smoothed
|
||||||
|
var used = Set(SpeakerEditing.orderedSpeakers(smoothed.segments))
|
||||||
for (current, proposal) in names where current != proposal.name {
|
for (current, proposal) in names where current != proposal.name {
|
||||||
guard !proposal.name.isEmpty, proposal.confidence != "low",
|
let new = proposal.name
|
||||||
!protected.contains(current),
|
guard !new.isEmpty, proposal.confidence != "low",
|
||||||
!LabelMergeResponse.isUnknownName(proposal.name) else { continue }
|
!protected.contains(current), !LabelMergeResponse.isUnknownName(new),
|
||||||
renamed = apply(rename: current, to: proposal.name, source: "content", in: renamed)
|
!protected.contains(new), // never assign the self/protected name to another voice
|
||||||
|
!(used.contains(new) && new != current) // never collide with an already-present different speaker
|
||||||
|
else { continue }
|
||||||
|
renamed = apply(rename: current, to: new, source: "content", in: renamed)
|
||||||
|
used.remove(current); used.insert(new)
|
||||||
}
|
}
|
||||||
return renamed
|
return renamed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Dissolve fragment clusters: a non-self "speaker" whose segments are MOSTLY tiny
|
||||||
|
/// (median duration ≤ `shortDur`) isn't a real participant — it's diarization
|
||||||
|
/// micro-fragments (single words split off mid-sentence; one stray longer segment
|
||||||
|
/// shouldn't rescue it, so we use the median, not the max). Reassign each of its
|
||||||
|
/// segments to the temporally-nearest real speaker. Pure/testable.
|
||||||
|
static func smoothFragments(_ file: SpeakersFile, protected: Set<String>,
|
||||||
|
shortDur: Double = 1.0, minSegs: Int = 3) -> SpeakersFile {
|
||||||
|
var durs: [String: [Double]] = [:]
|
||||||
|
for s in file.segments { durs[s.speaker, default: []].append(s.end - s.start) }
|
||||||
|
func isReal(_ name: String) -> Bool {
|
||||||
|
if protected.contains(name) { return true }
|
||||||
|
guard let d = durs[name], d.count >= minSegs else { return true } // too few to judge → keep
|
||||||
|
let sorted = d.sorted()
|
||||||
|
return sorted[sorted.count / 2] > shortDur // median > shortDur → real
|
||||||
|
}
|
||||||
|
guard file.segments.contains(where: { isReal($0.speaker) }),
|
||||||
|
file.segments.contains(where: { !isReal($0.speaker) }) else { return file }
|
||||||
|
|
||||||
|
let out = file.segments.sorted { $0.start < $1.start }
|
||||||
|
var result = out
|
||||||
|
for i in out.indices where !isReal(out[i].speaker) {
|
||||||
|
var bestName: String?, bestGap = Double.greatestFiniteMagnitude
|
||||||
|
var j = i - 1
|
||||||
|
while j >= 0 { if isReal(out[j].speaker) { let gap = out[i].start - out[j].end; if gap < bestGap { bestGap = gap; bestName = out[j].speaker }; break }; j -= 1 }
|
||||||
|
var k = i + 1
|
||||||
|
while k < out.count { if isReal(out[k].speaker) { let gap = out[k].start - out[i].end; if gap < bestGap { bestGap = gap; bestName = out[k].speaker }; break }; k += 1 }
|
||||||
|
if let name = bestName {
|
||||||
|
let s = out[i]
|
||||||
|
result[i] = SpeakersFile.Segment(start: s.start, end: s.end, speaker: name, text: s.text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let keep = SpeakerEditing.orderedSpeakers(result)
|
||||||
|
let speakers = keep.map { n in file.speakers.first { $0.name == n } ?? SpeakersFile.Speaker(name: n, source: "reconciled", overlapConfidence: nil, matchSimilarity: nil) }
|
||||||
|
return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec,
|
||||||
|
speakers: speakers, segments: result, models: file.models)
|
||||||
|
}
|
||||||
|
|
||||||
// MARK: - Voiceprint merge (pure)
|
// MARK: - Voiceprint merge (pure)
|
||||||
|
|
||||||
static func protectedNames(_ file: SpeakersFile, selfName: String) -> Set<String> {
|
static func protectedNames(_ file: SpeakersFile, selfName: String) -> Set<String> {
|
||||||
@@ -108,19 +154,23 @@ enum SpeakerReconciler {
|
|||||||
|
|
||||||
// MARK: - LLM content naming
|
// MARK: - LLM content naming
|
||||||
|
|
||||||
static func namingPrompt(file: SpeakersFile, selfName: String, labels: [String]) -> String {
|
static func namingPrompt(file: SpeakersFile, selfName: String, labels: [String], forbidden: Set<String>) -> String {
|
||||||
let entries = RecapAnalyzer.entries(from: file)
|
let entries = RecapAnalyzer.entries(from: file)
|
||||||
let transcript = RecapAnalyzer.cappedTranscript(entries, maxChars: 20_000)
|
let transcript = RecapAnalyzer.cappedTranscript(entries, maxChars: 20_000)
|
||||||
|
let forbiddenList = forbidden.sorted().joined(separator: ", ")
|
||||||
return """
|
return """
|
||||||
You are reconciling speaker labels in a diarized transcript. The voices were separated acoustically and labeled with placeholder initials or "Unknown_N". Your ONLY job is to map a placeholder to a person's REAL name when the conversation clearly reveals it — someone is addressed by name, introduces themselves, or is unambiguously referred to. If a label's real name is not clearly revealed, KEEP IT (return null). Never guess.
|
You are reconciling speaker labels in a diarized transcript. The voices were separated acoustically and labeled with placeholder initials or "Unknown_N". Your ONLY job is to map a placeholder to a person's REAL name when the conversation UNAMBIGUOUSLY reveals it — they introduce themselves ("this is Sarah"), or are directly addressed AND respond. Hearing a name mentioned is NOT enough; people are talked ABOUT without being on the call. When in doubt, return null. Precision matters far more than coverage — a wrong name is worse than no name.
|
||||||
|
|
||||||
|
"\(selfName)" is the local user (their own channel) and is already correct.
|
||||||
|
Do NOT assign any of these already-taken names to a different speaker: \(forbiddenList)
|
||||||
|
Each real name may be used for AT MOST ONE label.
|
||||||
|
|
||||||
SELF (already correct — never reassign): \(selfName)
|
|
||||||
LABELS TO RESOLVE: \(labels.joined(separator: ", "))
|
LABELS TO RESOLVE: \(labels.joined(separator: ", "))
|
||||||
|
|
||||||
TRANSCRIPT (each line is "[<label> <MM:SS>] text"):
|
TRANSCRIPT (each line is "[<label> <MM:SS>] text"):
|
||||||
\(transcript)
|
\(transcript)
|
||||||
|
|
||||||
Respond with ONLY valid JSON, no other text:
|
Respond with ONLY valid JSON, no other text. Use "high" confidence only when a label introduced themselves or was directly addressed and answered:
|
||||||
{
|
{
|
||||||
"speakers": [
|
"speakers": [
|
||||||
{"current": "<label>", "name": "Real Name" or null, "confidence": "high" | "medium" | "low"}
|
{"current": "<label>", "name": "Real Name" or null, "confidence": "high" | "medium" | "low"}
|
||||||
|
|||||||
@@ -49,6 +49,30 @@ final class SpeakerReconcilerTests: XCTestCase {
|
|||||||
XCTAssertEqual(out.speakers.count, 2)
|
XCTAssertEqual(out.speakers.count, 2)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testSmoothDissolvesFragmentCluster() {
|
||||||
|
// "Frag" is mostly micro-segments (the Marty pattern: median ≤ 1s) even though
|
||||||
|
// it has one longer stray → still absorbed into the surrounding real speaker.
|
||||||
|
let f = file([sp("Grant", "content"), sp("Frag", "content")],
|
||||||
|
[seg(0, 4, "Grant"), seg(4.0, 4.3, "Frag"), seg(4.4, 8, "Grant"),
|
||||||
|
seg(20, 24, "Grant"), seg(24.0, 24.2, "Frag"), seg(24.3, 28, "Grant"),
|
||||||
|
seg(30, 30.3, "Frag"), seg(31, 33, "Frag")]) // 4 Frag: 3 micro + 1 stray 2s
|
||||||
|
let out = SpeakerReconciler.smoothFragments(f, protected: [])
|
||||||
|
XCTAssertEqual(Set(out.speakers.map { $0.name }), ["Grant"]) // median(Frag)=0.3 ≤1 → dissolved
|
||||||
|
XCTAssertFalse(out.segments.contains { $0.speaker == "Frag" })
|
||||||
|
}
|
||||||
|
|
||||||
|
func testSmoothKeepsRealSpeakerWithMostlyLongSegs() {
|
||||||
|
let f = file([sp("A", "content")], [seg(0, 3, "A"), seg(3, 6, "A"), seg(6, 6.2, "A")]) // median 3 → real
|
||||||
|
XCTAssertEqual(SpeakerReconciler.smoothFragments(f, protected: []).speakers.map { $0.name }, ["A"])
|
||||||
|
}
|
||||||
|
|
||||||
|
func testSmoothProtectsSelfEvenIfAllShort() {
|
||||||
|
let f = file([sp("Me", "mic_channel"), sp("A", "content")],
|
||||||
|
[seg(0, 0.3, "Me"), seg(1, 4, "A"), seg(4, 4.2, "Me")])
|
||||||
|
let out = SpeakerReconciler.smoothFragments(f, protected: ["Me"])
|
||||||
|
XCTAssertTrue(out.speakers.contains { $0.name == "Me" }) // self never dissolved
|
||||||
|
}
|
||||||
|
|
||||||
func testParseNamingDropsNullAndKeepsConfidence() {
|
func testParseNamingDropsNullAndKeepsConfidence() {
|
||||||
let json = #"{"speakers":[{"current":"MH","name":"Jonathan Kirkwood","confidence":"high"},{"current":"Unknown_0","name":null,"confidence":"low"}]}"#
|
let json = #"{"speakers":[{"current":"MH","name":"Jonathan Kirkwood","confidence":"high"},{"current":"Unknown_0","name":null,"confidence":"low"}]}"#
|
||||||
let m = SpeakerReconciler.parseNaming(json)
|
let m = SpeakerReconciler.parseNaming(json)
|
||||||
|
|||||||
Reference in New Issue
Block a user