Files
ten31-transcripts/Ten31Transcripts/Session/SpeakerReconciler.swift
T
Grant Gilliam 6d0c8be8c9 Speaker reconciliation + open/re-process any saved session
Reconciliation (the marry-the-signals layer): after transcription, before the recap,
SpeakerReconciler (1) MERGES non-self clusters whose voiceprints are highly similar
(cosine >= 0.82) — fixes a person split across chunks (the real 1-on-1 failure: one
remote came back as 'MH' + 'Unknown_0'); and (2) NAMES remaining non-self clusters
from transcript CONTENT via the gateway LLM (people addressed by name / self-intros),
conservative + confidence-gated, keeping the placeholder when unrevealed. The
mic-channel self is protected and never reassigned. Voice does the segmentation; the
fingerprint-merge fixes splits; the LLM adds the content signal visual/voiceprint lack.

- SpeakerReconciler: pure cosine merge (tested) + LLM content-naming pass; rewrites
  speakers.json before recap. SessionController.finishBackend shares one model lookup
  for reconcile + recap. Gated by settings.reconcileSpeakers (default on).
- Open saved session: menu 'Open saved session…' → folder picker. Edits it if already
  transcribed, else reconstructs inputs from disk (visual_timeline vision segs +
  channel self-spans) and runs transcribe → reconcile → recap, then opens the editor.
  Lets you evaluate/correct ANY past call, not just the in-memory last one.

Note (from real Signal data): visual naming is unreliable on Signal (sparse, misread
initials, lowercase/center names) — so reconciliation + the editor (which teaches
voiceprints on confirm) carry it; the editor remains the human arbiter. 59/59 XCTest.
2026-06-08 11:54:41 -05:00

147 lines
8.2 KiB
Swift

import Foundation
/// Reconciles the backend's per-cluster speaker labels into cleaner identities:
/// 1. **Merge** non-self clusters whose voiceprints are highly similar fixes one
/// person being split across chunks (e.g. "MH" + "Unknown_0" one person).
/// 2. **Name** remaining non-self clusters from the transcript *content* (people
/// addressed by name, self-introductions) via the gateway LLM fixes wrong/initial
/// labels that the visual cue produced. Conservative: keeps the current label when
/// the content doesn't clearly reveal a name; never touches the mic-channel self.
///
/// The merge math is pure/testable; the naming pass is one LLM call.
enum SpeakerReconciler {
/// Full reconciliation: merge by voiceprint, then name by content.
static func reconcile(file: SpeakersFile, fingerprints: [String: [Float]], selfName: String,
llm: GatewayLLMClient, model: String,
mergeThreshold: Double = 0.82) async -> SpeakersFile {
let protected = protectedNames(file, selfName: selfName)
let merged = mergeByFingerprint(file, fingerprints: fingerprints, protected: protected, threshold: mergeThreshold)
// Name the non-self clusters from content.
let labels = SpeakerEditing.orderedSpeakers(merged.segments).filter { !protected.contains($0) }
guard !labels.isEmpty else { return merged }
let prompt = namingPrompt(file: merged, selfName: selfName, labels: labels)
guard let content = try? await llm.completeJSON(model: model, system: nil, user: prompt, maxTokens: 1024) else {
return merged
}
let names = parseNaming(content)
var renamed = merged
for (current, proposal) in names where current != proposal.name {
guard !proposal.name.isEmpty, proposal.confidence != "low",
!protected.contains(current),
!LabelMergeResponse.isUnknownName(proposal.name) else { continue }
renamed = apply(rename: current, to: proposal.name, source: "content", in: renamed)
}
return renamed
}
// MARK: - Voiceprint merge (pure)
static func protectedNames(_ file: SpeakersFile, selfName: String) -> Set<String> {
var p: Set<String> = [selfName]
for s in file.speakers where s.source == "mic_channel" { p.insert(s.name) }
return p
}
static func cosine(_ a: [Float], _ b: [Float]) -> Double {
guard a.count == b.count, !a.isEmpty else { return 0 }
var dot = 0.0, na = 0.0, nb = 0.0
for i in 0..<a.count { dot += Double(a[i] * b[i]); na += Double(a[i] * a[i]); nb += Double(b[i] * b[i]) }
guard na > 0, nb > 0 else { return 0 }
return dot / (na.squareRoot() * nb.squareRoot())
}
/// Greedily merge non-self clusters with cosine similarity threshold. The
/// survivor is the "better-named" one (a real name beats Unknown; higher
/// confidence wins ties). Segments + the speaker roster are remapped.
static func mergeByFingerprint(_ file: SpeakersFile, fingerprints: [String: [Float]],
protected: Set<String>, threshold: Double) -> SpeakersFile {
let names = file.speakers.map { $0.name }.filter { !protected.contains($0) && fingerprints[$0] != nil }
guard names.count > 1 else { return file }
let rank = Dictionary(uniqueKeysWithValues: file.speakers.map { ($0.name, $0) })
var canonical: [String: String] = [:] // name -> survivor
for n in names { canonical[n] = n }
func find(_ x: String) -> String { var r = x; while canonical[r]! != r { r = canonical[r]! }; return r }
for i in 0..<names.count {
for j in (i + 1)..<names.count {
let a = find(names[i]), b = find(names[j])
guard a != b, let fa = fingerprints[a], let fb = fingerprints[b] else { continue }
if cosine(fa, fb) >= threshold {
let survivor = better(a, b, rank: rank)
let absorbed = survivor == a ? b : a
canonical[absorbed] = survivor
}
}
}
let map = Dictionary(uniqueKeysWithValues: names.map { ($0, find($0)) }).filter { $0.key != $0.value }
guard !map.isEmpty else { return file }
let segments = file.segments.map { s in map[s.speaker].map {
SpeakersFile.Segment(start: s.start, end: s.end, speaker: $0, text: s.text) } ?? s }
let keep = SpeakerEditing.orderedSpeakers(segments)
let speakers = keep.map { rank[$0] ?? SpeakersFile.Speaker(name: $0, source: "reconciled", overlapConfidence: nil, matchSimilarity: nil) }
return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec,
speakers: speakers, segments: segments, models: file.models)
}
/// Prefer a real name over Unknown; otherwise the higher-confidence cluster.
private static func better(_ a: String, _ b: String, rank: [String: SpeakersFile.Speaker]) -> String {
let au = LabelMergeResponse.isUnknownName(a), bu = LabelMergeResponse.isUnknownName(b)
if au != bu { return au ? b : a }
let ca = (rank[a]?.overlapConfidence ?? rank[a]?.matchSimilarity ?? 0)
let cb = (rank[b]?.overlapConfidence ?? rank[b]?.matchSimilarity ?? 0)
return ca >= cb ? a : b
}
private static func apply(rename current: String, to new: String, source: String, in file: SpeakersFile) -> SpeakersFile {
let segments = SpeakerEditing.replaceSpeaker(current, with: new, in: file.segments)
let speakers = SpeakerEditing.orderedSpeakers(segments).map { name -> SpeakersFile.Speaker in
if name == new { return SpeakersFile.Speaker(name: new, source: source, overlapConfidence: nil, matchSimilarity: nil) }
return file.speakers.first { $0.name == name } ?? SpeakersFile.Speaker(name: name, source: "reconciled", overlapConfidence: nil, matchSimilarity: nil)
}
return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec,
speakers: speakers, segments: segments, models: file.models)
}
// MARK: - LLM content naming
static func namingPrompt(file: SpeakersFile, selfName: String, labels: [String]) -> String {
let entries = RecapAnalyzer.entries(from: file)
let transcript = RecapAnalyzer.cappedTranscript(entries, maxChars: 20_000)
return """
You are reconciling speaker labels in a diarized transcript. The voices were separated acoustically and labeled with placeholder initials or "Unknown_N". Your ONLY job is to map a placeholder to a person's REAL name when the conversation clearly reveals it — someone is addressed by name, introduces themselves, or is unambiguously referred to. If a label's real name is not clearly revealed, KEEP IT (return null). Never guess.
SELF (already correct — never reassign): \(selfName)
LABELS TO RESOLVE: \(labels.joined(separator: ", "))
TRANSCRIPT (each line is "[<label> <MM:SS>] text"):
\(transcript)
Respond with ONLY valid JSON, no other text:
{
"speakers": [
{"current": "<label>", "name": "Real Name" or null, "confidence": "high" | "medium" | "low"}
]
}
"""
}
static func parseNaming(_ content: String) -> [String: (name: String, confidence: String)] {
let cleaned = GatewayLLMClient.stripCodeFence(content)
guard let o = (try? JSONSerialization.jsonObject(with: Data(cleaned.utf8))) as? [String: Any],
let arr = o["speakers"] as? [[String: Any]] else { return [:] }
var out: [String: (name: String, confidence: String)] = [:]
for d in arr {
guard let cur = (d["current"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines), !cur.isEmpty,
let name = (d["name"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines),
!name.isEmpty, name.lowercased() != "null" else { continue }
let conf = (d["confidence"] as? String)?.lowercased() ?? "medium"
out[cur] = (name, conf)
}
return out
}
}