a95f27ecd1
Fragments reabsorbed by smoothFragments (e.g. "I" then "need to switch it back") were left as separate transcript lines. Add SpeakerReconciler.mergeAdjacent to join consecutive same-speaker segments within 2s, concatenating their text. Wire it into SessionController.finishBackend AFTER reconcile/LLM naming. The collapse needs no LLM, so finishBackend no longer early-returns when the gateway has no chat model — it runs the collapse and re-persists speakers.json unconditionally, gating only the reconcile and recap passes on the model.
219 lines
13 KiB
Swift
219 lines
13 KiB
Swift
import Foundation
|
|
|
|
/// Reconciles the backend's per-cluster speaker labels into cleaner identities:
|
|
/// 1. **Merge** non-self clusters whose voiceprints are highly similar — fixes one
|
|
/// person being split across chunks (e.g. "MH" + "Unknown_0" → one person).
|
|
/// 2. **Name** remaining non-self clusters from the transcript *content* (people
|
|
/// addressed by name, self-introductions) via the gateway LLM — fixes wrong/initial
|
|
/// labels that the visual cue produced. Conservative: keeps the current label when
|
|
/// the content doesn't clearly reveal a name; never touches the mic-channel self.
|
|
///
|
|
/// The merge math is pure/testable; the naming pass is one LLM call.
|
|
enum SpeakerReconciler {
|
|
|
|
/// Full reconciliation: merge by voiceprint → dissolve fragment clusters → name
|
|
/// remaining non-self clusters by content (guard-railed).
|
|
static func reconcile(file: SpeakersFile, fingerprints: [String: [Float]], selfName: String,
|
|
llm: GatewayLLMClient, model: String,
|
|
mergeThreshold: Double = 0.82) async -> SpeakersFile {
|
|
let protected = protectedNames(file, selfName: selfName)
|
|
let merged = mergeByFingerprint(file, fingerprints: fingerprints, protected: protected, threshold: mergeThreshold)
|
|
let smoothed = smoothFragments(merged, protected: protected)
|
|
|
|
// Name the non-self clusters from content.
|
|
let labels = SpeakerEditing.orderedSpeakers(smoothed.segments).filter { !protected.contains($0) }
|
|
guard !labels.isEmpty else { return smoothed }
|
|
// Names the LLM must NOT reuse for another speaker: self + everyone already named.
|
|
let forbidden = protected.union(labels.filter { !LabelMergeResponse.isUnknownName($0) })
|
|
let prompt = namingPrompt(file: smoothed, selfName: selfName, labels: labels, forbidden: forbidden)
|
|
guard let content = try? await llm.completeJSON(model: model, system: nil, user: prompt, maxTokens: 1024) else {
|
|
return smoothed
|
|
}
|
|
let names = parseNaming(content)
|
|
var renamed = smoothed
|
|
var used = Set(SpeakerEditing.orderedSpeakers(smoothed.segments))
|
|
for (current, proposal) in names where current != proposal.name {
|
|
let new = proposal.name
|
|
guard !new.isEmpty, proposal.confidence != "low",
|
|
!protected.contains(current), !LabelMergeResponse.isUnknownName(new),
|
|
!protected.contains(new), // never assign the self/protected name to another voice
|
|
!(used.contains(new) && new != current) // never collide with an already-present different speaker
|
|
else { continue }
|
|
renamed = apply(rename: current, to: new, source: "content", in: renamed)
|
|
used.remove(current); used.insert(new)
|
|
}
|
|
return renamed
|
|
}
|
|
|
|
/// Dissolve fragment clusters: a non-self "speaker" whose segments are MOSTLY tiny
|
|
/// (median duration ≤ `shortDur`) isn't a real participant — it's diarization
|
|
/// micro-fragments (single words split off mid-sentence; one stray longer segment
|
|
/// shouldn't rescue it, so we use the median, not the max). Reassign each of its
|
|
/// segments to the temporally-nearest real speaker. Pure/testable.
|
|
static func smoothFragments(_ file: SpeakersFile, protected: Set<String>,
|
|
shortDur: Double = 1.0, minSegs: Int = 3) -> SpeakersFile {
|
|
var durs: [String: [Double]] = [:]
|
|
for s in file.segments { durs[s.speaker, default: []].append(s.end - s.start) }
|
|
func isReal(_ name: String) -> Bool {
|
|
if protected.contains(name) { return true }
|
|
guard let d = durs[name], d.count >= minSegs else { return true } // too few to judge → keep
|
|
let sorted = d.sorted()
|
|
return sorted[sorted.count / 2] > shortDur // median > shortDur → real
|
|
}
|
|
guard file.segments.contains(where: { isReal($0.speaker) }),
|
|
file.segments.contains(where: { !isReal($0.speaker) }) else { return file }
|
|
|
|
let out = file.segments.sorted { $0.start < $1.start }
|
|
var result = out
|
|
for i in out.indices where !isReal(out[i].speaker) {
|
|
var bestName: String?, bestGap = Double.greatestFiniteMagnitude
|
|
var j = i - 1
|
|
while j >= 0 { if isReal(out[j].speaker) { let gap = out[i].start - out[j].end; if gap < bestGap { bestGap = gap; bestName = out[j].speaker }; break }; j -= 1 }
|
|
var k = i + 1
|
|
while k < out.count { if isReal(out[k].speaker) { let gap = out[k].start - out[i].end; if gap < bestGap { bestGap = gap; bestName = out[k].speaker }; break }; k += 1 }
|
|
if let name = bestName {
|
|
let s = out[i]
|
|
result[i] = SpeakersFile.Segment(start: s.start, end: s.end, speaker: name, text: s.text)
|
|
}
|
|
}
|
|
let keep = SpeakerEditing.orderedSpeakers(result)
|
|
let speakers = keep.map { n in file.speakers.first { $0.name == n } ?? SpeakersFile.Speaker(name: n, source: "reconciled", overlapConfidence: nil, matchSimilarity: nil) }
|
|
return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec,
|
|
speakers: speakers, segments: result, models: file.models)
|
|
}
|
|
|
|
/// Collapse consecutive segments from the SAME speaker separated by ≤ `maxGap`
|
|
/// seconds into one, joining their text — so fragments reabsorbed by smoothing
|
|
/// (e.g. "I" then "need to switch it back") read as a single clean line. Pure.
|
|
static func mergeAdjacent(_ file: SpeakersFile, maxGap: Double = 2.0) -> SpeakersFile {
|
|
let sorted = file.segments.sorted { $0.start < $1.start }
|
|
guard !sorted.isEmpty else { return file }
|
|
var out: [SpeakersFile.Segment] = []
|
|
for s in sorted {
|
|
if var last = out.last, last.speaker == s.speaker, s.start - last.end <= maxGap {
|
|
let joined = [last.text, s.text].compactMap { $0?.trimmingCharacters(in: .whitespaces) }
|
|
.filter { !$0.isEmpty }.joined(separator: " ")
|
|
last = .init(start: last.start, end: max(last.end, s.end), speaker: s.speaker,
|
|
text: joined.isEmpty ? nil : joined)
|
|
out[out.count - 1] = last
|
|
} else {
|
|
out.append(s)
|
|
}
|
|
}
|
|
return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec,
|
|
speakers: file.speakers, segments: out, models: file.models)
|
|
}
|
|
|
|
// MARK: - Voiceprint merge (pure)
|
|
|
|
static func protectedNames(_ file: SpeakersFile, selfName: String) -> Set<String> {
|
|
var p: Set<String> = [selfName]
|
|
for s in file.speakers where s.source == "mic_channel" { p.insert(s.name) }
|
|
return p
|
|
}
|
|
|
|
static func cosine(_ a: [Float], _ b: [Float]) -> Double {
|
|
guard a.count == b.count, !a.isEmpty else { return 0 }
|
|
var dot = 0.0, na = 0.0, nb = 0.0
|
|
for i in 0..<a.count { dot += Double(a[i] * b[i]); na += Double(a[i] * a[i]); nb += Double(b[i] * b[i]) }
|
|
guard na > 0, nb > 0 else { return 0 }
|
|
return dot / (na.squareRoot() * nb.squareRoot())
|
|
}
|
|
|
|
/// Greedily merge non-self clusters with cosine similarity ≥ threshold. The
|
|
/// survivor is the "better-named" one (a real name beats Unknown; higher
|
|
/// confidence wins ties). Segments + the speaker roster are remapped.
|
|
static func mergeByFingerprint(_ file: SpeakersFile, fingerprints: [String: [Float]],
|
|
protected: Set<String>, threshold: Double) -> SpeakersFile {
|
|
let names = file.speakers.map { $0.name }.filter { !protected.contains($0) && fingerprints[$0] != nil }
|
|
guard names.count > 1 else { return file }
|
|
let rank = Dictionary(uniqueKeysWithValues: file.speakers.map { ($0.name, $0) })
|
|
|
|
var canonical: [String: String] = [:] // name -> survivor
|
|
for n in names { canonical[n] = n }
|
|
func find(_ x: String) -> String { var r = x; while canonical[r]! != r { r = canonical[r]! }; return r }
|
|
|
|
for i in 0..<names.count {
|
|
for j in (i + 1)..<names.count {
|
|
let a = find(names[i]), b = find(names[j])
|
|
guard a != b, let fa = fingerprints[a], let fb = fingerprints[b] else { continue }
|
|
if cosine(fa, fb) >= threshold {
|
|
let survivor = better(a, b, rank: rank)
|
|
let absorbed = survivor == a ? b : a
|
|
canonical[absorbed] = survivor
|
|
}
|
|
}
|
|
}
|
|
let map = Dictionary(uniqueKeysWithValues: names.map { ($0, find($0)) }).filter { $0.key != $0.value }
|
|
guard !map.isEmpty else { return file }
|
|
|
|
let segments = file.segments.map { s in map[s.speaker].map {
|
|
SpeakersFile.Segment(start: s.start, end: s.end, speaker: $0, text: s.text) } ?? s }
|
|
let keep = SpeakerEditing.orderedSpeakers(segments)
|
|
let speakers = keep.map { rank[$0] ?? SpeakersFile.Speaker(name: $0, source: "reconciled", overlapConfidence: nil, matchSimilarity: nil) }
|
|
return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec,
|
|
speakers: speakers, segments: segments, models: file.models)
|
|
}
|
|
|
|
/// Prefer a real name over Unknown; otherwise the higher-confidence cluster.
|
|
private static func better(_ a: String, _ b: String, rank: [String: SpeakersFile.Speaker]) -> String {
|
|
let au = LabelMergeResponse.isUnknownName(a), bu = LabelMergeResponse.isUnknownName(b)
|
|
if au != bu { return au ? b : a }
|
|
let ca = (rank[a]?.overlapConfidence ?? rank[a]?.matchSimilarity ?? 0)
|
|
let cb = (rank[b]?.overlapConfidence ?? rank[b]?.matchSimilarity ?? 0)
|
|
return ca >= cb ? a : b
|
|
}
|
|
|
|
private static func apply(rename current: String, to new: String, source: String, in file: SpeakersFile) -> SpeakersFile {
|
|
let segments = SpeakerEditing.replaceSpeaker(current, with: new, in: file.segments)
|
|
let speakers = SpeakerEditing.orderedSpeakers(segments).map { name -> SpeakersFile.Speaker in
|
|
if name == new { return SpeakersFile.Speaker(name: new, source: source, overlapConfidence: nil, matchSimilarity: nil) }
|
|
return file.speakers.first { $0.name == name } ?? SpeakersFile.Speaker(name: name, source: "reconciled", overlapConfidence: nil, matchSimilarity: nil)
|
|
}
|
|
return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec,
|
|
speakers: speakers, segments: segments, models: file.models)
|
|
}
|
|
|
|
// MARK: - LLM content naming
|
|
|
|
static func namingPrompt(file: SpeakersFile, selfName: String, labels: [String], forbidden: Set<String>) -> String {
|
|
let entries = RecapAnalyzer.entries(from: file)
|
|
let transcript = RecapAnalyzer.cappedTranscript(entries, maxChars: 20_000)
|
|
let forbiddenList = forbidden.sorted().joined(separator: ", ")
|
|
return """
|
|
You are reconciling speaker labels in a diarized transcript. The voices were separated acoustically and labeled with placeholder initials or "Unknown_N". Your ONLY job is to map a placeholder to a person's REAL name when the conversation UNAMBIGUOUSLY reveals it — they introduce themselves ("this is Sarah"), or are directly addressed AND respond. Hearing a name mentioned is NOT enough; people are talked ABOUT without being on the call. When in doubt, return null. Precision matters far more than coverage — a wrong name is worse than no name.
|
|
|
|
"\(selfName)" is the local user (their own channel) and is already correct.
|
|
Do NOT assign any of these already-taken names to a different speaker: \(forbiddenList)
|
|
Each real name may be used for AT MOST ONE label.
|
|
|
|
LABELS TO RESOLVE: \(labels.joined(separator: ", "))
|
|
|
|
TRANSCRIPT (each line is "[<label> <MM:SS>] text"):
|
|
\(transcript)
|
|
|
|
Respond with ONLY valid JSON, no other text. Use "high" confidence only when a label introduced themselves or was directly addressed and answered:
|
|
{
|
|
"speakers": [
|
|
{"current": "<label>", "name": "Real Name" or null, "confidence": "high" | "medium" | "low"}
|
|
]
|
|
}
|
|
"""
|
|
}
|
|
|
|
static func parseNaming(_ content: String) -> [String: (name: String, confidence: String)] {
|
|
let cleaned = GatewayLLMClient.stripCodeFence(content)
|
|
guard let o = (try? JSONSerialization.jsonObject(with: Data(cleaned.utf8))) as? [String: Any],
|
|
let arr = o["speakers"] as? [[String: Any]] else { return [:] }
|
|
var out: [String: (name: String, confidence: String)] = [:]
|
|
for d in arr {
|
|
guard let cur = (d["current"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines), !cur.isEmpty,
|
|
let name = (d["name"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines),
|
|
!name.isEmpty, name.lowercased() != "null" else { continue }
|
|
let conf = (d["confidence"] as? String)?.lowercased() ?? "medium"
|
|
out[cur] = (name, conf)
|
|
}
|
|
return out
|
|
}
|
|
}
|