Collapse adjacent same-speaker segments after reconciliation
Fragments reabsorbed by smoothFragments (e.g. "I" then "need to switch it back") were left as separate transcript lines. Add SpeakerReconciler.mergeAdjacent to join consecutive same-speaker segments within 2s, concatenating their text. Wire it into SessionController.finishBackend AFTER reconcile/LLM naming. The collapse needs no LLM, so finishBackend no longer early-returns when the gateway has no chat model — it runs the collapse and re-persists speakers.json unconditionally, gating only the reconcile and recap passes on the model.
This commit is contained in:
@@ -393,24 +393,32 @@ final class SessionController: ObservableObject {
|
||||
}
|
||||
}
|
||||
|
||||
/// Post-transcription LLM passes (best-effort, share one gateway model lookup):
|
||||
/// reconcile speaker labels (merge split clusters + name from content), then build
|
||||
/// the readable recap. A missing LLM or any failure leaves speakers.json intact.
|
||||
/// Post-transcription cleanup + LLM passes. Speaker reconciliation (merge split
|
||||
/// clusters + content-naming) and the readable recap need the gateway LLM; the
|
||||
/// adjacent-segment collapse does not. So the collapse runs unconditionally and
|
||||
/// always re-persists `speakers.json`, while the LLM passes are skipped when no
|
||||
/// model is available. Any failure leaves the last good `speakers.json` intact.
|
||||
private func finishBackend(speakers: SpeakersFile, inputs: ProcessInputs, settings: AppSettings) async {
|
||||
let llm = GatewayLLMClient(baseURL: settings.backendBaseURL, skipTLS: settings.skipTLSVerification)
|
||||
guard let model = await llm.chatModelId() else { return } // no LLM on the gateway → skip both
|
||||
let model = await llm.chatModelId() // nil → no LLM on the gateway; LLM passes skipped
|
||||
|
||||
var resolved = speakers
|
||||
if settings.reconcileSpeakers, !speakers.segments.isEmpty {
|
||||
// Reconcile labels (needs the LLM): merge split clusters, dissolve fragments,
|
||||
// and name placeholders from transcript content.
|
||||
if let model, settings.reconcileSpeakers, !speakers.segments.isEmpty {
|
||||
self.transcriptStatus = .processing(0, 0)
|
||||
let fps = RecapEditModel.loadFingerprints(inputs.folder.appendingPathComponent("cluster_fingerprints.json"))
|
||||
resolved = await SpeakerReconciler.reconcile(file: speakers, fingerprints: fps,
|
||||
selfName: inputs.selfName, llm: llm, model: model)
|
||||
try? resolved.write(to: inputs.folder.appendingPathComponent("speakers.json"))
|
||||
self.transcriptStatus = .done(speakers: resolved.speakers.count, segments: resolved.segments.count)
|
||||
}
|
||||
// Collapse adjacent same-speaker segments (no LLM needed) so fragments
|
||||
// reabsorbed by smoothing read as one clean line, then persist. Always runs
|
||||
// — even when the LLM is unavailable — so the saved transcript is cleaned up.
|
||||
resolved = SpeakerReconciler.mergeAdjacent(resolved)
|
||||
try? resolved.write(to: inputs.folder.appendingPathComponent("speakers.json"))
|
||||
self.transcriptStatus = .done(speakers: resolved.speakers.count, segments: resolved.segments.count)
|
||||
|
||||
guard settings.recapEnabled, !resolved.segments.isEmpty else { return }
|
||||
guard let model, settings.recapEnabled, !resolved.segments.isEmpty else { return }
|
||||
let analyzer = RecapAnalyzer(llm: llm, model: model)
|
||||
guard let result = try? await analyzer.recap(file: resolved, template: settings.defaultTemplate) else { return }
|
||||
let title = Self.recapTitle(app: inputs.app, sessionId: inputs.sessionId)
|
||||
|
||||
@@ -82,6 +82,28 @@ enum SpeakerReconciler {
|
||||
speakers: speakers, segments: result, models: file.models)
|
||||
}
|
||||
|
||||
/// Collapse consecutive segments from the SAME speaker separated by ≤ `maxGap`
|
||||
/// seconds into one, joining their text — so fragments reabsorbed by smoothing
|
||||
/// (e.g. "I" then "need to switch it back") read as a single clean line. Pure.
|
||||
static func mergeAdjacent(_ file: SpeakersFile, maxGap: Double = 2.0) -> SpeakersFile {
|
||||
let sorted = file.segments.sorted { $0.start < $1.start }
|
||||
guard !sorted.isEmpty else { return file }
|
||||
var out: [SpeakersFile.Segment] = []
|
||||
for s in sorted {
|
||||
if var last = out.last, last.speaker == s.speaker, s.start - last.end <= maxGap {
|
||||
let joined = [last.text, s.text].compactMap { $0?.trimmingCharacters(in: .whitespaces) }
|
||||
.filter { !$0.isEmpty }.joined(separator: " ")
|
||||
last = .init(start: last.start, end: max(last.end, s.end), speaker: s.speaker,
|
||||
text: joined.isEmpty ? nil : joined)
|
||||
out[out.count - 1] = last
|
||||
} else {
|
||||
out.append(s)
|
||||
}
|
||||
}
|
||||
return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec,
|
||||
speakers: file.speakers, segments: out, models: file.models)
|
||||
}
|
||||
|
||||
// MARK: - Voiceprint merge (pure)
|
||||
|
||||
static func protectedNames(_ file: SpeakersFile, selfName: String) -> Set<String> {
|
||||
|
||||
Reference in New Issue
Block a user