diff --git a/Ten31Transcripts/Session/SessionController.swift b/Ten31Transcripts/Session/SessionController.swift index 8e05dae..75ef1be 100644 --- a/Ten31Transcripts/Session/SessionController.swift +++ b/Ten31Transcripts/Session/SessionController.swift @@ -393,24 +393,32 @@ final class SessionController: ObservableObject { } } - /// Post-transcription LLM passes (best-effort, share one gateway model lookup): - /// reconcile speaker labels (merge split clusters + name from content), then build - /// the readable recap. A missing LLM or any failure leaves speakers.json intact. + /// Post-transcription cleanup + LLM passes. Speaker reconciliation (merge split + /// clusters + content-naming) and the readable recap need the gateway LLM; the + /// adjacent-segment collapse does not. So the collapse runs unconditionally and + /// always re-persists `speakers.json`, while the LLM passes are skipped when no + /// model is available. Any failure leaves the last good `speakers.json` intact. private func finishBackend(speakers: SpeakersFile, inputs: ProcessInputs, settings: AppSettings) async { let llm = GatewayLLMClient(baseURL: settings.backendBaseURL, skipTLS: settings.skipTLSVerification) - guard let model = await llm.chatModelId() else { return } // no LLM on the gateway → skip both + let model = await llm.chatModelId() // nil → no LLM on the gateway; LLM passes skipped var resolved = speakers - if settings.reconcileSpeakers, !speakers.segments.isEmpty { + // Reconcile labels (needs the LLM): merge split clusters, dissolve fragments, + // and name placeholders from transcript content. + if let model, settings.reconcileSpeakers, !speakers.segments.isEmpty { self.transcriptStatus = .processing(0, 0) let fps = RecapEditModel.loadFingerprints(inputs.folder.appendingPathComponent("cluster_fingerprints.json")) resolved = await SpeakerReconciler.reconcile(file: speakers, fingerprints: fps, selfName: inputs.selfName, llm: llm, model: model) - try? resolved.write(to: inputs.folder.appendingPathComponent("speakers.json")) - self.transcriptStatus = .done(speakers: resolved.speakers.count, segments: resolved.segments.count) } + // Collapse adjacent same-speaker segments (no LLM needed) so fragments + // reabsorbed by smoothing read as one clean line, then persist. Always runs + // — even when the LLM is unavailable — so the saved transcript is cleaned up. + resolved = SpeakerReconciler.mergeAdjacent(resolved) + try? resolved.write(to: inputs.folder.appendingPathComponent("speakers.json")) + self.transcriptStatus = .done(speakers: resolved.speakers.count, segments: resolved.segments.count) - guard settings.recapEnabled, !resolved.segments.isEmpty else { return } + guard let model, settings.recapEnabled, !resolved.segments.isEmpty else { return } let analyzer = RecapAnalyzer(llm: llm, model: model) guard let result = try? await analyzer.recap(file: resolved, template: settings.defaultTemplate) else { return } let title = Self.recapTitle(app: inputs.app, sessionId: inputs.sessionId) diff --git a/Ten31Transcripts/Session/SpeakerReconciler.swift b/Ten31Transcripts/Session/SpeakerReconciler.swift index e7b654e..2a6205e 100644 --- a/Ten31Transcripts/Session/SpeakerReconciler.swift +++ b/Ten31Transcripts/Session/SpeakerReconciler.swift @@ -82,6 +82,28 @@ enum SpeakerReconciler { speakers: speakers, segments: result, models: file.models) } + /// Collapse consecutive segments from the SAME speaker separated by ≤ `maxGap` + /// seconds into one, joining their text — so fragments reabsorbed by smoothing + /// (e.g. "I" then "need to switch it back") read as a single clean line. Pure. + static func mergeAdjacent(_ file: SpeakersFile, maxGap: Double = 2.0) -> SpeakersFile { + let sorted = file.segments.sorted { $0.start < $1.start } + guard !sorted.isEmpty else { return file } + var out: [SpeakersFile.Segment] = [] + for s in sorted { + if var last = out.last, last.speaker == s.speaker, s.start - last.end <= maxGap { + let joined = [last.text, s.text].compactMap { $0?.trimmingCharacters(in: .whitespaces) } + .filter { !$0.isEmpty }.joined(separator: " ") + last = .init(start: last.start, end: max(last.end, s.end), speaker: s.speaker, + text: joined.isEmpty ? nil : joined) + out[out.count - 1] = last + } else { + out.append(s) + } + } + return SpeakersFile(sessionId: file.sessionId, app: file.app, durationSec: file.durationSec, + speakers: file.speakers, segments: out, models: file.models) + } + // MARK: - Voiceprint merge (pure) static func protectedNames(_ file: SpeakersFile, selfName: String) -> Set { diff --git a/Ten31TranscriptsTests/SpeakerReconcilerTests.swift b/Ten31TranscriptsTests/SpeakerReconcilerTests.swift index 0fbe0b1..5275441 100644 --- a/Ten31TranscriptsTests/SpeakerReconcilerTests.swift +++ b/Ten31TranscriptsTests/SpeakerReconcilerTests.swift @@ -73,6 +73,38 @@ final class SpeakerReconcilerTests: XCTestCase { XCTAssertTrue(out.speakers.contains { $0.name == "Me" }) // self never dissolved } + func testMergeAdjacentCollapsesSameSpeakerAndJoinsText() { + let f = file([sp("A", "content"), sp("B", "content")], [ + SpeakersFile.Segment(start: 0, end: 1, speaker: "A", text: "I"), + SpeakersFile.Segment(start: 1.5, end: 4, speaker: "A", text: "need to switch it back"), + SpeakersFile.Segment(start: 4.2, end: 6, speaker: "B", text: "Sure"), + ]) + let out = SpeakerReconciler.mergeAdjacent(f, maxGap: 2.0) + XCTAssertEqual(out.segments.count, 2) // two A's collapsed + XCTAssertEqual(out.segments[0].speaker, "A") + XCTAssertEqual(out.segments[0].start, 0, accuracy: 0.001) + XCTAssertEqual(out.segments[0].end, 4, accuracy: 0.001) + XCTAssertEqual(out.segments[0].text, "I need to switch it back") + XCTAssertEqual(out.segments[1].speaker, "B") // different speaker untouched + } + + func testMergeAdjacentRespectsMaxGapAndSpeakerBoundaries() { + let f = file([sp("A", "content")], [ + SpeakersFile.Segment(start: 0, end: 1, speaker: "A", text: "one"), + SpeakersFile.Segment(start: 5, end: 6, speaker: "A", text: "two"), // gap 4s > maxGap + ]) + let out = SpeakerReconciler.mergeAdjacent(f, maxGap: 2.0) + XCTAssertEqual(out.segments.count, 2) // large gap → not merged + + // A B A must stay three segments (intervening speaker breaks the run). + let g = file([sp("A", "content"), sp("B", "content")], [ + SpeakersFile.Segment(start: 0, end: 1, speaker: "A", text: "a1"), + SpeakersFile.Segment(start: 1.2, end: 2, speaker: "B", text: "b"), + SpeakersFile.Segment(start: 2.2, end: 3, speaker: "A", text: "a2"), + ]) + XCTAssertEqual(SpeakerReconciler.mergeAdjacent(g, maxGap: 2.0).segments.count, 3) + } + func testParseNamingDropsNullAndKeepsConfidence() { let json = #"{"speakers":[{"current":"MH","name":"Jonathan Kirkwood","confidence":"high"},{"current":"Unknown_0","name":null,"confidence":"low"}]}"# let m = SpeakerReconciler.parseNaming(json)