From a3e3406b28993282a9ebd8cb3e27b22a28b77986 Mon Sep 17 00:00:00 2001 From: Grant Gilliam Date: Tue, 9 Jun 2026 10:15:16 -0500 Subject: [PATCH] Make diarization chunk length configurable (Auto + presets) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chunk size was hardcoded at 2.5-min bodies. Add a Settings control: Auto / Standard 2.5min / Large group 60s / Fine 90s. Shorter chunks keep fewer simultaneous speakers per window (Sortformer resolves ~4/chunk), useful for large calls, at some cost to speed and cross-chunk voice matching. - ChunkMode (new, pure/testable): mode → body seconds; Auto picks 60s when >4 participants were detected, else 150s; overlap + single-chunk threshold scale with the body length. - AppSettings.chunkMode (+ typed `chunk`); SettingsView picker with explanation. - TranscriptPipeline.process gains chunkSeconds; derives overlap/threshold from it. - SessionController resolves the body from the setting + the session's detected participant count (visual_timeline participants) for both send + re-process. - Participant roster now counts EVERY tile OCR'd, not just who spoke (TimelineBuilder.observedNames → VisualObserver → VisualCapture), so the Auto call-size signal is meaningful even though speaking-detection is sparse. Tests: ChunkMode resolution, overlap scaling, short-body re-chunking. 69 pass. --- Ten31Transcripts/Session/ChunkPlan.swift | 51 +++++++++++++++++++ .../Session/SessionController.swift | 15 +++++- .../Session/TranscriptPipeline.swift | 8 ++- Ten31Transcripts/Settings/AppSettings.swift | 11 ++++ Ten31Transcripts/UI/SettingsView.swift | 6 +++ Ten31Transcripts/Visual/TimelineBuilder.swift | 9 ++++ Ten31Transcripts/Visual/VisualCapture.swift | 5 +- Ten31Transcripts/Visual/VisualObserver.swift | 4 ++ Ten31TranscriptsTests/Phase5Tests.swift | 27 ++++++++++ 9 files changed, 133 insertions(+), 3 deletions(-) create mode 100644 Ten31Transcripts/Session/ChunkPlan.swift diff --git a/Ten31Transcripts/Session/ChunkPlan.swift b/Ten31Transcripts/Session/ChunkPlan.swift new file mode 100644 index 0000000..d55e015 --- /dev/null +++ b/Ten31Transcripts/Session/ChunkPlan.swift @@ -0,0 +1,51 @@ +import Foundation + +/// How long each diarization *body* chunk should be. Smaller chunks keep fewer +/// simultaneous speakers inside one window — Sortformer resolves at most ~4 speakers +/// per chunk, and the dual-channel split already spends the local user on the mic +/// track, so the system (remote) channel is what can saturate on a big call. The +/// cost of going smaller: weaker cross-chunk voiceprints, more cross-chunk speaker +/// splitting (the reconciler re-merges some), and more backend round-trips. +enum ChunkMode: String, CaseIterable, Identifiable, Codable { + case auto, standard, largeGroup, fine + + var id: String { rawValue } + + var label: String { + switch self { + case .auto: return "Auto (by call size)" + case .standard: return "Standard · 2.5 min" + case .largeGroup: return "Large group · 60 sec" + case .fine: return "Fine · 90 sec" + } + } + + /// Fixed body length, or nil for `.auto` (resolved from the participant count). + var fixedBodySeconds: Double? { + switch self { + case .auto: return nil + case .standard: return 150 + case .largeGroup: return 60 + case .fine: return 90 + } + } + + /// More than this many detected participants makes `.auto` pick the short body, + /// so one chunk is less likely to exceed Sortformer's ~4-speaker resolution. + static let autoLargeThreshold = 4 + + /// Resolve the body length in seconds. `.auto` drops to 60s when more than + /// `autoLargeThreshold` participants were detected, else uses the 2.5-min default; + /// with no count available (audio-only) it stays at the 2.5-min default. + func bodySeconds(participantCount: Int?) -> Double { + if let fixed = fixedBodySeconds { return fixed } + if let n = participantCount, n > Self.autoLargeThreshold { return 60 } + return 150 + } + + /// Overlap margin scaled to the body length (~12%, clamped 8…15s) so a 60s chunk + /// isn't dominated by a fixed 15s margin while a 2.5-min chunk keeps the full 15s. + static func overlapSeconds(forBody body: Double) -> Double { + max(8, min(15, (body * 0.12).rounded())) + } +} diff --git a/Ten31Transcripts/Session/SessionController.swift b/Ten31Transcripts/Session/SessionController.swift index ea1fad7..1fb095b 100644 --- a/Ten31Transcripts/Session/SessionController.swift +++ b/Ten31Transcripts/Session/SessionController.swift @@ -378,12 +378,15 @@ final class SessionController: ObservableObject { let settings = self.settings let pipeline = TranscriptPipeline(baseURL: settings.backendBaseURL, skipTLS: settings.skipTLSVerification, voiceprints: voiceprints) + // Resolve the diarization chunk length from the setting; "Auto" uses the + // participant count the visual capture saw for this session. + let chunkSeconds = settings.chunk.bodySeconds(participantCount: Self.participantCount(in: inputs.folder)) do { let speakers = try await pipeline.process( sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app, micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL, timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName, - systemHealthy: inputs.systemHealthy, + systemHealthy: inputs.systemHealthy, chunkSeconds: chunkSeconds, progress: { done, total in await MainActor.run { self.transcriptStatus = .processing(done, total) } }) self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count) try Task.checkCancellation() @@ -531,6 +534,16 @@ final class SessionController: ObservableObject { } } + /// Detected participant count from a session's visual timeline, for "Auto" chunk + /// sizing. Nil when there's no visual timeline (audio-only) so callers keep the + /// default body length. Counts everyone OCR'd on the call, not just who spoke. + private static func participantCount(in folder: URL) -> Int? { + guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")), + let vt = try? JSONDecoder().decode(VisualTimeline.self, from: data), + !vt.participants.isEmpty else { return nil } + return vt.participants.count + } + /// The remote (vision) visual-timeline segments saved for a session, if any. private static func remoteTimeline(in folder: URL) -> [VisualTimeline.Segment] { guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")), diff --git a/Ten31Transcripts/Session/TranscriptPipeline.swift b/Ten31Transcripts/Session/TranscriptPipeline.swift index 3df6e0e..cd02bd8 100644 --- a/Ten31Transcripts/Session/TranscriptPipeline.swift +++ b/Ten31Transcripts/Session/TranscriptPipeline.swift @@ -28,6 +28,7 @@ final class TranscriptPipeline { selfSpans: [VADSpan], selfName: String, systemHealthy: Bool, + chunkSeconds: Double = 150, progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile { let fm = FileManager.default let dual = systemHealthy @@ -36,7 +37,12 @@ final class TranscriptPipeline { let duration = dual ? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL)) : SessionPackager.duration(of: mixedURL) - let plan = SessionPackager.planChunks(durationSec: duration) + // Chunk to the requested body length; overlap and the single-chunk threshold + // scale with it (a 60s body shouldn't be cut by a fixed 15s margin or stay + // unchunked below the 2.5-min default threshold). + let overlap = ChunkMode.overlapSeconds(forBody: chunkSeconds) + let plan = SessionPackager.planChunks(durationSec: duration, chunkSeconds: chunkSeconds, + overlapSeconds: overlap, thresholdSec: chunkSeconds * 1.2) // Zero-duration / empty session → a valid empty speakers.json, no backend call. if plan.isEmpty || duration <= 0 { diff --git a/Ten31Transcripts/Settings/AppSettings.swift b/Ten31Transcripts/Settings/AppSettings.swift index e919824..d43919d 100644 --- a/Ten31Transcripts/Settings/AppSettings.swift +++ b/Ten31Transcripts/Settings/AppSettings.swift @@ -60,6 +60,15 @@ final class AppSettings: ObservableObject { didSet { defaults.set(reconcileSpeakers, forKey: Keys.reconcileSpeakers) } } + /// Diarization chunk length (raw value of `ChunkMode`). `.auto` shrinks chunks on + /// large calls so a window is less likely to exceed Sortformer's ~4-speaker cap. + @Published var chunkMode: String { + didSet { defaults.set(chunkMode, forKey: Keys.chunkMode) } + } + + /// Typed accessor for `chunkMode`. + var chunk: ChunkMode { ChunkMode(rawValue: chunkMode) ?? .auto } + /// User-editable recap templates (takeaways categories per meeting type). @Published var recapTemplates: [RecapTemplate] { didSet { persist(recapTemplates, forKey: Keys.recapTemplates) } @@ -104,6 +113,7 @@ final class AppSettings: ObservableObject { self.autoSendOnStop = defaults.object(forKey: Keys.autoSend) as? Bool ?? false self.recapEnabled = defaults.object(forKey: Keys.recapEnabled) as? Bool ?? true self.reconcileSpeakers = defaults.object(forKey: Keys.reconcileSpeakers) as? Bool ?? true + self.chunkMode = defaults.string(forKey: Keys.chunkMode) ?? ChunkMode.auto.rawValue let loaded = (defaults.data(forKey: Keys.recapTemplates)) .flatMap { try? JSONDecoder().decode([RecapTemplate].self, from: $0) } @@ -126,6 +136,7 @@ final class AppSettings: ObservableObject { static let autoSend = "autoSendOnStop" static let recapEnabled = "recapEnabled" static let reconcileSpeakers = "reconcileSpeakers" + static let chunkMode = "chunkMode" static let recapTemplates = "recapTemplates" static let defaultTemplate = "defaultTemplateId" } diff --git a/Ten31Transcripts/UI/SettingsView.swift b/Ten31Transcripts/UI/SettingsView.swift index 6d17b48..c536808 100644 --- a/Ten31Transcripts/UI/SettingsView.swift +++ b/Ten31Transcripts/UI/SettingsView.swift @@ -39,6 +39,12 @@ struct SettingsView: View { Section("Transcription") { Toggle("Auto-send recordings to backend", isOn: $settings.autoSendOnStop) Toggle("Reconcile speakers (merge splits + name from content)", isOn: $settings.reconcileSpeakers) + Picker("Chunk length", selection: $settings.chunkMode) { + ForEach(ChunkMode.allCases) { Text($0.label).tag($0.rawValue) } + } + Text("How finely audio is split for diarization. Shorter chunks keep fewer simultaneous speakers per window (the diarizer resolves ~4 at a time), at some cost to speed and voice matching. Auto uses 60-sec chunks when more than \(ChunkMode.autoLargeThreshold) people are detected on the call, else 2.5 min.") + .font(.caption) + .foregroundStyle(.secondary) Toggle("Build readable recap (topics + highlights)", isOn: $settings.recapEnabled) HStack { Picker("Default recap template", selection: $settings.defaultTemplateId) { diff --git a/Ten31Transcripts/Visual/TimelineBuilder.swift b/Ten31Transcripts/Visual/TimelineBuilder.swift index a34554d..de52963 100644 --- a/Ten31Transcripts/Visual/TimelineBuilder.swift +++ b/Ten31Transcripts/Visual/TimelineBuilder.swift @@ -15,9 +15,15 @@ final class TimelineBuilder { private let closeFrames: Int private var aliases: [String: String] = [:] // normalized variant -> canonical private var states: [String: NameState] = [:] + private var observed: Set = [] // every tile name seen (speaking or not) private var lastFrameT: Double = 0 private(set) var segments: [VisualTimeline.Segment] = [] + /// Every distinct participant name the adapter has OCR'd, whether or not they were + /// ever detected speaking — the call-size signal (drives "Auto" chunk sizing and a + /// complete participant roster, since speaking-detection is intentionally sparse). + var observedNames: [String] { observed.sorted() } + init(openFrames: Int = 2, closeFrames: Int = 2) { self.openFrames = max(1, openFrames) self.closeFrames = max(1, closeFrames) @@ -34,6 +40,9 @@ final class TimelineBuilder { func ingest(_ observations: [SpeakerObservation], at t: TimeInterval) { lastFrameT = t + // Record every tile seen (speaking or not) for the participant roster / call size. + for obs in observations where !obs.name.isEmpty { observed.insert(canonical(obs.name)) } + // Best confidence per canonical name that is speaking this frame. var speaking: [String: Double] = [:] for obs in observations where obs.speaking && !obs.name.isEmpty { diff --git a/Ten31Transcripts/Visual/VisualCapture.swift b/Ten31Transcripts/Visual/VisualCapture.swift index 638c9bb..597d23e 100644 --- a/Ten31Transcripts/Visual/VisualCapture.swift +++ b/Ten31Transcripts/Visual/VisualCapture.swift @@ -75,7 +75,10 @@ final class VisualCapture { }, to: durationSec) let artifact = (vision + selfSegs).sorted { $0.start < $1.start } - let names = Set(artifact.map { $0.name }) + // Roster = everyone OCR'd (speaking or not) ∪ the names that produced segments, + // so the participant count reflects true call size even when few people were + // detected speaking. Drives "Auto" chunk sizing downstream. + let names = Set(artifact.map { $0.name }).union(observer.participantNames()) let participants = names.sorted().map { VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil) } diff --git a/Ten31Transcripts/Visual/VisualObserver.swift b/Ten31Transcripts/Visual/VisualObserver.swift index 49ab1c0..de8a6d3 100644 --- a/Ten31Transcripts/Visual/VisualObserver.swift +++ b/Ten31Transcripts/Visual/VisualObserver.swift @@ -114,6 +114,10 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput { queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) } } + /// Every distinct participant name OCR'd over the session (read on the builder's + /// queue; safe to call after `stop`). + func participantNames() -> [String] { queue.sync { builder.observedNames } } + // MARK: - SCStreamOutput (on `queue`) func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer, diff --git a/Ten31TranscriptsTests/Phase5Tests.swift b/Ten31TranscriptsTests/Phase5Tests.swift index 8acbf02..e0a0f7d 100644 --- a/Ten31TranscriptsTests/Phase5Tests.swift +++ b/Ten31TranscriptsTests/Phase5Tests.swift @@ -37,6 +37,33 @@ final class Phase5Tests: XCTestCase { XCTAssertEqual(asm.speakersFile.segments[0].start, 152, accuracy: 0.01) } + func testChunkModeResolvesBodyLength() { + // Fixed presets ignore participant count. + XCTAssertEqual(ChunkMode.standard.bodySeconds(participantCount: 99), 150) + XCTAssertEqual(ChunkMode.largeGroup.bodySeconds(participantCount: 2), 60) + XCTAssertEqual(ChunkMode.fine.bodySeconds(participantCount: nil), 90) + // Auto: >4 detected → 60s, ≤4 → 150s, unknown → 150s. + XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: 6), 60) + XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: 4), 150) + XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: nil), 150) + } + + func testChunkOverlapScalesWithBody() { + XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 150), 15) // capped + XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 60), 8) // floored (60*0.12=7.2→8) + XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 90), 11) // 90*0.12=10.8→11 + } + + func testPlanChunksShortBodyChunksAShortCall() { + // A 100s call would be ONE chunk at the 2.5-min default, but at a 60s body it + // splits — so "Large group" actually re-chunks medium calls. + let c = SessionPackager.planChunks(durationSec: 100, chunkSeconds: 60, + overlapSeconds: 8, thresholdSec: 72) + XCTAssertEqual(c.count, 2) + XCTAssertEqual(c[0].bodyStart, 0); XCTAssertEqual(c[0].bodyEnd, 60) + XCTAssertEqual(c[1].bodyStart, 60); XCTAssertEqual(c[1].bodyEnd, 100) + } + func testDropStuckSpansRemovesWholeCallCue() { let segs = [ VisualTimeline.Segment(start: 0, end: 1900, name: "Grant Gilliam", confidence: 1, source: "vision"), // stuck whole-call tile