From a3e3406b28993282a9ebd8cb3e27b22a28b77986 Mon Sep 17 00:00:00 2001
From: Grant Gilliam <grant@ten31.xyz>
Date: Tue, 9 Jun 2026 10:15:16 -0500
Subject: [PATCH] Make diarization chunk length configurable (Auto + presets)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Chunk size was hardcoded at 2.5-min bodies. Add a Settings control:
Auto / Standard 2.5min / Large group 60s / Fine 90s. Shorter chunks keep fewer
simultaneous speakers per window (Sortformer resolves ~4/chunk), useful for large
calls, at some cost to speed and cross-chunk voice matching.

- ChunkMode (new, pure/testable): mode → body seconds; Auto picks 60s when >4
  participants were detected, else 150s; overlap + single-chunk threshold scale
  with the body length.
- AppSettings.chunkMode (+ typed `chunk`); SettingsView picker with explanation.
- TranscriptPipeline.process gains chunkSeconds; derives overlap/threshold from it.
- SessionController resolves the body from the setting + the session's detected
  participant count (visual_timeline participants) for both send + re-process.
- Participant roster now counts EVERY tile OCR'd, not just who spoke
  (TimelineBuilder.observedNames → VisualObserver → VisualCapture), so the Auto
  call-size signal is meaningful even though speaking-detection is sparse.

Tests: ChunkMode resolution, overlap scaling, short-body re-chunking. 69 pass.
---
 Ten31Transcripts/Session/ChunkPlan.swift      | 51 +++++++++++++++++++
 .../Session/SessionController.swift           | 15 +++++-
 .../Session/TranscriptPipeline.swift          |  8 ++-
 Ten31Transcripts/Settings/AppSettings.swift   | 11 ++++
 Ten31Transcripts/UI/SettingsView.swift        |  6 +++
 Ten31Transcripts/Visual/TimelineBuilder.swift |  9 ++++
 Ten31Transcripts/Visual/VisualCapture.swift   |  5 +-
 Ten31Transcripts/Visual/VisualObserver.swift  |  4 ++
 Ten31TranscriptsTests/Phase5Tests.swift       | 27 ++++++++++
 9 files changed, 133 insertions(+), 3 deletions(-)
 create mode 100644 Ten31Transcripts/Session/ChunkPlan.swift

diff --git a/Ten31Transcripts/Session/ChunkPlan.swift b/Ten31Transcripts/Session/ChunkPlan.swift
new file mode 100644
index 0000000..d55e015
--- /dev/null
+++ b/Ten31Transcripts/Session/ChunkPlan.swift
@@ -0,0 +1,51 @@
+import Foundation
+
+/// How long each diarization *body* chunk should be. Smaller chunks keep fewer
+/// simultaneous speakers inside one window — Sortformer resolves at most ~4 speakers
+/// per chunk, and the dual-channel split already spends the local user on the mic
+/// track, so the system (remote) channel is what can saturate on a big call. The
+/// cost of going smaller: weaker cross-chunk voiceprints, more cross-chunk speaker
+/// splitting (the reconciler re-merges some), and more backend round-trips.
+enum ChunkMode: String, CaseIterable, Identifiable, Codable {
+    case auto, standard, largeGroup, fine
+
+    var id: String { rawValue }
+
+    var label: String {
+        switch self {
+        case .auto:       return "Auto (by call size)"
+        case .standard:   return "Standard · 2.5 min"
+        case .largeGroup: return "Large group · 60 sec"
+        case .fine:       return "Fine · 90 sec"
+        }
+    }
+
+    /// Fixed body length, or nil for `.auto` (resolved from the participant count).
+    var fixedBodySeconds: Double? {
+        switch self {
+        case .auto:       return nil
+        case .standard:   return 150
+        case .largeGroup: return 60
+        case .fine:       return 90
+        }
+    }
+
+    /// More than this many detected participants makes `.auto` pick the short body,
+    /// so one chunk is less likely to exceed Sortformer's ~4-speaker resolution.
+    static let autoLargeThreshold = 4
+
+    /// Resolve the body length in seconds. `.auto` drops to 60s when more than
+    /// `autoLargeThreshold` participants were detected, else uses the 2.5-min default;
+    /// with no count available (audio-only) it stays at the 2.5-min default.
+    func bodySeconds(participantCount: Int?) -> Double {
+        if let fixed = fixedBodySeconds { return fixed }
+        if let n = participantCount, n > Self.autoLargeThreshold { return 60 }
+        return 150
+    }
+
+    /// Overlap margin scaled to the body length (~12%, clamped 8…15s) so a 60s chunk
+    /// isn't dominated by a fixed 15s margin while a 2.5-min chunk keeps the full 15s.
+    static func overlapSeconds(forBody body: Double) -> Double {
+        max(8, min(15, (body * 0.12).rounded()))
+    }
+}
diff --git a/Ten31Transcripts/Session/SessionController.swift b/Ten31Transcripts/Session/SessionController.swift
index ea1fad7..1fb095b 100644
--- a/Ten31Transcripts/Session/SessionController.swift
+++ b/Ten31Transcripts/Session/SessionController.swift
@@ -378,12 +378,15 @@ final class SessionController: ObservableObject {
         let settings = self.settings
         let pipeline = TranscriptPipeline(baseURL: settings.backendBaseURL,
                                           skipTLS: settings.skipTLSVerification, voiceprints: voiceprints)
+        // Resolve the diarization chunk length from the setting; "Auto" uses the
+        // participant count the visual capture saw for this session.
+        let chunkSeconds = settings.chunk.bodySeconds(participantCount: Self.participantCount(in: inputs.folder))
         do {
             let speakers = try await pipeline.process(
                 sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
                 micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
                 timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
-                systemHealthy: inputs.systemHealthy,
+                systemHealthy: inputs.systemHealthy, chunkSeconds: chunkSeconds,
                 progress: { done, total in await MainActor.run { self.transcriptStatus = .processing(done, total) } })
             self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
             try Task.checkCancellation()
@@ -531,6 +534,16 @@ final class SessionController: ObservableObject {
         }
     }
 
+    /// Detected participant count from a session's visual timeline, for "Auto" chunk
+    /// sizing. Nil when there's no visual timeline (audio-only) so callers keep the
+    /// default body length. Counts everyone OCR'd on the call, not just who spoke.
+    private static func participantCount(in folder: URL) -> Int? {
+        guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")),
+              let vt = try? JSONDecoder().decode(VisualTimeline.self, from: data),
+              !vt.participants.isEmpty else { return nil }
+        return vt.participants.count
+    }
+
     /// The remote (vision) visual-timeline segments saved for a session, if any.
     private static func remoteTimeline(in folder: URL) -> [VisualTimeline.Segment] {
         guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")),
diff --git a/Ten31Transcripts/Session/TranscriptPipeline.swift b/Ten31Transcripts/Session/TranscriptPipeline.swift
index 3df6e0e..cd02bd8 100644
--- a/Ten31Transcripts/Session/TranscriptPipeline.swift
+++ b/Ten31Transcripts/Session/TranscriptPipeline.swift
@@ -28,6 +28,7 @@ final class TranscriptPipeline {
                  selfSpans: [VADSpan],
                  selfName: String,
                  systemHealthy: Bool,
+                 chunkSeconds: Double = 150,
                  progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
         let fm = FileManager.default
         let dual = systemHealthy
@@ -36,7 +37,12 @@ final class TranscriptPipeline {
         let duration = dual
             ? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
             : SessionPackager.duration(of: mixedURL)
-        let plan = SessionPackager.planChunks(durationSec: duration)
+        // Chunk to the requested body length; overlap and the single-chunk threshold
+        // scale with it (a 60s body shouldn't be cut by a fixed 15s margin or stay
+        // unchunked below the 2.5-min default threshold).
+        let overlap = ChunkMode.overlapSeconds(forBody: chunkSeconds)
+        let plan = SessionPackager.planChunks(durationSec: duration, chunkSeconds: chunkSeconds,
+                                              overlapSeconds: overlap, thresholdSec: chunkSeconds * 1.2)
 
         // Zero-duration / empty session → a valid empty speakers.json, no backend call.
         if plan.isEmpty || duration <= 0 {
diff --git a/Ten31Transcripts/Settings/AppSettings.swift b/Ten31Transcripts/Settings/AppSettings.swift
index e919824..d43919d 100644
--- a/Ten31Transcripts/Settings/AppSettings.swift
+++ b/Ten31Transcripts/Settings/AppSettings.swift
@@ -60,6 +60,15 @@ final class AppSettings: ObservableObject {
         didSet { defaults.set(reconcileSpeakers, forKey: Keys.reconcileSpeakers) }
     }
 
+    /// Diarization chunk length (raw value of `ChunkMode`). `.auto` shrinks chunks on
+    /// large calls so a window is less likely to exceed Sortformer's ~4-speaker cap.
+    @Published var chunkMode: String {
+        didSet { defaults.set(chunkMode, forKey: Keys.chunkMode) }
+    }
+
+    /// Typed accessor for `chunkMode`.
+    var chunk: ChunkMode { ChunkMode(rawValue: chunkMode) ?? .auto }
+
     /// User-editable recap templates (takeaways categories per meeting type).
     @Published var recapTemplates: [RecapTemplate] {
         didSet { persist(recapTemplates, forKey: Keys.recapTemplates) }
@@ -104,6 +113,7 @@ final class AppSettings: ObservableObject {
         self.autoSendOnStop = defaults.object(forKey: Keys.autoSend) as? Bool ?? false
         self.recapEnabled = defaults.object(forKey: Keys.recapEnabled) as? Bool ?? true
         self.reconcileSpeakers = defaults.object(forKey: Keys.reconcileSpeakers) as? Bool ?? true
+        self.chunkMode = defaults.string(forKey: Keys.chunkMode) ?? ChunkMode.auto.rawValue
 
         let loaded = (defaults.data(forKey: Keys.recapTemplates))
             .flatMap { try? JSONDecoder().decode([RecapTemplate].self, from: $0) }
@@ -126,6 +136,7 @@ final class AppSettings: ObservableObject {
         static let autoSend = "autoSendOnStop"
         static let recapEnabled = "recapEnabled"
         static let reconcileSpeakers = "reconcileSpeakers"
+        static let chunkMode = "chunkMode"
         static let recapTemplates = "recapTemplates"
         static let defaultTemplate = "defaultTemplateId"
     }
diff --git a/Ten31Transcripts/UI/SettingsView.swift b/Ten31Transcripts/UI/SettingsView.swift
index 6d17b48..c536808 100644
--- a/Ten31Transcripts/UI/SettingsView.swift
+++ b/Ten31Transcripts/UI/SettingsView.swift
@@ -39,6 +39,12 @@ struct SettingsView: View {
             Section("Transcription") {
                 Toggle("Auto-send recordings to backend", isOn: $settings.autoSendOnStop)
                 Toggle("Reconcile speakers (merge splits + name from content)", isOn: $settings.reconcileSpeakers)
+                Picker("Chunk length", selection: $settings.chunkMode) {
+                    ForEach(ChunkMode.allCases) { Text($0.label).tag($0.rawValue) }
+                }
+                Text("How finely audio is split for diarization. Shorter chunks keep fewer simultaneous speakers per window (the diarizer resolves ~4 at a time), at some cost to speed and voice matching. Auto uses 60-sec chunks when more than \(ChunkMode.autoLargeThreshold) people are detected on the call, else 2.5 min.")
+                    .font(.caption)
+                    .foregroundStyle(.secondary)
                 Toggle("Build readable recap (topics + highlights)", isOn: $settings.recapEnabled)
                 HStack {
                     Picker("Default recap template", selection: $settings.defaultTemplateId) {
diff --git a/Ten31Transcripts/Visual/TimelineBuilder.swift b/Ten31Transcripts/Visual/TimelineBuilder.swift
index a34554d..de52963 100644
--- a/Ten31Transcripts/Visual/TimelineBuilder.swift
+++ b/Ten31Transcripts/Visual/TimelineBuilder.swift
@@ -15,9 +15,15 @@ final class TimelineBuilder {
     private let closeFrames: Int
     private var aliases: [String: String] = [:]      // normalized variant -> canonical
     private var states: [String: NameState] = [:]
+    private var observed: Set<String> = []           // every tile name seen (speaking or not)
     private var lastFrameT: Double = 0
     private(set) var segments: [VisualTimeline.Segment] = []
 
+    /// Every distinct participant name the adapter has OCR'd, whether or not they were
+    /// ever detected speaking — the call-size signal (drives "Auto" chunk sizing and a
+    /// complete participant roster, since speaking-detection is intentionally sparse).
+    var observedNames: [String] { observed.sorted() }
+
     init(openFrames: Int = 2, closeFrames: Int = 2) {
         self.openFrames = max(1, openFrames)
         self.closeFrames = max(1, closeFrames)
@@ -34,6 +40,9 @@ final class TimelineBuilder {
     func ingest(_ observations: [SpeakerObservation], at t: TimeInterval) {
         lastFrameT = t
 
+        // Record every tile seen (speaking or not) for the participant roster / call size.
+        for obs in observations where !obs.name.isEmpty { observed.insert(canonical(obs.name)) }
+
         // Best confidence per canonical name that is speaking this frame.
         var speaking: [String: Double] = [:]
         for obs in observations where obs.speaking && !obs.name.isEmpty {
diff --git a/Ten31Transcripts/Visual/VisualCapture.swift b/Ten31Transcripts/Visual/VisualCapture.swift
index 638c9bb..597d23e 100644
--- a/Ten31Transcripts/Visual/VisualCapture.swift
+++ b/Ten31Transcripts/Visual/VisualCapture.swift
@@ -75,7 +75,10 @@ final class VisualCapture {
         }, to: durationSec)
 
         let artifact = (vision + selfSegs).sorted { $0.start < $1.start }
-        let names = Set(artifact.map { $0.name })
+        // Roster = everyone OCR'd (speaking or not) ∪ the names that produced segments,
+        // so the participant count reflects true call size even when few people were
+        // detected speaking. Drives "Auto" chunk sizing downstream.
+        let names = Set(artifact.map { $0.name }).union(observer.participantNames())
         let participants = names.sorted().map {
             VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
         }
diff --git a/Ten31Transcripts/Visual/VisualObserver.swift b/Ten31Transcripts/Visual/VisualObserver.swift
index 49ab1c0..de8a6d3 100644
--- a/Ten31Transcripts/Visual/VisualObserver.swift
+++ b/Ten31Transcripts/Visual/VisualObserver.swift
@@ -114,6 +114,10 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
         queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) }
     }
 
+    /// Every distinct participant name OCR'd over the session (read on the builder's
+    /// queue; safe to call after `stop`).
+    func participantNames() -> [String] { queue.sync { builder.observedNames } }
+
     // MARK: - SCStreamOutput (on `queue`)
 
     func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
diff --git a/Ten31TranscriptsTests/Phase5Tests.swift b/Ten31TranscriptsTests/Phase5Tests.swift
index 8acbf02..e0a0f7d 100644
--- a/Ten31TranscriptsTests/Phase5Tests.swift
+++ b/Ten31TranscriptsTests/Phase5Tests.swift
@@ -37,6 +37,33 @@ final class Phase5Tests: XCTestCase {
         XCTAssertEqual(asm.speakersFile.segments[0].start, 152, accuracy: 0.01)
     }
 
+    func testChunkModeResolvesBodyLength() {
+        // Fixed presets ignore participant count.
+        XCTAssertEqual(ChunkMode.standard.bodySeconds(participantCount: 99), 150)
+        XCTAssertEqual(ChunkMode.largeGroup.bodySeconds(participantCount: 2), 60)
+        XCTAssertEqual(ChunkMode.fine.bodySeconds(participantCount: nil), 90)
+        // Auto: >4 detected → 60s, ≤4 → 150s, unknown → 150s.
+        XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: 6), 60)
+        XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: 4), 150)
+        XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: nil), 150)
+    }
+
+    func testChunkOverlapScalesWithBody() {
+        XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 150), 15)   // capped
+        XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 60), 8)     // floored (60*0.12=7.2→8)
+        XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 90), 11)    // 90*0.12=10.8→11
+    }
+
+    func testPlanChunksShortBodyChunksAShortCall() {
+        // A 100s call would be ONE chunk at the 2.5-min default, but at a 60s body it
+        // splits — so "Large group" actually re-chunks medium calls.
+        let c = SessionPackager.planChunks(durationSec: 100, chunkSeconds: 60,
+                                           overlapSeconds: 8, thresholdSec: 72)
+        XCTAssertEqual(c.count, 2)
+        XCTAssertEqual(c[0].bodyStart, 0); XCTAssertEqual(c[0].bodyEnd, 60)
+        XCTAssertEqual(c[1].bodyStart, 60); XCTAssertEqual(c[1].bodyEnd, 100)
+    }
+
     func testDropStuckSpansRemovesWholeCallCue() {
         let segs = [
             VisualTimeline.Segment(start: 0, end: 1900, name: "Grant Gilliam", confidence: 1, source: "vision"), // stuck whole-call tile