import Foundation /// How long each diarization *body* chunk should be. Smaller chunks keep fewer /// simultaneous speakers inside one window — Sortformer resolves at most ~4 speakers /// per chunk, and the dual-channel split already spends the local user on the mic /// track, so the system (remote) channel is what can saturate on a big call. The /// cost of going smaller: weaker cross-chunk voiceprints, more cross-chunk speaker /// splitting (the reconciler re-merges some), and more backend round-trips. enum ChunkMode: String, CaseIterable, Identifiable, Codable { case auto, standard, largeGroup, fine var id: String { rawValue } var label: String { switch self { case .auto: return "Auto (by call size)" case .standard: return "Standard · 2.5 min" case .largeGroup: return "Large group · 60 sec" case .fine: return "Fine · 90 sec" } } /// Fixed body length, or nil for `.auto` (resolved from the participant count). var fixedBodySeconds: Double? { switch self { case .auto: return nil case .standard: return 150 case .largeGroup: return 60 case .fine: return 90 } } /// More than this many detected participants makes `.auto` pick the short body, /// so one chunk is less likely to exceed Sortformer's ~4-speaker resolution. static let autoLargeThreshold = 4 /// Resolve the body length in seconds. `.auto` drops to 60s when more than /// `autoLargeThreshold` participants were detected, else uses the 2.5-min default; /// with no count available (audio-only) it stays at the 2.5-min default. func bodySeconds(participantCount: Int?) -> Double { if let fixed = fixedBodySeconds { return fixed } if let n = participantCount, n > Self.autoLargeThreshold { return 60 } return 150 } /// Overlap margin scaled to the body length (~12%, clamped 8…15s) so a 60s chunk /// isn't dominated by a fixed 15s margin while a 2.5-min chunk keeps the full 15s. static func overlapSeconds(forBody body: Double) -> Double { max(8, min(15, (body * 0.12).rounded())) } }