Make diarization chunk length configurable (Auto + presets)
Chunk size was hardcoded at 2.5-min bodies. Add a Settings control: Auto / Standard 2.5min / Large group 60s / Fine 90s. Shorter chunks keep fewer simultaneous speakers per window (Sortformer resolves ~4/chunk), useful for large calls, at some cost to speed and cross-chunk voice matching. - ChunkMode (new, pure/testable): mode → body seconds; Auto picks 60s when >4 participants were detected, else 150s; overlap + single-chunk threshold scale with the body length. - AppSettings.chunkMode (+ typed `chunk`); SettingsView picker with explanation. - TranscriptPipeline.process gains chunkSeconds; derives overlap/threshold from it. - SessionController resolves the body from the setting + the session's detected participant count (visual_timeline participants) for both send + re-process. - Participant roster now counts EVERY tile OCR'd, not just who spoke (TimelineBuilder.observedNames → VisualObserver → VisualCapture), so the Auto call-size signal is meaningful even though speaking-detection is sparse. Tests: ChunkMode resolution, overlap scaling, short-body re-chunking. 69 pass.
This commit is contained in:
@@ -0,0 +1,51 @@
|
||||
import Foundation
|
||||
|
||||
/// How long each diarization *body* chunk should be. Smaller chunks keep fewer
|
||||
/// simultaneous speakers inside one window — Sortformer resolves at most ~4 speakers
|
||||
/// per chunk, and the dual-channel split already spends the local user on the mic
|
||||
/// track, so the system (remote) channel is what can saturate on a big call. The
|
||||
/// cost of going smaller: weaker cross-chunk voiceprints, more cross-chunk speaker
|
||||
/// splitting (the reconciler re-merges some), and more backend round-trips.
|
||||
enum ChunkMode: String, CaseIterable, Identifiable, Codable {
|
||||
case auto, standard, largeGroup, fine
|
||||
|
||||
var id: String { rawValue }
|
||||
|
||||
var label: String {
|
||||
switch self {
|
||||
case .auto: return "Auto (by call size)"
|
||||
case .standard: return "Standard · 2.5 min"
|
||||
case .largeGroup: return "Large group · 60 sec"
|
||||
case .fine: return "Fine · 90 sec"
|
||||
}
|
||||
}
|
||||
|
||||
/// Fixed body length, or nil for `.auto` (resolved from the participant count).
|
||||
var fixedBodySeconds: Double? {
|
||||
switch self {
|
||||
case .auto: return nil
|
||||
case .standard: return 150
|
||||
case .largeGroup: return 60
|
||||
case .fine: return 90
|
||||
}
|
||||
}
|
||||
|
||||
/// More than this many detected participants makes `.auto` pick the short body,
|
||||
/// so one chunk is less likely to exceed Sortformer's ~4-speaker resolution.
|
||||
static let autoLargeThreshold = 4
|
||||
|
||||
/// Resolve the body length in seconds. `.auto` drops to 60s when more than
|
||||
/// `autoLargeThreshold` participants were detected, else uses the 2.5-min default;
|
||||
/// with no count available (audio-only) it stays at the 2.5-min default.
|
||||
func bodySeconds(participantCount: Int?) -> Double {
|
||||
if let fixed = fixedBodySeconds { return fixed }
|
||||
if let n = participantCount, n > Self.autoLargeThreshold { return 60 }
|
||||
return 150
|
||||
}
|
||||
|
||||
/// Overlap margin scaled to the body length (~12%, clamped 8…15s) so a 60s chunk
|
||||
/// isn't dominated by a fixed 15s margin while a 2.5-min chunk keeps the full 15s.
|
||||
static func overlapSeconds(forBody body: Double) -> Double {
|
||||
max(8, min(15, (body * 0.12).rounded()))
|
||||
}
|
||||
}
|
||||
@@ -378,12 +378,15 @@ final class SessionController: ObservableObject {
|
||||
let settings = self.settings
|
||||
let pipeline = TranscriptPipeline(baseURL: settings.backendBaseURL,
|
||||
skipTLS: settings.skipTLSVerification, voiceprints: voiceprints)
|
||||
// Resolve the diarization chunk length from the setting; "Auto" uses the
|
||||
// participant count the visual capture saw for this session.
|
||||
let chunkSeconds = settings.chunk.bodySeconds(participantCount: Self.participantCount(in: inputs.folder))
|
||||
do {
|
||||
let speakers = try await pipeline.process(
|
||||
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
||||
micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
|
||||
timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
|
||||
systemHealthy: inputs.systemHealthy,
|
||||
systemHealthy: inputs.systemHealthy, chunkSeconds: chunkSeconds,
|
||||
progress: { done, total in await MainActor.run { self.transcriptStatus = .processing(done, total) } })
|
||||
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
|
||||
try Task.checkCancellation()
|
||||
@@ -531,6 +534,16 @@ final class SessionController: ObservableObject {
|
||||
}
|
||||
}
|
||||
|
||||
/// Detected participant count from a session's visual timeline, for "Auto" chunk
|
||||
/// sizing. Nil when there's no visual timeline (audio-only) so callers keep the
|
||||
/// default body length. Counts everyone OCR'd on the call, not just who spoke.
|
||||
private static func participantCount(in folder: URL) -> Int? {
|
||||
guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")),
|
||||
let vt = try? JSONDecoder().decode(VisualTimeline.self, from: data),
|
||||
!vt.participants.isEmpty else { return nil }
|
||||
return vt.participants.count
|
||||
}
|
||||
|
||||
/// The remote (vision) visual-timeline segments saved for a session, if any.
|
||||
private static func remoteTimeline(in folder: URL) -> [VisualTimeline.Segment] {
|
||||
guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")),
|
||||
|
||||
@@ -28,6 +28,7 @@ final class TranscriptPipeline {
|
||||
selfSpans: [VADSpan],
|
||||
selfName: String,
|
||||
systemHealthy: Bool,
|
||||
chunkSeconds: Double = 150,
|
||||
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
|
||||
let fm = FileManager.default
|
||||
let dual = systemHealthy
|
||||
@@ -36,7 +37,12 @@ final class TranscriptPipeline {
|
||||
let duration = dual
|
||||
? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
|
||||
: SessionPackager.duration(of: mixedURL)
|
||||
let plan = SessionPackager.planChunks(durationSec: duration)
|
||||
// Chunk to the requested body length; overlap and the single-chunk threshold
|
||||
// scale with it (a 60s body shouldn't be cut by a fixed 15s margin or stay
|
||||
// unchunked below the 2.5-min default threshold).
|
||||
let overlap = ChunkMode.overlapSeconds(forBody: chunkSeconds)
|
||||
let plan = SessionPackager.planChunks(durationSec: duration, chunkSeconds: chunkSeconds,
|
||||
overlapSeconds: overlap, thresholdSec: chunkSeconds * 1.2)
|
||||
|
||||
// Zero-duration / empty session → a valid empty speakers.json, no backend call.
|
||||
if plan.isEmpty || duration <= 0 {
|
||||
|
||||
Reference in New Issue
Block a user