Make diarization chunk length configurable (Auto + presets)
Chunk size was hardcoded at 2.5-min bodies. Add a Settings control: Auto / Standard 2.5min / Large group 60s / Fine 90s. Shorter chunks keep fewer simultaneous speakers per window (Sortformer resolves ~4/chunk), useful for large calls, at some cost to speed and cross-chunk voice matching. - ChunkMode (new, pure/testable): mode → body seconds; Auto picks 60s when >4 participants were detected, else 150s; overlap + single-chunk threshold scale with the body length. - AppSettings.chunkMode (+ typed `chunk`); SettingsView picker with explanation. - TranscriptPipeline.process gains chunkSeconds; derives overlap/threshold from it. - SessionController resolves the body from the setting + the session's detected participant count (visual_timeline participants) for both send + re-process. - Participant roster now counts EVERY tile OCR'd, not just who spoke (TimelineBuilder.observedNames → VisualObserver → VisualCapture), so the Auto call-size signal is meaningful even though speaking-detection is sparse. Tests: ChunkMode resolution, overlap scaling, short-body re-chunking. 69 pass.
This commit is contained in:
@@ -0,0 +1,51 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// How long each diarization *body* chunk should be. Smaller chunks keep fewer
|
||||||
|
/// simultaneous speakers inside one window — Sortformer resolves at most ~4 speakers
|
||||||
|
/// per chunk, and the dual-channel split already spends the local user on the mic
|
||||||
|
/// track, so the system (remote) channel is what can saturate on a big call. The
|
||||||
|
/// cost of going smaller: weaker cross-chunk voiceprints, more cross-chunk speaker
|
||||||
|
/// splitting (the reconciler re-merges some), and more backend round-trips.
|
||||||
|
enum ChunkMode: String, CaseIterable, Identifiable, Codable {
|
||||||
|
case auto, standard, largeGroup, fine
|
||||||
|
|
||||||
|
var id: String { rawValue }
|
||||||
|
|
||||||
|
var label: String {
|
||||||
|
switch self {
|
||||||
|
case .auto: return "Auto (by call size)"
|
||||||
|
case .standard: return "Standard · 2.5 min"
|
||||||
|
case .largeGroup: return "Large group · 60 sec"
|
||||||
|
case .fine: return "Fine · 90 sec"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fixed body length, or nil for `.auto` (resolved from the participant count).
|
||||||
|
var fixedBodySeconds: Double? {
|
||||||
|
switch self {
|
||||||
|
case .auto: return nil
|
||||||
|
case .standard: return 150
|
||||||
|
case .largeGroup: return 60
|
||||||
|
case .fine: return 90
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// More than this many detected participants makes `.auto` pick the short body,
|
||||||
|
/// so one chunk is less likely to exceed Sortformer's ~4-speaker resolution.
|
||||||
|
static let autoLargeThreshold = 4
|
||||||
|
|
||||||
|
/// Resolve the body length in seconds. `.auto` drops to 60s when more than
|
||||||
|
/// `autoLargeThreshold` participants were detected, else uses the 2.5-min default;
|
||||||
|
/// with no count available (audio-only) it stays at the 2.5-min default.
|
||||||
|
func bodySeconds(participantCount: Int?) -> Double {
|
||||||
|
if let fixed = fixedBodySeconds { return fixed }
|
||||||
|
if let n = participantCount, n > Self.autoLargeThreshold { return 60 }
|
||||||
|
return 150
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Overlap margin scaled to the body length (~12%, clamped 8…15s) so a 60s chunk
|
||||||
|
/// isn't dominated by a fixed 15s margin while a 2.5-min chunk keeps the full 15s.
|
||||||
|
static func overlapSeconds(forBody body: Double) -> Double {
|
||||||
|
max(8, min(15, (body * 0.12).rounded()))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -378,12 +378,15 @@ final class SessionController: ObservableObject {
|
|||||||
let settings = self.settings
|
let settings = self.settings
|
||||||
let pipeline = TranscriptPipeline(baseURL: settings.backendBaseURL,
|
let pipeline = TranscriptPipeline(baseURL: settings.backendBaseURL,
|
||||||
skipTLS: settings.skipTLSVerification, voiceprints: voiceprints)
|
skipTLS: settings.skipTLSVerification, voiceprints: voiceprints)
|
||||||
|
// Resolve the diarization chunk length from the setting; "Auto" uses the
|
||||||
|
// participant count the visual capture saw for this session.
|
||||||
|
let chunkSeconds = settings.chunk.bodySeconds(participantCount: Self.participantCount(in: inputs.folder))
|
||||||
do {
|
do {
|
||||||
let speakers = try await pipeline.process(
|
let speakers = try await pipeline.process(
|
||||||
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
||||||
micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
|
micURL: inputs.micURL, systemURL: inputs.systemURL, mixedURL: inputs.mixedURL,
|
||||||
timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
|
timeline: inputs.timeline, selfSpans: inputs.selfSpans, selfName: inputs.selfName,
|
||||||
systemHealthy: inputs.systemHealthy,
|
systemHealthy: inputs.systemHealthy, chunkSeconds: chunkSeconds,
|
||||||
progress: { done, total in await MainActor.run { self.transcriptStatus = .processing(done, total) } })
|
progress: { done, total in await MainActor.run { self.transcriptStatus = .processing(done, total) } })
|
||||||
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
|
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
|
||||||
try Task.checkCancellation()
|
try Task.checkCancellation()
|
||||||
@@ -531,6 +534,16 @@ final class SessionController: ObservableObject {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Detected participant count from a session's visual timeline, for "Auto" chunk
|
||||||
|
/// sizing. Nil when there's no visual timeline (audio-only) so callers keep the
|
||||||
|
/// default body length. Counts everyone OCR'd on the call, not just who spoke.
|
||||||
|
private static func participantCount(in folder: URL) -> Int? {
|
||||||
|
guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")),
|
||||||
|
let vt = try? JSONDecoder().decode(VisualTimeline.self, from: data),
|
||||||
|
!vt.participants.isEmpty else { return nil }
|
||||||
|
return vt.participants.count
|
||||||
|
}
|
||||||
|
|
||||||
/// The remote (vision) visual-timeline segments saved for a session, if any.
|
/// The remote (vision) visual-timeline segments saved for a session, if any.
|
||||||
private static func remoteTimeline(in folder: URL) -> [VisualTimeline.Segment] {
|
private static func remoteTimeline(in folder: URL) -> [VisualTimeline.Segment] {
|
||||||
guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")),
|
guard let data = try? Data(contentsOf: folder.appendingPathComponent("visual_timeline.json")),
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ final class TranscriptPipeline {
|
|||||||
selfSpans: [VADSpan],
|
selfSpans: [VADSpan],
|
||||||
selfName: String,
|
selfName: String,
|
||||||
systemHealthy: Bool,
|
systemHealthy: Bool,
|
||||||
|
chunkSeconds: Double = 150,
|
||||||
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
|
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
|
||||||
let fm = FileManager.default
|
let fm = FileManager.default
|
||||||
let dual = systemHealthy
|
let dual = systemHealthy
|
||||||
@@ -36,7 +37,12 @@ final class TranscriptPipeline {
|
|||||||
let duration = dual
|
let duration = dual
|
||||||
? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
|
? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
|
||||||
: SessionPackager.duration(of: mixedURL)
|
: SessionPackager.duration(of: mixedURL)
|
||||||
let plan = SessionPackager.planChunks(durationSec: duration)
|
// Chunk to the requested body length; overlap and the single-chunk threshold
|
||||||
|
// scale with it (a 60s body shouldn't be cut by a fixed 15s margin or stay
|
||||||
|
// unchunked below the 2.5-min default threshold).
|
||||||
|
let overlap = ChunkMode.overlapSeconds(forBody: chunkSeconds)
|
||||||
|
let plan = SessionPackager.planChunks(durationSec: duration, chunkSeconds: chunkSeconds,
|
||||||
|
overlapSeconds: overlap, thresholdSec: chunkSeconds * 1.2)
|
||||||
|
|
||||||
// Zero-duration / empty session → a valid empty speakers.json, no backend call.
|
// Zero-duration / empty session → a valid empty speakers.json, no backend call.
|
||||||
if plan.isEmpty || duration <= 0 {
|
if plan.isEmpty || duration <= 0 {
|
||||||
|
|||||||
@@ -60,6 +60,15 @@ final class AppSettings: ObservableObject {
|
|||||||
didSet { defaults.set(reconcileSpeakers, forKey: Keys.reconcileSpeakers) }
|
didSet { defaults.set(reconcileSpeakers, forKey: Keys.reconcileSpeakers) }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Diarization chunk length (raw value of `ChunkMode`). `.auto` shrinks chunks on
|
||||||
|
/// large calls so a window is less likely to exceed Sortformer's ~4-speaker cap.
|
||||||
|
@Published var chunkMode: String {
|
||||||
|
didSet { defaults.set(chunkMode, forKey: Keys.chunkMode) }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Typed accessor for `chunkMode`.
|
||||||
|
var chunk: ChunkMode { ChunkMode(rawValue: chunkMode) ?? .auto }
|
||||||
|
|
||||||
/// User-editable recap templates (takeaways categories per meeting type).
|
/// User-editable recap templates (takeaways categories per meeting type).
|
||||||
@Published var recapTemplates: [RecapTemplate] {
|
@Published var recapTemplates: [RecapTemplate] {
|
||||||
didSet { persist(recapTemplates, forKey: Keys.recapTemplates) }
|
didSet { persist(recapTemplates, forKey: Keys.recapTemplates) }
|
||||||
@@ -104,6 +113,7 @@ final class AppSettings: ObservableObject {
|
|||||||
self.autoSendOnStop = defaults.object(forKey: Keys.autoSend) as? Bool ?? false
|
self.autoSendOnStop = defaults.object(forKey: Keys.autoSend) as? Bool ?? false
|
||||||
self.recapEnabled = defaults.object(forKey: Keys.recapEnabled) as? Bool ?? true
|
self.recapEnabled = defaults.object(forKey: Keys.recapEnabled) as? Bool ?? true
|
||||||
self.reconcileSpeakers = defaults.object(forKey: Keys.reconcileSpeakers) as? Bool ?? true
|
self.reconcileSpeakers = defaults.object(forKey: Keys.reconcileSpeakers) as? Bool ?? true
|
||||||
|
self.chunkMode = defaults.string(forKey: Keys.chunkMode) ?? ChunkMode.auto.rawValue
|
||||||
|
|
||||||
let loaded = (defaults.data(forKey: Keys.recapTemplates))
|
let loaded = (defaults.data(forKey: Keys.recapTemplates))
|
||||||
.flatMap { try? JSONDecoder().decode([RecapTemplate].self, from: $0) }
|
.flatMap { try? JSONDecoder().decode([RecapTemplate].self, from: $0) }
|
||||||
@@ -126,6 +136,7 @@ final class AppSettings: ObservableObject {
|
|||||||
static let autoSend = "autoSendOnStop"
|
static let autoSend = "autoSendOnStop"
|
||||||
static let recapEnabled = "recapEnabled"
|
static let recapEnabled = "recapEnabled"
|
||||||
static let reconcileSpeakers = "reconcileSpeakers"
|
static let reconcileSpeakers = "reconcileSpeakers"
|
||||||
|
static let chunkMode = "chunkMode"
|
||||||
static let recapTemplates = "recapTemplates"
|
static let recapTemplates = "recapTemplates"
|
||||||
static let defaultTemplate = "defaultTemplateId"
|
static let defaultTemplate = "defaultTemplateId"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,6 +39,12 @@ struct SettingsView: View {
|
|||||||
Section("Transcription") {
|
Section("Transcription") {
|
||||||
Toggle("Auto-send recordings to backend", isOn: $settings.autoSendOnStop)
|
Toggle("Auto-send recordings to backend", isOn: $settings.autoSendOnStop)
|
||||||
Toggle("Reconcile speakers (merge splits + name from content)", isOn: $settings.reconcileSpeakers)
|
Toggle("Reconcile speakers (merge splits + name from content)", isOn: $settings.reconcileSpeakers)
|
||||||
|
Picker("Chunk length", selection: $settings.chunkMode) {
|
||||||
|
ForEach(ChunkMode.allCases) { Text($0.label).tag($0.rawValue) }
|
||||||
|
}
|
||||||
|
Text("How finely audio is split for diarization. Shorter chunks keep fewer simultaneous speakers per window (the diarizer resolves ~4 at a time), at some cost to speed and voice matching. Auto uses 60-sec chunks when more than \(ChunkMode.autoLargeThreshold) people are detected on the call, else 2.5 min.")
|
||||||
|
.font(.caption)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
Toggle("Build readable recap (topics + highlights)", isOn: $settings.recapEnabled)
|
Toggle("Build readable recap (topics + highlights)", isOn: $settings.recapEnabled)
|
||||||
HStack {
|
HStack {
|
||||||
Picker("Default recap template", selection: $settings.defaultTemplateId) {
|
Picker("Default recap template", selection: $settings.defaultTemplateId) {
|
||||||
|
|||||||
@@ -15,9 +15,15 @@ final class TimelineBuilder {
|
|||||||
private let closeFrames: Int
|
private let closeFrames: Int
|
||||||
private var aliases: [String: String] = [:] // normalized variant -> canonical
|
private var aliases: [String: String] = [:] // normalized variant -> canonical
|
||||||
private var states: [String: NameState] = [:]
|
private var states: [String: NameState] = [:]
|
||||||
|
private var observed: Set<String> = [] // every tile name seen (speaking or not)
|
||||||
private var lastFrameT: Double = 0
|
private var lastFrameT: Double = 0
|
||||||
private(set) var segments: [VisualTimeline.Segment] = []
|
private(set) var segments: [VisualTimeline.Segment] = []
|
||||||
|
|
||||||
|
/// Every distinct participant name the adapter has OCR'd, whether or not they were
|
||||||
|
/// ever detected speaking — the call-size signal (drives "Auto" chunk sizing and a
|
||||||
|
/// complete participant roster, since speaking-detection is intentionally sparse).
|
||||||
|
var observedNames: [String] { observed.sorted() }
|
||||||
|
|
||||||
init(openFrames: Int = 2, closeFrames: Int = 2) {
|
init(openFrames: Int = 2, closeFrames: Int = 2) {
|
||||||
self.openFrames = max(1, openFrames)
|
self.openFrames = max(1, openFrames)
|
||||||
self.closeFrames = max(1, closeFrames)
|
self.closeFrames = max(1, closeFrames)
|
||||||
@@ -34,6 +40,9 @@ final class TimelineBuilder {
|
|||||||
func ingest(_ observations: [SpeakerObservation], at t: TimeInterval) {
|
func ingest(_ observations: [SpeakerObservation], at t: TimeInterval) {
|
||||||
lastFrameT = t
|
lastFrameT = t
|
||||||
|
|
||||||
|
// Record every tile seen (speaking or not) for the participant roster / call size.
|
||||||
|
for obs in observations where !obs.name.isEmpty { observed.insert(canonical(obs.name)) }
|
||||||
|
|
||||||
// Best confidence per canonical name that is speaking this frame.
|
// Best confidence per canonical name that is speaking this frame.
|
||||||
var speaking: [String: Double] = [:]
|
var speaking: [String: Double] = [:]
|
||||||
for obs in observations where obs.speaking && !obs.name.isEmpty {
|
for obs in observations where obs.speaking && !obs.name.isEmpty {
|
||||||
|
|||||||
@@ -75,7 +75,10 @@ final class VisualCapture {
|
|||||||
}, to: durationSec)
|
}, to: durationSec)
|
||||||
|
|
||||||
let artifact = (vision + selfSegs).sorted { $0.start < $1.start }
|
let artifact = (vision + selfSegs).sorted { $0.start < $1.start }
|
||||||
let names = Set(artifact.map { $0.name })
|
// Roster = everyone OCR'd (speaking or not) ∪ the names that produced segments,
|
||||||
|
// so the participant count reflects true call size even when few people were
|
||||||
|
// detected speaking. Drives "Auto" chunk sizing downstream.
|
||||||
|
let names = Set(artifact.map { $0.name }).union(observer.participantNames())
|
||||||
let participants = names.sorted().map {
|
let participants = names.sorted().map {
|
||||||
VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
|
VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -114,6 +114,10 @@ final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
|
|||||||
queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) }
|
queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Every distinct participant name OCR'd over the session (read on the builder's
|
||||||
|
/// queue; safe to call after `stop`).
|
||||||
|
func participantNames() -> [String] { queue.sync { builder.observedNames } }
|
||||||
|
|
||||||
// MARK: - SCStreamOutput (on `queue`)
|
// MARK: - SCStreamOutput (on `queue`)
|
||||||
|
|
||||||
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
|
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
|
||||||
|
|||||||
@@ -37,6 +37,33 @@ final class Phase5Tests: XCTestCase {
|
|||||||
XCTAssertEqual(asm.speakersFile.segments[0].start, 152, accuracy: 0.01)
|
XCTAssertEqual(asm.speakersFile.segments[0].start, 152, accuracy: 0.01)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testChunkModeResolvesBodyLength() {
|
||||||
|
// Fixed presets ignore participant count.
|
||||||
|
XCTAssertEqual(ChunkMode.standard.bodySeconds(participantCount: 99), 150)
|
||||||
|
XCTAssertEqual(ChunkMode.largeGroup.bodySeconds(participantCount: 2), 60)
|
||||||
|
XCTAssertEqual(ChunkMode.fine.bodySeconds(participantCount: nil), 90)
|
||||||
|
// Auto: >4 detected → 60s, ≤4 → 150s, unknown → 150s.
|
||||||
|
XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: 6), 60)
|
||||||
|
XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: 4), 150)
|
||||||
|
XCTAssertEqual(ChunkMode.auto.bodySeconds(participantCount: nil), 150)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testChunkOverlapScalesWithBody() {
|
||||||
|
XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 150), 15) // capped
|
||||||
|
XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 60), 8) // floored (60*0.12=7.2→8)
|
||||||
|
XCTAssertEqual(ChunkMode.overlapSeconds(forBody: 90), 11) // 90*0.12=10.8→11
|
||||||
|
}
|
||||||
|
|
||||||
|
func testPlanChunksShortBodyChunksAShortCall() {
|
||||||
|
// A 100s call would be ONE chunk at the 2.5-min default, but at a 60s body it
|
||||||
|
// splits — so "Large group" actually re-chunks medium calls.
|
||||||
|
let c = SessionPackager.planChunks(durationSec: 100, chunkSeconds: 60,
|
||||||
|
overlapSeconds: 8, thresholdSec: 72)
|
||||||
|
XCTAssertEqual(c.count, 2)
|
||||||
|
XCTAssertEqual(c[0].bodyStart, 0); XCTAssertEqual(c[0].bodyEnd, 60)
|
||||||
|
XCTAssertEqual(c[1].bodyStart, 60); XCTAssertEqual(c[1].bodyEnd, 100)
|
||||||
|
}
|
||||||
|
|
||||||
func testDropStuckSpansRemovesWholeCallCue() {
|
func testDropStuckSpansRemovesWholeCallCue() {
|
||||||
let segs = [
|
let segs = [
|
||||||
VisualTimeline.Segment(start: 0, end: 1900, name: "Grant Gilliam", confidence: 1, source: "vision"), // stuck whole-call tile
|
VisualTimeline.Segment(start: 0, end: 1900, name: "Grant Gilliam", confidence: 1, source: "vision"), // stuck whole-call tile
|
||||||
|
|||||||
Reference in New Issue
Block a user