Files
ten31-transcripts/Ten31Transcripts/Visual/VisualCapture.swift
T
Grant Gilliam a3e3406b28 Make diarization chunk length configurable (Auto + presets)
Chunk size was hardcoded at 2.5-min bodies. Add a Settings control:
Auto / Standard 2.5min / Large group 60s / Fine 90s. Shorter chunks keep fewer
simultaneous speakers per window (Sortformer resolves ~4/chunk), useful for large
calls, at some cost to speed and cross-chunk voice matching.

- ChunkMode (new, pure/testable): mode → body seconds; Auto picks 60s when >4
  participants were detected, else 150s; overlap + single-chunk threshold scale
  with the body length.
- AppSettings.chunkMode (+ typed `chunk`); SettingsView picker with explanation.
- TranscriptPipeline.process gains chunkSeconds; derives overlap/threshold from it.
- SessionController resolves the body from the setting + the session's detected
  participant count (visual_timeline participants) for both send + re-process.
- Participant roster now counts EVERY tile OCR'd, not just who spoke
  (TimelineBuilder.observedNames → VisualObserver → VisualCapture), so the Auto
  call-size signal is meaningful even though speaking-detection is sparse.

Tests: ChunkMode resolution, overlap scaling, short-body re-chunking. 69 pass.
2026-06-09 10:15:16 -05:00

93 lines
4.7 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
import CoreGraphics
/// Owns the visual side of one recording session: picks the app's adapter, runs a
/// `VisualObserver` over the call window, and on stop writes `visual_timeline.json`
/// and returns the speaker segments for the backend hand-off.
///
/// Strictly best-effort: if there's no adapter for the app, or the window can't be
/// captured, the session simply records audio-only visuals never block or break
/// the proven audio path. `init?` returns nil when the app has no visual adapter.
@available(macOS 13.0, *)
final class VisualCapture {
let app: CallDetector.DetectedApp
private let adapter: any AppAdapter
private let observer: VisualObserver
init?(app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?, t0Host: Double) {
guard let adapter = AdapterRegistry.adapter(for: app) else { return nil }
self.app = app
self.adapter = adapter
self.observer = VisualObserver(bundleID: bundleID, windowID: windowID, adapter: adapter,
t0Host: t0Host, fps: adapter.preferredFPS)
}
/// Start window capture. Throws if the window isn't capturable (no window yet,
/// Screen Recording denied) the caller catches and falls back to audio-only.
func start() async throws {
try await observer.start()
}
/// Stop and discard capture without writing anything (used when the session
/// ends before capture was fully adopted).
func cancel() async {
_ = await observer.stop()
}
/// Clamp segment ends to the audio duration; drop any that become empty. Keeps
/// `visual_timeline.json` internally consistent and never sends the backend a
/// segment longer than the audio. (`duration <= 0` passthrough.)
static func clampSegments(_ segs: [VisualTimeline.Segment], to duration: Double) -> [VisualTimeline.Segment] {
guard duration > 0 else { return segs }
return segs.compactMap { s in
let end = min(s.end, duration)
guard end > s.start else { return nil }
return .init(start: s.start, end: end, name: s.name, confidence: s.confidence, source: s.source)
}
}
static func clampGaps(_ gaps: [VisualTimeline.Gap], to duration: Double) -> [VisualTimeline.Gap] {
guard duration > 0 else { return gaps }
return gaps.compactMap { g in
let end = min(g.end, duration)
guard end > g.start else { return nil }
return .init(start: g.start, end: end, reason: g.reason)
}
}
/// Stop capture and write `visual_timeline.json` (the full human-readable picture:
/// remote visual segments + the mic-VAD self spans, merged). Returns ONLY the
/// remote (vision) segments in dual-channel mode the backend names the system
/// track from these, while self is handled by the mic channel + `self_vad`.
func finish(selfSpans: [VADSpan], selfName: String,
sessionId: String, t0Unix: Double, durationSec: Double,
folder: URL) async -> [VisualTimeline.Segment] {
let (rawSegments, rawGaps) = await observer.stop()
// The observer stops slightly after audio fixes `durationSec`, so a trailing
// gap/segment can run past it. Clamp ends so the JSON is internally consistent
// (and we never hand the backend a segment longer than the audio).
let vision = Self.clampSegments(rawSegments, to: durationSec) // remote speakers
let gaps = Self.clampGaps(rawGaps, to: durationSec)
let selfSegs = Self.clampSegments(selfSpans.map {
VisualTimeline.Segment(start: $0.start, end: $0.end, name: selfName,
confidence: $0.confidence, source: "mic_vad")
}, to: durationSec)
let artifact = (vision + selfSegs).sorted { $0.start < $1.start }
// Roster = everyone OCR'd (speaking or not) the names that produced segments,
// so the participant count reflects true call size even when few people were
// detected speaking. Drives "Auto" chunk sizing downstream.
let names = Set(artifact.map { $0.name }).union(observer.participantNames())
let participants = names.sorted().map {
VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
}
let timeline = VisualTimeline(
sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion,
t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS,
selfName: selfName, participants: participants, segments: artifact, visualGaps: gaps)
try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json"))
return vision
}
}