a3e3406b28
Chunk size was hardcoded at 2.5-min bodies. Add a Settings control: Auto / Standard 2.5min / Large group 60s / Fine 90s. Shorter chunks keep fewer simultaneous speakers per window (Sortformer resolves ~4/chunk), useful for large calls, at some cost to speed and cross-chunk voice matching. - ChunkMode (new, pure/testable): mode → body seconds; Auto picks 60s when >4 participants were detected, else 150s; overlap + single-chunk threshold scale with the body length. - AppSettings.chunkMode (+ typed `chunk`); SettingsView picker with explanation. - TranscriptPipeline.process gains chunkSeconds; derives overlap/threshold from it. - SessionController resolves the body from the setting + the session's detected participant count (visual_timeline participants) for both send + re-process. - Participant roster now counts EVERY tile OCR'd, not just who spoke (TimelineBuilder.observedNames → VisualObserver → VisualCapture), so the Auto call-size signal is meaningful even though speaking-detection is sparse. Tests: ChunkMode resolution, overlap scaling, short-body re-chunking. 69 pass.
93 lines
4.7 KiB
Swift
93 lines
4.7 KiB
Swift
import Foundation
|
||
import CoreGraphics
|
||
|
||
/// Owns the visual side of one recording session: picks the app's adapter, runs a
|
||
/// `VisualObserver` over the call window, and on stop writes `visual_timeline.json`
|
||
/// and returns the speaker segments for the backend hand-off.
|
||
///
|
||
/// Strictly best-effort: if there's no adapter for the app, or the window can't be
|
||
/// captured, the session simply records audio-only — visuals never block or break
|
||
/// the proven audio path. `init?` returns nil when the app has no visual adapter.
|
||
@available(macOS 13.0, *)
|
||
final class VisualCapture {
|
||
let app: CallDetector.DetectedApp
|
||
private let adapter: any AppAdapter
|
||
private let observer: VisualObserver
|
||
|
||
init?(app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?, t0Host: Double) {
|
||
guard let adapter = AdapterRegistry.adapter(for: app) else { return nil }
|
||
self.app = app
|
||
self.adapter = adapter
|
||
self.observer = VisualObserver(bundleID: bundleID, windowID: windowID, adapter: adapter,
|
||
t0Host: t0Host, fps: adapter.preferredFPS)
|
||
}
|
||
|
||
/// Start window capture. Throws if the window isn't capturable (no window yet,
|
||
/// Screen Recording denied) — the caller catches and falls back to audio-only.
|
||
func start() async throws {
|
||
try await observer.start()
|
||
}
|
||
|
||
/// Stop and discard capture without writing anything (used when the session
|
||
/// ends before capture was fully adopted).
|
||
func cancel() async {
|
||
_ = await observer.stop()
|
||
}
|
||
|
||
/// Clamp segment ends to the audio duration; drop any that become empty. Keeps
|
||
/// `visual_timeline.json` internally consistent and never sends the backend a
|
||
/// segment longer than the audio. (`duration <= 0` → passthrough.)
|
||
static func clampSegments(_ segs: [VisualTimeline.Segment], to duration: Double) -> [VisualTimeline.Segment] {
|
||
guard duration > 0 else { return segs }
|
||
return segs.compactMap { s in
|
||
let end = min(s.end, duration)
|
||
guard end > s.start else { return nil }
|
||
return .init(start: s.start, end: end, name: s.name, confidence: s.confidence, source: s.source)
|
||
}
|
||
}
|
||
|
||
static func clampGaps(_ gaps: [VisualTimeline.Gap], to duration: Double) -> [VisualTimeline.Gap] {
|
||
guard duration > 0 else { return gaps }
|
||
return gaps.compactMap { g in
|
||
let end = min(g.end, duration)
|
||
guard end > g.start else { return nil }
|
||
return .init(start: g.start, end: end, reason: g.reason)
|
||
}
|
||
}
|
||
|
||
/// Stop capture and write `visual_timeline.json` (the full human-readable picture:
|
||
/// remote visual segments + the mic-VAD self spans, merged). Returns ONLY the
|
||
/// remote (vision) segments — in dual-channel mode the backend names the system
|
||
/// track from these, while self is handled by the mic channel + `self_vad`.
|
||
func finish(selfSpans: [VADSpan], selfName: String,
|
||
sessionId: String, t0Unix: Double, durationSec: Double,
|
||
folder: URL) async -> [VisualTimeline.Segment] {
|
||
let (rawSegments, rawGaps) = await observer.stop()
|
||
|
||
// The observer stops slightly after audio fixes `durationSec`, so a trailing
|
||
// gap/segment can run past it. Clamp ends so the JSON is internally consistent
|
||
// (and we never hand the backend a segment longer than the audio).
|
||
let vision = Self.clampSegments(rawSegments, to: durationSec) // remote speakers
|
||
let gaps = Self.clampGaps(rawGaps, to: durationSec)
|
||
let selfSegs = Self.clampSegments(selfSpans.map {
|
||
VisualTimeline.Segment(start: $0.start, end: $0.end, name: selfName,
|
||
confidence: $0.confidence, source: "mic_vad")
|
||
}, to: durationSec)
|
||
|
||
let artifact = (vision + selfSegs).sorted { $0.start < $1.start }
|
||
// Roster = everyone OCR'd (speaking or not) ∪ the names that produced segments,
|
||
// so the participant count reflects true call size even when few people were
|
||
// detected speaking. Drives "Auto" chunk sizing downstream.
|
||
let names = Set(artifact.map { $0.name }).union(observer.participantNames())
|
||
let participants = names.sorted().map {
|
||
VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
|
||
}
|
||
let timeline = VisualTimeline(
|
||
sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion,
|
||
t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS,
|
||
selfName: selfName, participants: participants, segments: artifact, visualGaps: gaps)
|
||
try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json"))
|
||
return vision
|
||
}
|
||
}
|