Files
ten31-transcripts/Ten31Transcripts/Session/SessionController.swift
T
Grant Gilliam 880b56e426 Wire visual capture into the recording lifecycle (failure-isolated)
Visual capture now runs alongside audio: on call start the session picks the
app's adapter, captures the call window on the SAME monotonic clock as the audio
(AudioRecorder.sharedT0Host), and on stop writes visual_timeline.json and hands
the backend the visual segments with mic-VAD self-spans merged. Any visual
failure (no adapter, no window, Screen Recording denied) leaves the session
recording audio-only — the proven path is never blocked or broken.

- CallDetector now emits DetectedCall{app, bundleID, windowID}: the exact
  CGWindowID of the matched Meet browser window (native apps → nil → largest).
- VisualCapture wraps VisualObserver + AdapterRegistry, writes visual_timeline.json.
- AudioRecorder.sharedT0Host() exposes the shared t0 for frame alignment.

Hardened per a 3-lens adversarial review (concurrency / failure-isolation /
data-flow), all 6 confirmed findings fixed:
- P0 (critical): startVisual could adopt a stale capture into a DIFFERENT session
  (cross-session SCStream leak + visual_timeline.json written to the wrong
  folder). Now gated on session identity — generation + recorder ===, still
  .recording — with fail-closed adoption; otherwise the stream is cancelled.
- P1: observer captured the browser's largest window, not the detected Meet
  window. Now targets the exact CGWindowID (pickWindowIndex, unit-tested),
  largest-area only as fallback.
- P2: a startVisual orphaned by a concurrent stop could leak a stream on quit.
  inFlightVisual is registered before the await and drained in prepareForTermination.
- P3: trailing visual gap/segment ends could exceed duration_sec. Clamped in
  VisualCapture (clampSegments/clampGaps, unit-tested).
- P4: capture pixel size used NSScreen.main scale; now uses the scale of the
  display actually hosting the window (OCR clarity on secondary displays).
- VisualObserver.stop() bounds stopCapture() with a 3s timeout (mirrors audio) so
  a wedged stream can't hang finalization.

25/25 XCTest pass. Live validation on real calls still pending.
2026-06-06 10:18:52 -05:00

460 lines
20 KiB
Swift

import Foundation
import Combine
import AppKit
import CoreGraphics
struct SessionInfo: Equatable {
let folder: URL
let mixedURL: URL
let duration: Double
let selfSpanCount: Int
}
/// Owns a single recording session: creates the session folder, drives
/// `AudioRecorder` start/stop, tracks elapsed time, and writes the Phase-1
/// preview of mic-VAD self spans. Detection/visual/backend wiring come later.
///
/// The lifecycle is serialized through an explicit state machine so start and
/// stop can never interleave (`.starting` `.recording` `.finishing`).
@MainActor
final class SessionController: ObservableObject {
enum State: Equatable {
case idle
case starting
case recording
case finishing
case error(String)
}
/// Backend transcription status for the most recent session.
enum TranscriptStatus: Equatable {
case idle
case processing(Int, Int) // chunk done, total
case done(speakers: Int, segments: Int)
case failed(String)
}
/// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a
/// recording in progress before the app quits.
static weak var shared: SessionController?
@Published private(set) var state: State = .idle
@Published private(set) var elapsed: TimeInterval = 0
@Published private(set) var lastSession: SessionInfo?
/// Live input peak levels (01) while recording, for the UI meters.
@Published private(set) var micLevel: Float = 0
@Published private(set) var systemLevel: Float = 0
/// Surfaced after a session if system audio stopped early.
@Published private(set) var warning: String?
/// Mirrored from `CallDetector` for the UI.
@Published private(set) var detectionStatus: CallDetector.Status = .disabled
/// Backend transcription status for the last session.
@Published private(set) var transcriptStatus: TranscriptStatus = .idle
private let settings: AppSettings
private var voiceprints: VoiceprintStore
private let detector = CallDetector()
private var cancellables = Set<AnyCancellable>()
private var currentLabel = "manual"
/// Inputs needed to (re)process the last finished session through the backend.
private struct ProcessInputs {
let folder: URL
let sessionId: String
let app: String
let mixedURL: URL
let timeline: [VisualTimeline.Segment]
}
private var lastProcess: ProcessInputs?
private var processTask: Task<Void, Never>?
private var recorder: AudioRecorder?
/// Visual capture for the current session (nil for manual recordings, apps with
/// no adapter, or when the window can't be captured those record audio-only).
private var visualCapture: VisualCapture?
/// A visual capture whose `start()` is in flight (registered before the await),
/// so `prepareForTermination` can tear it down if its start-Task is orphaned.
private var inFlightVisual: VisualCapture?
/// App + capture target to start visual capture for, set at `start()`. `windowID`
/// pins the exact detected window (e.g. the Meet browser window); nil largest.
private var pendingCapture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)?
private var currentFolder: URL?
private var startTime: Date?
private var timer: Timer?
/// True when the current session was started by call detection (not the user).
private var autoStarted = false
/// Set if a detected call ends while we're still in `.starting`.
private var pendingAutoStop = false
/// The in-flight start or stop Task, so `prepareForTermination` can await it.
private var lifecycleTask: Task<Void, Never>?
/// Bumped each time a start/stop Task is spawned (Task is a value type, so this
/// is how `prepareForTermination` detects a newly-spawned transition).
private var lifecycleGeneration = 0
init(settings: AppSettings) {
self.settings = settings
self.voiceprints = VoiceprintStore(
fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
SessionController.shared = self
detector.onCallStart = { [weak self] call in self?.handleCallStart(call) }
detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
detector.$status
.sink { [weak self] status in self?.detectionStatus = status }
.store(in: &cancellables)
// Re-point the voiceprint DB if the output folder changes. The in-flight
// pipeline keeps its own captured reference, so this can't disrupt a run.
settings.$outputFolderPath
.dropFirst()
.sink { [weak self] path in
guard let self else { return }
let dir = URL(fileURLWithPath: (path as NSString).expandingTildeInPath, isDirectory: true)
self.voiceprints = VoiceprintStore(fileURL: dir.appendingPathComponent("voiceprints.json"))
}
.store(in: &cancellables)
settings.$autoRecordOnDetection
.sink { [weak self] on in
guard let self else { return }
if on {
self.detector.enable()
} else {
self.detector.disable()
// Don't leave an auto-started session running with no detector
// handle both .recording and the in-flight .starting case.
if self.autoStarted {
switch self.state {
case .recording: self.stop()
case .starting: self.pendingAutoStop = true
default: break
}
}
}
}
.store(in: &cancellables)
}
// MARK: - Auto-detection
private func handleCallStart(_ call: CallDetector.DetectedCall) {
guard settings.autoRecordOnDetection else { return }
switch state {
case .idle, .error:
start(label: call.app.label, auto: true, capture: (call.app, call.bundleID, call.windowID))
case .starting, .recording, .finishing: break // don't disturb an active session
}
}
private func handleCallEnd() {
// Only auto-stop a session we auto-started; never a manual recording.
guard autoStarted else { return }
switch state {
case .recording: stop()
case .starting: pendingAutoStop = true // resolved when start() completes
case .idle, .error, .finishing: break
}
}
var isBusy: Bool {
state == .starting || state == .recording || state == .finishing
}
func toggle() {
switch state {
case .idle, .error: start()
case .recording: stop()
case .starting, .finishing: break // ignore taps mid-transition
}
}
// MARK: - Start / Stop
private func start(label: String = "manual", auto: Bool = false,
capture: (app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?)? = nil) {
let folder: URL
do {
folder = try makeSessionFolder(label: label)
} catch {
fail("Couldn't create session folder: \(error.localizedDescription)")
return
}
currentFolder = folder
currentLabel = label
autoStarted = auto
pendingAutoStop = false
pendingCapture = capture
let recorder = AudioRecorder(
micURL: folder.appendingPathComponent("mic.wav"),
systemURL: folder.appendingPathComponent("system.wav"),
mixedURL: folder.appendingPathComponent("mixed_mono_16k.wav"))
self.recorder = recorder
warning = nil
state = .starting
lifecycleGeneration += 1
let myGen = lifecycleGeneration
lifecycleTask = Task {
do {
try await recorder.start() // self-tears-down if it throws
self.state = .recording
self.startTime = Date()
self.startTimer()
// A detected call may have ended while we were still starting.
if self.pendingAutoStop {
self.pendingAutoStop = false
self.stop()
return
}
// Attach visual capture on the SAME clock (best-effort, audio-only on failure).
// Pass this session's generation + recorder so a slow start can't
// adopt itself into a different session that began meanwhile.
await self.startVisual(t0Host: recorder.sharedT0Host(), generation: myGen, recorder: recorder)
} catch {
self.handleStartFailure(error)
}
}
}
/// Map a recorder start failure to an actionable message. The common case is
/// Screen Recording getting re-checked after a rebuild (the SCStream auth
/// check fails even though CGPreflight reports granted), so re-prompt and open
/// the right Settings pane rather than show a cryptic TCC error.
private func handleStartFailure(_ error: Error) {
let msg = error.localizedDescription.lowercased()
let screenIssue = msg.contains("declined") || msg.contains("tcc")
|| msg.contains("screen") || msg.contains("permission")
if screenIssue {
_ = CGRequestScreenCaptureAccess()
if let url = URL(string: "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") {
NSWorkspace.shared.open(url)
}
fail("Screen Recording needs re-approval for this build. Toggle Ten31Transcripts off then on in System Settings ▸ Screen Recording, then restart the app.")
} else {
fail("Couldn't start recording: \(error.localizedDescription)")
}
}
// MARK: - Visual capture
/// Best-effort: start window capture for the detected app on the audio clock.
/// Any failure (no adapter, no window, Screen Recording denied) leaves
/// `visualCapture` nil and the session records audio-only.
///
/// `generation`/`recorder` identify the session that launched this; because
/// `vc.start()` is a slow async call, a stop + a fresh start can complete during
/// it. We adopt the stream ONLY back into the same session otherwise we cancel
/// it, so a stale capture can never attach to (or leak into) a different session.
private func startVisual(t0Host: Double, generation: Int, recorder: AudioRecorder) async {
guard let capture = pendingCapture else { return } // manual recording audio-only
pendingCapture = nil
guard let vc = VisualCapture(app: capture.app, bundleID: capture.bundleID,
windowID: capture.windowID, t0Host: t0Host) else { return }
// Register the live capture before the await so a quit (prepareForTermination)
// can drain it even if this start-Task gets orphaned by a concurrent stop.
inFlightVisual = vc
defer { if inFlightVisual === vc { inFlightVisual = nil } }
do {
try await vc.start()
// Adopt only if THIS session still owns the slot (same generation, same
// recorder, still recording); otherwise discard rather than leak/misattach.
guard generation == lifecycleGeneration, self.recorder === recorder,
case .recording = state else {
await vc.cancel()
return
}
if let existing = visualCapture { await existing.cancel() } // fail-closed
visualCapture = vc
} catch {
await vc.cancel() // tear down any partial stream; never break recording
}
}
/// Stop visual capture (if any), write `visual_timeline.json`, and return the
/// timeline for the backend: visual segments + merged self-spans when visual
/// ran, otherwise the mic-VAD self spans alone.
private func stopVisualAndTimeline(_ result: RecordingResult, folder: URL?) async -> [VisualTimeline.Segment] {
let selfName = settings.selfName
if let vc = visualCapture, let folder {
visualCapture = nil
return await vc.finish(
selfSpans: result.selfSpans, selfName: selfName,
sessionId: folder.lastPathComponent, t0Unix: result.t0Unix,
durationSec: result.duration, folder: folder)
}
if let vc = visualCapture { await vc.cancel(); visualCapture = nil }
return TranscriptPipeline.timeline(fromSelfSpans: result.selfSpans, selfName: selfName)
}
private func stop() {
guard let recorder else { return }
state = .finishing
stopTimer()
let folder = currentFolder
lifecycleGeneration += 1
lifecycleTask = Task {
let result = await recorder.stop()
let timeline = await self.stopVisualAndTimeline(result, folder: folder)
self.finish(result, timeline: timeline)
}
}
private func finish(_ result: RecordingResult, timeline: [VisualTimeline.Segment]) {
recorder = nil
micLevel = 0
systemLevel = 0
warning = result.systemNote.map { "System audio stopped early: \($0)" }
transcriptStatus = .idle
if let folder = currentFolder {
writeSelfSpans(result, to: folder)
lastSession = SessionInfo(
folder: folder, mixedURL: result.mixedURL,
duration: result.duration, selfSpanCount: result.selfSpans.count)
lastProcess = ProcessInputs(
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
mixedURL: result.mixedURL, timeline: timeline)
}
let autoSend = settings.autoSendOnStop
currentFolder = nil
autoStarted = false
pendingAutoStop = false
elapsed = 0
state = .idle
if autoSend { processLastSession() }
}
// MARK: - Backend transcription
/// Send the last finished session to the backend `speakers.json`. The
/// timeline is the session's visual segments (with mic-VAD self spans merged)
/// when visual capture ran, or the self spans alone otherwise. Safe to call
/// manually ("Send to backend") or automatically on stop.
func processLastSession() {
guard let inputs = lastProcess else { return }
if case .processing = transcriptStatus { return }
transcriptStatus = .processing(0, 1)
let settings = self.settings
let voiceprints = self.voiceprints
processTask = Task {
let pipeline = TranscriptPipeline(
baseURL: settings.backendBaseURL,
skipTLS: settings.skipTLSVerification,
voiceprints: voiceprints)
let timeline = inputs.timeline
do {
let speakers = try await pipeline.process(
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
mixedURL: inputs.mixedURL, timeline: timeline,
progress: { done, total in
await MainActor.run { self.transcriptStatus = .processing(done, total) }
})
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
} catch is CancellationError {
self.transcriptStatus = .idle
} catch {
self.transcriptStatus = .failed(error.localizedDescription)
}
}
}
private func fail(_ message: String) {
recorder = nil
visualCapture = nil // recorder.start() failed before visual started; nothing running
inFlightVisual = nil
pendingCapture = nil
currentFolder = nil
autoStarted = false
pendingAutoStop = false
stopTimer()
micLevel = 0
systemLevel = 0
elapsed = 0
state = .error(message)
}
/// Called from `applicationShouldTerminate`: flush any in-progress session so
/// its WAV headers are finalized before the process exits. Handles quit while
/// `.starting` and `.finishing`, not just `.recording`.
func prepareForTermination() async {
// Cancel any in-flight backend transcription (audio is already saved; the
// user can resend). The pipeline's checkCancellation + defer clean up chunks.
processTask?.cancel()
// Drain whatever lifecycle Task is in flight until nothing is busy. A Stop
// click landing in an await window can spawn a new stop Task, so loop
// rather than awaiting a single captured task.
while isBusy {
let gen = lifecycleGeneration
await lifecycleTask?.value
if state == .recording, let recorder {
state = .finishing
stopTimer()
let folder = currentFolder
let result = await recorder.stop()
let timeline = await stopVisualAndTimeline(result, folder: folder)
finish(result, timeline: timeline)
} else if lifecycleGeneration == gen {
break // settled: no new transition was spawned
}
}
// A visual start-Task orphaned by a concurrent stop may still hold a live
// stream that nothing else will tear down before exit drain it here.
if let vc = inFlightVisual {
inFlightVisual = nil
await vc.cancel()
}
}
// MARK: - Timer
private func startTimer() {
timer = Timer.scheduledTimer(withTimeInterval: 0.1, repeats: true) { [weak self] _ in
Task { @MainActor in
guard let self else { return }
if let start = self.startTime { self.elapsed = Date().timeIntervalSince(start) }
if let recorder = self.recorder {
let levels = recorder.currentLevels()
self.micLevel = levels.mic
self.systemLevel = levels.system
}
}
}
}
private func stopTimer() {
timer?.invalidate()
timer = nil
}
// MARK: - Files
private func makeSessionFolder(label: String) throws -> URL {
let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
let folder = base.appendingPathComponent("\(Self.timestamp())_\(label)", isDirectory: true)
try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true)
return folder
}
private static func timestamp() -> String {
let f = DateFormatter()
f.locale = Locale(identifier: "en_US_POSIX")
f.dateFormat = "yyyy-MM-dd'T'HH-mm-ss"
return f.string(from: Date())
}
/// Phase-1 preview of the mic-VAD "self" spans (the eventual
/// `visual_timeline.json` `mic_vad` segments). Lets us eyeball VAD quality.
private func writeSelfSpans(_ result: RecordingResult, to folder: URL) {
let segments = result.selfSpans.map { span -> [String: Any] in
["start": span.start, "end": span.end, "name": "self",
"confidence": span.confidence, "source": "mic_vad"]
}
let object: [String: Any] = [
"note": "Phase 1 mic-VAD self spans (preview of visual_timeline segments)",
"t0_unix": result.t0Unix,
"duration_sec": result.duration,
"self_spans": segments,
]
if let data = try? JSONSerialization.data(withJSONObject: object,
options: [.prettyPrinted, .sortedKeys]) {
try? data.write(to: folder.appendingPathComponent("self_vad.json"))
}
}
}