863136aeec
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
378 lines
15 KiB
Swift
378 lines
15 KiB
Swift
import Foundation
|
||
import Combine
|
||
import AppKit
|
||
import CoreGraphics
|
||
|
||
struct SessionInfo: Equatable {
|
||
let folder: URL
|
||
let mixedURL: URL
|
||
let duration: Double
|
||
let selfSpanCount: Int
|
||
}
|
||
|
||
/// Owns a single recording session: creates the session folder, drives
|
||
/// `AudioRecorder` start/stop, tracks elapsed time, and writes the Phase-1
|
||
/// preview of mic-VAD self spans. Detection/visual/backend wiring come later.
|
||
///
|
||
/// The lifecycle is serialized through an explicit state machine so start and
|
||
/// stop can never interleave (`.starting` → `.recording` → `.finishing`).
|
||
@MainActor
|
||
final class SessionController: ObservableObject {
|
||
enum State: Equatable {
|
||
case idle
|
||
case starting
|
||
case recording
|
||
case finishing
|
||
case error(String)
|
||
}
|
||
|
||
/// Backend transcription status for the most recent session.
|
||
enum TranscriptStatus: Equatable {
|
||
case idle
|
||
case processing(Int, Int) // chunk done, total
|
||
case done(speakers: Int, segments: Int)
|
||
case failed(String)
|
||
}
|
||
|
||
/// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a
|
||
/// recording in progress before the app quits.
|
||
static weak var shared: SessionController?
|
||
|
||
@Published private(set) var state: State = .idle
|
||
@Published private(set) var elapsed: TimeInterval = 0
|
||
@Published private(set) var lastSession: SessionInfo?
|
||
/// Live input peak levels (0…1) while recording, for the UI meters.
|
||
@Published private(set) var micLevel: Float = 0
|
||
@Published private(set) var systemLevel: Float = 0
|
||
/// Surfaced after a session if system audio stopped early.
|
||
@Published private(set) var warning: String?
|
||
/// Mirrored from `CallDetector` for the UI.
|
||
@Published private(set) var detectionStatus: CallDetector.Status = .disabled
|
||
/// Backend transcription status for the last session.
|
||
@Published private(set) var transcriptStatus: TranscriptStatus = .idle
|
||
|
||
private let settings: AppSettings
|
||
private var voiceprints: VoiceprintStore
|
||
private let detector = CallDetector()
|
||
private var cancellables = Set<AnyCancellable>()
|
||
private var currentLabel = "manual"
|
||
/// Inputs needed to (re)process the last finished session through the backend.
|
||
private struct ProcessInputs {
|
||
let folder: URL
|
||
let sessionId: String
|
||
let app: String
|
||
let mixedURL: URL
|
||
let selfSpans: [VADSpan]
|
||
}
|
||
private var lastProcess: ProcessInputs?
|
||
private var processTask: Task<Void, Never>?
|
||
private var recorder: AudioRecorder?
|
||
private var currentFolder: URL?
|
||
private var startTime: Date?
|
||
private var timer: Timer?
|
||
/// True when the current session was started by call detection (not the user).
|
||
private var autoStarted = false
|
||
/// Set if a detected call ends while we're still in `.starting`.
|
||
private var pendingAutoStop = false
|
||
/// The in-flight start or stop Task, so `prepareForTermination` can await it.
|
||
private var lifecycleTask: Task<Void, Never>?
|
||
/// Bumped each time a start/stop Task is spawned (Task is a value type, so this
|
||
/// is how `prepareForTermination` detects a newly-spawned transition).
|
||
private var lifecycleGeneration = 0
|
||
|
||
init(settings: AppSettings) {
|
||
self.settings = settings
|
||
self.voiceprints = VoiceprintStore(
|
||
fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
|
||
SessionController.shared = self
|
||
|
||
detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
|
||
detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
|
||
detector.$status
|
||
.sink { [weak self] status in self?.detectionStatus = status }
|
||
.store(in: &cancellables)
|
||
// Re-point the voiceprint DB if the output folder changes. The in-flight
|
||
// pipeline keeps its own captured reference, so this can't disrupt a run.
|
||
settings.$outputFolderPath
|
||
.dropFirst()
|
||
.sink { [weak self] path in
|
||
guard let self else { return }
|
||
let dir = URL(fileURLWithPath: (path as NSString).expandingTildeInPath, isDirectory: true)
|
||
self.voiceprints = VoiceprintStore(fileURL: dir.appendingPathComponent("voiceprints.json"))
|
||
}
|
||
.store(in: &cancellables)
|
||
settings.$autoRecordOnDetection
|
||
.sink { [weak self] on in
|
||
guard let self else { return }
|
||
if on {
|
||
self.detector.enable()
|
||
} else {
|
||
self.detector.disable()
|
||
// Don't leave an auto-started session running with no detector —
|
||
// handle both .recording and the in-flight .starting case.
|
||
if self.autoStarted {
|
||
switch self.state {
|
||
case .recording: self.stop()
|
||
case .starting: self.pendingAutoStop = true
|
||
default: break
|
||
}
|
||
}
|
||
}
|
||
}
|
||
.store(in: &cancellables)
|
||
}
|
||
|
||
// MARK: - Auto-detection
|
||
|
||
private func handleCallStart(_ app: CallDetector.DetectedApp) {
|
||
guard settings.autoRecordOnDetection else { return }
|
||
switch state {
|
||
case .idle, .error: start(label: app.label, auto: true)
|
||
case .starting, .recording, .finishing: break // don't disturb an active session
|
||
}
|
||
}
|
||
|
||
private func handleCallEnd() {
|
||
// Only auto-stop a session we auto-started; never a manual recording.
|
||
guard autoStarted else { return }
|
||
switch state {
|
||
case .recording: stop()
|
||
case .starting: pendingAutoStop = true // resolved when start() completes
|
||
case .idle, .error, .finishing: break
|
||
}
|
||
}
|
||
|
||
var isBusy: Bool {
|
||
state == .starting || state == .recording || state == .finishing
|
||
}
|
||
|
||
func toggle() {
|
||
switch state {
|
||
case .idle, .error: start()
|
||
case .recording: stop()
|
||
case .starting, .finishing: break // ignore taps mid-transition
|
||
}
|
||
}
|
||
|
||
// MARK: - Start / Stop
|
||
|
||
private func start(label: String = "manual", auto: Bool = false) {
|
||
let folder: URL
|
||
do {
|
||
folder = try makeSessionFolder(label: label)
|
||
} catch {
|
||
fail("Couldn't create session folder: \(error.localizedDescription)")
|
||
return
|
||
}
|
||
currentFolder = folder
|
||
currentLabel = label
|
||
autoStarted = auto
|
||
pendingAutoStop = false
|
||
let recorder = AudioRecorder(
|
||
micURL: folder.appendingPathComponent("mic.wav"),
|
||
systemURL: folder.appendingPathComponent("system.wav"),
|
||
mixedURL: folder.appendingPathComponent("mixed_mono_16k.wav"))
|
||
self.recorder = recorder
|
||
warning = nil
|
||
state = .starting
|
||
|
||
lifecycleGeneration += 1
|
||
lifecycleTask = Task {
|
||
do {
|
||
try await recorder.start() // self-tears-down if it throws
|
||
self.state = .recording
|
||
self.startTime = Date()
|
||
self.startTimer()
|
||
// A detected call may have ended while we were still starting.
|
||
if self.pendingAutoStop {
|
||
self.pendingAutoStop = false
|
||
self.stop()
|
||
}
|
||
} catch {
|
||
self.handleStartFailure(error)
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Map a recorder start failure to an actionable message. The common case is
|
||
/// Screen Recording getting re-checked after a rebuild (the SCStream auth
|
||
/// check fails even though CGPreflight reports granted), so re-prompt and open
|
||
/// the right Settings pane rather than show a cryptic TCC error.
|
||
private func handleStartFailure(_ error: Error) {
|
||
let msg = error.localizedDescription.lowercased()
|
||
let screenIssue = msg.contains("declined") || msg.contains("tcc")
|
||
|| msg.contains("screen") || msg.contains("permission")
|
||
if screenIssue {
|
||
_ = CGRequestScreenCaptureAccess()
|
||
if let url = URL(string: "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") {
|
||
NSWorkspace.shared.open(url)
|
||
}
|
||
fail("Screen Recording needs re-approval for this build. Toggle Ten31Transcripts off then on in System Settings ▸ Screen Recording, then restart the app.")
|
||
} else {
|
||
fail("Couldn't start recording: \(error.localizedDescription)")
|
||
}
|
||
}
|
||
|
||
private func stop() {
|
||
guard let recorder else { return }
|
||
state = .finishing
|
||
stopTimer()
|
||
lifecycleGeneration += 1
|
||
lifecycleTask = Task {
|
||
let result = await recorder.stop()
|
||
self.finish(result)
|
||
}
|
||
}
|
||
|
||
private func finish(_ result: RecordingResult) {
|
||
recorder = nil
|
||
micLevel = 0
|
||
systemLevel = 0
|
||
warning = result.systemNote.map { "System audio stopped early: \($0)" }
|
||
transcriptStatus = .idle
|
||
if let folder = currentFolder {
|
||
writeSelfSpans(result, to: folder)
|
||
lastSession = SessionInfo(
|
||
folder: folder, mixedURL: result.mixedURL,
|
||
duration: result.duration, selfSpanCount: result.selfSpans.count)
|
||
lastProcess = ProcessInputs(
|
||
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
|
||
mixedURL: result.mixedURL, selfSpans: result.selfSpans)
|
||
}
|
||
let autoSend = settings.autoSendOnStop
|
||
currentFolder = nil
|
||
autoStarted = false
|
||
pendingAutoStop = false
|
||
elapsed = 0
|
||
state = .idle
|
||
if autoSend { processLastSession() }
|
||
}
|
||
|
||
// MARK: - Backend transcription
|
||
|
||
/// Send the last finished session to the backend → `speakers.json`. Uses the
|
||
/// mic-VAD self spans as the timeline for now; visual segments (Phase 3–4) get
|
||
/// merged in once the adapters land. Safe to call manually ("Send to backend")
|
||
/// or automatically on stop.
|
||
func processLastSession() {
|
||
guard let inputs = lastProcess else { return }
|
||
if case .processing = transcriptStatus { return }
|
||
transcriptStatus = .processing(0, 1)
|
||
|
||
let settings = self.settings
|
||
let voiceprints = self.voiceprints
|
||
processTask = Task {
|
||
let pipeline = TranscriptPipeline(
|
||
baseURL: settings.backendBaseURL,
|
||
skipTLS: settings.skipTLSVerification,
|
||
voiceprints: voiceprints)
|
||
let timeline = TranscriptPipeline.timeline(
|
||
fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
|
||
do {
|
||
let speakers = try await pipeline.process(
|
||
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
||
mixedURL: inputs.mixedURL, timeline: timeline,
|
||
progress: { done, total in
|
||
await MainActor.run { self.transcriptStatus = .processing(done, total) }
|
||
})
|
||
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
|
||
} catch is CancellationError {
|
||
self.transcriptStatus = .idle
|
||
} catch {
|
||
self.transcriptStatus = .failed(error.localizedDescription)
|
||
}
|
||
}
|
||
}
|
||
|
||
private func fail(_ message: String) {
|
||
recorder = nil
|
||
currentFolder = nil
|
||
autoStarted = false
|
||
pendingAutoStop = false
|
||
stopTimer()
|
||
micLevel = 0
|
||
systemLevel = 0
|
||
elapsed = 0
|
||
state = .error(message)
|
||
}
|
||
|
||
/// Called from `applicationShouldTerminate`: flush any in-progress session so
|
||
/// its WAV headers are finalized before the process exits. Handles quit while
|
||
/// `.starting` and `.finishing`, not just `.recording`.
|
||
func prepareForTermination() async {
|
||
// Cancel any in-flight backend transcription (audio is already saved; the
|
||
// user can resend). The pipeline's checkCancellation + defer clean up chunks.
|
||
processTask?.cancel()
|
||
// Drain whatever lifecycle Task is in flight until nothing is busy. A Stop
|
||
// click landing in an await window can spawn a new stop Task, so loop
|
||
// rather than awaiting a single captured task.
|
||
while isBusy {
|
||
let gen = lifecycleGeneration
|
||
await lifecycleTask?.value
|
||
if state == .recording, let recorder {
|
||
state = .finishing
|
||
stopTimer()
|
||
finish(await recorder.stop())
|
||
} else if lifecycleGeneration == gen {
|
||
break // settled: no new transition was spawned
|
||
}
|
||
}
|
||
}
|
||
|
||
// MARK: - Timer
|
||
|
||
private func startTimer() {
|
||
timer = Timer.scheduledTimer(withTimeInterval: 0.1, repeats: true) { [weak self] _ in
|
||
Task { @MainActor in
|
||
guard let self else { return }
|
||
if let start = self.startTime { self.elapsed = Date().timeIntervalSince(start) }
|
||
if let recorder = self.recorder {
|
||
let levels = recorder.currentLevels()
|
||
self.micLevel = levels.mic
|
||
self.systemLevel = levels.system
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
private func stopTimer() {
|
||
timer?.invalidate()
|
||
timer = nil
|
||
}
|
||
|
||
// MARK: - Files
|
||
|
||
private func makeSessionFolder(label: String) throws -> URL {
|
||
let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
|
||
let folder = base.appendingPathComponent("\(Self.timestamp())_\(label)", isDirectory: true)
|
||
try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true)
|
||
return folder
|
||
}
|
||
|
||
private static func timestamp() -> String {
|
||
let f = DateFormatter()
|
||
f.locale = Locale(identifier: "en_US_POSIX")
|
||
f.dateFormat = "yyyy-MM-dd'T'HH-mm-ss"
|
||
return f.string(from: Date())
|
||
}
|
||
|
||
/// Phase-1 preview of the mic-VAD "self" spans (the eventual
|
||
/// `visual_timeline.json` `mic_vad` segments). Lets us eyeball VAD quality.
|
||
private func writeSelfSpans(_ result: RecordingResult, to folder: URL) {
|
||
let segments = result.selfSpans.map { span -> [String: Any] in
|
||
["start": span.start, "end": span.end, "name": "self",
|
||
"confidence": span.confidence, "source": "mic_vad"]
|
||
}
|
||
let object: [String: Any] = [
|
||
"note": "Phase 1 mic-VAD self spans (preview of visual_timeline segments)",
|
||
"t0_unix": result.t0Unix,
|
||
"duration_sec": result.duration,
|
||
"self_spans": segments,
|
||
]
|
||
if let data = try? JSONSerialization.data(withJSONObject: object,
|
||
options: [.prettyPrinted, .sortedKeys]) {
|
||
try? data.write(to: folder.appendingPathComponent("self_vad.json"))
|
||
}
|
||
}
|
||
}
|