Phases 2-6: detection, visual timeline, backend hand-off, voiceprints

Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
2026-06-06 00:15:49 -05:00
parent fd7e1a5907
commit 863136aeec
27 changed files with 2108 additions and 22 deletions
@@ -1,6 +1,7 @@
 import Foundation
 import Combine
 import AppKit
+import CoreGraphics

 struct SessionInfo: Equatable {
    let folder: URL
@@ -25,6 +26,14 @@ final class SessionController: ObservableObject {
        case error(String)
    }

+    /// Backend transcription status for the most recent session.
+    enum TranscriptStatus: Equatable {
+        case idle
+        case processing(Int, Int)              // chunk done, total
+        case done(speakers: Int, segments: Int)
+        case failed(String)
+    }
+
    /// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a
    /// recording in progress before the app quits.
    static weak var shared: SessionController?
@@ -37,12 +46,34 @@ final class SessionController: ObservableObject {
    @Published private(set) var systemLevel: Float = 0
    /// Surfaced after a session if system audio stopped early.
    @Published private(set) var warning: String?
+    /// Mirrored from `CallDetector` for the UI.
+    @Published private(set) var detectionStatus: CallDetector.Status = .disabled
+    /// Backend transcription status for the last session.
+    @Published private(set) var transcriptStatus: TranscriptStatus = .idle

    private let settings: AppSettings
+    private var voiceprints: VoiceprintStore
+    private let detector = CallDetector()
+    private var cancellables = Set<AnyCancellable>()
+    private var currentLabel = "manual"
+    /// Inputs needed to (re)process the last finished session through the backend.
+    private struct ProcessInputs {
+        let folder: URL
+        let sessionId: String
+        let app: String
+        let mixedURL: URL
+        let selfSpans: [VADSpan]
+    }
+    private var lastProcess: ProcessInputs?
+    private var processTask: Task<Void, Never>?
    private var recorder: AudioRecorder?
    private var currentFolder: URL?
    private var startTime: Date?
    private var timer: Timer?
+    /// True when the current session was started by call detection (not the user).
+    private var autoStarted = false
+    /// Set if a detected call ends while we're still in `.starting`.
+    private var pendingAutoStop = false
    /// The in-flight start or stop Task, so `prepareForTermination` can await it.
    private var lifecycleTask: Task<Void, Never>?
    /// Bumped each time a start/stop Task is spawned (Task is a value type, so this
@@ -51,7 +82,64 @@ final class SessionController: ObservableObject {

    init(settings: AppSettings) {
        self.settings = settings
+        self.voiceprints = VoiceprintStore(
+            fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
        SessionController.shared = self
+
+        detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
+        detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
+        detector.$status
+            .sink { [weak self] status in self?.detectionStatus = status }
+            .store(in: &cancellables)
+        // Re-point the voiceprint DB if the output folder changes. The in-flight
+        // pipeline keeps its own captured reference, so this can't disrupt a run.
+        settings.$outputFolderPath
+            .dropFirst()
+            .sink { [weak self] path in
+                guard let self else { return }
+                let dir = URL(fileURLWithPath: (path as NSString).expandingTildeInPath, isDirectory: true)
+                self.voiceprints = VoiceprintStore(fileURL: dir.appendingPathComponent("voiceprints.json"))
+            }
+            .store(in: &cancellables)
+        settings.$autoRecordOnDetection
+            .sink { [weak self] on in
+                guard let self else { return }
+                if on {
+                    self.detector.enable()
+                } else {
+                    self.detector.disable()
+                    // Don't leave an auto-started session running with no detector —
+                    // handle both .recording and the in-flight .starting case.
+                    if self.autoStarted {
+                        switch self.state {
+                        case .recording: self.stop()
+                        case .starting:  self.pendingAutoStop = true
+                        default:         break
+                        }
+                    }
+                }
+            }
+            .store(in: &cancellables)
+    }
+
+    // MARK: - Auto-detection
+
+    private func handleCallStart(_ app: CallDetector.DetectedApp) {
+        guard settings.autoRecordOnDetection else { return }
+        switch state {
+        case .idle, .error: start(label: app.label, auto: true)
+        case .starting, .recording, .finishing: break   // don't disturb an active session
+        }
+    }
+
+    private func handleCallEnd() {
+        // Only auto-stop a session we auto-started; never a manual recording.
+        guard autoStarted else { return }
+        switch state {
+        case .recording: stop()
+        case .starting:  pendingAutoStop = true   // resolved when start() completes
+        case .idle, .error, .finishing: break
+        }
    }

    var isBusy: Bool {
@@ -68,15 +156,18 @@ final class SessionController: ObservableObject {

    // MARK: - Start / Stop

-    private func start() {
+    private func start(label: String = "manual", auto: Bool = false) {
        let folder: URL
        do {
-            folder = try makeSessionFolder()
+            folder = try makeSessionFolder(label: label)
        } catch {
            fail("Couldn't create session folder: \(error.localizedDescription)")
            return
        }
        currentFolder = folder
+        currentLabel = label
+        autoStarted = auto
+        pendingAutoStop = false
        let recorder = AudioRecorder(
            micURL: folder.appendingPathComponent("mic.wav"),
            systemURL: folder.appendingPathComponent("system.wav"),
@@ -92,12 +183,36 @@ final class SessionController: ObservableObject {
                self.state = .recording
                self.startTime = Date()
                self.startTimer()
+                // A detected call may have ended while we were still starting.
+                if self.pendingAutoStop {
+                    self.pendingAutoStop = false
+                    self.stop()
+                }
            } catch {
-                self.fail("Couldn't start recording: \(error.localizedDescription)")
+                self.handleStartFailure(error)
            }
        }
    }

+    /// Map a recorder start failure to an actionable message. The common case is
+    /// Screen Recording getting re-checked after a rebuild (the SCStream auth
+    /// check fails even though CGPreflight reports granted), so re-prompt and open
+    /// the right Settings pane rather than show a cryptic TCC error.
+    private func handleStartFailure(_ error: Error) {
+        let msg = error.localizedDescription.lowercased()
+        let screenIssue = msg.contains("declined") || msg.contains("tcc")
+            || msg.contains("screen") || msg.contains("permission")
+        if screenIssue {
+            _ = CGRequestScreenCaptureAccess()
+            if let url = URL(string: "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") {
+                NSWorkspace.shared.open(url)
+            }
+            fail("Screen Recording needs re-approval for this build. Toggle Ten31Transcripts off then on in System Settings ▸ Screen Recording, then restart the app.")
+        } else {
+            fail("Couldn't start recording: \(error.localizedDescription)")
+        }
+    }
+
    private func stop() {
        guard let recorder else { return }
        state = .finishing
@@ -114,20 +229,66 @@ final class SessionController: ObservableObject {
        micLevel = 0
        systemLevel = 0
        warning = result.systemNote.map { "System audio stopped early: \($0)" }
+        transcriptStatus = .idle
        if let folder = currentFolder {
            writeSelfSpans(result, to: folder)
            lastSession = SessionInfo(
                folder: folder, mixedURL: result.mixedURL,
                duration: result.duration, selfSpanCount: result.selfSpans.count)
+            lastProcess = ProcessInputs(
+                folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
+                mixedURL: result.mixedURL, selfSpans: result.selfSpans)
        }
+        let autoSend = settings.autoSendOnStop
        currentFolder = nil
+        autoStarted = false
+        pendingAutoStop = false
        elapsed = 0
        state = .idle
+        if autoSend { processLastSession() }
+    }
+
+    // MARK: - Backend transcription
+
+    /// Send the last finished session to the backend → `speakers.json`. Uses the
+    /// mic-VAD self spans as the timeline for now; visual segments (Phase 3–4) get
+    /// merged in once the adapters land. Safe to call manually ("Send to backend")
+    /// or automatically on stop.
+    func processLastSession() {
+        guard let inputs = lastProcess else { return }
+        if case .processing = transcriptStatus { return }
+        transcriptStatus = .processing(0, 1)
+
+        let settings = self.settings
+        let voiceprints = self.voiceprints
+        processTask = Task {
+            let pipeline = TranscriptPipeline(
+                baseURL: settings.backendBaseURL,
+                skipTLS: settings.skipTLSVerification,
+                voiceprints: voiceprints)
+            let timeline = TranscriptPipeline.timeline(
+                fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
+            do {
+                let speakers = try await pipeline.process(
+                    sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
+                    mixedURL: inputs.mixedURL, timeline: timeline,
+                    progress: { done, total in
+                        await MainActor.run { self.transcriptStatus = .processing(done, total) }
+                    })
+                self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
+            } catch is CancellationError {
+                self.transcriptStatus = .idle
+            } catch {
+                self.transcriptStatus = .failed(error.localizedDescription)
+            }
+        }
    }

    private func fail(_ message: String) {
        recorder = nil
        currentFolder = nil
+        autoStarted = false
+        pendingAutoStop = false
        stopTimer()
        micLevel = 0
        systemLevel = 0
@@ -139,6 +300,9 @@ final class SessionController: ObservableObject {
    /// its WAV headers are finalized before the process exits. Handles quit while
    /// `.starting` and `.finishing`, not just `.recording`.
    func prepareForTermination() async {
+        // Cancel any in-flight backend transcription (audio is already saved; the
+        // user can resend). The pipeline's checkCancellation + defer clean up chunks.
+        processTask?.cancel()
        // Drain whatever lifecycle Task is in flight until nothing is busy. A Stop
        // click landing in an await window can spawn a new stop Task, so loop
        // rather than awaiting a single captured task.
@@ -178,9 +342,9 @@ final class SessionController: ObservableObject {

    // MARK: - Files

-    private func makeSessionFolder() throws -> URL {
+    private func makeSessionFolder(label: String) throws -> URL {
        let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
-        let folder = base.appendingPathComponent("\(Self.timestamp())_manual", isDirectory: true)
+        let folder = base.appendingPathComponent("\(Self.timestamp())_\(label)", isDirectory: true)
        try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true)
        return folder
    }
@@ -0,0 +1,85 @@
+import Foundation
+import AVFoundation
+
+/// Splits a long session into backend-sized chunks and produces, per chunk, the
+/// sliced audio and the timeline rebased to chunk-local seconds.
+///
+/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3
+/// min are chunked into ~2–3 min windows; names + voiceprints unify speakers
+/// across chunks (handled in the pipeline).
+enum SessionPackager {
+    struct PlannedChunk: Equatable {
+        let index: Int
+        let start: Double      // global seconds
+        let end: Double
+    }
+
+    /// One chunk if short; otherwise even ~`chunkSeconds` windows.
+    static func planChunks(durationSec: Double,
+                           chunkSeconds: Double = 150,
+                           thresholdSec: Double = 180) -> [PlannedChunk] {
+        guard durationSec > thresholdSec else {
+            return [PlannedChunk(index: 0, start: 0, end: durationSec)]
+        }
+        var chunks: [PlannedChunk] = []
+        var start = 0.0
+        var index = 0
+        while start < durationSec - 0.001 {
+            let end = min(start + chunkSeconds, durationSec)
+            chunks.append(PlannedChunk(index: index, start: start, end: end))
+            start = end
+            index += 1
+        }
+        return chunks
+    }
+
+    /// Clip segments to `[start, end)` and rebase to chunk-local seconds, then
+    /// emit the flat `label-merge` array `[{start,end,name,confidence}]`.
+    static func rebasedTimelineData(_ segments: [VisualTimeline.Segment],
+                                    start: Double, end: Double) throws -> Data {
+        let flat: [[String: Any]] = segments.compactMap { seg in
+            let s = max(seg.start, start)
+            let e = min(seg.end, end)
+            guard e > s else { return nil }
+            return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence]
+        }
+        return try JSONSerialization.data(withJSONObject: flat, options: [])
+    }
+
+    /// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
+    static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
+        let input = try AVAudioFile(forReading: source)
+        let sr = input.fileFormat.sampleRate
+        let startFrame = AVAudioFramePosition((startSec * sr).rounded())
+        let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded()))
+        guard endFrame > startFrame else { return }
+
+        let settings: [String: Any] = [
+            AVFormatIDKey: kAudioFormatLinearPCM,
+            AVSampleRateKey: sr,
+            AVNumberOfChannelsKey: 1,
+            AVLinearPCMBitDepthKey: 16,
+            AVLinearPCMIsFloatKey: false,
+            AVLinearPCMIsBigEndianKey: false,
+        ]
+        let output = try AVAudioFile(forWriting: dest, settings: settings,
+                                     commonFormat: .pcmFormatFloat32, interleaved: false)
+        input.framePosition = startFrame
+        var remaining = AVAudioFrameCount(endFrame - startFrame)
+        let block: AVAudioFrameCount = 16_000
+        while remaining > 0 {
+            let n = min(block, remaining)
+            guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break }
+            try input.read(into: buffer, frameCount: n)
+            if buffer.frameLength == 0 { break }
+            try output.write(from: buffer)
+            remaining -= buffer.frameLength
+        }
+    }
+
+    /// Duration (seconds) of a WAV.
+    static func duration(of url: URL) -> Double {
+        guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 }
+        return Double(file.length) / file.fileFormat.sampleRate
+    }
+}
@@ -0,0 +1,45 @@
+import Foundation
+
+/// `speakers.json` — the final stored output (docs §6): per-chunk `label-merge`
+/// results concatenated, timestamps offset back to global seconds, names unified.
+/// This is the hand-off to the downstream summarizer; the app stops here.
+struct SpeakersFile: Codable {
+    let sessionId: String
+    let app: String
+    let durationSec: Double
+    let speakers: [Speaker]
+    let segments: [Segment]
+    let models: [String: String]
+
+    struct Speaker: Codable, Equatable {
+        let name: String
+        let source: String
+        let overlapConfidence: Double?
+        let matchSimilarity: Double?
+        enum CodingKeys: String, CodingKey {
+            case name, source
+            case overlapConfidence = "overlap_confidence"
+            case matchSimilarity = "match_similarity"
+        }
+    }
+
+    struct Segment: Codable, Equatable {
+        let start: Double
+        let end: Double
+        let speaker: String
+        let text: String?
+    }
+
+    enum CodingKeys: String, CodingKey {
+        case sessionId = "session_id"
+        case app
+        case durationSec = "duration_sec"
+        case speakers, segments, models
+    }
+
+    func write(to url: URL) throws {
+        let encoder = JSONEncoder()
+        encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
+        try encoder.encode(self).write(to: url)
+    }
+}
@@ -0,0 +1,78 @@
+import Foundation
+
+/// Concatenates per-chunk `label-merge` results into one global `speakers.json`:
+/// segment times offset back to global seconds, speakers unified across chunks by
+/// name, and fingerprints collected for the voiceprint store.
+enum TranscriptAssembler {
+    struct ChunkResult {
+        let chunkStart: Double           // global seconds
+        let response: LabelMergeResponse
+    }
+
+    struct Assembled {
+        let speakersFile: SpeakersFile
+        let fingerprints: [String: [Float]]   // name -> 192-dim, for VoiceprintStore
+    }
+
+    /// Source ranking when the same name appears across chunks with different sources.
+    private static func rank(_ source: String) -> Int {
+        switch source {
+        case "visual": return 3
+        case "voiceprint": return 2
+        default: return 1            // unmatched
+        }
+    }
+
+    private static func isUnknown(_ name: String) -> Bool {
+        LabelMergeResponse.isUnknownName(name)
+    }
+
+    static func assemble(sessionId: String, app: String, chunks: [ChunkResult]) -> Assembled {
+        var segments: [SpeakersFile.Segment] = []
+        var bestSpeaker: [String: SpeakersFile.Speaker] = [:]
+        var fingerprints: [String: [Float]] = [:]
+        var models: [String: String] = [:]
+        var duration = 0.0
+
+        for chunk in chunks {
+            let offset = chunk.chunkStart
+            // Audio length from the chunk window, so silent/all-unknown calls still
+            // report a real duration (not just the last segment's end).
+            duration = max(duration, offset + chunk.response.duration)
+
+            for seg in chunk.response.segments {
+                let start = seg.startSeconds + offset
+                let end = seg.endSeconds + offset
+                segments.append(.init(start: start, end: end, speaker: seg.speaker, text: seg.text))
+                duration = max(duration, end)
+            }
+
+            for sp in chunk.response.speakers {
+                let candidate = SpeakersFile.Speaker(
+                    name: sp.name, source: sp.source,
+                    overlapConfidence: sp.overlapConfidence, matchSimilarity: sp.matchSimilarity)
+                if let existing = bestSpeaker[sp.name] {
+                    if rank(sp.source) > rank(existing.source) { bestSpeaker[sp.name] = candidate }
+                } else {
+                    bestSpeaker[sp.name] = candidate
+                }
+                // Collect named fingerprints only (never Unknown_N / Speaker_unknown).
+                if !isUnknown(sp.name), let fp = sp.fingerprint, fp.count > 0 {
+                    fingerprints[sp.name] = fp
+                }
+            }
+            for (name, fp) in chunk.response.fingerprints where !isUnknown(name) && fp.count > 0 {
+                fingerprints[name] = fp
+            }
+        }
+
+        segments.sort { $0.start < $1.start }
+        let speakers = bestSpeaker.values.sorted { $0.name < $1.name }
+        models = chunks.last?.response.models ?? [:]
+
+        let file = SpeakersFile(
+            sessionId: sessionId, app: app, durationSec: duration,
+            speakers: speakers, segments: segments, models: models)
+        return Assembled(speakersFile: file, fingerprints: fingerprints)
+    }
+}
@@ -0,0 +1,75 @@
+import Foundation
+
+/// Drives a finished session through the backend: chunk → sequential
+/// `label-merge` (accumulating voiceprints) → assemble `speakers.json` → persist
+/// fingerprints. Requests are sequential by construction (one chunk at a time).
+final class TranscriptPipeline {
+    private let client: SparkControlClient
+    private let voiceprints: VoiceprintStore
+
+    init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
+        self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
+        self.voiceprints = voiceprints
+    }
+
+    /// Process `mixedURL` against `timeline` (visual + self spans). Writes
+    /// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)`
+    /// is called per chunk.
+    func process(sessionFolder: URL,
+                 sessionId: String,
+                 app: String,
+                 mixedURL: URL,
+                 timeline: [VisualTimeline.Segment],
+                 progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
+        let duration = SessionPackager.duration(of: mixedURL)
+        let plan = SessionPackager.planChunks(durationSec: duration)
+
+        // Zero-duration / empty session → a valid empty speakers.json, no backend call.
+        if plan.isEmpty || duration <= 0 {
+            let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
+            try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
+            await progress?(0, 0)
+            return empty.speakersFile
+        }
+
+        let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
+        try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true)
+        defer { try? FileManager.default.removeItem(at: chunksDir) }   // cleanup on success OR throw
+
+        // Start from stored voiceprints; accumulate this call's prints across chunks
+        // for within-call unification (the store only persists high-confidence ones).
+        var known = voiceprints.knownVoiceprints()
+        var results: [TranscriptAssembler.ChunkResult] = []
+
+        for chunk in plan {
+            try Task.checkCancellation()
+            await progress?(chunk.index, plan.count)
+            let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav")
+            try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
+            guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue }  // empty slice → skip
+
+            let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
+            let response = try await client.labelMerge(
+                audioURL: chunkURL, timeline: timelineData,
+                knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
+
+            for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
+                known[name] = fp
+            }
+            voiceprints.update(with: response)
+            results.append(.init(chunkStart: chunk.start, response: response))
+            try? FileManager.default.removeItem(at: chunkURL)
+        }
+        await progress?(plan.count, plan.count)
+
+        let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
+        try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
+        return assembled.speakersFile
+    }
+
+    /// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
+    /// the visual adapters land (Phase 3–4), their segments are merged in too.
+    static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
+        spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
+    }
+}