Phases 2-6: detection, visual timeline, backend hand-off, voiceprints

Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
2026-06-06 00:15:49 -05:00
parent fd7e1a5907
commit 863136aeec
27 changed files with 2108 additions and 22 deletions
@@ -0,0 +1,31 @@
+import Foundation
+import CoreVideo
+
+/// Signal Desktop adapter. Signal shows avatars/initials with a coloured ring
+/// around the active speaker; names may also be available via the Electron
+/// Accessibility tree (preferred over OCR when we enable it). Geometry/threshold
+/// here are first-pass and will be calibrated against real Signal screenshots.
+struct SignalAdapter: AppAdapter {
+    static let bundleIDs = ["org.whispersystems.signal-desktop"]
+    let adapterVersion = "signal-0.1.0"
+    let preferredFPS = 3
+
+    private let analyzer: GridCallAnalyzer
+
+    init() {
+        var config = GridCallAnalyzer.Config()
+        // Signal tiles are squarish with initials centred; tune with fixtures.
+        config.tileExpandX = 1.6
+        config.tileExpandY = 1.8
+        self.analyzer = GridCallAnalyzer(config: config)
+    }
+
+    func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
+        analyzer.analyze(pixelBuffer: frame, at: t)
+    }
+
+    // Exposed for fixture/synthetic tests.
+    func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
+        analyzer.analyze(cgImage: cgImage, at: t)
+    }
+}
@@ -156,31 +156,44 @@ final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput {

    // MARK: - Ingest (ioQueue only)

+    /// Write audio CONTINUOUSLY; re-anchor to the timestamp only when drift is a
+    /// real gap (> ~100 ms), not per-buffer timestamp jitter. Correcting every
+    /// buffer injects/strips a few samples each time → audible rhythmic glitching.
+    /// The shared t0 still bounds mic/system skew to the tolerance, well within
+    /// what the backend merge needs.
+    private static let driftTolerance: Int64 = 1600   // 100 ms @ 16 kHz
+
    private func ingestMic(_ buffer: AVAudioPCMBuffer, startHost: Double) {
        guard !tornDown, let writer = micWriter, let vad else { return }
-        let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
-        if expected > writer.framesWritten {
-            let padded = writer.padSilence(expected - writer.framesWritten)
+        let drift = max(0, Int64(((startHost - t0Host) * 16_000).rounded())) - writer.framesWritten
+        var chunk: AVAudioPCMBuffer? = buffer
+        if drift > Self.driftTolerance {                  // real gap → pad to realign
+            let padded = writer.padSilence(drift)
            if padded > 0 { vad.feedSilence(padded) }
+        } else if drift < -Self.driftTolerance {          // far ahead → trim overlap
+            let trim = Int(-drift)
+            if trim >= Int(buffer.frameLength) { return }
+            chunk = Self.trimFront(buffer, by: trim)
        }
-        let startIdx = max(0, Int(writer.framesWritten - expected))
-        if startIdx >= Int(buffer.frameLength) { return }
-        guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
-        updateLevel(chunk, isMic: true)
-        if writer.write(chunk) > 0 { vad.feed(chunk) }
+        guard let out = chunk else { return }
+        updateLevel(out, isMic: true)
+        if writer.write(out) > 0 { vad.feed(out) }
    }

    private func ingestSystem(_ buffer: AVAudioPCMBuffer, startHost: Double) {
        guard !tornDown, let writer = systemWriter else { return }
-        let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
-        if expected > writer.framesWritten {
-            writer.padSilence(expected - writer.framesWritten)
+        let drift = max(0, Int64(((startHost - t0Host) * 16_000).rounded())) - writer.framesWritten
+        var chunk: AVAudioPCMBuffer? = buffer
+        if drift > Self.driftTolerance {
+            writer.padSilence(drift)
+        } else if drift < -Self.driftTolerance {
+            let trim = Int(-drift)
+            if trim >= Int(buffer.frameLength) { return }
+            chunk = Self.trimFront(buffer, by: trim)
        }
-        let startIdx = max(0, Int(writer.framesWritten - expected))
-        if startIdx >= Int(buffer.frameLength) { return }
-        guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
-        updateLevel(chunk, isMic: false)
-        writer.write(chunk)
+        guard let out = chunk else { return }
+        updateLevel(out, isMic: false)
+        writer.write(out)
    }

    // MARK: - Mic (AVAudioEngine)
@@ -24,7 +24,12 @@ final class Resampler {
        guard !ended, input.frameLength > 0 else { return nil }

        if converter == nil || sourceFormat != input.format {
-            converter = AVAudioConverter(from: input.format, to: Self.targetFormat)
+            let c = AVAudioConverter(from: input.format, to: Self.targetFormat)
+            // Highest-quality sample-rate conversion: best anti-aliasing on the
+            // 48k→16k downsample, which avoids harsh artifacts on loud/bright speech.
+            c?.sampleRateConverterQuality = .max
+            c?.sampleRateConverterAlgorithm = AVSampleRateConverterAlgorithm_Mastering
+            converter = c
            sourceFormat = input.format
        }
        guard let converter else { return nil }
@@ -0,0 +1,179 @@
+import Foundation
+
+/// Decoded `POST /api/audio/label-merge` response (verified against the live
+/// backend). Handles both `transcribe=true` (start_ms/end_ms + text) and
+/// `transcribe=false` (start_s/end_s + confidence) segment shapes.
+struct LabelMergeResponse: Decodable {
+    let duration: Double
+    let speakers: [Speaker]
+    let segments: [Segment]
+    let fingerprints: [String: [Float]]
+    let models: [String: String]?
+
+    /// The backend's "unmatched" labels — never persisted as a named voiceprint.
+    static func isUnknownName(_ name: String) -> Bool {
+        name.hasPrefix("Unknown_") || name == "Speaker_unknown"
+    }
+
+    struct Speaker: Decodable {
+        let cluster: String
+        let name: String
+        let source: String                 // visual | voiceprint | unmatched
+        let overlapConfidence: Double?
+        let matchSimilarity: Double?
+        let fingerprint: [Float]?
+        enum CodingKeys: String, CodingKey {
+            case cluster, name, source, fingerprint
+            case overlapConfidence = "overlap_confidence"
+            case matchSimilarity = "match_similarity"
+        }
+    }
+
+    struct Segment: Decodable {
+        let startMs: Int?
+        let endMs: Int?
+        let startS: Double?
+        let endS: Double?
+        let speaker: String
+        let text: String?
+        let confidence: Double?
+        enum CodingKeys: String, CodingKey {
+            case startMs = "start_ms"
+            case endMs = "end_ms"
+            case startS = "start_s"
+            case endS = "end_s"
+            case speaker, text, confidence
+        }
+        /// Start time in seconds regardless of which shape the backend used.
+        var startSeconds: Double { startS ?? startMs.map { Double($0) / 1000 } ?? 0 }
+        var endSeconds: Double { endS ?? endMs.map { Double($0) / 1000 } ?? 0 }
+    }
+}
+
+enum SparkControlError: Error, LocalizedError {
+    case invalidHost
+    case tooLarge                       // 413
+    case server(Int, String)            // other non-2xx with {"detail":...}
+    case decode(String)
+    case retriesExhausted
+
+    var errorDescription: String? {
+        switch self {
+        case .invalidHost: return "Invalid backend host URL."
+        case .tooLarge: return "Audio chunk exceeds the backend's 200 MB limit."
+        case .server(let code, let detail): return "Backend error \(code): \(detail)"
+        case .decode(let msg): return "Couldn't decode backend response: \(msg)"
+        case .retriesExhausted: return "Backend stayed busy (503) after retries."
+        }
+    }
+}
+
+/// Talks to SparkControl's `label-merge`. **Callers must invoke sequentially**
+/// (one audio request in flight) — concurrent audio requests trip a GPU race
+/// (503). The Phase-5 pipeline drives one chunk at a time, satisfying this.
+final class SparkControlClient {
+    private let baseURL: String
+    private let urlSession: URLSession
+
+    init(baseURL: String, skipTLS: Bool) {
+        let trimmed = baseURL.trimmingCharacters(in: .whitespacesAndNewlines)
+        self.baseURL = trimmed.hasSuffix("/") ? String(trimmed.dropLast()) : trimmed
+        let config = URLSessionConfiguration.ephemeral
+        config.timeoutIntervalForRequest = 600     // diarization can take up to ~600s
+        config.timeoutIntervalForResource = 900
+        config.waitsForConnectivity = false
+        let delegate: URLSessionDelegate? = skipTLS ? InsecureTrustDelegate() : nil
+        self.urlSession = URLSession(configuration: config, delegate: delegate, delegateQueue: nil)
+    }
+
+    deinit { urlSession.finishTasksAndInvalidate() }
+
+    /// One `label-merge` call. `timeline` is the flat `[{start,end,name,confidence}]`
+    /// JSON (chunk-local seconds). Retries on `503 + Retry-After`.
+    func labelMerge(audioURL: URL,
+                    timeline: Data,
+                    knownVoiceprints: [String: [Float]]?,
+                    transcribe: Bool,
+                    minOverlap: Double? = nil,
+                    voiceprintThreshold: Double? = nil,
+                    maxRetries: Int = 3) async throws -> LabelMergeResponse {
+        guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
+            throw SparkControlError.invalidHost
+        }
+
+        var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"]
+        if let timelineString = String(data: timeline, encoding: .utf8) {
+            fields["timeline"] = timelineString
+        }
+        if let known = knownVoiceprints, !known.isEmpty,
+           let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }),
+           let str = String(data: data, encoding: .utf8) {
+            fields["known_voiceprints"] = str
+        }
+        if let minOverlap { fields["min_overlap"] = String(minOverlap) }
+        if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) }
+
+        let audio = try Data(contentsOf: audioURL)
+        // Body doesn't change between retries — build it once.
+        let (body, contentType) = Self.multipart(fields: fields, fileField: "file",
+                                                 filename: audioURL.lastPathComponent, fileData: audio)
+
+        var attempt = 0
+        while true {
+            var request = URLRequest(url: url)
+            request.httpMethod = "POST"
+            request.setValue(contentType, forHTTPHeaderField: "Content-Type")
+            request.httpBody = body
+
+            let (data, response) = try await urlSession.data(for: request)
+            guard let http = response as? HTTPURLResponse else {
+                throw SparkControlError.decode("no HTTP response")
+            }
+
+            switch http.statusCode {
+            case 200..<300:
+                do {
+                    return try JSONDecoder().decode(LabelMergeResponse.self, from: data)
+                } catch {
+                    throw SparkControlError.decode(error.localizedDescription)
+                }
+            case 503:
+                attempt += 1
+                if attempt > maxRetries { throw SparkControlError.retriesExhausted }
+                let retryAfter = (http.value(forHTTPHeaderField: "Retry-After")).flatMap(Double.init) ?? 5
+                try await Task.sleep(nanoseconds: UInt64(max(1, retryAfter) * 1_000_000_000))
+            case 413:
+                throw SparkControlError.tooLarge
+            default:
+                throw SparkControlError.server(http.statusCode, Self.detail(from: data))
+            }
+        }
+    }
+
+    // MARK: - Helpers
+
+    private static func detail(from data: Data) -> String {
+        if let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
+           let detail = obj["detail"] as? String { return detail }
+        return String(data: data, encoding: .utf8) ?? "unknown error"
+    }
+
+    private static func multipart(fields: [String: String], fileField: String,
+                                  filename: String, fileData: Data) -> (Data, String) {
+        let boundary = "Boundary-\(UUID().uuidString)"
+        var body = Data()
+        func append(_ s: String) { body.append(s.data(using: .utf8)!) }
+
+        for (name, value) in fields {
+            append("--\(boundary)\r\n")
+            append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n")
+            append("\(value)\r\n")
+        }
+        append("--\(boundary)\r\n")
+        append("Content-Disposition: form-data; name=\"\(fileField)\"; filename=\"\(filename)\"\r\n")
+        append("Content-Type: audio/wav\r\n\r\n")
+        body.append(fileData)
+        append("\r\n--\(boundary)--\r\n")
+        return (body, "multipart/form-data; boundary=\(boundary)")
+    }
+}
@@ -0,0 +1,104 @@
+import Foundation
+
+/// Local persistence of named voiceprints — the compounding-identity layer.
+///
+/// File `~/Ten31Transcripts/voiceprints.json`:
+///   `{ "<name>": { "vector": [192 floats], "updated": <iso>, "calls": <int> } }`
+///
+/// On send → `knownVoiceprints()` feeds `label-merge`. On response → `update(with:)`
+/// stores/refreshes vectors for speakers resolved by **visual** (overlap ≥ ~0.8)
+/// or **voiceprint** match. Never stores `Unknown_N` / `Speaker_unknown`.
+///
+/// Thread-safe (lock-guarded); the sequential pipeline is the only writer.
+final class VoiceprintStore {
+    struct Entry: Codable, Equatable {
+        var vector: [Float]
+        var updated: String
+        var calls: Int
+    }
+
+    private let url: URL
+    private let minOverlapToStore: Double
+    private let lock = NSLock()
+    private var entriesStore: [String: Entry] = [:]
+
+    init(fileURL: URL, minOverlapToStore: Double = 0.8) {
+        self.url = fileURL
+        self.minOverlapToStore = minOverlapToStore
+        load()
+    }
+
+    var entries: [String: Entry] {
+        lock.lock(); defer { lock.unlock() }
+        return entriesStore
+    }
+
+    /// Vectors keyed by name, for the `known_voiceprints` field.
+    func knownVoiceprints() -> [String: [Float]] {
+        lock.lock(); defer { lock.unlock() }
+        return entriesStore.mapValues { $0.vector }
+    }
+
+    /// Persist fingerprints from a `label-merge` response for confidently-named
+    /// speakers only.
+    func update(with response: LabelMergeResponse) {
+        lock.lock(); defer { lock.unlock() }
+        let now = ISO8601DateFormatter().string(from: Date())
+        for sp in response.speakers {
+            guard !Self.isUnknown(sp.name) else { continue }
+            let acceptable: Bool
+            switch sp.source {
+            case "visual":     acceptable = (sp.overlapConfidence ?? 0) >= minOverlapToStore
+            case "voiceprint": acceptable = true            // already matched a known print
+            default:           acceptable = false           // unmatched
+            }
+            guard acceptable, let vector = sp.fingerprint ?? response.fingerprints[sp.name],
+                  !vector.isEmpty else { continue }
+            var entry = entriesStore[sp.name] ?? Entry(vector: vector, updated: now, calls: 0)
+            entry.vector = vector
+            entry.updated = now
+            entry.calls += 1
+            entriesStore[sp.name] = entry
+        }
+        save()
+    }
+
+    func rename(_ old: String, to new: String) {
+        lock.lock(); defer { lock.unlock() }
+        guard let e = entriesStore.removeValue(forKey: old) else { return }
+        entriesStore[new] = e
+        save()
+    }
+
+    func remove(_ name: String) {
+        lock.lock(); defer { lock.unlock() }
+        entriesStore.removeValue(forKey: name)
+        save()
+    }
+
+    func reset() {
+        lock.lock(); defer { lock.unlock() }
+        entriesStore = [:]
+        save()
+    }
+
+    // MARK: - Persistence (call with lock held)
+
+    private func load() {
+        guard let data = try? Data(contentsOf: url),
+              let decoded = try? JSONDecoder().decode([String: Entry].self, from: data) else { return }
+        entriesStore = decoded
+    }
+
+    private func save() {
+        let encoder = JSONEncoder()
+        encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
+        try? FileManager.default.createDirectory(at: url.deletingLastPathComponent(),
+                                                 withIntermediateDirectories: true)
+        if let data = try? encoder.encode(entriesStore) { try? data.write(to: url) }
+    }
+
+    private static func isUnknown(_ name: String) -> Bool {
+        LabelMergeResponse.isUnknownName(name)
+    }
+}
@@ -0,0 +1,61 @@
+import CoreAudio
+import Foundation
+
+/// Lists the PIDs of processes currently using an audio **input** (the mic), via
+/// the CoreAudio process-object API (macOS 14+).
+///
+/// This is how we attribute mic usage to a *specific* app — e.g. "is Signal in a
+/// call?" — which is far more robust than matching window titles, and it works
+/// uniformly for Zoom/Teams/Signal and browser calls (Meet). It also lets us
+/// ignore our own recording: we look at the *call app's* PID, not the global mic,
+/// so a call's end is detected even while we keep the mic open.
+///
+/// Approach mirrors fastrepl/anarlog's `list_mic_using_apps`.
+@available(macOS 14.0, *)
+enum AudioInputProcesses {
+    static func micUsingPIDs() -> Set<pid_t> {
+        var listAddr = AudioObjectPropertyAddress(
+            mSelector: kAudioHardwarePropertyProcessObjectList,
+            mScope: kAudioObjectPropertyScopeGlobal,
+            mElement: kAudioObjectPropertyElementMain)
+
+        var dataSize: UInt32 = 0
+        guard AudioObjectGetPropertyDataSize(
+            AudioObjectID(kAudioObjectSystemObject), &listAddr, 0, nil, &dataSize) == noErr,
+            dataSize > 0 else { return [] }
+
+        let count = Int(dataSize) / MemoryLayout<AudioObjectID>.size
+        var processes = [AudioObjectID](repeating: 0, count: count)
+        guard AudioObjectGetPropertyData(
+            AudioObjectID(kAudioObjectSystemObject), &listAddr, 0, nil, &dataSize, &processes) == noErr
+        else { return [] }
+
+        var pids = Set<pid_t>()
+        for process in processes where isRunningInput(process) {
+            if let pid = pid(of: process) { pids.insert(pid) }
+        }
+        return pids
+    }
+
+    private static func isRunningInput(_ process: AudioObjectID) -> Bool {
+        var addr = AudioObjectPropertyAddress(
+            mSelector: kAudioProcessPropertyIsRunningInput,
+            mScope: kAudioObjectPropertyScopeGlobal,
+            mElement: kAudioObjectPropertyElementMain)
+        var value: UInt32 = 0
+        var size = UInt32(MemoryLayout<UInt32>.size)
+        guard AudioObjectGetPropertyData(process, &addr, 0, nil, &size, &value) == noErr else { return false }
+        return value != 0
+    }
+
+    private static func pid(of process: AudioObjectID) -> pid_t? {
+        var addr = AudioObjectPropertyAddress(
+            mSelector: kAudioProcessPropertyPID,
+            mScope: kAudioObjectPropertyScopeGlobal,
+            mElement: kAudioObjectPropertyElementMain)
+        var value: pid_t = 0
+        var size = UInt32(MemoryLayout<pid_t>.size)
+        guard AudioObjectGetPropertyData(process, &addr, 0, nil, &size, &value) == noErr else { return nil }
+        return value
+    }
+}
@@ -0,0 +1,226 @@
+import AppKit
+import CoreGraphics
+import Combine
+
+/// Detects when the user joins/leaves a call and reports it via callbacks.
+///
+/// Heuristic: the mic is live system-wide AND a known call app is present —
+/// Zoom/Teams/Signal by bundle ID, or Google Meet by a browser window whose
+/// title looks like a Meet call (read via `CGWindowList`, using the Screen
+/// Recording permission). Debounced so a quick unrelated mic use doesn't trigger.
+///
+/// Main-actor: all evaluation runs on the main thread.
+@MainActor
+final class CallDetector: ObservableObject {
+
+    enum DetectedApp: String, Equatable {
+        case zoom, teams, signal, meet
+        var label: String { rawValue }
+        var display: String {
+            switch self {
+            case .zoom:   return "Zoom"
+            case .teams:  return "Microsoft Teams"
+            case .signal: return "Signal"
+            case .meet:   return "Google Meet"
+            }
+        }
+    }
+
+    enum Status: Equatable {
+        case disabled
+        case listening
+        case inCall(DetectedApp)
+    }
+
+    @Published private(set) var status: Status = .disabled
+
+    var onCallStart: ((DetectedApp) -> Void)?
+    var onCallEnd: (() -> Void)?
+
+    private let mic = MicActivityMonitor()
+    private var pollTimer: Timer?
+    private var openTimer: Timer?
+    private var closeTimer: Timer?
+    private var inCall = false
+    private var currentApp: DetectedApp?
+    private var enabled = false
+
+    private let openDelay: TimeInterval = 2.0
+    private let closeDelay: TimeInterval = 4.0
+    private let pollInterval: TimeInterval = 3.0
+
+    private static let nativeApps: [(id: String, app: DetectedApp)] = [
+        ("us.zoom.xos", .zoom),
+        ("com.microsoft.teams2", .teams),
+        ("com.microsoft.teams", .teams),
+        ("org.whispersystems.signal-desktop", .signal),
+    ]
+    private static let browserIDs: Set<String> = [
+        "org.mozilla.firefox", "com.google.Chrome", "com.apple.Safari",
+        "company.thebrowser.Browser", "com.brave.Browser", "com.microsoft.edgemac",
+    ]
+
+    func enable() {
+        guard !enabled else { return }
+        enabled = true
+        mic.onChange = { [weak self] _ in self?.evaluate() }
+        mic.start()
+        status = .listening
+        pollTimer = Timer.scheduledTimer(withTimeInterval: pollInterval, repeats: true) { [weak self] _ in
+            Task { @MainActor in self?.evaluate() }
+        }
+        evaluate()
+    }
+
+    func disable() {
+        guard enabled else { return }
+        enabled = false
+        mic.stop()
+        pollTimer?.invalidate(); pollTimer = nil
+        cancelOpen(); cancelClose()
+        inCall = false
+        currentApp = nil
+        status = .disabled
+    }
+
+    // MARK: - Evaluation
+
+    private func evaluate() {
+        guard enabled else { return }
+        let candidate = mic.isRunning ? detectApp() : nil
+
+        if let candidate {
+            cancelClose()
+            if inCall {
+                currentApp = candidate
+                status = .inCall(candidate)
+            } else if openTimer == nil {
+                openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in
+                    Task { @MainActor in self?.fireOpen() }
+                }
+            }
+        } else {
+            cancelOpen()
+            if inCall && closeTimer == nil {
+                closeTimer = Timer.scheduledTimer(withTimeInterval: closeDelay, repeats: false) { [weak self] _ in
+                    Task { @MainActor in self?.fireClose() }
+                }
+            }
+        }
+    }
+
+    private func fireOpen() {
+        openTimer = nil
+        // Re-resolve the app at fire time (the debounce window may have changed it).
+        guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return }
+        inCall = true
+        currentApp = app
+        status = .inCall(app)
+        onCallStart?(app)
+    }
+
+    private func fireClose() {
+        closeTimer = nil
+        guard enabled, inCall else { return }
+        inCall = false
+        currentApp = nil
+        status = .listening
+        onCallEnd?()
+    }
+
+    private func cancelOpen() { openTimer?.invalidate(); openTimer = nil }
+    private func cancelClose() { closeTimer?.invalidate(); closeTimer = nil }
+
+    // MARK: - App detection
+
+    /// A call is active when a known call app is actually using the mic.
+    /// On macOS 14+ we attribute mic usage per-process (robust start AND stop,
+    /// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13
+    /// we fall back to the per-app call-window heuristic.
+    private func detectApp() -> DetectedApp? {
+        if #available(macOS 14.0, *) {
+            return detectViaMicAttribution()
+        }
+        return detectViaWindowTitle()
+    }
+
+    @available(macOS 14.0, *)
+    private func detectViaMicAttribution() -> DetectedApp? {
+        let micPIDs = AudioInputProcesses.micUsingPIDs()
+        guard !micPIDs.isEmpty else { return nil }
+        let selfPID = NSRunningApplication.current.processIdentifier
+
+        for app in NSWorkspace.shared.runningApplications {
+            let pid = app.processIdentifier
+            guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue }
+            if let native = Self.nativeApps.first(where: { $0.id == id }) {
+                return native.app          // Signal/Zoom/Teams using the mic = in a call
+            }
+            // A browser using the mic + a Meet window = a Meet call. The mic state
+            // gives reliable start/stop; the window check keeps non-Meet browser
+            // mic use (other web apps) from being mislabeled as a Meet recording.
+            if Self.browserIDs.contains(id), pidHasMeetWindow(pid) {
+                return .meet
+            }
+        }
+        return nil
+    }
+
+    private func pidHasMeetWindow(_ pid: pid_t) -> Bool {
+        guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]]
+        else { return false }
+        for w in info {
+            guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid,
+                  let title = w[kCGWindowName as String] as? String else { continue }
+            if Self.looksLikeMeet(title) { return true }
+        }
+        return false
+    }
+
+    /// macOS 13 fallback: detect by the presence of a call WINDOW per app.
+    private func detectViaWindowTitle() -> DetectedApp? {
+        var pidToApp: [pid_t: DetectedApp] = [:]
+        var browserPIDs = Set<pid_t>()
+        for app in NSWorkspace.shared.runningApplications {
+            guard let id = app.bundleIdentifier else { continue }
+            if let native = Self.nativeApps.first(where: { $0.id == id }) {
+                pidToApp[app.processIdentifier] = native.app
+            } else if Self.browserIDs.contains(id) {
+                browserPIDs.insert(app.processIdentifier)
+            }
+        }
+        guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil }
+        guard let infoList = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] else {
+            return nil
+        }
+        for info in infoList {
+            guard let pid = info[kCGWindowOwnerPID as String] as? pid_t,
+                  let title = info[kCGWindowName as String] as? String,
+                  !title.isEmpty else { continue }
+            if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet }
+            if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app }
+        }
+        return nil
+    }
+
+    /// Per-app in-call window-title signatures (macOS 13 fallback only).
+    private static func isCallWindow(_ app: DetectedApp, _ title: String) -> Bool {
+        let t = title.lowercased()
+        switch app {
+        case .zoom:   return t.contains("zoom meeting") || t.contains("meeting")
+        case .teams:  return t.contains("meeting")
+        case .signal: return t.contains("signal call") || t.contains("group call")
+        case .meet:   return false   // handled via the browser path above
+        }
+    }
+
+    /// Match an ACTIVE Google Meet call. Verified against real Firefox titles:
+    /// in a call the title is "Meet - <code>" (e.g. "Meet - tjh-pixe-ier"), while
+    /// the home/lobby/"you left" pages are bare "Meet" or "Google Meet". Matching
+    /// only the "Meet - …" form is what lets auto-STOP fire when you leave (and
+    /// avoids false-starting on the home page). Also excludes "Zoom Meeting" etc.
+    private static func looksLikeMeet(_ title: String) -> Bool {
+        let t = title.lowercased()
+        return t.hasPrefix("meet - ") || t.hasPrefix("meet – ") || t.hasPrefix("meet — ")
+    }
+}
@@ -0,0 +1,125 @@
+import CoreAudio
+import Foundation
+
+/// Watches whether *any* app is using the default input device (the system-wide
+/// "mic is live" signal), via CoreAudio property listeners. Re-binds when the
+/// default input device changes (e.g. you plug in a headset mid-call).
+///
+/// Threading: ALL CoreAudio state (deviceID, listener blocks, `started`) and all
+/// Add/Remove calls are confined to the serial `queue`. `isRunning` is written
+/// and read only on the main thread (via `deliver`). `onChange` fires on main.
+final class MicActivityMonitor {
+    private(set) var isRunning = false           // main-thread only
+    var onChange: ((Bool) -> Void)?
+
+    private let queue = DispatchQueue(label: "xyz.ten31.micmonitor")
+
+    // queue-confined:
+    private var deviceID = AudioObjectID(kAudioObjectUnknown)
+    private var runningBlock: AudioObjectPropertyListenerBlock?
+    private var defaultDeviceBlock: AudioObjectPropertyListenerBlock?
+    private var started = false
+
+    private static let runningAddr = AudioObjectPropertyAddress(
+        mSelector: kAudioDevicePropertyDeviceIsRunningSomewhere,
+        mScope: kAudioObjectPropertyScopeGlobal,
+        mElement: kAudioObjectPropertyElementMain)
+
+    private static let defaultDeviceAddr = AudioObjectPropertyAddress(
+        mSelector: kAudioHardwarePropertyDefaultInputDevice,
+        mScope: kAudioObjectPropertyScopeGlobal,
+        mElement: kAudioObjectPropertyElementMain)
+
+    func start() { queue.async { self.begin() } }
+
+    /// Called on the main thread (by the @MainActor CallDetector). Resets
+    /// `isRunning` so a subsequent enable()'s synchronous evaluation can't read a
+    /// stale `true` before the fresh reading arrives.
+    func stop() {
+        queue.sync { self.end() }
+        isRunning = false
+    }
+
+    // MARK: - queue-confined
+
+    private func begin() {
+        guard !started else { return }
+        started = true
+        var addr = Self.defaultDeviceAddr
+        let block: AudioObjectPropertyListenerBlock = { [weak self] _, _ in
+            self?.rebindRunning()   // delivered on `queue`
+        }
+        defaultDeviceBlock = block
+        AudioObjectAddPropertyListenerBlock(AudioObjectID(kAudioObjectSystemObject), &addr, queue, block)
+        bindRunning()
+    }
+
+    private func end() {
+        started = false
+        if let block = defaultDeviceBlock {
+            var addr = Self.defaultDeviceAddr
+            AudioObjectRemovePropertyListenerBlock(AudioObjectID(kAudioObjectSystemObject), &addr, queue, block)
+            defaultDeviceBlock = nil
+        }
+        unbindRunning()
+    }
+
+    private func bindRunning() {
+        guard started else { return }
+        deviceID = Self.defaultInputDevice()
+        guard deviceID != AudioObjectID(kAudioObjectUnknown) else { return }
+        var addr = Self.runningAddr
+        let block: AudioObjectPropertyListenerBlock = { [weak self] _, _ in
+            guard let self else { return }
+            self.deliver(Self.isDeviceRunning(self.deviceID))   // on `queue`
+        }
+        runningBlock = block
+        // Install the listener BEFORE the initial read so a flip during setup is
+        // caught (either by the now-installed listener or the post-install read).
+        AudioObjectAddPropertyListenerBlock(deviceID, &addr, queue, block)
+        deliver(Self.isDeviceRunning(deviceID))
+    }
+
+    private func unbindRunning() {
+        if deviceID != AudioObjectID(kAudioObjectUnknown), let block = runningBlock {
+            var addr = Self.runningAddr
+            AudioObjectRemovePropertyListenerBlock(deviceID, &addr, queue, block)
+        }
+        runningBlock = nil
+        deviceID = AudioObjectID(kAudioObjectUnknown)
+    }
+
+    private func rebindRunning() {
+        guard started else { return }
+        unbindRunning()
+        bindRunning()
+    }
+
+    private func deliver(_ running: Bool) {
+        DispatchQueue.main.async {
+            let changed = running != self.isRunning
+            self.isRunning = running
+            if changed { self.onChange?(running) }
+        }
+    }
+
+    // MARK: - CoreAudio reads (use local address copies)
+
+    private static func defaultInputDevice() -> AudioObjectID {
+        var addr = defaultDeviceAddr
+        var device = AudioObjectID(kAudioObjectUnknown)
+        var size = UInt32(MemoryLayout<AudioObjectID>.size)
+        let status = AudioObjectGetPropertyData(
+            AudioObjectID(kAudioObjectSystemObject), &addr, 0, nil, &size, &device)
+        return status == noErr ? device : AudioObjectID(kAudioObjectUnknown)
+    }
+
+    private static func isDeviceRunning(_ device: AudioObjectID) -> Bool {
+        guard device != AudioObjectID(kAudioObjectUnknown) else { return false }
+        var addr = runningAddr
+        var value: UInt32 = 0
+        var size = UInt32(MemoryLayout<UInt32>.size)
+        let status = AudioObjectGetPropertyData(device, &addr, 0, nil, &size, &value)
+        return status == noErr && value != 0
+    }
+}
@@ -1,6 +1,7 @@
 import Foundation
 import Combine
 import AppKit
+import CoreGraphics

 struct SessionInfo: Equatable {
    let folder: URL
@@ -25,6 +26,14 @@ final class SessionController: ObservableObject {
        case error(String)
    }

+    /// Backend transcription status for the most recent session.
+    enum TranscriptStatus: Equatable {
+        case idle
+        case processing(Int, Int)              // chunk done, total
+        case done(speakers: Int, segments: Int)
+        case failed(String)
+    }
+
    /// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a
    /// recording in progress before the app quits.
    static weak var shared: SessionController?
@@ -37,12 +46,34 @@ final class SessionController: ObservableObject {
    @Published private(set) var systemLevel: Float = 0
    /// Surfaced after a session if system audio stopped early.
    @Published private(set) var warning: String?
+    /// Mirrored from `CallDetector` for the UI.
+    @Published private(set) var detectionStatus: CallDetector.Status = .disabled
+    /// Backend transcription status for the last session.
+    @Published private(set) var transcriptStatus: TranscriptStatus = .idle

    private let settings: AppSettings
+    private var voiceprints: VoiceprintStore
+    private let detector = CallDetector()
+    private var cancellables = Set<AnyCancellable>()
+    private var currentLabel = "manual"
+    /// Inputs needed to (re)process the last finished session through the backend.
+    private struct ProcessInputs {
+        let folder: URL
+        let sessionId: String
+        let app: String
+        let mixedURL: URL
+        let selfSpans: [VADSpan]
+    }
+    private var lastProcess: ProcessInputs?
+    private var processTask: Task<Void, Never>?
    private var recorder: AudioRecorder?
    private var currentFolder: URL?
    private var startTime: Date?
    private var timer: Timer?
+    /// True when the current session was started by call detection (not the user).
+    private var autoStarted = false
+    /// Set if a detected call ends while we're still in `.starting`.
+    private var pendingAutoStop = false
    /// The in-flight start or stop Task, so `prepareForTermination` can await it.
    private var lifecycleTask: Task<Void, Never>?
    /// Bumped each time a start/stop Task is spawned (Task is a value type, so this
@@ -51,7 +82,64 @@ final class SessionController: ObservableObject {

    init(settings: AppSettings) {
        self.settings = settings
+        self.voiceprints = VoiceprintStore(
+            fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
        SessionController.shared = self
+
+        detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
+        detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
+        detector.$status
+            .sink { [weak self] status in self?.detectionStatus = status }
+            .store(in: &cancellables)
+        // Re-point the voiceprint DB if the output folder changes. The in-flight
+        // pipeline keeps its own captured reference, so this can't disrupt a run.
+        settings.$outputFolderPath
+            .dropFirst()
+            .sink { [weak self] path in
+                guard let self else { return }
+                let dir = URL(fileURLWithPath: (path as NSString).expandingTildeInPath, isDirectory: true)
+                self.voiceprints = VoiceprintStore(fileURL: dir.appendingPathComponent("voiceprints.json"))
+            }
+            .store(in: &cancellables)
+        settings.$autoRecordOnDetection
+            .sink { [weak self] on in
+                guard let self else { return }
+                if on {
+                    self.detector.enable()
+                } else {
+                    self.detector.disable()
+                    // Don't leave an auto-started session running with no detector —
+                    // handle both .recording and the in-flight .starting case.
+                    if self.autoStarted {
+                        switch self.state {
+                        case .recording: self.stop()
+                        case .starting:  self.pendingAutoStop = true
+                        default:         break
+                        }
+                    }
+                }
+            }
+            .store(in: &cancellables)
+    }
+
+    // MARK: - Auto-detection
+
+    private func handleCallStart(_ app: CallDetector.DetectedApp) {
+        guard settings.autoRecordOnDetection else { return }
+        switch state {
+        case .idle, .error: start(label: app.label, auto: true)
+        case .starting, .recording, .finishing: break   // don't disturb an active session
+        }
+    }
+
+    private func handleCallEnd() {
+        // Only auto-stop a session we auto-started; never a manual recording.
+        guard autoStarted else { return }
+        switch state {
+        case .recording: stop()
+        case .starting:  pendingAutoStop = true   // resolved when start() completes
+        case .idle, .error, .finishing: break
+        }
    }

    var isBusy: Bool {
@@ -68,15 +156,18 @@ final class SessionController: ObservableObject {

    // MARK: - Start / Stop

-    private func start() {
+    private func start(label: String = "manual", auto: Bool = false) {
        let folder: URL
        do {
-            folder = try makeSessionFolder()
+            folder = try makeSessionFolder(label: label)
        } catch {
            fail("Couldn't create session folder: \(error.localizedDescription)")
            return
        }
        currentFolder = folder
+        currentLabel = label
+        autoStarted = auto
+        pendingAutoStop = false
        let recorder = AudioRecorder(
            micURL: folder.appendingPathComponent("mic.wav"),
            systemURL: folder.appendingPathComponent("system.wav"),
@@ -92,12 +183,36 @@ final class SessionController: ObservableObject {
                self.state = .recording
                self.startTime = Date()
                self.startTimer()
+                // A detected call may have ended while we were still starting.
+                if self.pendingAutoStop {
+                    self.pendingAutoStop = false
+                    self.stop()
+                }
            } catch {
-                self.fail("Couldn't start recording: \(error.localizedDescription)")
+                self.handleStartFailure(error)
            }
        }
    }

+    /// Map a recorder start failure to an actionable message. The common case is
+    /// Screen Recording getting re-checked after a rebuild (the SCStream auth
+    /// check fails even though CGPreflight reports granted), so re-prompt and open
+    /// the right Settings pane rather than show a cryptic TCC error.
+    private func handleStartFailure(_ error: Error) {
+        let msg = error.localizedDescription.lowercased()
+        let screenIssue = msg.contains("declined") || msg.contains("tcc")
+            || msg.contains("screen") || msg.contains("permission")
+        if screenIssue {
+            _ = CGRequestScreenCaptureAccess()
+            if let url = URL(string: "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") {
+                NSWorkspace.shared.open(url)
+            }
+            fail("Screen Recording needs re-approval for this build. Toggle Ten31Transcripts off then on in System Settings ▸ Screen Recording, then restart the app.")
+        } else {
+            fail("Couldn't start recording: \(error.localizedDescription)")
+        }
+    }
+
    private func stop() {
        guard let recorder else { return }
        state = .finishing
@@ -114,20 +229,66 @@ final class SessionController: ObservableObject {
        micLevel = 0
        systemLevel = 0
        warning = result.systemNote.map { "System audio stopped early: \($0)" }
+        transcriptStatus = .idle
        if let folder = currentFolder {
            writeSelfSpans(result, to: folder)
            lastSession = SessionInfo(
                folder: folder, mixedURL: result.mixedURL,
                duration: result.duration, selfSpanCount: result.selfSpans.count)
+            lastProcess = ProcessInputs(
+                folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
+                mixedURL: result.mixedURL, selfSpans: result.selfSpans)
        }
+        let autoSend = settings.autoSendOnStop
        currentFolder = nil
+        autoStarted = false
+        pendingAutoStop = false
        elapsed = 0
        state = .idle
+        if autoSend { processLastSession() }
+    }
+
+    // MARK: - Backend transcription
+
+    /// Send the last finished session to the backend → `speakers.json`. Uses the
+    /// mic-VAD self spans as the timeline for now; visual segments (Phase 3–4) get
+    /// merged in once the adapters land. Safe to call manually ("Send to backend")
+    /// or automatically on stop.
+    func processLastSession() {
+        guard let inputs = lastProcess else { return }
+        if case .processing = transcriptStatus { return }
+        transcriptStatus = .processing(0, 1)
+
+        let settings = self.settings
+        let voiceprints = self.voiceprints
+        processTask = Task {
+            let pipeline = TranscriptPipeline(
+                baseURL: settings.backendBaseURL,
+                skipTLS: settings.skipTLSVerification,
+                voiceprints: voiceprints)
+            let timeline = TranscriptPipeline.timeline(
+                fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
+            do {
+                let speakers = try await pipeline.process(
+                    sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
+                    mixedURL: inputs.mixedURL, timeline: timeline,
+                    progress: { done, total in
+                        await MainActor.run { self.transcriptStatus = .processing(done, total) }
+                    })
+                self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
+            } catch is CancellationError {
+                self.transcriptStatus = .idle
+            } catch {
+                self.transcriptStatus = .failed(error.localizedDescription)
+            }
+        }
    }

    private func fail(_ message: String) {
        recorder = nil
        currentFolder = nil
+        autoStarted = false
+        pendingAutoStop = false
        stopTimer()
        micLevel = 0
        systemLevel = 0
@@ -139,6 +300,9 @@ final class SessionController: ObservableObject {
    /// its WAV headers are finalized before the process exits. Handles quit while
    /// `.starting` and `.finishing`, not just `.recording`.
    func prepareForTermination() async {
+        // Cancel any in-flight backend transcription (audio is already saved; the
+        // user can resend). The pipeline's checkCancellation + defer clean up chunks.
+        processTask?.cancel()
        // Drain whatever lifecycle Task is in flight until nothing is busy. A Stop
        // click landing in an await window can spawn a new stop Task, so loop
        // rather than awaiting a single captured task.
@@ -178,9 +342,9 @@ final class SessionController: ObservableObject {

    // MARK: - Files

-    private func makeSessionFolder() throws -> URL {
+    private func makeSessionFolder(label: String) throws -> URL {
        let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
-        let folder = base.appendingPathComponent("\(Self.timestamp())_manual", isDirectory: true)
+        let folder = base.appendingPathComponent("\(Self.timestamp())_\(label)", isDirectory: true)
        try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true)
        return folder
    }
@@ -0,0 +1,85 @@
+import Foundation
+import AVFoundation
+
+/// Splits a long session into backend-sized chunks and produces, per chunk, the
+/// sliced audio and the timeline rebased to chunk-local seconds.
+///
+/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3
+/// min are chunked into ~2–3 min windows; names + voiceprints unify speakers
+/// across chunks (handled in the pipeline).
+enum SessionPackager {
+    struct PlannedChunk: Equatable {
+        let index: Int
+        let start: Double      // global seconds
+        let end: Double
+    }
+
+    /// One chunk if short; otherwise even ~`chunkSeconds` windows.
+    static func planChunks(durationSec: Double,
+                           chunkSeconds: Double = 150,
+                           thresholdSec: Double = 180) -> [PlannedChunk] {
+        guard durationSec > thresholdSec else {
+            return [PlannedChunk(index: 0, start: 0, end: durationSec)]
+        }
+        var chunks: [PlannedChunk] = []
+        var start = 0.0
+        var index = 0
+        while start < durationSec - 0.001 {
+            let end = min(start + chunkSeconds, durationSec)
+            chunks.append(PlannedChunk(index: index, start: start, end: end))
+            start = end
+            index += 1
+        }
+        return chunks
+    }
+
+    /// Clip segments to `[start, end)` and rebase to chunk-local seconds, then
+    /// emit the flat `label-merge` array `[{start,end,name,confidence}]`.
+    static func rebasedTimelineData(_ segments: [VisualTimeline.Segment],
+                                    start: Double, end: Double) throws -> Data {
+        let flat: [[String: Any]] = segments.compactMap { seg in
+            let s = max(seg.start, start)
+            let e = min(seg.end, end)
+            guard e > s else { return nil }
+            return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence]
+        }
+        return try JSONSerialization.data(withJSONObject: flat, options: [])
+    }
+
+    /// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
+    static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
+        let input = try AVAudioFile(forReading: source)
+        let sr = input.fileFormat.sampleRate
+        let startFrame = AVAudioFramePosition((startSec * sr).rounded())
+        let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded()))
+        guard endFrame > startFrame else { return }
+
+        let settings: [String: Any] = [
+            AVFormatIDKey: kAudioFormatLinearPCM,
+            AVSampleRateKey: sr,
+            AVNumberOfChannelsKey: 1,
+            AVLinearPCMBitDepthKey: 16,
+            AVLinearPCMIsFloatKey: false,
+            AVLinearPCMIsBigEndianKey: false,
+        ]
+        let output = try AVAudioFile(forWriting: dest, settings: settings,
+                                     commonFormat: .pcmFormatFloat32, interleaved: false)
+        input.framePosition = startFrame
+        var remaining = AVAudioFrameCount(endFrame - startFrame)
+        let block: AVAudioFrameCount = 16_000
+        while remaining > 0 {
+            let n = min(block, remaining)
+            guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break }
+            try input.read(into: buffer, frameCount: n)
+            if buffer.frameLength == 0 { break }
+            try output.write(from: buffer)
+            remaining -= buffer.frameLength
+        }
+    }
+
+    /// Duration (seconds) of a WAV.
+    static func duration(of url: URL) -> Double {
+        guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 }
+        return Double(file.length) / file.fileFormat.sampleRate
+    }
+}
@@ -0,0 +1,45 @@
+import Foundation
+
+/// `speakers.json` — the final stored output (docs §6): per-chunk `label-merge`
+/// results concatenated, timestamps offset back to global seconds, names unified.
+/// This is the hand-off to the downstream summarizer; the app stops here.
+struct SpeakersFile: Codable {
+    let sessionId: String
+    let app: String
+    let durationSec: Double
+    let speakers: [Speaker]
+    let segments: [Segment]
+    let models: [String: String]
+
+    struct Speaker: Codable, Equatable {
+        let name: String
+        let source: String
+        let overlapConfidence: Double?
+        let matchSimilarity: Double?
+        enum CodingKeys: String, CodingKey {
+            case name, source
+            case overlapConfidence = "overlap_confidence"
+            case matchSimilarity = "match_similarity"
+        }
+    }
+
+    struct Segment: Codable, Equatable {
+        let start: Double
+        let end: Double
+        let speaker: String
+        let text: String?
+    }
+
+    enum CodingKeys: String, CodingKey {
+        case sessionId = "session_id"
+        case app
+        case durationSec = "duration_sec"
+        case speakers, segments, models
+    }
+
+    func write(to url: URL) throws {
+        let encoder = JSONEncoder()
+        encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
+        try encoder.encode(self).write(to: url)
+    }
+}
@@ -0,0 +1,78 @@
+import Foundation
+
+/// Concatenates per-chunk `label-merge` results into one global `speakers.json`:
+/// segment times offset back to global seconds, speakers unified across chunks by
+/// name, and fingerprints collected for the voiceprint store.
+enum TranscriptAssembler {
+    struct ChunkResult {
+        let chunkStart: Double           // global seconds
+        let response: LabelMergeResponse
+    }
+
+    struct Assembled {
+        let speakersFile: SpeakersFile
+        let fingerprints: [String: [Float]]   // name -> 192-dim, for VoiceprintStore
+    }
+
+    /// Source ranking when the same name appears across chunks with different sources.
+    private static func rank(_ source: String) -> Int {
+        switch source {
+        case "visual": return 3
+        case "voiceprint": return 2
+        default: return 1            // unmatched
+        }
+    }
+
+    private static func isUnknown(_ name: String) -> Bool {
+        LabelMergeResponse.isUnknownName(name)
+    }
+
+    static func assemble(sessionId: String, app: String, chunks: [ChunkResult]) -> Assembled {
+        var segments: [SpeakersFile.Segment] = []
+        var bestSpeaker: [String: SpeakersFile.Speaker] = [:]
+        var fingerprints: [String: [Float]] = [:]
+        var models: [String: String] = [:]
+        var duration = 0.0
+
+        for chunk in chunks {
+            let offset = chunk.chunkStart
+            // Audio length from the chunk window, so silent/all-unknown calls still
+            // report a real duration (not just the last segment's end).
+            duration = max(duration, offset + chunk.response.duration)
+
+            for seg in chunk.response.segments {
+                let start = seg.startSeconds + offset
+                let end = seg.endSeconds + offset
+                segments.append(.init(start: start, end: end, speaker: seg.speaker, text: seg.text))
+                duration = max(duration, end)
+            }
+
+            for sp in chunk.response.speakers {
+                let candidate = SpeakersFile.Speaker(
+                    name: sp.name, source: sp.source,
+                    overlapConfidence: sp.overlapConfidence, matchSimilarity: sp.matchSimilarity)
+                if let existing = bestSpeaker[sp.name] {
+                    if rank(sp.source) > rank(existing.source) { bestSpeaker[sp.name] = candidate }
+                } else {
+                    bestSpeaker[sp.name] = candidate
+                }
+                // Collect named fingerprints only (never Unknown_N / Speaker_unknown).
+                if !isUnknown(sp.name), let fp = sp.fingerprint, fp.count > 0 {
+                    fingerprints[sp.name] = fp
+                }
+            }
+            for (name, fp) in chunk.response.fingerprints where !isUnknown(name) && fp.count > 0 {
+                fingerprints[name] = fp
+            }
+        }
+
+        segments.sort { $0.start < $1.start }
+        let speakers = bestSpeaker.values.sorted { $0.name < $1.name }
+        models = chunks.last?.response.models ?? [:]
+
+        let file = SpeakersFile(
+            sessionId: sessionId, app: app, durationSec: duration,
+            speakers: speakers, segments: segments, models: models)
+        return Assembled(speakersFile: file, fingerprints: fingerprints)
+    }
+}
@@ -0,0 +1,75 @@
+import Foundation
+
+/// Drives a finished session through the backend: chunk → sequential
+/// `label-merge` (accumulating voiceprints) → assemble `speakers.json` → persist
+/// fingerprints. Requests are sequential by construction (one chunk at a time).
+final class TranscriptPipeline {
+    private let client: SparkControlClient
+    private let voiceprints: VoiceprintStore
+
+    init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
+        self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
+        self.voiceprints = voiceprints
+    }
+
+    /// Process `mixedURL` against `timeline` (visual + self spans). Writes
+    /// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)`
+    /// is called per chunk.
+    func process(sessionFolder: URL,
+                 sessionId: String,
+                 app: String,
+                 mixedURL: URL,
+                 timeline: [VisualTimeline.Segment],
+                 progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
+        let duration = SessionPackager.duration(of: mixedURL)
+        let plan = SessionPackager.planChunks(durationSec: duration)
+
+        // Zero-duration / empty session → a valid empty speakers.json, no backend call.
+        if plan.isEmpty || duration <= 0 {
+            let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
+            try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
+            await progress?(0, 0)
+            return empty.speakersFile
+        }
+
+        let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
+        try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true)
+        defer { try? FileManager.default.removeItem(at: chunksDir) }   // cleanup on success OR throw
+
+        // Start from stored voiceprints; accumulate this call's prints across chunks
+        // for within-call unification (the store only persists high-confidence ones).
+        var known = voiceprints.knownVoiceprints()
+        var results: [TranscriptAssembler.ChunkResult] = []
+
+        for chunk in plan {
+            try Task.checkCancellation()
+            await progress?(chunk.index, plan.count)
+            let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav")
+            try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
+            guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue }  // empty slice → skip
+
+            let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
+            let response = try await client.labelMerge(
+                audioURL: chunkURL, timeline: timelineData,
+                knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
+
+            for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
+                known[name] = fp
+            }
+            voiceprints.update(with: response)
+            results.append(.init(chunkStart: chunk.start, response: response))
+            try? FileManager.default.removeItem(at: chunkURL)
+        }
+        await progress?(plan.count, plan.count)
+
+        let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
+        try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
+        return assembled.speakersFile
+    }
+
+    /// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
+    /// the visual adapters land (Phase 3–4), their segments are merged in too.
+    static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
+        spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
+    }
+}
@@ -32,6 +32,21 @@ final class AppSettings: ObservableObject {
        didSet { defaults.set(adapterEnabled, forKey: Keys.adapterEnabled) }
    }

+    @Published var autoRecordOnDetection: Bool {
+        didSet { defaults.set(autoRecordOnDetection, forKey: Keys.autoRecord) }
+    }
+
+    /// The user's name, pre-seeded into the timeline for mic-VAD "self" spans.
+    @Published var selfName: String {
+        didSet { defaults.set(selfName, forKey: Keys.selfName) }
+    }
+
+    /// Auto-send a finished recording to the backend for transcription. Default
+    /// off while developing; flip on for hands-free transcripts.
+    @Published var autoSendOnStop: Bool {
+        didSet { defaults.set(autoSendOnStop, forKey: Keys.autoSend) }
+    }
+
    /// Output folder as a resolved file URL (expands a leading `~`).
    var outputFolderURL: URL {
        URL(fileURLWithPath: (outputFolderPath as NSString).expandingTildeInPath,
@@ -55,6 +70,10 @@ final class AppSettings: ObservableObject {
        self.adapterEnabled = stored ?? Dictionary(
            uniqueKeysWithValues: Self.adapterKeys.map { ($0.key, true) }
        )
+
+        self.autoRecordOnDetection = defaults.object(forKey: Keys.autoRecord) as? Bool ?? true
+        self.selfName = defaults.string(forKey: Keys.selfName) ?? "Me"
+        self.autoSendOnStop = defaults.object(forKey: Keys.autoSend) as? Bool ?? false
    }

    private enum Keys {
@@ -62,5 +81,8 @@ final class AppSettings: ObservableObject {
        static let skipTLS = "skipTLSVerification"
        static let outputFolder = "outputFolderPath"
        static let adapterEnabled = "adapterEnabled"
+        static let autoRecord = "autoRecordOnDetection"
+        static let selfName = "selfName"
+        static let autoSend = "autoSendOnStop"
    }
 }
@@ -46,6 +46,9 @@ struct MenuBarView: View {
                        .foregroundStyle(.secondary)
                }
            }
+            Text(detectionText)
+                .font(.caption)
+                .foregroundStyle(.secondary)

            Button {
                session.toggle()
@@ -84,6 +87,15 @@ struct MenuBarView: View {
                        .font(.caption)
                }
                .buttonStyle(.link)
+
+                HStack {
+                    Button("Send to backend") { session.processLastSession() }
+                        .disabled(transcriptProcessing)
+                    Spacer()
+                }
+                if !transcriptText.isEmpty {
+                    Text(transcriptText).font(.caption).foregroundStyle(transcriptColor)
+                }
            }
        }
    }
@@ -114,6 +126,36 @@ struct MenuBarView: View {
        return String(format: "%02d:%02d", total / 60, total % 60)
    }

+    private var detectionText: String {
+        switch session.detectionStatus {
+        case .disabled:        return "Auto-detect off"
+        case .listening:       return "Listening for calls…"
+        case .inCall(let app): return "In call: \(app.display)"
+        }
+    }
+
+    private var transcriptProcessing: Bool {
+        if case .processing = session.transcriptStatus { return true }
+        return false
+    }
+
+    private var transcriptText: String {
+        switch session.transcriptStatus {
+        case .idle:                       return ""
+        case .processing(let d, let t):   return "Transcribing… chunk \(d)/\(t)"
+        case .done(let s, let seg):       return "Transcript ready · \(s) speakers · \(seg) segments"
+        case .failed(let m):              return "Transcript failed: \(m)"
+        }
+    }
+
+    private var transcriptColor: Color {
+        switch session.transcriptStatus {
+        case .failed: return .red
+        case .done:   return .green
+        default:      return .secondary
+        }
+    }
+
    private var header: some View {
        VStack(alignment: .leading, spacing: 2) {
            Text("Ten31 Transcripts").font(.headline)
@@ -14,6 +14,22 @@ struct SettingsView: View {
                       isOn: $settings.skipTLSVerification)
            }

+            Section("Call detection") {
+                Toggle("Auto-record when a call is detected", isOn: $settings.autoRecordOnDetection)
+                Text("Detects Zoom, Teams, Signal, and Google Meet (any browser).")
+                    .font(.caption)
+                    .foregroundStyle(.secondary)
+            }
+
+            Section("Transcription") {
+                TextField("Your name", text: $settings.selfName)
+                    .textFieldStyle(.roundedBorder)
+                Toggle("Auto-send recordings to backend", isOn: $settings.autoSendOnStop)
+                Text("Your name labels the mic-VAD \"self\" spans. Auto-send transcribes each recording on stop.")
+                    .font(.caption)
+                    .foregroundStyle(.secondary)
+            }
+
            Section("Output") {
                HStack {
                    Text(settings.outputFolderPath)
@@ -0,0 +1,82 @@
+import Foundation
+import CoreGraphics
+
+/// Renders a CGImage to an RGBA8 buffer once, then answers cheap colour queries
+/// over pixel regions. Used to score the active-speaker highlight (a saturated
+/// coloured border/ring) around participant tiles.
+struct FrameSampler {
+    let width: Int
+    let height: Int
+    private let pixels: [UInt8]      // RGBA8, row-major, top-left origin
+
+    init?(cgImage: CGImage) {
+        let w = cgImage.width, h = cgImage.height
+        guard w > 0, h > 0 else { return nil }
+        var buffer = [UInt8](repeating: 0, count: w * h * 4)
+        let colorSpace = CGColorSpaceCreateDeviceRGB()
+        let info = CGImageAlphaInfo.premultipliedLast.rawValue
+        guard let ctx = buffer.withUnsafeMutableBytes({ raw -> CGContext? in
+            CGContext(data: raw.baseAddress, width: w, height: h, bitsPerComponent: 8,
+                      bytesPerRow: w * 4, space: colorSpace, bitmapInfo: info)
+        }) else { return nil }
+        ctx.draw(cgImage, in: CGRect(x: 0, y: 0, width: w, height: h))
+        self.width = w
+        self.height = h
+        self.pixels = buffer
+    }
+
+    /// Mean HSV saturation (0…1) over a pixel rect (top-left origin), sampled on a grid.
+    func meanSaturation(inPixelRect rect: CGRect, samples: Int = 24) -> Double {
+        let x0 = max(0, Int(rect.minX)), x1 = min(width, Int(rect.maxX))
+        let y0 = max(0, Int(rect.minY)), y1 = min(height, Int(rect.maxY))
+        guard x1 > x0, y1 > y0 else { return 0 }
+        let stepX = max(1, (x1 - x0) / samples)
+        let stepY = max(1, (y1 - y0) / samples)
+        var sum = 0.0, count = 0
+        var y = y0
+        while y < y1 {
+            var x = x0
+            while x < x1 {
+                let i = (y * width + x) * 4
+                let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2])
+                let mx = max(r, g, b), mn = min(r, g, b)
+                sum += mx > 0 ? (mx - mn) / mx : 0
+                count += 1
+                x += stepX
+            }
+            y += stepY
+        }
+        return count > 0 ? sum / Double(count) : 0
+    }
+
+    /// Mean saturation of a ring just inside `rect`'s edges (the tile border),
+    /// excluding the interior — that's where the speaking highlight lives.
+    func borderSaturation(inPixelRect rect: CGRect, thicknessFraction: Double = 0.12) -> Double {
+        let t = max(2.0, min(rect.width, rect.height) * thicknessFraction)
+        let top = CGRect(x: rect.minX, y: rect.minY, width: rect.width, height: t)
+        let bottom = CGRect(x: rect.minX, y: rect.maxY - t, width: rect.width, height: t)
+        let left = CGRect(x: rect.minX, y: rect.minY, width: t, height: rect.height)
+        let right = CGRect(x: rect.maxX - t, y: rect.minY, width: t, height: rect.height)
+        return [top, bottom, left, right].map { meanSaturation(inPixelRect: $0) }.max() ?? 0
+    }
+
+    /// Grid-sampled pixel positions (top-left origin) that are strongly saturated
+    /// AND bright enough to be a UI highlight — i.e. the speaking ring/border.
+    func saturatedPoints(threshold: Double = 0.5, minBrightness: Double = 60, gridStep: Int = 6) -> [CGPoint] {
+        var points: [CGPoint] = []
+        var y = 0
+        while y < height {
+            var x = 0
+            while x < width {
+                let i = (y * width + x) * 4
+                let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2])
+                let mx = max(r, g, b), mn = min(r, g, b)
+                let sat = mx > 0 ? (mx - mn) / mx : 0
+                if sat > threshold && mx > minBrightness { points.append(CGPoint(x: x, y: y)) }
+                x += gridStep
+            }
+            y += gridStep
+        }
+        return points
+    }
+}
@@ -0,0 +1,94 @@
+import Foundation
+import CoreGraphics
+import CoreVideo
+import CoreImage
+
+/// Shared engine for tile-grid conferencing UIs (Signal/Zoom/Teams): OCR the
+/// name/initials on each tile, then mark the active speaker(s) by the saturated
+/// coloured highlight around their tile.
+///
+/// Geometry (`Config`) is a first pass; the exact tile expansion and saturation
+/// threshold get calibrated per app against real screenshot fixtures. The
+/// detection *logic* (read names; pick the highlighted tile) is validated with
+/// synthetic frames.
+struct GridCallAnalyzer {
+    struct Config {
+        var tileExpandX = 1.8        // grow text bbox → approx tile (for the reported bbox)
+        var tileExpandY = 2.6
+        var minTextConfidence: Float = 0.3
+        var maxNameLength = 40
+        /// Highlight detection: a name is "speaking" if enough strongly-saturated
+        /// highlight pixels sit within `highlightRadiusFraction` of its label.
+        var highlightRadiusFraction = 0.22   // of max(frame W,H)
+        var minHighlightPoints = 6
+        var highlightShareOfMax = 0.35       // must be ≥ this fraction of the busiest tile
+    }
+
+    var config = Config()
+    var recognizer = TextRecognizer()
+
+    func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
+        guard let cg = Self.cgImage(from: pixelBuffer) else { return [] }
+        return analyze(cgImage: cg, at: t)
+    }
+
+    func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
+        let texts = recognizer.recognize(in: cgImage).filter {
+            $0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
+        }
+        guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
+
+        let w = cgImage.width, h = cgImage.height
+        let tiles = texts.map { r -> (name: String, center: CGPoint, rect: CGRect, conf: Double) in
+            let rect = tileRect(r.boundingBox, imageW: w, imageH: h)
+            let cx = r.boundingBox.midX * Double(w)
+            let cy = (1 - r.boundingBox.midY) * Double(h)     // flip Y to top-left origin
+            return (cleaned(r.text), CGPoint(x: cx, y: cy), rect, Double(r.confidence))
+        }
+
+        // Find highlight pixels once, attribute each to the nearest name label.
+        let points = sampler.saturatedPoints()
+        let radius = Double(max(w, h)) * config.highlightRadiusFraction
+        let r2 = radius * radius
+        let counts = tiles.map { tile -> Int in
+            points.reduce(0) { acc, p in
+                let dx = Double(p.x) - tile.center.x, dy = Double(p.y) - tile.center.y
+                return acc + (dx * dx + dy * dy <= r2 ? 1 : 0)
+            }
+        }
+        let maxCount = counts.max() ?? 0
+        let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
+
+        return tiles.enumerated().map { idx, tile in
+            let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
+            return SpeakerObservation(name: tile.name, speaking: speaking,
+                                      bbox: tile.rect, confidence: tile.conf, t: t)
+        }
+    }
+
+    /// Vision normalized bbox (bottom-left origin) → pixel tile rect (top-left),
+    /// expanded around the text centre to approximate the whole tile.
+    private func tileRect(_ box: CGRect, imageW: Int, imageH: Int) -> CGRect {
+        let W = Double(imageW), H = Double(imageH)
+        let pw = box.width * W
+        let ph = box.height * H
+        let cx = (box.midX) * W
+        let cy = (1 - box.midY) * H          // flip Y to top-left origin
+        let nw = pw * config.tileExpandX
+        let nh = ph * config.tileExpandY
+        let rect = CGRect(x: cx - nw / 2, y: cy - nh / 2, width: nw, height: nh)
+        return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
+    }
+
+    private func cleaned(_ s: String) -> String {
+        let t = s.trimmingCharacters(in: .whitespacesAndNewlines)
+        return t.count <= config.maxNameLength ? t : ""
+    }
+
+    private static let ciContext = CIContext()
+
+    static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? {
+        let ci = CIImage(cvPixelBuffer: pixelBuffer)
+        return ciContext.createCGImage(ci, from: ci.extent)   // reuse; allocating per frame is costly
+    }
+}
@@ -0,0 +1,36 @@
+import Foundation
+import CoreGraphics
+import CoreVideo
+
+/// One per-frame observation from an app adapter: a participant tile, whether its
+/// active-speaker cue is showing, and where it is. `name` may be a full name,
+/// initials (Signal), or "" if unknown. `t` is seconds relative to the session t0.
+struct SpeakerObservation: Equatable {
+    let name: String
+    let speaking: Bool
+    let bbox: CGRect
+    let confidence: Double   // 0…1
+    let t: TimeInterval
+}
+
+/// Per-app screen-reading strategy. Each conferencing app gets one implementation
+/// that knows that app's tile layout, name placement, and active-speaker cue.
+/// Adapters must be testable offline against still-image fixtures.
+protocol AppAdapter {
+    static var bundleIDs: [String] { get }
+    var adapterVersion: String { get }
+    var preferredFPS: Int { get }
+
+    /// Analyze one frame; return the speakers visible and whether each is speaking.
+    /// Must process in-memory and never persist the frame.
+    func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation]
+
+    /// Optional: participant names from the app's Accessibility tree (Electron
+    /// apps like Signal expose these), preferred over OCR when available.
+    func namesFromAccessibility() -> [String]?
+}
+
+extension AppAdapter {
+    func namesFromAccessibility() -> [String]? { nil }
+    var preferredFPS: Int { 3 }
+}
@@ -0,0 +1,59 @@
+import Foundation
+import Vision
+import CoreVideo
+import CoreGraphics
+
+/// Thin wrapper over Vision's text recognition, used by adapters to read names /
+/// initials off participant tiles. Runs on the Neural Engine; no permission
+/// needed. Works on any frame, so adapters can be developed against still images.
+struct TextRecognizer {
+    struct Result {
+        let text: String
+        let confidence: Float
+        /// Normalized Vision bounding box (origin bottom-left, 0…1).
+        let boundingBox: CGRect
+    }
+
+    var recognitionLevel: VNRequestTextRecognitionLevel = .accurate
+    var minimumTextHeight: Float = 0          // 0 = Vision default
+    var usesLanguageCorrection = false        // names/initials aren't dictionary words
+
+    /// Recognize text in `pixelBuffer`, optionally limited to a normalized region
+    /// of interest (origin bottom-left, matching Vision's coordinate space).
+    func recognize(in pixelBuffer: CVPixelBuffer, regionOfInterest: CGRect? = nil) -> [Result] {
+        let request = VNRecognizeTextRequest()
+        request.recognitionLevel = recognitionLevel
+        request.usesLanguageCorrection = usesLanguageCorrection
+        if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
+        if let roi = regionOfInterest { request.regionOfInterest = roi }
+
+        let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
+        do {
+            try handler.perform([request])
+        } catch {
+            return []
+        }
+
+        guard let observations = request.results else { return [] }
+        return observations.compactMap { obs in
+            guard let top = obs.topCandidates(1).first else { return nil }
+            return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
+        }
+    }
+
+    /// Convenience for fixtures/tests: recognize text in a CGImage.
+    func recognize(in cgImage: CGImage, regionOfInterest: CGRect? = nil) -> [Result] {
+        let request = VNRecognizeTextRequest()
+        request.recognitionLevel = recognitionLevel
+        request.usesLanguageCorrection = usesLanguageCorrection
+        if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
+        if let roi = regionOfInterest { request.regionOfInterest = roi }
+
+        let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+        guard (try? handler.perform([request])) != nil, let results = request.results else { return [] }
+        return results.compactMap { obs in
+            guard let top = obs.topCandidates(1).first else { return nil }
+            return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
+        }
+    }
+}
@@ -0,0 +1,127 @@
+import Foundation
+
+/// Turns noisy per-frame `SpeakerObservation`s into clean
+/// `(start, end, name, confidence)` segments.
+///
+/// - Hysteresis: open a segment after `openFrames` consecutive speaking frames,
+///   close after `closeFrames` quiet frames — rides out UI-cue lag/flicker.
+/// - Overlaps allowed: each name is tracked independently (crosstalk).
+/// - mic-VAD "self" spans are merged in as high-confidence segments.
+/// - OCR name variants are normalized via an alias table.
+///
+/// Pure logic, no UI/capture deps — fully unit-testable offline.
+final class TimelineBuilder {
+    private let openFrames: Int
+    private let closeFrames: Int
+    private var aliases: [String: String] = [:]      // normalized variant -> canonical
+    private var states: [String: NameState] = [:]
+    private var lastFrameT: Double = 0
+    private(set) var segments: [VisualTimeline.Segment] = []
+
+    init(openFrames: Int = 2, closeFrames: Int = 2) {
+        self.openFrames = max(1, openFrames)
+        self.closeFrames = max(1, closeFrames)
+    }
+
+    /// Register that `variant` (e.g. "Sarah J") should map to `canonical`
+    /// (e.g. "Sarah Jones").
+    func addAlias(_ variant: String, canonical: String) {
+        aliases[Self.normalize(variant)] = canonical
+    }
+
+    /// Ingest one frame's observations (all sharing time `t`). Names not present
+    /// (or present but not speaking) count as a quiet frame for any open segment.
+    func ingest(_ observations: [SpeakerObservation], at t: TimeInterval) {
+        lastFrameT = t
+
+        // Best confidence per canonical name that is speaking this frame.
+        var speaking: [String: Double] = [:]
+        for obs in observations where obs.speaking && !obs.name.isEmpty {
+            let name = canonical(obs.name)
+            speaking[name] = max(speaking[name] ?? 0, obs.confidence)
+        }
+
+        let names = Set(states.keys).union(speaking.keys)
+        for name in names {
+            var st = states[name] ?? NameState()
+            if let conf = speaking[name] {
+                if st.voiced == 0 { st.runStart = t }
+                st.voiced += 1
+                st.silent = 0
+                st.lastVoicedT = t
+                if !st.open && st.voiced >= openFrames {
+                    st.open = true
+                    st.segStart = st.runStart
+                    st.confSum = 0
+                    st.confN = 0
+                }
+                if st.open { st.confSum += conf; st.confN += 1 }
+            } else {
+                st.silent += 1
+                st.voiced = 0
+                if st.open && st.silent >= closeFrames {
+                    closeSegment(name: name, state: st)
+                    st.open = false
+                }
+            }
+            states[name] = st
+        }
+    }
+
+    /// Merge mic-VAD self spans (the user) as high-confidence segments.
+    func mergeSelfSpans(_ spans: [VADSpan], selfName: String) {
+        for span in spans where span.end > span.start {
+            segments.append(.init(start: span.start, end: span.end,
+                                  name: selfName, confidence: span.confidence, source: "mic_vad"))
+        }
+    }
+
+    /// Force-close any open segments at `t` (used when a visual gap begins, so a
+    /// segment isn't carried across the gap).
+    func closeOpenSegments(at t: TimeInterval) {
+        for (name, st) in states where st.open {
+            closeSegment(name: name, state: st)
+            states[name]?.open = false
+            states[name]?.voiced = 0
+            states[name]?.silent = 0
+        }
+    }
+
+    /// Close any still-open segments at end of capture.
+    func finish() {
+        for (name, st) in states where st.open {
+            closeSegment(name: name, state: st)
+            states[name]?.open = false
+        }
+        segments.sort { $0.start < $1.start }
+    }
+
+    // MARK: - Internal
+
+    private struct NameState {
+        var voiced = 0
+        var silent = 0
+        var open = false
+        var runStart: Double = 0
+        var segStart: Double = 0
+        var lastVoicedT: Double = 0
+        var confSum: Double = 0
+        var confN = 0
+    }
+
+    private func closeSegment(name: String, state st: NameState) {
+        guard st.lastVoicedT > st.segStart else { return }
+        let confidence = st.confN > 0 ? st.confSum / Double(st.confN) : 0.8
+        segments.append(.init(start: st.segStart, end: st.lastVoicedT,
+                              name: name, confidence: confidence, source: "vision"))
+    }
+
+    private func canonical(_ raw: String) -> String {
+        let key = Self.normalize(raw)
+        return aliases[key] ?? raw.trimmingCharacters(in: .whitespacesAndNewlines)
+    }
+
+    private static func normalize(_ s: String) -> String {
+        s.lowercased().trimmingCharacters(in: .whitespacesAndNewlines)
+    }
+}
@@ -0,0 +1,131 @@
+import Foundation
+import ScreenCaptureKit
+import CoreMedia
+import QuartzCore
+import AppKit
+
+/// Window-scoped visual capture: streams the call window's own rendered content
+/// at ~`fps`, hands each frame to the app adapter, and **releases it immediately
+/// — frames are never written to disk**. Builds the speaker timeline and records
+/// `visual_gap`s when the window is minimized (SCK delivers non-live frames).
+///
+/// Window visibility/focus is NOT required — SCK captures a window even when it's
+/// occluded or on another Space; only minimization freezes the backing buffer.
+@available(macOS 13.0, *)
+final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
+    private let bundleID: String
+    private let adapter: any AppAdapter
+    private let t0Host: Double
+    private let fps: Int
+    private let queue = DispatchQueue(label: "xyz.ten31.visual")
+
+    private var stream: SCStream?
+    private let builder = TimelineBuilder()
+    private var gaps: [VisualTimeline.Gap] = []
+    private var gapStart: Double?
+
+    /// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
+    var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?
+
+    init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
+        self.bundleID = bundleID
+        self.adapter = adapter
+        self.t0Host = t0Host
+        self.fps = max(1, fps)
+    }
+
+    func start() async throws {
+        let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
+        // The call window: the largest window owned by the target app.
+        let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
+        guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else {
+            throw NSError(domain: "Ten31", code: 2,
+                          userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
+        }
+
+        let filter = SCContentFilter(desktopIndependentWindow: window)
+        let config = SCStreamConfiguration()
+        config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(fps))
+        config.queueDepth = 3
+        config.showsCursor = false
+        config.pixelFormat = kCVPixelFormatType_32BGRA
+        // window.frame is in points; capture at native pixels so OCR can read small
+        // initials/names (a half-res Retina capture badly hurts recognition).
+        let scale = NSScreen.main?.backingScaleFactor ?? 2
+        config.width = max(2, Int(window.frame.width * scale))
+        config.height = max(2, Int(window.frame.height * scale))
+
+        let stream = SCStream(filter: filter, configuration: config, delegate: self)
+        try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: queue)
+        try await stream.startCapture()
+        self.stream = stream
+    }
+
+    func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
+        if let stream { try? await stream.stopCapture() }
+        stream = nil
+        return queue.sync {
+            if let gs = gapStart {
+                gaps.append(.init(start: gs, end: CACurrentMediaTime() - t0Host, reason: "minimized"))
+                gapStart = nil
+            }
+            builder.finish()
+            return (builder.segments, gaps)
+        }
+    }
+
+    /// Merge mic-VAD self spans into the visual timeline (call before `stop`'s read,
+    /// or fold in afterwards in the packager).
+    func addSelfSpans(_ spans: [VADSpan], selfName: String) {
+        queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) }
+    }
+
+    // MARK: - SCStreamOutput (on `queue`)
+
+    func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
+                of type: SCStreamOutputType) {
+        guard type == .screen, CMSampleBufferDataIsReady(sampleBuffer) else { return }
+        let now = CACurrentMediaTime() - t0Host
+
+        switch frameKind(sampleBuffer) {
+        case .idle:
+            // Window is live but static (no pixel change) — no new info, not a gap.
+            return
+        case .gap:
+            // Minimized/blanked: the backing buffer is frozen. Open a gap once and
+            // close any open speaker segments so none is carried across it.
+            if gapStart == nil {
+                gapStart = now
+                builder.closeOpenSegments(at: now)
+            }
+            return
+        case .live:
+            if let gs = gapStart {
+                gaps.append(.init(start: gs, end: now, reason: "minimized"))
+                gapStart = nil
+            }
+            guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
+            let observations = adapter.analyze(frame: pixelBuffer, at: now)  // frame released after this scope
+            builder.ingest(observations, at: now)
+            onObservations?(observations, now)
+        }
+    }
+
+    func stream(_ stream: SCStream, didStopWithError error: Error) {}
+
+    private enum FrameKind { case live, idle, gap }
+
+    /// SCK delivers `.complete` only when content changes, `.idle` for a static
+    /// (but visible) window, and `.blank`/`.suspended`/`.stopped` when frozen.
+    private func frameKind(_ sampleBuffer: CMSampleBuffer) -> FrameKind {
+        guard let attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, createIfNecessary: false)
+                as? [[SCStreamFrameInfo: Any]],
+              let raw = attachments.first?[.status] as? Int,
+              let status = SCFrameStatus(rawValue: raw) else { return .live }
+        switch status {
+        case .complete: return .live
+        case .idle:     return .idle
+        default:        return .gap        // .blank / .suspended / .stopped
+        }
+    }
+}
@@ -0,0 +1,72 @@
+import Foundation
+
+/// `visual_timeline.json` (schema 1.1) — the app's primary visual output. Times
+/// are seconds relative to session t0. Segments may overlap (crosstalk).
+struct VisualTimeline: Codable {
+    var schemaVersion = "1.1"
+    let sessionId: String
+    let app: String
+    let adapterVersion: String
+    let t0Unix: Double
+    let durationSec: Double
+    let fpsSampled: Int
+    let selfName: String?
+    let participants: [Participant]
+    let segments: [Segment]
+    let visualGaps: [Gap]
+
+    struct Participant: Codable {
+        let name: String
+        let isSelf: Bool?
+        let aliases: [String]?
+        enum CodingKeys: String, CodingKey {
+            case name
+            case isSelf = "is_self"
+            case aliases
+        }
+    }
+
+    struct Segment: Codable, Equatable {
+        let start: Double
+        let end: Double
+        let name: String
+        let confidence: Double
+        let source: String   // vision | accessibility | fused | mic_vad
+    }
+
+    struct Gap: Codable, Equatable {
+        let start: Double
+        let end: Double
+        let reason: String   // minimized | tab_switched
+    }
+
+    enum CodingKeys: String, CodingKey {
+        case schemaVersion = "schema_version"
+        case sessionId = "session_id"
+        case app
+        case adapterVersion = "adapter_version"
+        case t0Unix = "t0_unix"
+        case durationSec = "duration_sec"
+        case fpsSampled = "fps_sampled"
+        case selfName = "self_name"
+        case participants
+        case segments
+        case visualGaps = "visual_gaps"
+    }
+
+    /// Write the rich `visual_timeline.json`.
+    func write(to url: URL) throws {
+        let encoder = JSONEncoder()
+        encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
+        try encoder.encode(self).write(to: url)
+    }
+
+    /// The flat array `label-merge` wants: `[{start,end,name,confidence}]`,
+    /// dropping `source`. Slice/rebase to chunk-local seconds happens in Phase 5.
+    func flatTimelineData() throws -> Data {
+        let flat = segments.map { seg -> [String: Any] in
+            ["start": seg.start, "end": seg.end, "name": seg.name, "confidence": seg.confidence]
+        }
+        return try JSONSerialization.data(withJSONObject: flat, options: [])
+    }
+}