Phases 2-6: detection, visual timeline, backend hand-off, voiceprints

Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
2026-06-06 00:15:49 -05:00
parent fd7e1a5907
commit 863136aeec
27 changed files with 2108 additions and 22 deletions
@@ -0,0 +1,82 @@
+import Foundation
+import CoreGraphics
+
+/// Renders a CGImage to an RGBA8 buffer once, then answers cheap colour queries
+/// over pixel regions. Used to score the active-speaker highlight (a saturated
+/// coloured border/ring) around participant tiles.
+struct FrameSampler {
+    let width: Int
+    let height: Int
+    private let pixels: [UInt8]      // RGBA8, row-major, top-left origin
+
+    init?(cgImage: CGImage) {
+        let w = cgImage.width, h = cgImage.height
+        guard w > 0, h > 0 else { return nil }
+        var buffer = [UInt8](repeating: 0, count: w * h * 4)
+        let colorSpace = CGColorSpaceCreateDeviceRGB()
+        let info = CGImageAlphaInfo.premultipliedLast.rawValue
+        guard let ctx = buffer.withUnsafeMutableBytes({ raw -> CGContext? in
+            CGContext(data: raw.baseAddress, width: w, height: h, bitsPerComponent: 8,
+                      bytesPerRow: w * 4, space: colorSpace, bitmapInfo: info)
+        }) else { return nil }
+        ctx.draw(cgImage, in: CGRect(x: 0, y: 0, width: w, height: h))
+        self.width = w
+        self.height = h
+        self.pixels = buffer
+    }
+
+    /// Mean HSV saturation (0…1) over a pixel rect (top-left origin), sampled on a grid.
+    func meanSaturation(inPixelRect rect: CGRect, samples: Int = 24) -> Double {
+        let x0 = max(0, Int(rect.minX)), x1 = min(width, Int(rect.maxX))
+        let y0 = max(0, Int(rect.minY)), y1 = min(height, Int(rect.maxY))
+        guard x1 > x0, y1 > y0 else { return 0 }
+        let stepX = max(1, (x1 - x0) / samples)
+        let stepY = max(1, (y1 - y0) / samples)
+        var sum = 0.0, count = 0
+        var y = y0
+        while y < y1 {
+            var x = x0
+            while x < x1 {
+                let i = (y * width + x) * 4
+                let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2])
+                let mx = max(r, g, b), mn = min(r, g, b)
+                sum += mx > 0 ? (mx - mn) / mx : 0
+                count += 1
+                x += stepX
+            }
+            y += stepY
+        }
+        return count > 0 ? sum / Double(count) : 0
+    }
+
+    /// Mean saturation of a ring just inside `rect`'s edges (the tile border),
+    /// excluding the interior — that's where the speaking highlight lives.
+    func borderSaturation(inPixelRect rect: CGRect, thicknessFraction: Double = 0.12) -> Double {
+        let t = max(2.0, min(rect.width, rect.height) * thicknessFraction)
+        let top = CGRect(x: rect.minX, y: rect.minY, width: rect.width, height: t)
+        let bottom = CGRect(x: rect.minX, y: rect.maxY - t, width: rect.width, height: t)
+        let left = CGRect(x: rect.minX, y: rect.minY, width: t, height: rect.height)
+        let right = CGRect(x: rect.maxX - t, y: rect.minY, width: t, height: rect.height)
+        return [top, bottom, left, right].map { meanSaturation(inPixelRect: $0) }.max() ?? 0
+    }
+
+    /// Grid-sampled pixel positions (top-left origin) that are strongly saturated
+    /// AND bright enough to be a UI highlight — i.e. the speaking ring/border.
+    func saturatedPoints(threshold: Double = 0.5, minBrightness: Double = 60, gridStep: Int = 6) -> [CGPoint] {
+        var points: [CGPoint] = []
+        var y = 0
+        while y < height {
+            var x = 0
+            while x < width {
+                let i = (y * width + x) * 4
+                let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2])
+                let mx = max(r, g, b), mn = min(r, g, b)
+                let sat = mx > 0 ? (mx - mn) / mx : 0
+                if sat > threshold && mx > minBrightness { points.append(CGPoint(x: x, y: y)) }
+                x += gridStep
+            }
+            y += gridStep
+        }
+        return points
+    }
+}
@@ -0,0 +1,94 @@
+import Foundation
+import CoreGraphics
+import CoreVideo
+import CoreImage
+
+/// Shared engine for tile-grid conferencing UIs (Signal/Zoom/Teams): OCR the
+/// name/initials on each tile, then mark the active speaker(s) by the saturated
+/// coloured highlight around their tile.
+///
+/// Geometry (`Config`) is a first pass; the exact tile expansion and saturation
+/// threshold get calibrated per app against real screenshot fixtures. The
+/// detection *logic* (read names; pick the highlighted tile) is validated with
+/// synthetic frames.
+struct GridCallAnalyzer {
+    struct Config {
+        var tileExpandX = 1.8        // grow text bbox → approx tile (for the reported bbox)
+        var tileExpandY = 2.6
+        var minTextConfidence: Float = 0.3
+        var maxNameLength = 40
+        /// Highlight detection: a name is "speaking" if enough strongly-saturated
+        /// highlight pixels sit within `highlightRadiusFraction` of its label.
+        var highlightRadiusFraction = 0.22   // of max(frame W,H)
+        var minHighlightPoints = 6
+        var highlightShareOfMax = 0.35       // must be ≥ this fraction of the busiest tile
+    }
+
+    var config = Config()
+    var recognizer = TextRecognizer()
+
+    func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
+        guard let cg = Self.cgImage(from: pixelBuffer) else { return [] }
+        return analyze(cgImage: cg, at: t)
+    }
+
+    func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
+        let texts = recognizer.recognize(in: cgImage).filter {
+            $0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
+        }
+        guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
+
+        let w = cgImage.width, h = cgImage.height
+        let tiles = texts.map { r -> (name: String, center: CGPoint, rect: CGRect, conf: Double) in
+            let rect = tileRect(r.boundingBox, imageW: w, imageH: h)
+            let cx = r.boundingBox.midX * Double(w)
+            let cy = (1 - r.boundingBox.midY) * Double(h)     // flip Y to top-left origin
+            return (cleaned(r.text), CGPoint(x: cx, y: cy), rect, Double(r.confidence))
+        }
+
+        // Find highlight pixels once, attribute each to the nearest name label.
+        let points = sampler.saturatedPoints()
+        let radius = Double(max(w, h)) * config.highlightRadiusFraction
+        let r2 = radius * radius
+        let counts = tiles.map { tile -> Int in
+            points.reduce(0) { acc, p in
+                let dx = Double(p.x) - tile.center.x, dy = Double(p.y) - tile.center.y
+                return acc + (dx * dx + dy * dy <= r2 ? 1 : 0)
+            }
+        }
+        let maxCount = counts.max() ?? 0
+        let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
+
+        return tiles.enumerated().map { idx, tile in
+            let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
+            return SpeakerObservation(name: tile.name, speaking: speaking,
+                                      bbox: tile.rect, confidence: tile.conf, t: t)
+        }
+    }
+
+    /// Vision normalized bbox (bottom-left origin) → pixel tile rect (top-left),
+    /// expanded around the text centre to approximate the whole tile.
+    private func tileRect(_ box: CGRect, imageW: Int, imageH: Int) -> CGRect {
+        let W = Double(imageW), H = Double(imageH)
+        let pw = box.width * W
+        let ph = box.height * H
+        let cx = (box.midX) * W
+        let cy = (1 - box.midY) * H          // flip Y to top-left origin
+        let nw = pw * config.tileExpandX
+        let nh = ph * config.tileExpandY
+        let rect = CGRect(x: cx - nw / 2, y: cy - nh / 2, width: nw, height: nh)
+        return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
+    }
+
+    private func cleaned(_ s: String) -> String {
+        let t = s.trimmingCharacters(in: .whitespacesAndNewlines)
+        return t.count <= config.maxNameLength ? t : ""
+    }
+
+    private static let ciContext = CIContext()
+
+    static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? {
+        let ci = CIImage(cvPixelBuffer: pixelBuffer)
+        return ciContext.createCGImage(ci, from: ci.extent)   // reuse; allocating per frame is costly
+    }
+}
@@ -0,0 +1,36 @@
+import Foundation
+import CoreGraphics
+import CoreVideo
+
+/// One per-frame observation from an app adapter: a participant tile, whether its
+/// active-speaker cue is showing, and where it is. `name` may be a full name,
+/// initials (Signal), or "" if unknown. `t` is seconds relative to the session t0.
+struct SpeakerObservation: Equatable {
+    let name: String
+    let speaking: Bool
+    let bbox: CGRect
+    let confidence: Double   // 0…1
+    let t: TimeInterval
+}
+
+/// Per-app screen-reading strategy. Each conferencing app gets one implementation
+/// that knows that app's tile layout, name placement, and active-speaker cue.
+/// Adapters must be testable offline against still-image fixtures.
+protocol AppAdapter {
+    static var bundleIDs: [String] { get }
+    var adapterVersion: String { get }
+    var preferredFPS: Int { get }
+
+    /// Analyze one frame; return the speakers visible and whether each is speaking.
+    /// Must process in-memory and never persist the frame.
+    func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation]
+
+    /// Optional: participant names from the app's Accessibility tree (Electron
+    /// apps like Signal expose these), preferred over OCR when available.
+    func namesFromAccessibility() -> [String]?
+}
+
+extension AppAdapter {
+    func namesFromAccessibility() -> [String]? { nil }
+    var preferredFPS: Int { 3 }
+}
@@ -0,0 +1,59 @@
+import Foundation
+import Vision
+import CoreVideo
+import CoreGraphics
+
+/// Thin wrapper over Vision's text recognition, used by adapters to read names /
+/// initials off participant tiles. Runs on the Neural Engine; no permission
+/// needed. Works on any frame, so adapters can be developed against still images.
+struct TextRecognizer {
+    struct Result {
+        let text: String
+        let confidence: Float
+        /// Normalized Vision bounding box (origin bottom-left, 0…1).
+        let boundingBox: CGRect
+    }
+
+    var recognitionLevel: VNRequestTextRecognitionLevel = .accurate
+    var minimumTextHeight: Float = 0          // 0 = Vision default
+    var usesLanguageCorrection = false        // names/initials aren't dictionary words
+
+    /// Recognize text in `pixelBuffer`, optionally limited to a normalized region
+    /// of interest (origin bottom-left, matching Vision's coordinate space).
+    func recognize(in pixelBuffer: CVPixelBuffer, regionOfInterest: CGRect? = nil) -> [Result] {
+        let request = VNRecognizeTextRequest()
+        request.recognitionLevel = recognitionLevel
+        request.usesLanguageCorrection = usesLanguageCorrection
+        if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
+        if let roi = regionOfInterest { request.regionOfInterest = roi }
+
+        let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
+        do {
+            try handler.perform([request])
+        } catch {
+            return []
+        }
+
+        guard let observations = request.results else { return [] }
+        return observations.compactMap { obs in
+            guard let top = obs.topCandidates(1).first else { return nil }
+            return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
+        }
+    }
+
+    /// Convenience for fixtures/tests: recognize text in a CGImage.
+    func recognize(in cgImage: CGImage, regionOfInterest: CGRect? = nil) -> [Result] {
+        let request = VNRecognizeTextRequest()
+        request.recognitionLevel = recognitionLevel
+        request.usesLanguageCorrection = usesLanguageCorrection
+        if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
+        if let roi = regionOfInterest { request.regionOfInterest = roi }
+
+        let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+        guard (try? handler.perform([request])) != nil, let results = request.results else { return [] }
+        return results.compactMap { obs in
+            guard let top = obs.topCandidates(1).first else { return nil }
+            return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
+        }
+    }
+}
@@ -0,0 +1,127 @@
+import Foundation
+
+/// Turns noisy per-frame `SpeakerObservation`s into clean
+/// `(start, end, name, confidence)` segments.
+///
+/// - Hysteresis: open a segment after `openFrames` consecutive speaking frames,
+///   close after `closeFrames` quiet frames — rides out UI-cue lag/flicker.
+/// - Overlaps allowed: each name is tracked independently (crosstalk).
+/// - mic-VAD "self" spans are merged in as high-confidence segments.
+/// - OCR name variants are normalized via an alias table.
+///
+/// Pure logic, no UI/capture deps — fully unit-testable offline.
+final class TimelineBuilder {
+    private let openFrames: Int
+    private let closeFrames: Int
+    private var aliases: [String: String] = [:]      // normalized variant -> canonical
+    private var states: [String: NameState] = [:]
+    private var lastFrameT: Double = 0
+    private(set) var segments: [VisualTimeline.Segment] = []
+
+    init(openFrames: Int = 2, closeFrames: Int = 2) {
+        self.openFrames = max(1, openFrames)
+        self.closeFrames = max(1, closeFrames)
+    }
+
+    /// Register that `variant` (e.g. "Sarah J") should map to `canonical`
+    /// (e.g. "Sarah Jones").
+    func addAlias(_ variant: String, canonical: String) {
+        aliases[Self.normalize(variant)] = canonical
+    }
+
+    /// Ingest one frame's observations (all sharing time `t`). Names not present
+    /// (or present but not speaking) count as a quiet frame for any open segment.
+    func ingest(_ observations: [SpeakerObservation], at t: TimeInterval) {
+        lastFrameT = t
+
+        // Best confidence per canonical name that is speaking this frame.
+        var speaking: [String: Double] = [:]
+        for obs in observations where obs.speaking && !obs.name.isEmpty {
+            let name = canonical(obs.name)
+            speaking[name] = max(speaking[name] ?? 0, obs.confidence)
+        }
+
+        let names = Set(states.keys).union(speaking.keys)
+        for name in names {
+            var st = states[name] ?? NameState()
+            if let conf = speaking[name] {
+                if st.voiced == 0 { st.runStart = t }
+                st.voiced += 1
+                st.silent = 0
+                st.lastVoicedT = t
+                if !st.open && st.voiced >= openFrames {
+                    st.open = true
+                    st.segStart = st.runStart
+                    st.confSum = 0
+                    st.confN = 0
+                }
+                if st.open { st.confSum += conf; st.confN += 1 }
+            } else {
+                st.silent += 1
+                st.voiced = 0
+                if st.open && st.silent >= closeFrames {
+                    closeSegment(name: name, state: st)
+                    st.open = false
+                }
+            }
+            states[name] = st
+        }
+    }
+
+    /// Merge mic-VAD self spans (the user) as high-confidence segments.
+    func mergeSelfSpans(_ spans: [VADSpan], selfName: String) {
+        for span in spans where span.end > span.start {
+            segments.append(.init(start: span.start, end: span.end,
+                                  name: selfName, confidence: span.confidence, source: "mic_vad"))
+        }
+    }
+
+    /// Force-close any open segments at `t` (used when a visual gap begins, so a
+    /// segment isn't carried across the gap).
+    func closeOpenSegments(at t: TimeInterval) {
+        for (name, st) in states where st.open {
+            closeSegment(name: name, state: st)
+            states[name]?.open = false
+            states[name]?.voiced = 0
+            states[name]?.silent = 0
+        }
+    }
+
+    /// Close any still-open segments at end of capture.
+    func finish() {
+        for (name, st) in states where st.open {
+            closeSegment(name: name, state: st)
+            states[name]?.open = false
+        }
+        segments.sort { $0.start < $1.start }
+    }
+
+    // MARK: - Internal
+
+    private struct NameState {
+        var voiced = 0
+        var silent = 0
+        var open = false
+        var runStart: Double = 0
+        var segStart: Double = 0
+        var lastVoicedT: Double = 0
+        var confSum: Double = 0
+        var confN = 0
+    }
+
+    private func closeSegment(name: String, state st: NameState) {
+        guard st.lastVoicedT > st.segStart else { return }
+        let confidence = st.confN > 0 ? st.confSum / Double(st.confN) : 0.8
+        segments.append(.init(start: st.segStart, end: st.lastVoicedT,
+                              name: name, confidence: confidence, source: "vision"))
+    }
+
+    private func canonical(_ raw: String) -> String {
+        let key = Self.normalize(raw)
+        return aliases[key] ?? raw.trimmingCharacters(in: .whitespacesAndNewlines)
+    }
+
+    private static func normalize(_ s: String) -> String {
+        s.lowercased().trimmingCharacters(in: .whitespacesAndNewlines)
+    }
+}
@@ -0,0 +1,131 @@
+import Foundation
+import ScreenCaptureKit
+import CoreMedia
+import QuartzCore
+import AppKit
+
+/// Window-scoped visual capture: streams the call window's own rendered content
+/// at ~`fps`, hands each frame to the app adapter, and **releases it immediately
+/// — frames are never written to disk**. Builds the speaker timeline and records
+/// `visual_gap`s when the window is minimized (SCK delivers non-live frames).
+///
+/// Window visibility/focus is NOT required — SCK captures a window even when it's
+/// occluded or on another Space; only minimization freezes the backing buffer.
+@available(macOS 13.0, *)
+final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
+    private let bundleID: String
+    private let adapter: any AppAdapter
+    private let t0Host: Double
+    private let fps: Int
+    private let queue = DispatchQueue(label: "xyz.ten31.visual")
+
+    private var stream: SCStream?
+    private let builder = TimelineBuilder()
+    private var gaps: [VisualTimeline.Gap] = []
+    private var gapStart: Double?
+
+    /// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
+    var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?
+
+    init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
+        self.bundleID = bundleID
+        self.adapter = adapter
+        self.t0Host = t0Host
+        self.fps = max(1, fps)
+    }
+
+    func start() async throws {
+        let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
+        // The call window: the largest window owned by the target app.
+        let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
+        guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else {
+            throw NSError(domain: "Ten31", code: 2,
+                          userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
+        }
+
+        let filter = SCContentFilter(desktopIndependentWindow: window)
+        let config = SCStreamConfiguration()
+        config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(fps))
+        config.queueDepth = 3
+        config.showsCursor = false
+        config.pixelFormat = kCVPixelFormatType_32BGRA
+        // window.frame is in points; capture at native pixels so OCR can read small
+        // initials/names (a half-res Retina capture badly hurts recognition).
+        let scale = NSScreen.main?.backingScaleFactor ?? 2
+        config.width = max(2, Int(window.frame.width * scale))
+        config.height = max(2, Int(window.frame.height * scale))
+
+        let stream = SCStream(filter: filter, configuration: config, delegate: self)
+        try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: queue)
+        try await stream.startCapture()
+        self.stream = stream
+    }
+
+    func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
+        if let stream { try? await stream.stopCapture() }
+        stream = nil
+        return queue.sync {
+            if let gs = gapStart {
+                gaps.append(.init(start: gs, end: CACurrentMediaTime() - t0Host, reason: "minimized"))
+                gapStart = nil
+            }
+            builder.finish()
+            return (builder.segments, gaps)
+        }
+    }
+
+    /// Merge mic-VAD self spans into the visual timeline (call before `stop`'s read,
+    /// or fold in afterwards in the packager).
+    func addSelfSpans(_ spans: [VADSpan], selfName: String) {
+        queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) }
+    }
+
+    // MARK: - SCStreamOutput (on `queue`)
+
+    func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
+                of type: SCStreamOutputType) {
+        guard type == .screen, CMSampleBufferDataIsReady(sampleBuffer) else { return }
+        let now = CACurrentMediaTime() - t0Host
+
+        switch frameKind(sampleBuffer) {
+        case .idle:
+            // Window is live but static (no pixel change) — no new info, not a gap.
+            return
+        case .gap:
+            // Minimized/blanked: the backing buffer is frozen. Open a gap once and
+            // close any open speaker segments so none is carried across it.
+            if gapStart == nil {
+                gapStart = now
+                builder.closeOpenSegments(at: now)
+            }
+            return
+        case .live:
+            if let gs = gapStart {
+                gaps.append(.init(start: gs, end: now, reason: "minimized"))
+                gapStart = nil
+            }
+            guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
+            let observations = adapter.analyze(frame: pixelBuffer, at: now)  // frame released after this scope
+            builder.ingest(observations, at: now)
+            onObservations?(observations, now)
+        }
+    }
+
+    func stream(_ stream: SCStream, didStopWithError error: Error) {}
+
+    private enum FrameKind { case live, idle, gap }
+
+    /// SCK delivers `.complete` only when content changes, `.idle` for a static
+    /// (but visible) window, and `.blank`/`.suspended`/`.stopped` when frozen.
+    private func frameKind(_ sampleBuffer: CMSampleBuffer) -> FrameKind {
+        guard let attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, createIfNecessary: false)
+                as? [[SCStreamFrameInfo: Any]],
+              let raw = attachments.first?[.status] as? Int,
+              let status = SCFrameStatus(rawValue: raw) else { return .live }
+        switch status {
+        case .complete: return .live
+        case .idle:     return .idle
+        default:        return .gap        // .blank / .suspended / .stopped
+        }
+    }
+}
@@ -0,0 +1,72 @@
+import Foundation
+
+/// `visual_timeline.json` (schema 1.1) — the app's primary visual output. Times
+/// are seconds relative to session t0. Segments may overlap (crosstalk).
+struct VisualTimeline: Codable {
+    var schemaVersion = "1.1"
+    let sessionId: String
+    let app: String
+    let adapterVersion: String
+    let t0Unix: Double
+    let durationSec: Double
+    let fpsSampled: Int
+    let selfName: String?
+    let participants: [Participant]
+    let segments: [Segment]
+    let visualGaps: [Gap]
+
+    struct Participant: Codable {
+        let name: String
+        let isSelf: Bool?
+        let aliases: [String]?
+        enum CodingKeys: String, CodingKey {
+            case name
+            case isSelf = "is_self"
+            case aliases
+        }
+    }
+
+    struct Segment: Codable, Equatable {
+        let start: Double
+        let end: Double
+        let name: String
+        let confidence: Double
+        let source: String   // vision | accessibility | fused | mic_vad
+    }
+
+    struct Gap: Codable, Equatable {
+        let start: Double
+        let end: Double
+        let reason: String   // minimized | tab_switched
+    }
+
+    enum CodingKeys: String, CodingKey {
+        case schemaVersion = "schema_version"
+        case sessionId = "session_id"
+        case app
+        case adapterVersion = "adapter_version"
+        case t0Unix = "t0_unix"
+        case durationSec = "duration_sec"
+        case fpsSampled = "fps_sampled"
+        case selfName = "self_name"
+        case participants
+        case segments
+        case visualGaps = "visual_gaps"
+    }
+
+    /// Write the rich `visual_timeline.json`.
+    func write(to url: URL) throws {
+        let encoder = JSONEncoder()
+        encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
+        try encoder.encode(self).write(to: url)
+    }
+
+    /// The flat array `label-merge` wants: `[{start,end,name,confidence}]`,
+    /// dropping `source`. Slice/rebase to chunk-local seconds happens in Phase 5.
+    func flatTimelineData() throws -> Data {
+        let flat = segments.map { seg -> [String: Any] in
+            ["start": seg.start, "end": seg.end, "name": seg.name, "confidence": seg.confidence]
+        }
+        return try JSONSerialization.data(withJSONObject: flat, options: [])
+    }
+}