commit b2ae3a62b976c9a78ad94d52cf7d667dfc2bfe7f Author: Grant Gilliam Date: Fri Jun 5 19:33:53 2026 -0500 Phase 0: menu-bar scaffold, permissions, backend health check Native SwiftUI menu-bar app (LSUIElement, macOS 13+), generated from project.yml via XcodeGen. Includes: - PermissionsManager (Microphone / Screen Recording / Accessibility) + UI - SparkControlHealth: GET /api/status over self-signed TLS (InsecureTrustDelegate) - AppSettings persistence (host, TLS-skip, output folder, adapter toggles) - Menu-bar panel + Settings, app sandbox & hardened runtime off (LAN tool) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0c09c6f --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +# macOS +.DS_Store + +# Xcode project is generated from project.yml by XcodeGen. +# Regenerate any time with: xcodegen generate +*.xcodeproj +*.xcworkspace +xcuserdata/ +DerivedData/ +build/ + +# Swift / SwiftPM +.build/ + +# App output (never commit recordings or transcripts) +/Ten31Transcripts-output/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..6273494 --- /dev/null +++ b/README.md @@ -0,0 +1,64 @@ +# Ten31 Transcripts + +Native macOS menu-bar app that auto-detects conference calls, records local audio, +builds a visual-derived speaker timeline, and hands audio + timeline to the +SparkControl backend for naming/transcription. See `docs/` for the full spec. + +This repo is at **Phase 0** (scaffold, permissions, backend health check). + +## One-time setup + +1. **Install Xcode** from the Mac App Store (free; ~40 GB). Open it once and + accept the license prompt. +2. **Install XcodeGen** (generates the Xcode project from `project.yml`): + ```sh + brew install xcodegen + ``` +3. **Generate the project:** + ```sh + xcodegen generate + ``` + This creates `Ten31Transcripts.xcodeproj` (git-ignored — regenerate any time). +4. **Open it:** + ```sh + open Ten31Transcripts.xcodeproj + ``` +5. In Xcode, select the **Ten31Transcripts** target → **Signing & Capabilities**: + - Check **Automatically manage signing**. + - For **Team**, pick your personal team (sign in with your Apple ID — free; no + paid developer account needed). A stable team keeps macOS from re-asking for + permissions on every rebuild. +6. Press **Run** (⌘R). + +## What Phase 0 does + +- Launches as a menu-bar-only app (no Dock icon). +- Menu panel shows live status for the three permissions it needs — **Microphone**, + **Screen Recording**, **Accessibility** — with Grant / Open Settings buttons. +- Shows a **backend health check** (`GET /api/status`) against the configured host. +- **Settings:** backend base URL, skip-TLS toggle (on by default for the + self-signed cert), output folder, and adapter toggles (inert this phase). + +No audio capture, call detection, screen reading, or backend hand-off yet — those +arrive in Phases 1–6 (`docs/04_BUILD_PLAN.md`). + +## Project layout + +``` +project.yml # XcodeGen recipe → generates the .xcodeproj +Ten31Transcripts/ + App/ Ten31TranscriptsApp.swift, AppDelegate.swift + UI/ MenuBarView, SettingsView, PermissionRow + Permissions/PermissionsManager.swift + Backend/ SparkControlHealth.swift, InsecureTrustDelegate.swift + Settings/ AppSettings.swift + Support/ Info.plist, Ten31Transcripts.entitlements +Ten31TranscriptsTests/ # placeholder; real tests land in Phase 3 +``` + +## Notes + +- **App Sandbox is off** and **Hardened Runtime is off** — this is a personal, + LAN-only tool that must observe other apps. Revisit only if distributing. +- The default backend host is `https://your-spark-backend.local:62419` (editable in + Settings). diff --git a/Ten31Transcripts/App/AppDelegate.swift b/Ten31Transcripts/App/AppDelegate.swift new file mode 100644 index 0000000..96b4c6b --- /dev/null +++ b/Ten31Transcripts/App/AppDelegate.swift @@ -0,0 +1,10 @@ +import AppKit + +final class AppDelegate: NSObject, NSApplicationDelegate { + func applicationDidFinishLaunching(_ notification: Notification) { + // Run as a menu-bar accessory (no Dock icon, no main window). + // LSUIElement in Info.plist already enforces this; set it explicitly too + // so behavior is unambiguous regardless of how the app is launched. + NSApp.setActivationPolicy(.accessory) + } +} diff --git a/Ten31Transcripts/App/Ten31TranscriptsApp.swift b/Ten31Transcripts/App/Ten31TranscriptsApp.swift new file mode 100644 index 0000000..94e3e58 --- /dev/null +++ b/Ten31Transcripts/App/Ten31TranscriptsApp.swift @@ -0,0 +1,28 @@ +import SwiftUI + +/// Menu-bar-only app entry point. +/// +/// `LSUIElement` (set in Info.plist) keeps the app out of the Dock; the +/// `MenuBarExtra` scene provides the status-bar item and its panel. Phase 0 only +/// wires up permissions, settings, and a backend health check — no audio, +/// capture, or call detection yet. +@main +struct Ten31TranscriptsApp: App { + @NSApplicationDelegateAdaptor(AppDelegate.self) private var appDelegate + + @StateObject private var settings = AppSettings() + @StateObject private var permissions = PermissionsManager() + @StateObject private var health = SparkControlHealth() + + var body: some Scene { + MenuBarExtra { + MenuBarView() + .environmentObject(settings) + .environmentObject(permissions) + .environmentObject(health) + } label: { + Image(systemName: "waveform.circle") + } + .menuBarExtraStyle(.window) + } +} diff --git a/Ten31Transcripts/Backend/InsecureTrustDelegate.swift b/Ten31Transcripts/Backend/InsecureTrustDelegate.swift new file mode 100644 index 0000000..84681ba --- /dev/null +++ b/Ten31Transcripts/Backend/InsecureTrustDelegate.swift @@ -0,0 +1,24 @@ +import Foundation + +/// URLSession delegate that trusts the server certificate without validation. +/// +/// SparkControl sits behind a Start9 self-signed Root CA on the LAN, so default +/// trust evaluation rejects it. This delegate is used **only** when the +/// "Skip TLS verification" setting is on. It trusts any server certificate — +/// acceptable for a personal tool on a trusted local network and nothing else. +final class InsecureTrustDelegate: NSObject, URLSessionDelegate { + func urlSession( + _ session: URLSession, + didReceive challenge: URLAuthenticationChallenge, + completionHandler: @escaping (URLSession.AuthChallengeDisposition, URLCredential?) -> Void + ) { + guard + challenge.protectionSpace.authenticationMethod == NSURLAuthenticationMethodServerTrust, + let serverTrust = challenge.protectionSpace.serverTrust + else { + completionHandler(.performDefaultHandling, nil) + return + } + completionHandler(.useCredential, URLCredential(trust: serverTrust)) + } +} diff --git a/Ten31Transcripts/Backend/SparkControlHealth.swift b/Ten31Transcripts/Backend/SparkControlHealth.swift new file mode 100644 index 0000000..06fc666 --- /dev/null +++ b/Ten31Transcripts/Backend/SparkControlHealth.swift @@ -0,0 +1,66 @@ +import Foundation +import Combine + +/// Performs the Phase 0 backend reachability check: `GET {baseURL}/api/status`. +/// +/// This is a thin slice — the full `SparkControlClient` (label-merge, multipart, +/// sequential queueing, retries) arrives in Phase 5. +@MainActor +final class SparkControlHealth: ObservableObject { + + enum Status: Equatable { + case unknown + case checking + case online(String) + case offline(String) + } + + @Published private(set) var status: Status = .unknown + @Published private(set) var lastChecked: Date? + + func check(baseURL: String, skipTLS: Bool) async { + status = .checking + + let trimmed = baseURL.trimmingCharacters(in: .whitespacesAndNewlines) + let base = trimmed.hasSuffix("/") ? String(trimmed.dropLast()) : trimmed + guard !base.isEmpty, let url = URL(string: base + "/api/status") else { + status = .offline("Invalid host URL") + return + } + + let config = URLSessionConfiguration.ephemeral + config.timeoutIntervalForRequest = 8 + config.waitsForConnectivity = false + + let delegate: URLSessionDelegate? = skipTLS ? InsecureTrustDelegate() : nil + let session = URLSession(configuration: config, delegate: delegate, delegateQueue: nil) + defer { session.finishTasksAndInvalidate() } + + do { + let (data, response) = try await session.data(from: url) + lastChecked = Date() + guard let http = response as? HTTPURLResponse else { + status = .offline("No HTTP response") + return + } + if (200..<300).contains(http.statusCode) { + status = .online(Self.summarize(data) ?? "Reachable") + } else { + status = .offline("HTTP \(http.statusCode)") + } + } catch { + lastChecked = Date() + status = .offline(error.localizedDescription) + } + } + + /// Best-effort one-line summary of the `/api/status` body, if it's JSON. + private static func summarize(_ data: Data) -> String? { + guard let object = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { + return nil + } + if let s = object["status"] as? String { return s } + if let s = object["state"] as? String { return s } + return "Reachable" + } +} diff --git a/Ten31Transcripts/Permissions/PermissionsManager.swift b/Ten31Transcripts/Permissions/PermissionsManager.swift new file mode 100644 index 0000000..b4937d1 --- /dev/null +++ b/Ten31Transcripts/Permissions/PermissionsManager.swift @@ -0,0 +1,89 @@ +import AVFoundation +import CoreGraphics +import ApplicationServices +import AppKit +import Combine + +enum PermissionState { + case granted + case denied + case notDetermined +} + +/// Tracks and requests the three TCC permissions the app needs. +/// +/// - Microphone: AVFoundation authorization (has a real "not determined" state). +/// - Screen Recording: CoreGraphics preflight/request (binary granted/denied). +/// - Accessibility: AXIsProcessTrusted (binary granted/denied). +@MainActor +final class PermissionsManager: ObservableObject { + + @Published private(set) var microphone: PermissionState = .notDetermined + @Published private(set) var screenRecording: PermissionState = .notDetermined + @Published private(set) var accessibility: PermissionState = .notDetermined + + init() { + refresh() + } + + func refresh() { + microphone = Self.microphoneState() + screenRecording = CGPreflightScreenCaptureAccess() ? .granted : .denied + accessibility = AXIsProcessTrusted() ? .granted : .denied + } + + // MARK: - Requests + + func requestMicrophone() { + AVCaptureDevice.requestAccess(for: .audio) { _ in + Task { @MainActor in self.refresh() } + } + } + + /// Triggers the system Screen Recording prompt on first call. The user must + /// still toggle the app on in System Settings; `refresh()` reflects it after. + func requestScreenRecording() { + _ = CGRequestScreenCaptureAccess() + refresh() + } + + /// Shows the Accessibility trust prompt (deep-links to the right pane). + func requestAccessibility() { + // Literal is the value of `kAXTrustedCheckOptionPrompt`; used directly to + // stay robust across SDK import shapes of that constant. + let options = ["AXTrustedCheckOptionPrompt": true] as CFDictionary + _ = AXIsProcessTrustedWithOptions(options) + refresh() + } + + func openSettings(_ pane: SettingsPane) { + guard let url = URL(string: pane.urlString) else { return } + NSWorkspace.shared.open(url) + } + + // MARK: - Helpers + + private static func microphoneState() -> PermissionState { + switch AVCaptureDevice.authorizationStatus(for: .audio) { + case .authorized: return .granted + case .denied, .restricted: return .denied + case .notDetermined: return .notDetermined + @unknown default: return .notDetermined + } + } + + enum SettingsPane { + case microphone + case screenRecording + case accessibility + + var urlString: String { + let root = "x-apple.systempreferences:com.apple.preference.security?" + switch self { + case .microphone: return root + "Privacy_Microphone" + case .screenRecording: return root + "Privacy_ScreenCapture" + case .accessibility: return root + "Privacy_Accessibility" + } + } + } +} diff --git a/Ten31Transcripts/Settings/AppSettings.swift b/Ten31Transcripts/Settings/AppSettings.swift new file mode 100644 index 0000000..445f899 --- /dev/null +++ b/Ten31Transcripts/Settings/AppSettings.swift @@ -0,0 +1,66 @@ +import Foundation +import Combine + +/// User-facing settings, persisted to `UserDefaults`. +/// +/// Phase 0 scope: backend host + TLS-skip, output folder, and adapter toggles. +/// The adapter toggles persist but do nothing yet (adapters arrive in Phase 3–4). +@MainActor +final class AppSettings: ObservableObject { + + /// Adapters the app will eventually run, in display order. + static let adapterKeys: [(key: String, label: String)] = [ + ("zoom", "Zoom"), + ("teams", "Microsoft Teams"), + ("signal", "Signal"), + ("meet", "Google Meet"), + ] + + @Published var backendBaseURL: String { + didSet { defaults.set(backendBaseURL, forKey: Keys.backendBaseURL) } + } + + @Published var skipTLSVerification: Bool { + didSet { defaults.set(skipTLSVerification, forKey: Keys.skipTLS) } + } + + @Published var outputFolderPath: String { + didSet { defaults.set(outputFolderPath, forKey: Keys.outputFolder) } + } + + @Published var adapterEnabled: [String: Bool] { + didSet { defaults.set(adapterEnabled, forKey: Keys.adapterEnabled) } + } + + /// Output folder as a resolved file URL (expands a leading `~`). + var outputFolderURL: URL { + URL(fileURLWithPath: (outputFolderPath as NSString).expandingTildeInPath, + isDirectory: true) + } + + private let defaults: UserDefaults + + init(defaults: UserDefaults = .standard) { + self.defaults = defaults + + self.backendBaseURL = defaults.string(forKey: Keys.backendBaseURL) + ?? "https://your-spark-backend.local:62419" + + self.skipTLSVerification = defaults.object(forKey: Keys.skipTLS) as? Bool ?? true + + self.outputFolderPath = defaults.string(forKey: Keys.outputFolder) + ?? "~/Ten31Transcripts" + + let stored = defaults.dictionary(forKey: Keys.adapterEnabled) as? [String: Bool] + self.adapterEnabled = stored ?? Dictionary( + uniqueKeysWithValues: Self.adapterKeys.map { ($0.key, true) } + ) + } + + private enum Keys { + static let backendBaseURL = "backendBaseURL" + static let skipTLS = "skipTLSVerification" + static let outputFolder = "outputFolderPath" + static let adapterEnabled = "adapterEnabled" + } +} diff --git a/Ten31Transcripts/Support/Info.plist b/Ten31Transcripts/Support/Info.plist new file mode 100644 index 0000000..c3d9db2 --- /dev/null +++ b/Ten31Transcripts/Support/Info.plist @@ -0,0 +1,43 @@ + + + + + CFBundleDevelopmentRegion + en + CFBundleExecutable + $(EXECUTABLE_NAME) + CFBundleIdentifier + $(PRODUCT_BUNDLE_IDENTIFIER) + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + $(PRODUCT_NAME) + CFBundleDisplayName + Ten31 Transcripts + CFBundlePackageType + APPL + CFBundleShortVersionString + $(MARKETING_VERSION) + CFBundleVersion + $(CURRENT_PROJECT_VERSION) + LSMinimumSystemVersion + $(MACOSX_DEPLOYMENT_TARGET) + LSUIElement + + NSPrincipalClass + NSApplication + NSHumanReadableCopyright + Ten31 + NSMicrophoneUsageDescription + Ten31 Transcripts records your microphone during calls to build the local audio track. + NSAppleEventsUsageDescription + Ten31 Transcripts reads the active browser tab's URL to detect Google Meet calls. + NSLocalNetworkUsageDescription + Ten31 Transcripts connects to your SparkControl server on the local network. + NSAppTransportSecurity + + NSAllowsLocalNetworking + + + + diff --git a/Ten31Transcripts/Support/Ten31Transcripts.entitlements b/Ten31Transcripts/Support/Ten31Transcripts.entitlements new file mode 100644 index 0000000..65115f0 --- /dev/null +++ b/Ten31Transcripts/Support/Ten31Transcripts.entitlements @@ -0,0 +1,11 @@ + + + + + + com.apple.security.app-sandbox + + + diff --git a/Ten31Transcripts/UI/MenuBarView.swift b/Ten31Transcripts/UI/MenuBarView.swift new file mode 100644 index 0000000..63b387c --- /dev/null +++ b/Ten31Transcripts/UI/MenuBarView.swift @@ -0,0 +1,116 @@ +import SwiftUI +import AppKit + +/// The menu-bar panel: permission statuses, backend health, and a link to +/// Settings. Shown when the user clicks the status-bar item. +struct MenuBarView: View { + @EnvironmentObject private var settings: AppSettings + @EnvironmentObject private var permissions: PermissionsManager + @EnvironmentObject private var health: SparkControlHealth + + var body: some View { + NavigationStack { + VStack(alignment: .leading, spacing: 12) { + header + Divider() + permissionsSection + Divider() + backendSection + Divider() + footer + } + .padding(14) + .frame(width: 320) + } + .onAppear { permissions.refresh() } + .task { await refreshHealth() } + } + + private var header: some View { + VStack(alignment: .leading, spacing: 2) { + Text("Ten31 Transcripts").font(.headline) + Text("Phase 0 · setup & status") + .font(.caption) + .foregroundStyle(.secondary) + } + } + + private var permissionsSection: some View { + VStack(alignment: .leading, spacing: 8) { + Text("Permissions").font(.subheadline).bold() + PermissionRow( + title: "Microphone", + state: permissions.microphone, + onGrant: permissions.requestMicrophone, + onOpenSettings: { permissions.openSettings(.microphone) } + ) + PermissionRow( + title: "Screen Recording", + state: permissions.screenRecording, + onGrant: permissions.requestScreenRecording, + onOpenSettings: { permissions.openSettings(.screenRecording) } + ) + PermissionRow( + title: "Accessibility", + state: permissions.accessibility, + onGrant: permissions.requestAccessibility, + onOpenSettings: { permissions.openSettings(.accessibility) } + ) + } + } + + private var backendSection: some View { + VStack(alignment: .leading, spacing: 8) { + HStack { + Text("Backend").font(.subheadline).bold() + Spacer() + Button("Check") { Task { await refreshHealth() } } + .disabled(health.status == .checking) + } + Text(settings.backendBaseURL) + .font(.caption) + .foregroundStyle(.secondary) + .lineLimit(1) + .truncationMode(.middle) + HStack(spacing: 8) { + StatusDot(color: healthColor) + Text(healthText).font(.caption) + } + } + } + + private var footer: some View { + HStack { + NavigationLink("Settings…") { + SettingsView() + } + Spacer() + Button("Quit") { NSApplication.shared.terminate(nil) } + } + } + + private func refreshHealth() async { + await health.check( + baseURL: settings.backendBaseURL, + skipTLS: settings.skipTLSVerification + ) + } + + private var healthColor: Color { + switch health.status { + case .online: return .green + case .offline: return .red + case .checking: return .orange + case .unknown: return .gray + } + } + + private var healthText: String { + switch health.status { + case .unknown: return "Not checked yet" + case .checking: return "Checking…" + case .online(let detail): return "Online · \(detail)" + case .offline(let error): return "Offline · \(error)" + } + } +} diff --git a/Ten31Transcripts/UI/PermissionRow.swift b/Ten31Transcripts/UI/PermissionRow.swift new file mode 100644 index 0000000..6b76563 --- /dev/null +++ b/Ten31Transcripts/UI/PermissionRow.swift @@ -0,0 +1,54 @@ +import SwiftUI + +/// Small status indicator dot. +struct StatusDot: View { + let color: Color + var body: some View { + Circle().fill(color).frame(width: 9, height: 9) + } +} + +/// One permission line: status dot, label, and a context-appropriate action. +struct PermissionRow: View { + let title: String + let state: PermissionState + let onGrant: () -> Void + let onOpenSettings: () -> Void + + var body: some View { + HStack(spacing: 8) { + StatusDot(color: dotColor) + Text(title) + Spacer() + actionButton + } + } + + private var dotColor: Color { + switch state { + case .granted: return .green + case .denied: return .red + case .notDetermined: return .orange + } + } + + @ViewBuilder + private var actionButton: some View { + switch state { + case .granted: + Image(systemName: "checkmark.circle.fill").foregroundStyle(.green) + case .notDetermined: + // Native prompt (Microphone). The request also registers the app. + Button("Grant", action: onGrant) + case .denied: + // Screen Recording / Accessibility report binary granted/denied, so + // "not yet asked" looks like denied. "Grant" calls the request API, + // which registers the app in the relevant list and shows the system + // prompt the first time; "Open Settings" is the manual fallback. + HStack(spacing: 6) { + Button("Grant", action: onGrant) + Button("Open Settings", action: onOpenSettings) + } + } + } +} diff --git a/Ten31Transcripts/UI/SettingsView.swift b/Ten31Transcripts/UI/SettingsView.swift new file mode 100644 index 0000000..7d81868 --- /dev/null +++ b/Ten31Transcripts/UI/SettingsView.swift @@ -0,0 +1,67 @@ +import SwiftUI +import AppKit + +/// Settings panel (pushed from the menu-bar panel). +struct SettingsView: View { + @EnvironmentObject private var settings: AppSettings + + var body: some View { + Form { + Section("SparkControl backend") { + TextField("Base URL", text: $settings.backendBaseURL) + .textFieldStyle(.roundedBorder) + Toggle("Skip TLS verification (self-signed cert)", + isOn: $settings.skipTLSVerification) + } + + Section("Output") { + HStack { + Text(settings.outputFolderPath) + .lineLimit(1) + .truncationMode(.middle) + .foregroundStyle(.secondary) + Spacer() + Button("Choose…", action: chooseFolder) + } + } + + Section("Adapters") { + Text("Inert in Phase 0 — these toggles only persist for now.") + .font(.caption) + .foregroundStyle(.secondary) + ForEach(AppSettings.adapterKeys, id: \.key) { adapter in + Toggle(adapter.label, isOn: binding(for: adapter.key)) + } + } + } + .formStyle(.grouped) + .frame(width: 320) + .navigationTitle("Settings") + } + + private func binding(for key: String) -> Binding { + Binding( + get: { settings.adapterEnabled[key] ?? true }, + set: { settings.adapterEnabled[key] = $0 } + ) + } + + private func chooseFolder() { + let panel = NSOpenPanel() + panel.canChooseDirectories = true + panel.canChooseFiles = false + panel.allowsMultipleSelection = false + panel.prompt = "Choose" + panel.directoryURL = settings.outputFolderURL + + // The app is a menu-bar accessory and this is invoked from the transient + // MenuBarExtra(.window) popover. Use the async begin(...) API rather than + // runModal() — a nested modal loop can let the popover dismiss the panel + // out from under it. Activate first so the panel comes to the front. + NSApp.activate(ignoringOtherApps: true) + panel.begin { response in + guard response == .OK, let url = panel.url else { return } + settings.outputFolderPath = url.path + } + } +} diff --git a/Ten31TranscriptsTests/Ten31TranscriptsTests.swift b/Ten31TranscriptsTests/Ten31TranscriptsTests.swift new file mode 100644 index 0000000..5439a59 --- /dev/null +++ b/Ten31TranscriptsTests/Ten31TranscriptsTests.swift @@ -0,0 +1,9 @@ +import XCTest + +/// Placeholder so the test target is wired and green from Phase 0. +/// Real coverage (adapter fixture tests) arrives in Phase 3. +final class Ten31TranscriptsTests: XCTestCase { + func testScaffoldBuilds() throws { + XCTAssertTrue(true) + } +} diff --git a/docs/01_PROJECT_BRIEF.md b/docs/01_PROJECT_BRIEF.md new file mode 100644 index 0000000..e9b5baf --- /dev/null +++ b/docs/01_PROJECT_BRIEF.md @@ -0,0 +1,151 @@ +# Project Brief — Ten31 Transcripts + +> Local macOS app that auto-detects conference calls, records local audio, and +> produces a **visual-derived, timestamped speaker timeline** — then hands the +> mixed audio + that timeline to the operator's **SparkControl** backend, which +> diarizes, names the speakers (majority-overlap vote against the timeline), and +> returns named transcript segments. A growing **voiceprint library** recovers +> speakers even when the visual cue is missing. + +Master context document. Read this first, then `02_ARCHITECTURE.md`, +`03_DATA_CONTRACTS.md`, `04_BUILD_PLAN.md`. The SparkControl API is now fully +specified — see `03_DATA_CONTRACTS.md` (and the source `AUDIO_API.md`). + +--- + +## 1. What we are building + +A lightweight, always-running **menu-bar app on macOS** that: + +1. **Detects** when the user joins a call in Google Meet, Zoom, Microsoft Teams, + or Signal. +2. **Records two local audio tracks** — system audio (everyone else) and the + user's microphone (the user) — and **mixes them to one 16 kHz mono WAV** for + the backend. +3. **Watches the call window** at ~2–4 fps and, per app, reads participant + **names** and the **active-speaker cue**, producing a + `(start, end, name, confidence)` **visual timeline** — its best guess at who + was talking when. +4. **Discards every video frame after extraction.** No video is ever written to + disk. Only audio + the derived timeline persist locally. +5. On call end, **POSTs the mixed audio + the visual timeline (+ the known + voiceprint library) to `POST /api/audio/label-merge`** on SparkControl, which + returns **named, speaker-attributed transcript segments** and a **voiceprint + per speaker**. +6. **Persists the returned voiceprints** keyed by name, so the next call can pass + them as `known_voiceprints` and recover a speaker by voice when the visual cue + is absent (camera off, a bad OCR frame). + +The app's job ends at receiving and storing the named segments from SparkControl. +**All transcription, diarization, and the name-merge happen on the backend.** Do +not build transcription, diarization, or the merge vote in this app. + +## 2. Why the visual timeline still matters (the core idea) + +Audio diarization (NVIDIA Sortformer on the backend) is excellent at +**segmentation** — precise speaker boundaries — but its clusters are **anonymous** +(`Speaker_0`, `Speaker_1`…). It cannot name anyone. + +The screen *already knows the names*. Each app visually marks the active speaker +(colored tile border, animated audio bars, a ring around an avatar) next to that +person's name/initials. + +So responsibilities split cleanly: +- **Audio (backend)** owns *segmentation* — the exact *when*. +- **Visual capture (this app)** owns *identity* — the *who*. +- **`label-merge` (backend)** fuses them: it diarizes, then assigns each cluster + the timeline name with the most temporal overlap. The visual track needn't be + perfect — it only needs to win the per-cluster vote. + +**New compounding layer — the voiceprint library.** Every named cluster comes +back from the backend with a 192-dim TitaNet voiceprint. The app persists these +keyed by name and replays them as `known_voiceprints` next time. Resolution order +per cluster becomes: **visual overlap → voiceprint match → `Unknown_N`** (never +mislabeled). So the screen capture *enrolls a voice library for free*, and over a +few calls the system can name regulars even with cameras off. + +## 3. Hard scope boundaries + +**In scope (this app):** +- Call detection for Meet / Zoom / Teams / Signal. +- Dual-track local audio capture + mix-to-mono for the backend. +- Low-fps window capture → OCR (names) + active-speaker cue detection. +- Per-app "adapter" modules encapsulating each app's UI quirks. +- Building the visual timeline; **mic-VAD self-labeling** (the mic track is the + user, so hot-mic spans pre-seed the user's name into the timeline). +- Chunking long calls (~2–3 min) and calling `label-merge` **sequentially**. +- A local **voiceprint store** (persist + replay named voiceprints). +- Storing the backend's named transcript segments locally. +- A minimal menu-bar UI: status, manual start/stop, recent sessions, adapter + toggles, backend host/health, output folder. + +**Out of scope (owned by the backend):** +- Transcription, diarization, the name-merge vote, summarization/analysis. + +**Explicitly not doing:** saving video; cloud anything. Everything stays on the +operator's LAN. + +## 4. Key decisions (now resolved against the real contract) + +| Decision | Choice | Why | +|---|---|---| +| Language / framework | Native Swift + SwiftUI menu-bar app (`LSUIElement`) | System audio, window capture, Vision all native; one codebase. | +| Audio capture | ScreenCaptureKit (system audio) + AVFoundation (mic) | No virtual audio device; works with headphones; macOS 13+. | +| Backend audio format | **Mixed-mono 16 kHz WAV** | Diarizer separates speakers from one mixed stream; 16 kHz is ideal. | +| Call detection | CoreAudio "mic running somewhere" + known-app / Meet-tab heuristic | Clean live-mic signal + app disambiguation. | +| Speaker naming | **Backend, via `POST /api/audio/label-merge`** | One call does diarize + overlap-vote naming + transcription. No client merge. | +| Identity recovery | **Local voiceprint library** replayed as `known_voiceprints` | Recovers camera-off / OCR-missed speakers by voice; compounds over calls. | +| Self-identity | mic-VAD → pre-seed user's name in timeline | The mic track is the user; gives the backend a strong prior + enrolls the user's voiceprint immediately. | +| Requests | **Sequential, one audio request in flight** | Parallel audio requests trip a backend GPU race (`503 + Retry-After`). | +| Long calls | Chunk ~2–3 min, sequential, stitch via names+voiceprints | Diarizer caps at **4 speakers/chunk**; voiceprints + names unify across chunks. | +| Transport / TLS | `multipart/form-data`, file field `file`; self-signed Start9 cert (skip verify or trust the Root CA); **no auth on LAN** | Matches every other SparkControl endpoint. | +| Timing | Batch after call (sync endpoints, no polling) | Endpoints are synchronous; no job/poll machinery needed. | + +### On forking Hyprnote +Unchanged recommendation: the audio capture is the trivial part (~200 lines of +native Swift) and the rest (Vision screen-reading) is native too. Build native; +use Hyprnote's capture/detection only as reference. Fork remains an override. + +## 5. Target apps & identifiers + +| App | Join via | Bundle ID(s) | Speaking cue / names | +|---|---|---|---| +| Zoom | Native | `us.zoom.xos` | Colored tile border; name label in tile. | +| Microsoft Teams | Native (new) | `com.microsoft.teams2` (new), `com.microsoft.teams` (classic) | Colored ring/border; labeled. | +| Signal | Native (Electron) | `org.whispersystems.signal-desktop` | Ring around avatar/initials; try Accessibility names first. | +| Google Meet | Browser tab | `com.google.Chrome`, `com.apple.Safari`, `company.thebrowser.Browser` (Arc)… | Canvas video → Vision for the cue; DOM names → Accessibility/AppleScript; confirm via active-tab URL `meet.google.com`. | + +Four required adapters; adding a 5th must be one new file conforming to the +`AppAdapter` protocol. + +## 6. The backend (do not rebuild) — now concrete + +SparkControl, on the operator's Start9 LAN, fronting two DGX Sparks: +- **STT:** NVIDIA Parakeet TDT 0.6B — `POST /v1/audio/transcriptions` (OpenAI-compatible). +- **Diarization:** NVIDIA Sortformer 4spk-v1 — `POST /api/audio/diarize-chunk` + (anonymous clusters + voiceprints) and `POST /api/audio/transcribe-with-speakers`. +- **Embeddings:** NVIDIA TitaNet (192-dim voiceprints). +- **★ Primary endpoint for this app:** `POST /api/audio/label-merge` — diarize + + name from the visual timeline (+ voiceprint fallback), optionally transcribe, + in one synchronous call. +- Health/discovery: `GET /api/status`, `GET /api/endpoints`, `GET /v1/models`. + +Full request/response shapes, curl examples, limits, and error formats are in +`03_DATA_CONTRACTS.md`. + +## 7. Remaining open items (small) + +1. **Base URL — RESOLVED.** `https://your-spark-backend.local:62419`, also + `https://your-spark-backend.local:62419` (prefer the `.local` form; it survives IP + changes). Ship the `.local` host as the default; keep it editable in settings. + Service-discovery at `GET /api/endpoints`. +2. **Send trigger** — assume auto-POST on call end; expose a "hold for review" + toggle if the user wants to eyeball the timeline first. +3. **Retention** — keep the session folder after a successful hand-off, or prune + audio and keep only `speakers.json` + voiceprints? Default: keep everything, + user-configurable. +4. **Voiceprint update policy** — overwrite vs running-average a person's stored + voiceprint across calls (see `02_ARCHITECTURE.md §2.9`). Start simple + (store/refresh latest high-confidence), refine later. +5. **Signing** — stable identity so macOS doesn't re-prompt for permissions on + each rebuild. diff --git a/docs/02_ARCHITECTURE.md b/docs/02_ARCHITECTURE.md new file mode 100644 index 0000000..752aa03 --- /dev/null +++ b/docs/02_ARCHITECTURE.md @@ -0,0 +1,248 @@ +# Architecture — Ten31 Transcripts + +Companion to `01_PROJECT_BRIEF.md`. Module layout, data flow, the per-app adapter +pattern, the macOS APIs, and the SparkControl integration (now fully specified). + +--- + +## 1. High-level data flow + +``` + ┌─────────────────────────────────────────┐ + │ CallDetector │ + │ CoreAudio "mic running somewhere" │ + │ + known-app / Meet-tab heuristic │ + └───────────────┬───────────────────────────┘ + │ callStarted(app, window) + ▼ + ┌──────────────────────────────────────────────────────────┐ + │ SessionController │ + │ owns one Session; shared t0; start/stop; on end package │ + └───────┬───────────────────────────┬──────────────────────┘ + │ │ + ▼ ▼ + ┌────────────────────────┐ ┌───────────────────────────────────┐ + │ AudioRecorder │ │ VisualObserver │ + │ SCStream system audio │ │ SCStream window frames @2–4fps │ + │ AVAudioEngine mic │ │ │ (frames released, never saved)│ + │ → mic.wav, system.wav │ │ ▼ │ + │ → mixed_mono_16k.wav │ │ AppAdapter (per app) │ + │ + mic VAD → self spans │ │ OCR names + active-speaker cue │ + └────────────┬───────────┘ │ → SpeakerObservation │ + │ └──────────────┬────────────────────┘ + │ ▼ + │ ┌───────────────────────────┐ + │ self spans ───▶│ TimelineBuilder │ + │ │ debounce/coalesce + merge │ + │ │ mic-VAD self spans │ + │ │ → visual_timeline.json │ + │ └──────────────┬────────────┘ + │ │ + ▼ ▼ + ┌──────────────────────────────────────────────────────────┐ + │ SessionPackager │ + │ mixed_mono_16k.wav + visual_timeline.json + manifest │ + │ + chunk plan (if call > ~3 min) │ + └───────────────────────────┬──────────────────────────────┘ + ▼ + ┌──────────────────────────────────────┐ + │ SparkControlClient │ + │ per chunk, SEQUENTIAL: │ + │ POST /api/audio/label-merge │ + │ file=chunk.wav │ + │ timeline= │ + │ known_voiceprints= │ + │ transcribe=true │ + │ → named segments + per-speaker prints │ + └───────────────┬───────────┬────────────┘ + │ │ + offset+stitch│ │ fingerprints (keyed by name) + ▼ ▼ + ┌────────────────┐ ┌────────────────────┐ + │ speakers.json │ │ VoiceprintStore │ + │ (named, global) │ │ persist/replay │ + └────────────────┘ └────────────────────┘ +``` + +## 2. Modules + +### 2.1 `CallDetector` +Fire `callStarted(app:window:)` / `callEnded()`. +- **Mic active system-wide:** CoreAudio `kAudioDevicePropertyDeviceIsRunningSomewhere` + on the default input device (listener, not poll). +- **App present/active:** `NSWorkspace` running/frontmost vs the bundle-ID table. +- **Meet (browser):** when a browser is frontmost + mic live, read the active-tab + URL (AppleScript/Accessibility); confirm `meet.google.com`. +- **Heuristic:** `mic_running` AND (`known_native_app_active` OR `browser+meet_tab`). + Debounce ~2 s open; end when mic quiet > N s and the app/tab leaves foreground + or quits. +- Output: app id + the call window (`SCWindow`) for the `VisualObserver`. + +### 2.2 `AudioRecorder` +- **System audio:** `SCStream` with `capturesAudio = true` (mixer-level; works with + headphones; no BlackHole). macOS 13+. +- **Mic:** `AVAudioEngine` input tap. +- Outputs: `mic.wav`, `system.wav`, and the backend deliverable + **`mixed_mono_16k.wav`** (mic + system summed → mono → 16 kHz PCM WAV). +- **Shared `t0`** (`CACurrentMediaTime`) stamped once; every audio sample and + visual observation is relative to it. Non-negotiable — the merge depends on it. +- **Mic VAD:** run lightweight energy/VAD on the mic track to emit "the user is + speaking" spans. These feed `TimelineBuilder` as pre-seeded **self** segments + (high confidence) so the backend names the user even when their own tile isn't + read — and so the user's voiceprint enrolls on call one. + +### 2.3 `VisualObserver` +- `SCStream` scoped (via `SCContentFilter`) to the **specific call window**. +- Throttle to adapter fps (default 3). Hand each frame to the adapter; **release + immediately — never persist a frame.** +- **Window visibility / focus is NOT required.** SCK captures a window's own + rendered content even when it's in the background, occluded by other apps, or + on another Space. The user can work in other apps during the call and visual + capture continues normally. (This is a key reason for window capture over + display capture — also more private.) +- **Capture liveness — the one real failure mode.** Two states stop fresh frames: + 1. **Minimized to the Dock** — macOS may freeze the window's backing buffer, so + SCK delivers stale/duplicate frames. Detect minimization + (`SCWindow.isOnScreen == false` / window state) and **pause visual analysis + + flag a `visual_gap` for that span** rather than emitting bogus observations. + 2. **Browser tab switched away (Meet only)** — see §2.4 Meet note. + In both cases **audio keeps recording**, and the backend voiceprint fallback + still names previously-heard speakers — so a gap only costs naming precision for + *new, never-seen* speakers during that exact window. Record gaps in + `visual_timeline.json` (a `visual_gaps: [{start,end,reason}]` array) so the + cause is auditable; `TimelineBuilder` must not interpolate across a gap. + +### 2.4 `AppAdapter` (protocol) + four implementations +```swift +protocol AppAdapter { + static var bundleIDs: [String] { get } + var preferredFPS: Int { get } + func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] + func namesFromAccessibility() -> [String]? // optional +} +struct SpeakerObservation { + let name: String // OCR'd / a11y name; "" if unknown + let speaking: Bool // active-speaker cue detected + let bbox: CGRect + let confidence: Double // 0..1 + let t: TimeInterval // relative to session t0 +} +``` +Per-adapter cues: +- **Zoom** (`us.zoom.xos`): colored tile border = active speaker; OCR the tile + name label; handle speaker + gallery layouts. +- **Teams** (`com.microsoft.teams2`): colored ring/border; labeled; like Zoom. +- **Signal** (`org.whispersystems.signal-desktop`): ring around avatar/initials; + try `namesFromAccessibility()` first (Electron a11y tree), OCR fallback. +- **Meet** (browser): **hybrid** — names via Accessibility/AppleScript (DOM text), + speaking cue via Vision (canvas/WebGL animated bars / tile highlight), fused by + tile position. Most likely to need iteration. + - **Tab-switch caveat (Meet-specific):** if Meet is a browser *tab* and the user + switches to a different tab **in the same window**, the browser stops rendering + the Meet tab → SCK captures a frozen last-frame (a `visual_gap`). Switching to a + different *app* is fine; switching tabs is not. Mitigations, in order: (1) + detect the active-tab URL leaving `meet.google.com` and flag a `visual_gap` + (don't emit stale observations); (2) prefer capturing Meet in a **dedicated + browser window / PWA / standalone window** so tab-switching can't blank it + — surface this as a one-time setup tip in the UI; (3) names still come from the + a11y/DOM tree where available, and audio + voiceprint fallback carry identity + through the gap regardless. + +Each adapter is **testable offline** against PNG/JPEG frame fixtures. + +### 2.5 `TimelineBuilder` +Turn noisy per-frame observations into clean `(start, end, name, confidence)` +segments. +- Group by name; open a segment after K consecutive speaking frames (e.g. 2), + close after M quiet frames (e.g. 2) — hysteresis rides out UI-cue lag/flicker. +- **Allow overlaps** (crosstalk). Do not force one speaker per instant. +- Merge in the mic-VAD **self** spans (the user) with high confidence. +- Normalize OCR name variants ("Sarah J" → "Sarah Jones") via a per-session + alias table. +- Emit `visual_timeline.json` (schema in `03_DATA_CONTRACTS.md`). The flat + `segments` array maps directly onto the `timeline` field `label-merge` wants. + +### 2.6 `SessionPackager` +Write the session folder and, if the call is longer than ~3 min, produce a +**chunk plan**: ~2–3 min windows on `mixed_mono_16k.wav`, each with its +**timeline slice rebased to chunk-local seconds**. +``` +~/Ten31Transcripts/sessions/2026-06-05T14-03_zoom/ + mic.wav system.wav mixed_mono_16k.wav + visual_timeline.json + manifest.json + (chunks/ produced transiently if chunking) + speakers.json # written after backend hand-off +``` + +### 2.7 `SparkControlClient` +Deliver to SparkControl. **Primary path = `POST /api/audio/label-merge`** with +`file`, `timeline`, `known_voiceprints`, `transcribe=true`. +- **Sequential only** — one audio request in flight (parallel ⇒ `503 + Retry-After`). +- **Self-signed TLS** — skip verification (`URLSession` delegate trusting the + Start9 cert) or trust the Root CA. **No auth on the LAN.** +- **Per chunk:** call `label-merge` with that chunk's audio + rebased timeline + + the **accumulated** voiceprints; offset returned timestamps back to global and + append. Names unify across chunks because the same names/voiceprints are passed + each time; new voiceprints accumulate into the store. +- Retry on `503` after `Retry-After`; on hard failure keep the session folder and + surface "Resend" in the UI. +- Limits to respect: **200 MB/request** (`413`), transcription timeout ~300 s, + diarization ~600 s. Chunking keeps requests well under these. +- See `03_DATA_CONTRACTS.md §4` for exact fields and a real response. + +### 2.8 result assembly → `speakers.json` +Concatenate the per-chunk `label-merge` results into one global, named, +speaker-attributed transcript (timestamps offset to session time). This is the +seam to the user's existing summarizer. The app does not analyze past this. + +### 2.9 `VoiceprintStore` +Local persistence of named voiceprints — the compounding-identity layer. +- File: `~/Ten31Transcripts/voiceprints.json` → + `{ "": { "vector": [192 floats], "updated": , "calls": } }`. +- **On send:** load all entries → pass as `known_voiceprints` to `label-merge`. +- **On response:** for each speaker resolved by **visual** (or a high-similarity + **voiceprint** match), store/refresh that name's vector. **Never** store + `Unknown_N`. +- **Update policy (open, start simple):** overwrite with the latest + high-confidence vector, or keep a running mean per name. v1 = store latest with + `overlap_confidence ≥ ~0.8`; refine to averaging later (`01 §7.4`). +- Editable/clearable from the menu-bar UI (rename, delete a person, reset). + +### 2.10 `MenuBarUI` (SwiftUI, `LSUIElement`) +Status (idle / detected / recording / uploading), manual start/stop, recent +sessions (open folder, resend, delete), adapter toggles, **backend host + a +health check** (`GET /api/status`), output folder, voiceprint manager, and a +permissions checklist (Screen Recording, Microphone, Accessibility). + +## 3. macOS frameworks & permissions + +| Need | Framework | Permission | +|---|---|---| +| System audio + window frames | ScreenCaptureKit | Screen Recording | +| Microphone | AVFoundation / CoreAudio | Microphone | +| Meet/Signal names, tab URL | Accessibility (AXUIElement) / AppleScript | Accessibility + Automation | +| OCR + cue analysis | Vision (`VNRecognizeTextRequest`) | none | +| App/tab detection | AppKit (`NSWorkspace`) | none | + +Stable signing identity avoids permission re-prompts on rebuild. + +## 4. Performance +Window-scoped capture + 3 fps + Vision-on-Neural-Engine is light; audio is cheap; +frames are released immediately so memory stays flat. The app idles near-zero +until a call starts. Backend requests are sequential and chunked, so they never +saturate the GPU. + +## 5. The merge — now done by the backend +The app no longer implements the overlap vote. `label-merge` resolves each +anonymous cluster in order: +1. **visual** — timeline name with the most temporal overlap (`source: "visual"`, + `overlap_confidence`); +2. **voiceprint** — closest `known_voiceprints` match above `voiceprint_threshold` + (`source: "voiceprint"`, `match_similarity`); +3. **`Unknown_N`** (`source: "unmatched"`) — never guessed/mislabeled. +The app's contribution is a good timeline (incl. mic-VAD self spans) and an +ever-growing voiceprint library. `min_overlap` and `voiceprint_threshold` are +tunable request params if precision needs adjusting. +``` +``` diff --git a/docs/03_DATA_CONTRACTS.md b/docs/03_DATA_CONTRACTS.md new file mode 100644 index 0000000..0a91528 --- /dev/null +++ b/docs/03_DATA_CONTRACTS.md @@ -0,0 +1,214 @@ +# Data Contracts — Ten31 Transcripts + +Companion to docs 01/02. Defines the files the app produces/stores and the **real +SparkControl contract** (source of truth: `AUDIO_API.md`). The `label-merge` +endpoint is the app's primary integration point. + +--- + +## 1. `visual_timeline.json` (the app's primary output) + +Rich, app-native record of who-was-speaking-when, inferred from the screen. +Times are **seconds relative to session `t0`** (the shared audio clock). + +```jsonc +{ + "schema_version": "1.1", + "session_id": "2026-06-05T14-03_zoom", + "app": "zoom", // zoom | teams | signal | meet + "adapter_version": "zoom-0.3.1", + "t0_unix": 1749135780.123, + "duration_sec": 2841.7, + "fps_sampled": 3, + "self_name": "Grant", // user's name; segments from mic VAD + "participants": [ + { "name": "Grant", "is_self": true }, + { "name": "Sarah Jones", "aliases": ["Sarah J"] }, + { "name": "Dmitri" } + ], + "segments": [ + { "start": 0.0, "end": 4.5, "name": "Grant", "confidence": 0.97, "source": "mic_vad" }, + { "start": 4.8, "end": 9.3, "name": "Sarah Jones", "confidence": 0.82, "source": "vision" } + ], + "visual_gaps": [ + { "start": 120.4, "end": 138.9, "reason": "minimized" } // reason: minimized | tab_switched + ] +} +``` +- Segments **may overlap** (crosstalk) — do not flatten. +- `source ∈ {vision, accessibility, fused, mic_vad}`. +- The user (mic-VAD `is_self`) is pre-seeded so the backend names them and + enrolls their voiceprint on call one. + +### 1.1 Mapping to the `label-merge` `timeline` field +`label-merge` wants a **flat JSON array** of `{start, end, name, confidence}` +(seconds). Build it by taking `segments` and dropping `source`: +```json +[{"start":0.0,"end":4.5,"name":"Grant","confidence":0.97}, + {"start":4.8,"end":9.3,"name":"Sarah Jones","confidence":0.82}] +``` +When chunking, **slice to the chunk window and rebase to chunk-local seconds** +(subtract chunk start) before sending. + +## 2. `manifest.json` +```jsonc +{ + "session_id": "2026-06-05T14-03_zoom", + "app": "zoom", + "t0_unix": 1749135780.123, + "duration_sec": 2841.7, + "audio": { + "mixed": { "file": "mixed_mono_16k.wav", "sr": 16000, "channels": 1, "sha256": "..." }, + "mic": { "file": "mic.wav", "sr": 16000, "channels": 1, "sha256": "..." }, + "system": { "file": "system.wav", "sr": 16000, "channels": 1, "sha256": "..." } + }, + "chunking": { "enabled": true, "chunk_sec": 150, "chunks": 19 }, + "visual_timeline": "visual_timeline.json", + "backend_result": "speakers.json", + "adapter_versions": { "zoom": "0.3.1" }, + "app_version": "0.1.0" +} +``` +(`mixed_mono_16k.wav` is the one the backend gets; the separate tracks are kept +locally — the mic track is the user's known identity / VAD source.) + +--- + +## 3. SparkControl — connection (real) + +- **Base URL (confirmed):** `https://your-spark-backend.local:62419` — also reachable at + `https://your-spark-backend.local:62419` (the `.local` form survives IP changes; + **prefer it as the default**). Service-discovery JSON is at + `GET /api/endpoints` (returns current vLLM / Parakeet / Kokoro URLs). All audio + endpoints in §4–§5 hang off this base. Still **make it a setting** so the host + can change, but ship `https://your-spark-backend.local:62419` as the default. +- **TLS:** Start9 self-signed Root CA. Either skip verification (`URLSession` + delegate trusting the cert; curl `-k`; `rejectUnauthorized:false`) **or** install + the Start9 Root CA into the trust store. +- **Auth:** **none on the LAN.** No token/key today. +- **Limits:** **200 MB/request** (`413` over); timeouts ~300 s (transcription), + ~600 s (diarization). **Send audio requests SEQUENTIALLY** — concurrent audio + trips a GPU FFT race → `503 + Retry-After`. +- **Transport:** `multipart/form-data`, audio file field name **`file`** (bytes, + not base64/path). +- **All endpoints are synchronous** (no job IDs / polling). +- **Errors:** JSON `{"detail": "..."}`; `400` malformed, `413` too large, `503 + + Retry-After` transient (retry after the interval). +- **Health/discovery:** `GET /api/status`, `GET /api/endpoints`, `GET /v1/models`. + +--- + +## 4. ★ `POST /api/audio/label-merge` — the app's primary call + +Diarize + name clusters from the visual timeline (majority temporal overlap), +with voiceprint fallback, optionally transcribed. Synchronous. **Stateless** — +the app owns the timeline and the voiceprint library. + +**Multipart fields:** +| field | required | notes | +|---|---|---| +| `file` | **yes** | mixed-mono WAV (the chunk, when chunking) | +| `timeline` | **yes** | flat JSON array `[{"start","end","name","confidence"}]`, chunk-local seconds (§1.1) | +| `known_voiceprints` | no | JSON `{"":[192 floats], ...}` from `VoiceprintStore` | +| `transcribe` | no | `"true"` to also return per-segment text (default false) | +| `min_overlap` | no | min fraction of a cluster's time overlapping the winning name (default `0.0`) | +| `voiceprint_threshold` | no | cosine sim to accept a voiceprint match (default `0.5`) | + +```bash +curl -k -X POST https:///api/audio/label-merge \ + -F "file=@chunk_003.wav" \ + -F 'timeline=[{"start":0,"end":4.5,"name":"Grant"},{"start":4.8,"end":9.3,"name":"Sarah Jones"}]' \ + -F 'known_voiceprints={"Grant":[/*192*/],"Sarah Jones":[/*192*/]}' \ + -F "transcribe=true" +``` + +**Response (`transcribe=true`):** +```jsonc +{ + "duration": 9.259, + "speakers": [ + { "cluster": "Speaker_0", "name": "Grant", "source": "visual", + "overlap_confidence": 0.9866, "fingerprint": [/*192 floats*/] }, + { "cluster": "Speaker_1", "name": "Sarah Jones", "source": "voiceprint", + "match_similarity": 0.71, "fingerprint": [/*192 floats*/] } + ], + "segments": [ + { "start_ms": 0, "end_ms": 4480, "speaker": "Grant", + "text": "Good morning everyone. I think the energy thesis is strong this quarter." }, + { "start_ms": 4800, "end_ms": 9040, "speaker": "Sarah Jones", + "text": "I agree, but I am worried about the lockup terms and the fee load this time." } + ], + "fingerprints": { "Grant": [/*192*/], "Sarah Jones": [/*192*/] }, + "models": { "diarization": "nvidia/diar_sortformer_4spk-v1", + "embedding": "nvidia/speakerverification_en_titanet_large" } +} +``` +- `transcribe=false` → segments are `{start_s, end_s, speaker, confidence}` (no text). +- **Resolution order per cluster:** `visual` → `voiceprint` (with + `match_similarity`) → `Unknown_N` (`source:"unmatched"`, never mislabeled). +- **Persist `fingerprints`** keyed by name into `VoiceprintStore` (skip `Unknown_N`). +- **Diarizer caps at 4 speakers/chunk** and takes **no `num_speakers` hint** — chunk + for larger/longer calls; names + voiceprints unify speakers across chunks. + +--- + +## 5. Other SparkControl endpoints (reference / fallback) + +### 5.1 `POST /v1/audio/transcriptions` (OpenAI-compatible, sync) +Plain STT. Fields: `file` (req), `model` (default `parakeet-tdt-0.6b-v3`), +`response_format` (`json` | `verbose_json` | `text`), `language`, `temperature`, +`prompt`. `verbose_json` returns word- **and** segment-level timestamps + full +`text`. Use only if you ever want transcript without speaker labels. + +### 5.2 `POST /api/audio/diarize-chunk` (sync) +Field: `file`. Returns anonymous clusters `{start_s, end_s, speaker, confidence}`, +`speakers_detected`, and a 192-dim `fingerprints` map per local speaker. Use if +you ever want to do the merge client-side instead of via `label-merge` (not the +default path). + +### 5.3 `POST /api/audio/transcribe-with-speakers` (sync) +Field: `file`. ASR + diarization merged into anonymous speaker-attributed blocks +`{start_ms, end_ms, speaker, text}`. (Anonymous — no naming. `label-merge` is the +named version.) + +--- + +## 6. `speakers.json` — final stored output (target end state) + +Per-chunk `label-merge` results concatenated, **timestamps offset back to session +(global) seconds**, names unified across chunks. This is the hand-off to the +user's downstream summarizer; the app stops here. + +```jsonc +{ + "session_id": "2026-06-05T14-03_zoom", + "app": "zoom", + "duration_sec": 2841.7, + "speakers": [ + { "name": "Grant", "source": "visual", "overlap_confidence": 0.99 }, + { "name": "Sarah Jones", "source": "voiceprint", "match_similarity": 0.71 }, + { "name": "Unknown_0", "source": "unmatched" } + ], + "segments": [ + { "start": 0.0, "end": 4.48, "speaker": "Grant", "text": "..." }, + { "start": 4.80, "end": 9.04, "speaker": "Sarah Jones", "text": "..." }, + { "start": 152.3, "end": 158.1, "speaker": "Unknown_0", "text": "..." } + ], + "models": { "diarization": "nvidia/diar_sortformer_4spk-v1", + "embedding": "nvidia/speakerverification_en_titanet_large", + "transcription": "parakeet-tdt-0.6b-v3" } +} +``` +(Convert backend `start_ms`/`end_ms` → seconds; add the chunk offset.) + +## 7. `voiceprints.json` — local voiceprint library +```jsonc +{ + "Grant": { "vector": [/*192 floats*/], "updated": "2026-06-05T14:51Z", "calls": 12 }, + "Sarah Jones": { "vector": [/*192 floats*/], "updated": "2026-06-02T09:10Z", "calls": 3 } +} +``` +Loaded → `known_voiceprints` on every `label-merge` call. Updated from response +`fingerprints` for `visual`/high-confidence `voiceprint` speakers only. Never +stores `Unknown_N`. Update policy (`02 §2.9`): start = store latest with +`overlap_confidence ≥ ~0.8`; consider per-name running mean later. diff --git a/docs/04_BUILD_PLAN.md b/docs/04_BUILD_PLAN.md new file mode 100644 index 0000000..e1c6faa --- /dev/null +++ b/docs/04_BUILD_PLAN.md @@ -0,0 +1,123 @@ +# Build Plan — Ten31 Transcripts + +Companion to docs 01–03. Phased plan for the Claude Code session, each phase with +a demoable milestone. Build in order; the risky/novel work (visual adapters) is +isolated for independent tuning. The SparkControl contract is now known +(`03_DATA_CONTRACTS.md`), so Phase 5 wires the real endpoints. + +--- + +## Ground rules +- **Native Swift + SwiftUI**, menu-bar app (`LSUIElement = true`). macOS 13.0+ + (ScreenCaptureKit system audio). +- **Never write video frames to disk.** Process in-memory, release immediately; + assert this in `VisualObserver` review. +- **One shared monotonic `t0`** for audio + visual timestamps — wire first. +- **Backend deliverable = `mixed_mono_16k.wav`.** Keep `mic.wav`/`system.wav` + locally (mic = the user, and the VAD source for self-labeling). +- **SparkControl calls are SEQUENTIAL** (one audio request in flight) over + self-signed TLS, **no auth**, primary endpoint `POST /api/audio/label-merge`. +- Every adapter must be testable offline against image fixtures. + +--- + +## Phase 0 — Scaffold, permissions, backend ping (milestone: launches + green) +- Menu-bar shell; permissions onboarding (Screen Recording, Microphone, + Accessibility) with status + deep links. +- Settings: output folder, adapter toggles, **SparkControl base URL** + TLS-skip + toggle. +- `GET /api/status` health check shown in the menu. +- **Milestone:** app sits in the menu bar; permissions green; backend reachable. + +## Phase 1 — Audio capture + mix (milestone: clean mixed-mono WAV) +- `AudioRecorder`: SCK system audio → `system.wav`; AVAudioEngine mic → `mic.wav`; + shared `t0`; produce **`mixed_mono_16k.wav`**. +- Mic VAD → user "self" speaking spans (held for the timeline). +- Manual start/stop for now. +- **Milestone:** record any call manually → tracks line up, mixed-mono plays back + clean, mic-VAD spans look right. + +## Phase 2 — Call detection (milestone: hands-free start/stop) +- `CallDetector`: CoreAudio mic-running listener + `NSWorkspace` app/frontmost + + Meet active-tab-URL check; debounce. +- Wire detector → `SessionController` → `AudioRecorder`. +- **Milestone:** joining a call in any of the four apps auto-starts; leaving + auto-stops; a quick mic test does not false-trigger. + +## Phase 3 — VisualObserver + Zoom adapter (milestone: visual_timeline.json) +- `VisualObserver`: window-scoped `SCStream` @3 fps → frames → adapter → release. + Background/occluded windows capture fine; detect **minimized** (and, for Meet, + **tab-switched**) → pause analysis + record a `visual_gap`, never emit stale + frames. Audio keeps recording through gaps. +- `AppAdapter` protocol + `ZoomAdapter` (tile-border cue + OCR names; speaker & + gallery layouts). +- `TimelineBuilder`: hysteresis, overlap-allowed, merge mic-VAD self spans, alias + normalization → `visual_timeline.json` (+ the flat `timeline` array form). +- **Offline fixtures:** ~30 sample Zoom frames; unit-test/tune thresholds without + a live call. +- **Milestone:** a real Zoom call yields a `visual_timeline.json` whose segments + visibly match who spoke (incl. the user from mic VAD). + +## Phase 4 — Remaining adapters (milestone: all four produce timelines) +- `TeamsAdapter`, `SignalAdapter` (a11y names first), `MeetAdapter` (hybrid: a11y + names + Vision cue). Fixtures per app; per-adapter fps/threshold config. +- **Milestone:** each app yields a sensible timeline; adapters independently + toggleable. + +## Phase 5 — SparkControl hand-off via label-merge (milestone: end-to-end named transcript) +- `SessionPackager`: write session folder; if call > ~3 min, build the chunk plan + (~2–3 min windows) with **timeline sliced + rebased to chunk-local seconds**. +- `SparkControlClient`: + - Self-signed TLS handling; multipart `file`; **sequential** requests. + - Per chunk → `POST /api/audio/label-merge` with `file`, chunk `timeline`, + `known_voiceprints` (from store), `transcribe=true`. + - Handle `503 + Retry-After` (wait, retry), `413`, `{"detail":...}` errors. + - Offset returned `start_ms/end_ms` back to global seconds; concatenate. +- Assemble `speakers.json` (named, global, speaker-attributed transcript). +- Failed sessions stay on disk with a "Resend" affordance. +- **Milestone:** finish a call → `mixed_mono_16k.wav` + timeline delivered → a + `speakers.json` with real names + accurate boundaries lands locally. Test a + >3-min call to exercise chunking + cross-chunk name unification. + +## Phase 6 — Voiceprint library (milestone: identity that compounds) +- `VoiceprintStore` (`voiceprints.json`): load → `known_voiceprints`; on response, + store/refresh `fingerprint` for `visual`/high-confidence speakers; skip + `Unknown_N`. +- Verify the three resolution paths against the backend: (a) **visual** name wins; + (b) **voiceprint** recovery for a camera-off / OCR-missed speaker; (c) + **`Unknown_N`** when neither matches. +- Voiceprint manager in the UI (rename, delete, reset). +- **Milestone:** a regular contact gets named on a later call **with their camera + off**, purely from a stored voiceprint. + +## Phase 7 — Polish (milestone: daily-driver quality) +- Recent-sessions list (open/resend/delete); alias-table editor. +- Optional on-screen "recording" indicator toggle. +- Adapter health: warn when detection confidence drops sharply (catches UI + redesigns early); keep fixtures for quick re-tuning. +- Login-item / launch-at-login. + +--- + +## Risk register +1. **UI fragility (highest ongoing cost):** apps redesign speaking cues / name + placement. Isolate all pixel/color/label logic in adapters; Phase-7 + confidence-drop warning + fixtures keep re-tuning fast. +2. **Active-speaker lag / crosstalk:** UI cue trails speech and flickers. Mitigated + by treating visual as a *prior* (the backend votes), hysteresis, and the + voiceprint fallback. +3. **Meet-in-browser:** canvas video + DOM names; the hybrid adapter needs the most + iteration — budget extra time. +4. **4-speaker-per-chunk cap (Sortformer):** within a single chunk, >4 simultaneous + speakers can't all be separated. Chunking + voiceprints + the visual timeline + mitigate across chunks; for rare large calls, note it as a known limit. +5. **Sequential GPU constraint:** never fire parallel audio requests; queue them. +6. **Permission re-prompts on rebuild:** stable signing identity. + +## Definition of done (v1) +Join a call in any of the four apps → app auto-records dual-track audio, mixes to +mono, and builds a visual speaker timeline (incl. mic-VAD self spans) → on +hang-up the audio + timeline go to `label-merge` (chunked + sequential as needed) +→ a `speakers.json` with **real names and accurate boundaries** is produced, the +**voiceprint library is updated**, and **no video is ever written to disk** and +**no transcription/diarization/merge logic lives in this app.** diff --git a/project.yml b/project.yml new file mode 100644 index 0000000..7fe9914 --- /dev/null +++ b/project.yml @@ -0,0 +1,49 @@ +name: Ten31Transcripts + +options: + bundleIdPrefix: xyz.ten31 + deploymentTarget: + macOS: "13.0" + createIntermediateGroups: true + groupSortPosition: top + +settings: + base: + MARKETING_VERSION: "0.1.0" + CURRENT_PROJECT_VERSION: "1" + SWIFT_VERSION: "5.0" + CODE_SIGN_STYLE: Automatic + # Leave the team empty; pick your free personal team in Xcode's + # Signing & Capabilities tab on first open (see README). + DEVELOPMENT_TEAM: "" + +targets: + Ten31Transcripts: + type: application + platform: macOS + sources: + - path: Ten31Transcripts + settings: + base: + PRODUCT_NAME: Ten31Transcripts + PRODUCT_BUNDLE_IDENTIFIER: xyz.ten31.transcripts + INFOPLIST_FILE: Ten31Transcripts/Support/Info.plist + CODE_SIGN_ENTITLEMENTS: Ten31Transcripts/Support/Ten31Transcripts.entitlements + GENERATE_INFOPLIST_FILE: NO + ENABLE_HARDENED_RUNTIME: NO + MACOSX_DEPLOYMENT_TARGET: "13.0" + scheme: + testTargets: + - Ten31TranscriptsTests + + Ten31TranscriptsTests: + type: bundle.unit-test + platform: macOS + sources: + - path: Ten31TranscriptsTests + settings: + base: + PRODUCT_BUNDLE_IDENTIFIER: xyz.ten31.transcripts.tests + GENERATE_INFOPLIST_FILE: YES + dependencies: + - target: Ten31Transcripts