Phases 2-6: detection, visual timeline, backend hand-off, voiceprints

Phase 2 (call detection): CallDetector using CoreAudio per-process mic
attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet,
ignoring our own recording; auto-record toggle. Built; pending live multi-app
confirmation by the user.

Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation,
TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema
1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR +
saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver
(window capture; frames released, never saved; minimized->visual_gap, idle != gap).
Synthetic-frame tested; adapter geometry pending real Signal fixtures + live
VisualObserver validation.

Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential,
TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline
slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated
END-TO-END against the live backend (chunk -> label-merge -> speakers.json).

Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named
fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status,
auto-send toggle (default off) + self-name setting.

All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
Grant Gilliam
2026-06-06 00:15:49 -05:00
parent fd7e1a5907
commit 863136aeec
27 changed files with 2108 additions and 22 deletions
@@ -1,6 +1,7 @@
import Foundation
import Combine
import AppKit
import CoreGraphics
struct SessionInfo: Equatable {
let folder: URL
@@ -25,6 +26,14 @@ final class SessionController: ObservableObject {
case error(String)
}
/// Backend transcription status for the most recent session.
enum TranscriptStatus: Equatable {
case idle
case processing(Int, Int) // chunk done, total
case done(speakers: Int, segments: Int)
case failed(String)
}
/// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a
/// recording in progress before the app quits.
static weak var shared: SessionController?
@@ -37,12 +46,34 @@ final class SessionController: ObservableObject {
@Published private(set) var systemLevel: Float = 0
/// Surfaced after a session if system audio stopped early.
@Published private(set) var warning: String?
/// Mirrored from `CallDetector` for the UI.
@Published private(set) var detectionStatus: CallDetector.Status = .disabled
/// Backend transcription status for the last session.
@Published private(set) var transcriptStatus: TranscriptStatus = .idle
private let settings: AppSettings
private var voiceprints: VoiceprintStore
private let detector = CallDetector()
private var cancellables = Set<AnyCancellable>()
private var currentLabel = "manual"
/// Inputs needed to (re)process the last finished session through the backend.
private struct ProcessInputs {
let folder: URL
let sessionId: String
let app: String
let mixedURL: URL
let selfSpans: [VADSpan]
}
private var lastProcess: ProcessInputs?
private var processTask: Task<Void, Never>?
private var recorder: AudioRecorder?
private var currentFolder: URL?
private var startTime: Date?
private var timer: Timer?
/// True when the current session was started by call detection (not the user).
private var autoStarted = false
/// Set if a detected call ends while we're still in `.starting`.
private var pendingAutoStop = false
/// The in-flight start or stop Task, so `prepareForTermination` can await it.
private var lifecycleTask: Task<Void, Never>?
/// Bumped each time a start/stop Task is spawned (Task is a value type, so this
@@ -51,7 +82,64 @@ final class SessionController: ObservableObject {
init(settings: AppSettings) {
self.settings = settings
self.voiceprints = VoiceprintStore(
fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
SessionController.shared = self
detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
detector.$status
.sink { [weak self] status in self?.detectionStatus = status }
.store(in: &cancellables)
// Re-point the voiceprint DB if the output folder changes. The in-flight
// pipeline keeps its own captured reference, so this can't disrupt a run.
settings.$outputFolderPath
.dropFirst()
.sink { [weak self] path in
guard let self else { return }
let dir = URL(fileURLWithPath: (path as NSString).expandingTildeInPath, isDirectory: true)
self.voiceprints = VoiceprintStore(fileURL: dir.appendingPathComponent("voiceprints.json"))
}
.store(in: &cancellables)
settings.$autoRecordOnDetection
.sink { [weak self] on in
guard let self else { return }
if on {
self.detector.enable()
} else {
self.detector.disable()
// Don't leave an auto-started session running with no detector
// handle both .recording and the in-flight .starting case.
if self.autoStarted {
switch self.state {
case .recording: self.stop()
case .starting: self.pendingAutoStop = true
default: break
}
}
}
}
.store(in: &cancellables)
}
// MARK: - Auto-detection
private func handleCallStart(_ app: CallDetector.DetectedApp) {
guard settings.autoRecordOnDetection else { return }
switch state {
case .idle, .error: start(label: app.label, auto: true)
case .starting, .recording, .finishing: break // don't disturb an active session
}
}
private func handleCallEnd() {
// Only auto-stop a session we auto-started; never a manual recording.
guard autoStarted else { return }
switch state {
case .recording: stop()
case .starting: pendingAutoStop = true // resolved when start() completes
case .idle, .error, .finishing: break
}
}
var isBusy: Bool {
@@ -68,15 +156,18 @@ final class SessionController: ObservableObject {
// MARK: - Start / Stop
private func start() {
private func start(label: String = "manual", auto: Bool = false) {
let folder: URL
do {
folder = try makeSessionFolder()
folder = try makeSessionFolder(label: label)
} catch {
fail("Couldn't create session folder: \(error.localizedDescription)")
return
}
currentFolder = folder
currentLabel = label
autoStarted = auto
pendingAutoStop = false
let recorder = AudioRecorder(
micURL: folder.appendingPathComponent("mic.wav"),
systemURL: folder.appendingPathComponent("system.wav"),
@@ -92,12 +183,36 @@ final class SessionController: ObservableObject {
self.state = .recording
self.startTime = Date()
self.startTimer()
// A detected call may have ended while we were still starting.
if self.pendingAutoStop {
self.pendingAutoStop = false
self.stop()
}
} catch {
self.fail("Couldn't start recording: \(error.localizedDescription)")
self.handleStartFailure(error)
}
}
}
/// Map a recorder start failure to an actionable message. The common case is
/// Screen Recording getting re-checked after a rebuild (the SCStream auth
/// check fails even though CGPreflight reports granted), so re-prompt and open
/// the right Settings pane rather than show a cryptic TCC error.
private func handleStartFailure(_ error: Error) {
let msg = error.localizedDescription.lowercased()
let screenIssue = msg.contains("declined") || msg.contains("tcc")
|| msg.contains("screen") || msg.contains("permission")
if screenIssue {
_ = CGRequestScreenCaptureAccess()
if let url = URL(string: "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") {
NSWorkspace.shared.open(url)
}
fail("Screen Recording needs re-approval for this build. Toggle Ten31Transcripts off then on in System Settings ▸ Screen Recording, then restart the app.")
} else {
fail("Couldn't start recording: \(error.localizedDescription)")
}
}
private func stop() {
guard let recorder else { return }
state = .finishing
@@ -114,20 +229,66 @@ final class SessionController: ObservableObject {
micLevel = 0
systemLevel = 0
warning = result.systemNote.map { "System audio stopped early: \($0)" }
transcriptStatus = .idle
if let folder = currentFolder {
writeSelfSpans(result, to: folder)
lastSession = SessionInfo(
folder: folder, mixedURL: result.mixedURL,
duration: result.duration, selfSpanCount: result.selfSpans.count)
lastProcess = ProcessInputs(
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
mixedURL: result.mixedURL, selfSpans: result.selfSpans)
}
let autoSend = settings.autoSendOnStop
currentFolder = nil
autoStarted = false
pendingAutoStop = false
elapsed = 0
state = .idle
if autoSend { processLastSession() }
}
// MARK: - Backend transcription
/// Send the last finished session to the backend `speakers.json`. Uses the
/// mic-VAD self spans as the timeline for now; visual segments (Phase 34) get
/// merged in once the adapters land. Safe to call manually ("Send to backend")
/// or automatically on stop.
func processLastSession() {
guard let inputs = lastProcess else { return }
if case .processing = transcriptStatus { return }
transcriptStatus = .processing(0, 1)
let settings = self.settings
let voiceprints = self.voiceprints
processTask = Task {
let pipeline = TranscriptPipeline(
baseURL: settings.backendBaseURL,
skipTLS: settings.skipTLSVerification,
voiceprints: voiceprints)
let timeline = TranscriptPipeline.timeline(
fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
do {
let speakers = try await pipeline.process(
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
mixedURL: inputs.mixedURL, timeline: timeline,
progress: { done, total in
await MainActor.run { self.transcriptStatus = .processing(done, total) }
})
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
} catch is CancellationError {
self.transcriptStatus = .idle
} catch {
self.transcriptStatus = .failed(error.localizedDescription)
}
}
}
private func fail(_ message: String) {
recorder = nil
currentFolder = nil
autoStarted = false
pendingAutoStop = false
stopTimer()
micLevel = 0
systemLevel = 0
@@ -139,6 +300,9 @@ final class SessionController: ObservableObject {
/// its WAV headers are finalized before the process exits. Handles quit while
/// `.starting` and `.finishing`, not just `.recording`.
func prepareForTermination() async {
// Cancel any in-flight backend transcription (audio is already saved; the
// user can resend). The pipeline's checkCancellation + defer clean up chunks.
processTask?.cancel()
// Drain whatever lifecycle Task is in flight until nothing is busy. A Stop
// click landing in an await window can spawn a new stop Task, so loop
// rather than awaiting a single captured task.
@@ -178,9 +342,9 @@ final class SessionController: ObservableObject {
// MARK: - Files
private func makeSessionFolder() throws -> URL {
private func makeSessionFolder(label: String) throws -> URL {
let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
let folder = base.appendingPathComponent("\(Self.timestamp())_manual", isDirectory: true)
let folder = base.appendingPathComponent("\(Self.timestamp())_\(label)", isDirectory: true)
try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true)
return folder
}
@@ -0,0 +1,85 @@
import Foundation
import AVFoundation
/// Splits a long session into backend-sized chunks and produces, per chunk, the
/// sliced audio and the timeline rebased to chunk-local seconds.
///
/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3
/// min are chunked into ~23 min windows; names + voiceprints unify speakers
/// across chunks (handled in the pipeline).
enum SessionPackager {
struct PlannedChunk: Equatable {
let index: Int
let start: Double // global seconds
let end: Double
}
/// One chunk if short; otherwise even ~`chunkSeconds` windows.
static func planChunks(durationSec: Double,
chunkSeconds: Double = 150,
thresholdSec: Double = 180) -> [PlannedChunk] {
guard durationSec > thresholdSec else {
return [PlannedChunk(index: 0, start: 0, end: durationSec)]
}
var chunks: [PlannedChunk] = []
var start = 0.0
var index = 0
while start < durationSec - 0.001 {
let end = min(start + chunkSeconds, durationSec)
chunks.append(PlannedChunk(index: index, start: start, end: end))
start = end
index += 1
}
return chunks
}
/// Clip segments to `[start, end)` and rebase to chunk-local seconds, then
/// emit the flat `label-merge` array `[{start,end,name,confidence}]`.
static func rebasedTimelineData(_ segments: [VisualTimeline.Segment],
start: Double, end: Double) throws -> Data {
let flat: [[String: Any]] = segments.compactMap { seg in
let s = max(seg.start, start)
let e = min(seg.end, end)
guard e > s else { return nil }
return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence]
}
return try JSONSerialization.data(withJSONObject: flat, options: [])
}
/// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
let input = try AVAudioFile(forReading: source)
let sr = input.fileFormat.sampleRate
let startFrame = AVAudioFramePosition((startSec * sr).rounded())
let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded()))
guard endFrame > startFrame else { return }
let settings: [String: Any] = [
AVFormatIDKey: kAudioFormatLinearPCM,
AVSampleRateKey: sr,
AVNumberOfChannelsKey: 1,
AVLinearPCMBitDepthKey: 16,
AVLinearPCMIsFloatKey: false,
AVLinearPCMIsBigEndianKey: false,
]
let output = try AVAudioFile(forWriting: dest, settings: settings,
commonFormat: .pcmFormatFloat32, interleaved: false)
input.framePosition = startFrame
var remaining = AVAudioFrameCount(endFrame - startFrame)
let block: AVAudioFrameCount = 16_000
while remaining > 0 {
let n = min(block, remaining)
guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break }
try input.read(into: buffer, frameCount: n)
if buffer.frameLength == 0 { break }
try output.write(from: buffer)
remaining -= buffer.frameLength
}
}
/// Duration (seconds) of a WAV.
static func duration(of url: URL) -> Double {
guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 }
return Double(file.length) / file.fileFormat.sampleRate
}
}
@@ -0,0 +1,45 @@
import Foundation
/// `speakers.json` the final stored output (docs §6): per-chunk `label-merge`
/// results concatenated, timestamps offset back to global seconds, names unified.
/// This is the hand-off to the downstream summarizer; the app stops here.
struct SpeakersFile: Codable {
let sessionId: String
let app: String
let durationSec: Double
let speakers: [Speaker]
let segments: [Segment]
let models: [String: String]
struct Speaker: Codable, Equatable {
let name: String
let source: String
let overlapConfidence: Double?
let matchSimilarity: Double?
enum CodingKeys: String, CodingKey {
case name, source
case overlapConfidence = "overlap_confidence"
case matchSimilarity = "match_similarity"
}
}
struct Segment: Codable, Equatable {
let start: Double
let end: Double
let speaker: String
let text: String?
}
enum CodingKeys: String, CodingKey {
case sessionId = "session_id"
case app
case durationSec = "duration_sec"
case speakers, segments, models
}
func write(to url: URL) throws {
let encoder = JSONEncoder()
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
try encoder.encode(self).write(to: url)
}
}
@@ -0,0 +1,78 @@
import Foundation
/// Concatenates per-chunk `label-merge` results into one global `speakers.json`:
/// segment times offset back to global seconds, speakers unified across chunks by
/// name, and fingerprints collected for the voiceprint store.
enum TranscriptAssembler {
struct ChunkResult {
let chunkStart: Double // global seconds
let response: LabelMergeResponse
}
struct Assembled {
let speakersFile: SpeakersFile
let fingerprints: [String: [Float]] // name -> 192-dim, for VoiceprintStore
}
/// Source ranking when the same name appears across chunks with different sources.
private static func rank(_ source: String) -> Int {
switch source {
case "visual": return 3
case "voiceprint": return 2
default: return 1 // unmatched
}
}
private static func isUnknown(_ name: String) -> Bool {
LabelMergeResponse.isUnknownName(name)
}
static func assemble(sessionId: String, app: String, chunks: [ChunkResult]) -> Assembled {
var segments: [SpeakersFile.Segment] = []
var bestSpeaker: [String: SpeakersFile.Speaker] = [:]
var fingerprints: [String: [Float]] = [:]
var models: [String: String] = [:]
var duration = 0.0
for chunk in chunks {
let offset = chunk.chunkStart
// Audio length from the chunk window, so silent/all-unknown calls still
// report a real duration (not just the last segment's end).
duration = max(duration, offset + chunk.response.duration)
for seg in chunk.response.segments {
let start = seg.startSeconds + offset
let end = seg.endSeconds + offset
segments.append(.init(start: start, end: end, speaker: seg.speaker, text: seg.text))
duration = max(duration, end)
}
for sp in chunk.response.speakers {
let candidate = SpeakersFile.Speaker(
name: sp.name, source: sp.source,
overlapConfidence: sp.overlapConfidence, matchSimilarity: sp.matchSimilarity)
if let existing = bestSpeaker[sp.name] {
if rank(sp.source) > rank(existing.source) { bestSpeaker[sp.name] = candidate }
} else {
bestSpeaker[sp.name] = candidate
}
// Collect named fingerprints only (never Unknown_N / Speaker_unknown).
if !isUnknown(sp.name), let fp = sp.fingerprint, fp.count > 0 {
fingerprints[sp.name] = fp
}
}
for (name, fp) in chunk.response.fingerprints where !isUnknown(name) && fp.count > 0 {
fingerprints[name] = fp
}
}
segments.sort { $0.start < $1.start }
let speakers = bestSpeaker.values.sorted { $0.name < $1.name }
models = chunks.last?.response.models ?? [:]
let file = SpeakersFile(
sessionId: sessionId, app: app, durationSec: duration,
speakers: speakers, segments: segments, models: models)
return Assembled(speakersFile: file, fingerprints: fingerprints)
}
}
@@ -0,0 +1,75 @@
import Foundation
/// Drives a finished session through the backend: chunk sequential
/// `label-merge` (accumulating voiceprints) assemble `speakers.json` persist
/// fingerprints. Requests are sequential by construction (one chunk at a time).
final class TranscriptPipeline {
private let client: SparkControlClient
private let voiceprints: VoiceprintStore
init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
self.voiceprints = voiceprints
}
/// Process `mixedURL` against `timeline` (visual + self spans). Writes
/// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)`
/// is called per chunk.
func process(sessionFolder: URL,
sessionId: String,
app: String,
mixedURL: URL,
timeline: [VisualTimeline.Segment],
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
let duration = SessionPackager.duration(of: mixedURL)
let plan = SessionPackager.planChunks(durationSec: duration)
// Zero-duration / empty session a valid empty speakers.json, no backend call.
if plan.isEmpty || duration <= 0 {
let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
await progress?(0, 0)
return empty.speakersFile
}
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true)
defer { try? FileManager.default.removeItem(at: chunksDir) } // cleanup on success OR throw
// Start from stored voiceprints; accumulate this call's prints across chunks
// for within-call unification (the store only persists high-confidence ones).
var known = voiceprints.knownVoiceprints()
var results: [TranscriptAssembler.ChunkResult] = []
for chunk in plan {
try Task.checkCancellation()
await progress?(chunk.index, plan.count)
let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav")
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue } // empty slice skip
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
let response = try await client.labelMerge(
audioURL: chunkURL, timeline: timelineData,
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
known[name] = fp
}
voiceprints.update(with: response)
results.append(.init(chunkStart: chunk.start, response: response))
try? FileManager.default.removeItem(at: chunkURL)
}
await progress?(plan.count, plan.count)
let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
return assembled.speakersFile
}
/// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
/// the visual adapters land (Phase 34), their segments are merged in too.
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
}
}