diff --git a/README.md b/README.md index 6273494..172efc7 100644 --- a/README.md +++ b/README.md @@ -23,13 +23,17 @@ This repo is at **Phase 0** (scaffold, permissions, backend health check). ```sh open Ten31Transcripts.xcodeproj ``` -5. In Xcode, select the **Ten31Transcripts** target → **Signing & Capabilities**: - - Check **Automatically manage signing**. - - For **Team**, pick your personal team (sign in with your Apple ID — free; no - paid developer account needed). A stable team keeps macOS from re-asking for - permissions on every rebuild. +5. Signing is preconfigured: `project.yml` sets `DEVELOPMENT_TEAM` to the free + personal team `BK4Y6CXN35` with automatic signing, so **Signing & Capabilities + should already show the team** — no manual selection needed. (If you ever sign + with a different Apple ID, update `DEVELOPMENT_TEAM` in `project.yml`, not in + Xcode — `xcodegen generate` overwrites Xcode-side changes.) 6. Press **Run** (⌘R). +> **Note:** after adding files in a new phase, re-run `xcodegen generate` and let +> Xcode reload the project. The signing team persists because it lives in +> `project.yml`, so macOS permissions stay granted across rebuilds. + ## What Phase 0 does - Launches as a menu-bar-only app (no Dock icon). diff --git a/Ten31Transcripts/App/AppDelegate.swift b/Ten31Transcripts/App/AppDelegate.swift index 96b4c6b..cd294c7 100644 --- a/Ten31Transcripts/App/AppDelegate.swift +++ b/Ten31Transcripts/App/AppDelegate.swift @@ -6,5 +6,30 @@ final class AppDelegate: NSObject, NSApplicationDelegate { // LSUIElement in Info.plist already enforces this; set it explicitly too // so behavior is unambiguous regardless of how the app is launched. NSApp.setActivationPolicy(.accessory) + terminateOtherInstances() + } + + /// Single-instance: a fresh launch (e.g. each Xcode ⌘R) terminates any older + /// copies so you never end up with two menu-bar icons. + private func terminateOtherInstances() { + guard let bundleID = Bundle.main.bundleIdentifier else { return } + let me = NSRunningApplication.current.processIdentifier + for app in NSRunningApplication.runningApplications(withBundleIdentifier: bundleID) + where app.processIdentifier != me { + app.terminate() + } + } + + /// If a recording is in progress when the user quits, finalize it (flush WAV + /// headers) before the process exits, so the session isn't corrupted. + func applicationShouldTerminate(_ sender: NSApplication) -> NSApplication.TerminateReply { + guard let controller = SessionController.shared, controller.isBusy else { + return .terminateNow + } + Task { @MainActor in + await controller.prepareForTermination() + NSApp.reply(toApplicationShouldTerminate: true) + } + return .terminateLater } } diff --git a/Ten31Transcripts/App/Ten31TranscriptsApp.swift b/Ten31Transcripts/App/Ten31TranscriptsApp.swift index 94e3e58..1db4a83 100644 --- a/Ten31Transcripts/App/Ten31TranscriptsApp.swift +++ b/Ten31Transcripts/App/Ten31TranscriptsApp.swift @@ -10,9 +10,16 @@ import SwiftUI struct Ten31TranscriptsApp: App { @NSApplicationDelegateAdaptor(AppDelegate.self) private var appDelegate - @StateObject private var settings = AppSettings() + @StateObject private var settings: AppSettings @StateObject private var permissions = PermissionsManager() @StateObject private var health = SparkControlHealth() + @StateObject private var session: SessionController + + init() { + let settings = AppSettings() + _settings = StateObject(wrappedValue: settings) + _session = StateObject(wrappedValue: SessionController(settings: settings)) + } var body: some Scene { MenuBarExtra { @@ -20,8 +27,9 @@ struct Ten31TranscriptsApp: App { .environmentObject(settings) .environmentObject(permissions) .environmentObject(health) + .environmentObject(session) } label: { - Image(systemName: "waveform.circle") + Image(systemName: session.state == .recording ? "waveform.circle.fill" : "waveform.circle") } .menuBarExtraStyle(.window) } diff --git a/Ten31Transcripts/Audio/AudioMixer.swift b/Ten31Transcripts/Audio/AudioMixer.swift new file mode 100644 index 0000000..c97fb2c --- /dev/null +++ b/Ten31Transcripts/Audio/AudioMixer.swift @@ -0,0 +1,67 @@ +import AVFoundation + +/// Sums the two aligned 16 kHz mono tracks (mic + system) into the single +/// **mixed-mono 16 kHz WAV** that the backend receives. Both inputs are already +/// front-padded to the shared t0, so frame N of each file is the same instant. +/// Streamed in 1-second chunks to keep memory flat for long calls. +enum AudioMixer { + static func mix(mic micURL: URL, system systemURL: URL, into outURL: URL) throws { + let mic = try? AVAudioFile(forReading: micURL) + let sys = try? AVAudioFile(forReading: systemURL) + + let settings: [String: Any] = [ + AVFormatIDKey: kAudioFormatLinearPCM, + AVSampleRateKey: 16_000, + AVNumberOfChannelsKey: 1, + AVLinearPCMBitDepthKey: 16, + AVLinearPCMIsFloatKey: false, + AVLinearPCMIsBigEndianKey: false, + ] + let out = try AVAudioFile( + forWriting: outURL, + settings: settings, + commonFormat: .pcmFormatFloat32, + interleaved: false) + + let outFormat = Resampler.targetFormat + let chunk: AVAudioFramePosition = 16_000 + let total = max(mic?.length ?? 0, sys?.length ?? 0) + var pos: AVAudioFramePosition = 0 + + while pos < total { + let frames = AVAudioFrameCount(min(chunk, total - pos)) + guard let mixBuf = AVAudioPCMBuffer(pcmFormat: outFormat, frameCapacity: frames), + let dst = mixBuf.floatChannelData?[0] else { break } + mixBuf.frameLength = frames + memset(dst, 0, Int(frames) * MemoryLayout.size) + + add(file: mic, at: pos, maxFrames: frames, into: dst) + add(file: sys, at: pos, maxFrames: frames, into: dst) + + var i = 0 + while i < Int(frames) { + if dst[i] > 1 { dst[i] = 1 } else if dst[i] < -1 { dst[i] = -1 } + i += 1 + } + try out.write(from: mixBuf) + pos += AVAudioFramePosition(frames) + } + } + + private static func add(file: AVAudioFile?, at pos: AVAudioFramePosition, + maxFrames: AVAudioFrameCount, into dst: UnsafeMutablePointer) { + guard let file, pos < file.length else { return } + file.framePosition = pos + let toRead = AVAudioFrameCount(min(AVAudioFramePosition(maxFrames), file.length - pos)) + guard toRead > 0, + let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: toRead) + else { return } + do { + try file.read(into: buf, frameCount: toRead) + guard let src = buf.floatChannelData?[0] else { return } + var i = 0 + let count = Int(buf.frameLength) + while i < count { dst[i] += src[i]; i += 1 } + } catch {} + } +} diff --git a/Ten31Transcripts/Audio/AudioRecorder.swift b/Ten31Transcripts/Audio/AudioRecorder.swift new file mode 100644 index 0000000..9af33a3 --- /dev/null +++ b/Ten31Transcripts/Audio/AudioRecorder.swift @@ -0,0 +1,333 @@ +import AVFoundation +import ScreenCaptureKit +import CoreMedia +import QuartzCore + +struct RecordingResult { + let micURL: URL + let systemURL: URL + let mixedURL: URL + let duration: Double + let selfSpans: [VADSpan] + let t0Unix: Double + /// Non-nil if system-audio capture stopped early (e.g. SCStream error). + let systemNote: String? +} + +/// Dual-track local audio capture for Phase 1. +/// +/// - System audio via `SCStream` (`capturesAudio`); its audio handler runs on +/// `ioQueue`. A discard-only video output runs on `screenQueue` purely to keep +/// SCStream's frame pipeline drained (an unconsumed video queue can stall the +/// whole stream) — frames are dropped instantly, never stored. +/// - Mic via `AVAudioEngine` input tap: the tap deep-copies the raw buffer and +/// hands it to `ioQueue`, where it is resampled and written. +/// - **`ioQueue` is the single isolation domain** for the writers, VAD, both +/// resamplers, and lifecycle flags. +/// - One shared monotonic `t0` (`CACurrentMediaTime`). Each buffer is placed at +/// its true `(startHost − t0)` frame (gaps padded, overlaps trimmed), so mic +/// and system stay aligned and the mix is a straight sum. +/// - Live peak levels are exposed via `currentLevels()` for the UI meter. +/// - `stop()` tears the mic down first and bounds `stopCapture()` with a timeout, +/// so a wedged stream can never block finalization. No video is written. +final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput { + private let micURL: URL + private let systemURL: URL + private let mixedURL: URL + + private let ioQueue = DispatchQueue(label: "xyz.ten31.audio.io") + private let screenQueue = DispatchQueue(label: "xyz.ten31.audio.screen") + + // ioQueue-only state: + private var t0Host: Double = 0 + private var t0Unix: Double = 0 + private var micWriter: MonoTrackWriter? + private var systemWriter: MonoTrackWriter? + private var vad: MicVAD? + private var tornDown = true + private let micResampler = Resampler() + private let systemResampler = Resampler() + + // Cross-thread, guarded by levelLock: + private let levelLock = NSLock() + private var micPeak: Float = 0 + private var sysPeak: Float = 0 + private var streamStopped = false + private var systemErrorMessage: String? + + private var engine: AVAudioEngine? + private var stream: SCStream? + + init(micURL: URL, systemURL: URL, mixedURL: URL) { + self.micURL = micURL + self.systemURL = systemURL + self.mixedURL = mixedURL + } + + // MARK: - Lifecycle + + func start() async throws { + let t0 = CACurrentMediaTime() + let t0u = Date().timeIntervalSince1970 + try ioQueue.sync { + let mic = try MonoTrackWriter(url: self.micURL) + let sys = try MonoTrackWriter(url: self.systemURL) + self.t0Host = t0 + self.t0Unix = t0u + self.micWriter = mic + self.systemWriter = sys + self.vad = MicVAD() + self.tornDown = false + } + do { + try startMic() + try await startSystem() // throws if Screen Recording is denied + } catch { + await abortStart() + throw error + } + } + + func stop() async -> RecordingResult { + // Stop the mic FIRST — always succeeds and halts mic capture immediately. + engine?.inputNode.removeTap(onBus: 0) + engine?.stop() + engine = nil + + // Stop system capture WITHOUT hanging: an already-errored stream can make + // stopCapture() block forever, so skip it if it already stopped and bound + // it with a timeout otherwise. + if let stream, !flag({ self.streamStopped }) { + await Self.stopCaptureWithTimeout(stream, seconds: 3) + } + stream = nil + + var micFrames: Int64 = 0 + var sysFrames: Int64 = 0 + var spans: [VADSpan] = [] + var t0u: Double = 0 + + ioQueue.sync { + if let tail = micResampler.drain() { + if (micWriter?.write(tail) ?? 0) > 0 { vad?.feed(tail) } + } + if let tail = systemResampler.drain() { systemWriter?.write(tail) } + vad?.finish() + micFrames = micWriter?.framesWritten ?? 0 + sysFrames = systemWriter?.framesWritten ?? 0 + spans = vad?.spans ?? [] + t0u = t0Unix + tornDown = true + micWriter = nil + systemWriter = nil + vad = nil + } + + try? AudioMixer.mix(mic: micURL, system: systemURL, into: mixedURL) + + let duration = Double(max(micFrames, sysFrames)) / 16_000.0 + let note = flag { self.systemErrorMessage } as String? + return RecordingResult( + micURL: micURL, systemURL: systemURL, mixedURL: mixedURL, + duration: duration, selfSpans: spans, t0Unix: t0u, systemNote: note) + } + + private func abortStart() async { + engine?.inputNode.removeTap(onBus: 0) + engine?.stop() + engine = nil + if let stream { await Self.stopCaptureWithTimeout(stream, seconds: 3) } + stream = nil + ioQueue.sync { + tornDown = true + micWriter = nil + systemWriter = nil + vad = nil + } + } + + /// Latest peak levels (0…1) for each source; decays so a stalled source fades. + func currentLevels() -> (mic: Float, system: Float) { + levelLock.lock(); defer { levelLock.unlock() } + let m = micPeak, s = sysPeak + micPeak *= 0.55; sysPeak *= 0.55 + return (m, s) + } + + // MARK: - Ingest (ioQueue only) + + private func ingestMic(_ buffer: AVAudioPCMBuffer, startHost: Double) { + guard !tornDown, let writer = micWriter, let vad else { return } + let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded())) + if expected > writer.framesWritten { + let padded = writer.padSilence(expected - writer.framesWritten) + if padded > 0 { vad.feedSilence(padded) } + } + let startIdx = max(0, Int(writer.framesWritten - expected)) + if startIdx >= Int(buffer.frameLength) { return } + guard let chunk = Self.trimFront(buffer, by: startIdx) else { return } + updateLevel(chunk, isMic: true) + if writer.write(chunk) > 0 { vad.feed(chunk) } + } + + private func ingestSystem(_ buffer: AVAudioPCMBuffer, startHost: Double) { + guard !tornDown, let writer = systemWriter else { return } + let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded())) + if expected > writer.framesWritten { + writer.padSilence(expected - writer.framesWritten) + } + let startIdx = max(0, Int(writer.framesWritten - expected)) + if startIdx >= Int(buffer.frameLength) { return } + guard let chunk = Self.trimFront(buffer, by: startIdx) else { return } + updateLevel(chunk, isMic: false) + writer.write(chunk) + } + + // MARK: - Mic (AVAudioEngine) + + private func startMic() throws { + let engine = AVAudioEngine() + let input = engine.inputNode + let format = input.inputFormat(forBus: 0) + + input.installTap(onBus: 0, bufferSize: 4096, format: format) { [weak self] buffer, when in + guard let self else { return } + let entry = CACurrentMediaTime() + let stamped = when.isHostTimeValid ? AudioRecorder.hostSeconds(when.hostTime) : entry + let startHost = abs(stamped - entry) < 5 ? stamped : entry + guard let raw = AudioRecorder.copy(buffer) else { return } + self.ioQueue.async { + guard !self.tornDown, let resampled = self.micResampler.resample(raw) else { return } + self.ingestMic(resampled, startHost: startHost) + } + } + engine.prepare() + try engine.start() + self.engine = engine + } + + // MARK: - System (ScreenCaptureKit) + + private func startSystem() async throws { + let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false) + guard let display = content.displays.first else { + throw NSError(domain: "Ten31", code: 1, + userInfo: [NSLocalizedDescriptionKey: "No display available for system-audio capture."]) + } + let filter = SCContentFilter(display: display, excludingWindows: []) + let config = SCStreamConfiguration() + config.capturesAudio = true + config.excludesCurrentProcessAudio = true + config.sampleRate = 48_000 + config.channelCount = 2 + config.width = 2 + config.height = 2 + config.minimumFrameInterval = CMTime(value: 1, timescale: 2) // ~2 fps tiny video + config.queueDepth = 6 + + let stream = SCStream(filter: filter, configuration: config, delegate: self) + try stream.addStreamOutput(self, type: .audio, sampleHandlerQueue: ioQueue) + // Discard-only video consumer keeps SCStream's frame queue drained so the + // stream stays alive; frames are dropped immediately and never stored. + try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: screenQueue) + try await stream.startCapture() + self.stream = stream + } + + func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer, + of type: SCStreamOutputType) { + guard type == .audio else { return } // .screen frames discarded here + guard CMSampleBufferDataIsReady(sampleBuffer), + let pcm = Self.pcmBuffer(from: sampleBuffer), + let resampled = systemResampler.resample(pcm) else { return } + let entry = CACurrentMediaTime() + let pts = CMSampleBufferGetPresentationTimeStamp(sampleBuffer) + let stamped = pts.isValid ? pts.seconds : entry + let startHost = abs(stamped - entry) < 5 ? stamped : entry + ingestSystem(resampled, startHost: startHost) + } + + func stream(_ stream: SCStream, didStopWithError error: Error) { + levelLock.lock() + streamStopped = true + systemErrorMessage = error.localizedDescription + levelLock.unlock() + } + + // MARK: - Helpers + + private func updateLevel(_ buffer: AVAudioPCMBuffer, isMic: Bool) { + guard let ch = buffer.floatChannelData?[0] else { return } + var peak: Float = 0 + let n = Int(buffer.frameLength) + var i = 0 + while i < n { let a = abs(ch[i]); if a > peak { peak = a }; i += 1 } + levelLock.lock() + if isMic { if peak > micPeak { micPeak = peak } } + else { if peak > sysPeak { sysPeak = peak } } + levelLock.unlock() + } + + /// Read a levelLock-guarded value. + private func flag(_ body: () -> T) -> T { + levelLock.lock(); defer { levelLock.unlock() } + return body() + } + + private static func stopCaptureWithTimeout(_ stream: SCStream, seconds: Double) async { + await withTaskGroup(of: Void.self) { group in + group.addTask { try? await stream.stopCapture() } + group.addTask { try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) } + _ = await group.next() // proceed as soon as either finishes + group.cancelAll() + } + } + + /// Deep-copy a PCM buffer (the engine reuses the tap buffer). Layout-agnostic. + private static func copy(_ buffer: AVAudioPCMBuffer) -> AVAudioPCMBuffer? { + guard buffer.frameLength > 0, + let out = AVAudioPCMBuffer(pcmFormat: buffer.format, frameCapacity: buffer.frameLength) + else { return nil } + out.frameLength = buffer.frameLength + let src = UnsafeMutableAudioBufferListPointer(UnsafeMutablePointer(mutating: buffer.audioBufferList)) + let dst = UnsafeMutableAudioBufferListPointer(out.mutableAudioBufferList) + guard src.count == dst.count else { return nil } + for i in 0.. AVAudioPCMBuffer? { + if frames <= 0 { return buffer } + let total = Int(buffer.frameLength) + guard frames < total, let src = buffer.floatChannelData?[0] else { return nil } + let n = AVAudioFrameCount(total - frames) + guard let out = AVAudioPCMBuffer(pcmFormat: buffer.format, frameCapacity: n), + let dst = out.floatChannelData?[0] else { return nil } + out.frameLength = n + memcpy(dst, src + frames, Int(n) * MemoryLayout.size) + return out + } + + private static func hostSeconds(_ hostTime: UInt64) -> Double { + var info = mach_timebase_info_data_t() + mach_timebase_info(&info) + return Double(hostTime) * Double(info.numer) / Double(info.denom) / 1_000_000_000.0 + } + + private static func pcmBuffer(from sampleBuffer: CMSampleBuffer) -> AVAudioPCMBuffer? { + guard let fmtDesc = CMSampleBufferGetFormatDescription(sampleBuffer), + let asbdPtr = CMAudioFormatDescriptionGetStreamBasicDescription(fmtDesc) else { return nil } + var asbd = asbdPtr.pointee + guard let format = AVAudioFormat(streamDescription: &asbd) else { return nil } + let frames = AVAudioFrameCount(CMSampleBufferGetNumSamples(sampleBuffer)) + guard frames > 0, + let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frames) else { return nil } + buffer.frameLength = frames + let status = CMSampleBufferCopyPCMDataIntoAudioBufferList( + sampleBuffer, at: 0, frameCount: Int32(frames), into: buffer.mutableAudioBufferList) + return status == noErr ? buffer : nil + } +} diff --git a/Ten31Transcripts/Audio/MicVAD.swift b/Ten31Transcripts/Audio/MicVAD.swift new file mode 100644 index 0000000..922221a --- /dev/null +++ b/Ten31Transcripts/Audio/MicVAD.swift @@ -0,0 +1,98 @@ +import AVFoundation + +/// A speaking span on the session `t0` timeline (seconds). +struct VADSpan: Equatable { + let start: Double + let end: Double + let confidence: Double +} + +/// Lightweight energy-based voice-activity detector for the **mic** track (the +/// user). It is fed the *exact same* 16 kHz mono stream the mic WAV receives — +/// real samples via `feed` and timeline-gap silence via `feedSilence` — so its +/// internal sample cursor always equals the mic file position, and span times +/// land on the same instants as `mixed_mono_16k.wav`. +/// +/// Phase 3's `TimelineBuilder` will fold these in as high-confidence pre-seeded +/// "self" segments. Thresholds are intentionally simple and will be tuned later. +/// +/// Single-threaded: all calls happen on `AudioRecorder.ioQueue`. +final class MicVAD { + private let frameSize = 320 // 20 ms @ 16 kHz + private let openFrames = 2 // ~40 ms above threshold to open + private let closeFrames = 10 // ~200 ms hangover to close + private let absoluteFloor: Float = 0.006 + private let floorMultiplier: Float = 2.5 + + private var cursorSamples = 0 // total samples fed (== mic file position) + private var noiseFloor: Float = 0.01 + private var voicedRun = 0 + private var silentRun = 0 + private var inSpeech = false + private var spanStartSample = 0 + private var acc: [Float] = [] + private(set) var spans: [VADSpan] = [] + + func feed(_ buffer: AVAudioPCMBuffer) { + guard let ch = buffer.floatChannelData, buffer.frameLength > 0 else { return } + acc.append(contentsOf: UnsafeBufferPointer(start: ch[0], count: Int(buffer.frameLength))) + drainFrames() + } + + func feedSilence(_ count: Int64) { + guard count > 0 else { return } + acc.append(contentsOf: repeatElement(0, count: Int(count))) + drainFrames() + } + + /// Close any span still open at end of capture. + func finish() { + if inSpeech { + appendSpan(startSample: spanStartSample, endSample: cursorSamples) + inSpeech = false + } + } + + private func drainFrames() { + var i = 0 + while i + frameSize <= acc.count { + var sum: Float = 0 + var j = i + while j < i + frameSize { sum += acc[j] * acc[j]; j += 1 } + step(rms: (sum / Float(frameSize)).squareRoot()) + cursorSamples += frameSize + i += frameSize + } + if i > 0 { acc.removeFirst(i) } + } + + /// `cursorSamples` is the start sample of the frame being evaluated. + private func step(rms: Float) { + if rms < noiseFloor { noiseFloor = 0.9 * noiseFloor + 0.1 * rms } + else { noiseFloor = 0.995 * noiseFloor + 0.005 * rms } + + let threshold = max(absoluteFloor, noiseFloor * floorMultiplier) + let voiced = rms > threshold + + if voiced { + voicedRun += 1; silentRun = 0 + if !inSpeech && voicedRun >= openFrames { + inSpeech = true + spanStartSample = cursorSamples - (voicedRun - 1) * frameSize + } + } else { + silentRun += 1; voicedRun = 0 + if inSpeech && silentRun >= closeFrames { + inSpeech = false + appendSpan(startSample: spanStartSample, + endSample: cursorSamples - (closeFrames - 1) * frameSize) + } + } + } + + private func appendSpan(startSample: Int, endSample: Int) { + let start = Double(max(0, startSample)) / 16_000.0 + let end = Double(endSample) / 16_000.0 + if end > start { spans.append(VADSpan(start: start, end: end, confidence: 0.9)) } + } +} diff --git a/Ten31Transcripts/Audio/MonoTrackWriter.swift b/Ten31Transcripts/Audio/MonoTrackWriter.swift new file mode 100644 index 0000000..261b942 --- /dev/null +++ b/Ten31Transcripts/Audio/MonoTrackWriter.swift @@ -0,0 +1,67 @@ +import AVFoundation + +/// Sequential **16 kHz mono PCM-16 WAV** writer. Deliberately "dumb": it only +/// appends buffers and silence and tracks `framesWritten`. Time alignment to the +/// shared `t0` is done by the caller (`AudioRecorder`), which pads/trims using +/// each buffer's true host time so the mic and system tracks stay anchored to +/// the same timeline even if buffers are dropped or the hardware clocks drift. +/// +/// Single-threaded: all calls happen on `AudioRecorder.ioQueue`. +final class MonoTrackWriter { + private let file: AVAudioFile + private(set) var framesWritten: Int64 = 0 + + init(url: URL) throws { + let settings: [String: Any] = [ + AVFormatIDKey: kAudioFormatLinearPCM, + AVSampleRateKey: 16_000, + AVNumberOfChannelsKey: 1, + AVLinearPCMBitDepthKey: 16, + AVLinearPCMIsFloatKey: false, + AVLinearPCMIsBigEndianKey: false, + ] + // On disk = Int16 PCM; processing/buffer format = Float32 (matches Resampler). + self.file = try AVAudioFile( + forWriting: url, + settings: settings, + commonFormat: .pcmFormatFloat32, + interleaved: false) + } + + /// Writes the buffer; returns the number of frames actually committed (0 on + /// failure). Callers feed the VAD this committed count to stay in lockstep. + @discardableResult + func write(_ buffer: AVAudioPCMBuffer) -> Int64 { + guard buffer.frameLength > 0 else { return 0 } + do { + try file.write(from: buffer) + let n = Int64(buffer.frameLength) + framesWritten += n + return n + } catch { + return 0 // best-effort: drop a buffer rather than tear down + } + } + + /// Append `count` frames of silence (to fill timeline gaps); returns frames + /// actually committed. + @discardableResult + func padSilence(_ count: Int64) -> Int64 { + guard count > 0 else { return 0 } + var remaining = count + var committed: Int64 = 0 + let chunk: Int64 = 16_000 + while remaining > 0 { + let n = AVAudioFrameCount(min(chunk, remaining)) + guard let buffer = AVAudioPCMBuffer(pcmFormat: Resampler.targetFormat, frameCapacity: n) else { break } + buffer.frameLength = n + if let ch = buffer.floatChannelData { + memset(ch[0], 0, Int(n) * MemoryLayout.size) + } + if write(buffer) == 0 { break } + committed += Int64(n) + remaining -= Int64(n) + } + return committed + } +} diff --git a/Ten31Transcripts/Audio/Resampler.swift b/Ten31Transcripts/Audio/Resampler.swift new file mode 100644 index 0000000..87b53da --- /dev/null +++ b/Ten31Transcripts/Audio/Resampler.swift @@ -0,0 +1,65 @@ +import AVFoundation + +/// Converts arbitrary input PCM buffers to **16 kHz mono Float32**, maintaining +/// resampler state across calls. Reuse one instance per source stream so the +/// internal sample-rate converter stays continuous across buffers. +/// +/// Not thread-safe: use one instance from a single thread. Both the mic and +/// system instances are driven exclusively from `AudioRecorder.ioQueue` (one per +/// source stream), kept continuous across buffers. +final class Resampler { + /// The canonical Phase-1 audio format: 16 kHz, mono, Float32, deinterleaved. + static let targetFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: 16_000, + channels: 1, + interleaved: false)! + + private var converter: AVAudioConverter? + private var sourceFormat: AVAudioFormat? + private var ended = false + + /// 16 kHz mono buffer for `input`, or nil if conversion produced nothing. + func resample(_ input: AVAudioPCMBuffer) -> AVAudioPCMBuffer? { + guard !ended, input.frameLength > 0 else { return nil } + + if converter == nil || sourceFormat != input.format { + converter = AVAudioConverter(from: input.format, to: Self.targetFormat) + sourceFormat = input.format + } + guard let converter else { return nil } + + let ratio = Self.targetFormat.sampleRate / input.format.sampleRate + let capacity = AVAudioFrameCount((Double(input.frameLength) * ratio).rounded(.up)) + 64 + guard let output = AVAudioPCMBuffer(pcmFormat: Self.targetFormat, frameCapacity: capacity) else { + return nil + } + + var consumed = false + var error: NSError? + let status = converter.convert(to: output, error: &error) { _, inputStatus in + if consumed { inputStatus.pointee = .noDataNow; return nil } + consumed = true + inputStatus.pointee = .haveData + return input + } + if status == .error || output.frameLength == 0 { return nil } + return output + } + + /// Flush the converter's internal tail at end of stream (call once on stop). + func drain() -> AVAudioPCMBuffer? { + guard !ended, let converter else { ended = true; return nil } + ended = true + guard let output = AVAudioPCMBuffer(pcmFormat: Self.targetFormat, frameCapacity: 8192) else { + return nil + } + var error: NSError? + let status = converter.convert(to: output, error: &error) { _, inputStatus in + inputStatus.pointee = .endOfStream + return nil + } + if status == .error || output.frameLength == 0 { return nil } + return output + } +} diff --git a/Ten31Transcripts/Session/SessionController.swift b/Ten31Transcripts/Session/SessionController.swift new file mode 100644 index 0000000..2637aa1 --- /dev/null +++ b/Ten31Transcripts/Session/SessionController.swift @@ -0,0 +1,213 @@ +import Foundation +import Combine +import AppKit + +struct SessionInfo: Equatable { + let folder: URL + let mixedURL: URL + let duration: Double + let selfSpanCount: Int +} + +/// Owns a single recording session: creates the session folder, drives +/// `AudioRecorder` start/stop, tracks elapsed time, and writes the Phase-1 +/// preview of mic-VAD self spans. Detection/visual/backend wiring come later. +/// +/// The lifecycle is serialized through an explicit state machine so start and +/// stop can never interleave (`.starting` → `.recording` → `.finishing`). +@MainActor +final class SessionController: ObservableObject { + enum State: Equatable { + case idle + case starting + case recording + case finishing + case error(String) + } + + /// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a + /// recording in progress before the app quits. + static weak var shared: SessionController? + + @Published private(set) var state: State = .idle + @Published private(set) var elapsed: TimeInterval = 0 + @Published private(set) var lastSession: SessionInfo? + /// Live input peak levels (0…1) while recording, for the UI meters. + @Published private(set) var micLevel: Float = 0 + @Published private(set) var systemLevel: Float = 0 + /// Surfaced after a session if system audio stopped early. + @Published private(set) var warning: String? + + private let settings: AppSettings + private var recorder: AudioRecorder? + private var currentFolder: URL? + private var startTime: Date? + private var timer: Timer? + /// The in-flight start or stop Task, so `prepareForTermination` can await it. + private var lifecycleTask: Task? + /// Bumped each time a start/stop Task is spawned (Task is a value type, so this + /// is how `prepareForTermination` detects a newly-spawned transition). + private var lifecycleGeneration = 0 + + init(settings: AppSettings) { + self.settings = settings + SessionController.shared = self + } + + var isBusy: Bool { + state == .starting || state == .recording || state == .finishing + } + + func toggle() { + switch state { + case .idle, .error: start() + case .recording: stop() + case .starting, .finishing: break // ignore taps mid-transition + } + } + + // MARK: - Start / Stop + + private func start() { + let folder: URL + do { + folder = try makeSessionFolder() + } catch { + fail("Couldn't create session folder: \(error.localizedDescription)") + return + } + currentFolder = folder + let recorder = AudioRecorder( + micURL: folder.appendingPathComponent("mic.wav"), + systemURL: folder.appendingPathComponent("system.wav"), + mixedURL: folder.appendingPathComponent("mixed_mono_16k.wav")) + self.recorder = recorder + warning = nil + state = .starting + + lifecycleGeneration += 1 + lifecycleTask = Task { + do { + try await recorder.start() // self-tears-down if it throws + self.state = .recording + self.startTime = Date() + self.startTimer() + } catch { + self.fail("Couldn't start recording: \(error.localizedDescription)") + } + } + } + + private func stop() { + guard let recorder else { return } + state = .finishing + stopTimer() + lifecycleGeneration += 1 + lifecycleTask = Task { + let result = await recorder.stop() + self.finish(result) + } + } + + private func finish(_ result: RecordingResult) { + recorder = nil + micLevel = 0 + systemLevel = 0 + warning = result.systemNote.map { "System audio stopped early: \($0)" } + if let folder = currentFolder { + writeSelfSpans(result, to: folder) + lastSession = SessionInfo( + folder: folder, mixedURL: result.mixedURL, + duration: result.duration, selfSpanCount: result.selfSpans.count) + } + currentFolder = nil + elapsed = 0 + state = .idle + } + + private func fail(_ message: String) { + recorder = nil + currentFolder = nil + stopTimer() + micLevel = 0 + systemLevel = 0 + elapsed = 0 + state = .error(message) + } + + /// Called from `applicationShouldTerminate`: flush any in-progress session so + /// its WAV headers are finalized before the process exits. Handles quit while + /// `.starting` and `.finishing`, not just `.recording`. + func prepareForTermination() async { + // Drain whatever lifecycle Task is in flight until nothing is busy. A Stop + // click landing in an await window can spawn a new stop Task, so loop + // rather than awaiting a single captured task. + while isBusy { + let gen = lifecycleGeneration + await lifecycleTask?.value + if state == .recording, let recorder { + state = .finishing + stopTimer() + finish(await recorder.stop()) + } else if lifecycleGeneration == gen { + break // settled: no new transition was spawned + } + } + } + + // MARK: - Timer + + private func startTimer() { + timer = Timer.scheduledTimer(withTimeInterval: 0.1, repeats: true) { [weak self] _ in + Task { @MainActor in + guard let self else { return } + if let start = self.startTime { self.elapsed = Date().timeIntervalSince(start) } + if let recorder = self.recorder { + let levels = recorder.currentLevels() + self.micLevel = levels.mic + self.systemLevel = levels.system + } + } + } + } + + private func stopTimer() { + timer?.invalidate() + timer = nil + } + + // MARK: - Files + + private func makeSessionFolder() throws -> URL { + let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true) + let folder = base.appendingPathComponent("\(Self.timestamp())_manual", isDirectory: true) + try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true) + return folder + } + + private static func timestamp() -> String { + let f = DateFormatter() + f.locale = Locale(identifier: "en_US_POSIX") + f.dateFormat = "yyyy-MM-dd'T'HH-mm-ss" + return f.string(from: Date()) + } + + /// Phase-1 preview of the mic-VAD "self" spans (the eventual + /// `visual_timeline.json` `mic_vad` segments). Lets us eyeball VAD quality. + private func writeSelfSpans(_ result: RecordingResult, to folder: URL) { + let segments = result.selfSpans.map { span -> [String: Any] in + ["start": span.start, "end": span.end, "name": "self", + "confidence": span.confidence, "source": "mic_vad"] + } + let object: [String: Any] = [ + "note": "Phase 1 mic-VAD self spans (preview of visual_timeline segments)", + "t0_unix": result.t0Unix, + "duration_sec": result.duration, + "self_spans": segments, + ] + if let data = try? JSONSerialization.data(withJSONObject: object, + options: [.prettyPrinted, .sortedKeys]) { + try? data.write(to: folder.appendingPathComponent("self_vad.json")) + } + } +} diff --git a/Ten31Transcripts/UI/LevelBar.swift b/Ten31Transcripts/UI/LevelBar.swift new file mode 100644 index 0000000..6080666 --- /dev/null +++ b/Ten31Transcripts/UI/LevelBar.swift @@ -0,0 +1,38 @@ +import SwiftUI + +/// A small horizontal audio level meter. `level` is a peak amplitude (0…1); +/// it's mapped to a dBFS scale (−60 dB … 0 dB) so normal speech is clearly visible. +struct LevelBar: View { + let label: String + let level: Float + + var body: some View { + HStack(spacing: 8) { + Text(label) + .font(.caption2) + .foregroundStyle(.secondary) + .frame(width: 48, alignment: .leading) + GeometryReader { geo in + ZStack(alignment: .leading) { + RoundedRectangle(cornerRadius: 2).fill(Color.secondary.opacity(0.2)) + RoundedRectangle(cornerRadius: 2) + .fill(color) + .frame(width: geo.size.width * fraction) + } + } + .frame(height: 6) + } + } + + private var fraction: CGFloat { + guard level > 0 else { return 0 } + let db = 20 * log10(Double(level)) // −∞ … 0 + return CGFloat(min(1, max(0, (db + 60) / 60))) + } + + private var color: Color { + if fraction < 0.02 { return .gray } + if fraction > 0.9 { return .red } + return .green + } +} diff --git a/Ten31Transcripts/UI/MenuBarView.swift b/Ten31Transcripts/UI/MenuBarView.swift index 63b387c..4ee27ef 100644 --- a/Ten31Transcripts/UI/MenuBarView.swift +++ b/Ten31Transcripts/UI/MenuBarView.swift @@ -7,12 +7,15 @@ struct MenuBarView: View { @EnvironmentObject private var settings: AppSettings @EnvironmentObject private var permissions: PermissionsManager @EnvironmentObject private var health: SparkControlHealth + @EnvironmentObject private var session: SessionController var body: some View { NavigationStack { VStack(alignment: .leading, spacing: 12) { header Divider() + recordingSection + Divider() permissionsSection Divider() backendSection @@ -26,6 +29,91 @@ struct MenuBarView: View { .task { await refreshHealth() } } + // MARK: Recording + + private var canRecord: Bool { + permissions.microphone == .granted && permissions.screenRecording == .granted + } + + private var recordingSection: some View { + VStack(alignment: .leading, spacing: 8) { + HStack { + Text("Recording").font(.subheadline).bold() + Spacer() + if session.state == .recording { + Text(timeString(session.elapsed)) + .font(.system(.caption, design: .monospaced)) + .foregroundStyle(.secondary) + } + } + + Button { + session.toggle() + } label: { + Label(recordButtonTitle, systemImage: recordButtonIcon) + .frame(maxWidth: .infinity) + } + .controlSize(.large) + .tint(session.state == .recording ? .red : .accentColor) + .disabled(recordButtonDisabled) + + if session.state == .recording { + LevelBar(label: "Mic", level: session.micLevel) + LevelBar(label: "System", level: session.systemLevel) + } + + if !canRecord && !session.isBusy { + Text("Grant Microphone + Screen Recording above to record.") + .font(.caption) + .foregroundStyle(.secondary) + } + + if case .error(let message) = session.state { + Text(message).font(.caption).foregroundStyle(.red) + } + + if let warning = session.warning { + Text(warning).font(.caption).foregroundStyle(.orange) + } + + if let last = session.lastSession { + Button { + NSWorkspace.shared.activateFileViewerSelecting([last.mixedURL]) + } label: { + Text("Last: \(Int(last.duration.rounded()))s · \(last.selfSpanCount) self-spans — reveal in Finder") + .font(.caption) + } + .buttonStyle(.link) + } + } + } + + private var recordButtonTitle: String { + switch session.state { + case .starting: return "Starting…" + case .recording: return "Stop Recording" + case .finishing: return "Finishing…" + case .idle, .error: return "Start Recording" + } + } + + private var recordButtonIcon: String { + session.state == .recording ? "stop.circle.fill" : "record.circle" + } + + private var recordButtonDisabled: Bool { + switch session.state { + case .starting, .finishing: return true + case .recording: return false + case .idle, .error: return !canRecord + } + } + + private func timeString(_ t: TimeInterval) -> String { + let total = Int(t) + return String(format: "%02d:%02d", total / 60, total % 60) + } + private var header: some View { VStack(alignment: .leading, spacing: 2) { Text("Ten31 Transcripts").font(.headline) diff --git a/project.yml b/project.yml index 7fe9914..06de0f4 100644 --- a/project.yml +++ b/project.yml @@ -13,9 +13,11 @@ settings: CURRENT_PROJECT_VERSION: "1" SWIFT_VERSION: "5.0" CODE_SIGN_STYLE: Automatic - # Leave the team empty; pick your free personal team in Xcode's - # Signing & Capabilities tab on first open (see README). - DEVELOPMENT_TEAM: "" + # Grant's free personal team (cert OU). Baked in so `xcodegen generate` keeps + # a STABLE signing identity across regenerations — macOS ties TCC permission + # grants (Mic / Screen Recording / Accessibility) to this identity, so a + # stable team is what makes those permissions persist across rebuilds. + DEVELOPMENT_TEAM: "BK4Y6CXN35" targets: Ten31Transcripts: