Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
@@ -0,0 +1,31 @@
|
|||||||
|
import Foundation
|
||||||
|
import CoreVideo
|
||||||
|
|
||||||
|
/// Signal Desktop adapter. Signal shows avatars/initials with a coloured ring
|
||||||
|
/// around the active speaker; names may also be available via the Electron
|
||||||
|
/// Accessibility tree (preferred over OCR when we enable it). Geometry/threshold
|
||||||
|
/// here are first-pass and will be calibrated against real Signal screenshots.
|
||||||
|
struct SignalAdapter: AppAdapter {
|
||||||
|
static let bundleIDs = ["org.whispersystems.signal-desktop"]
|
||||||
|
let adapterVersion = "signal-0.1.0"
|
||||||
|
let preferredFPS = 3
|
||||||
|
|
||||||
|
private let analyzer: GridCallAnalyzer
|
||||||
|
|
||||||
|
init() {
|
||||||
|
var config = GridCallAnalyzer.Config()
|
||||||
|
// Signal tiles are squarish with initials centred; tune with fixtures.
|
||||||
|
config.tileExpandX = 1.6
|
||||||
|
config.tileExpandY = 1.8
|
||||||
|
self.analyzer = GridCallAnalyzer(config: config)
|
||||||
|
}
|
||||||
|
|
||||||
|
func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
|
||||||
|
analyzer.analyze(pixelBuffer: frame, at: t)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Exposed for fixture/synthetic tests.
|
||||||
|
func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
|
||||||
|
analyzer.analyze(cgImage: cgImage, at: t)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -156,31 +156,44 @@ final class AudioRecorder: NSObject, SCStreamDelegate, SCStreamOutput {
|
|||||||
|
|
||||||
// MARK: - Ingest (ioQueue only)
|
// MARK: - Ingest (ioQueue only)
|
||||||
|
|
||||||
|
/// Write audio CONTINUOUSLY; re-anchor to the timestamp only when drift is a
|
||||||
|
/// real gap (> ~100 ms), not per-buffer timestamp jitter. Correcting every
|
||||||
|
/// buffer injects/strips a few samples each time → audible rhythmic glitching.
|
||||||
|
/// The shared t0 still bounds mic/system skew to the tolerance, well within
|
||||||
|
/// what the backend merge needs.
|
||||||
|
private static let driftTolerance: Int64 = 1600 // 100 ms @ 16 kHz
|
||||||
|
|
||||||
private func ingestMic(_ buffer: AVAudioPCMBuffer, startHost: Double) {
|
private func ingestMic(_ buffer: AVAudioPCMBuffer, startHost: Double) {
|
||||||
guard !tornDown, let writer = micWriter, let vad else { return }
|
guard !tornDown, let writer = micWriter, let vad else { return }
|
||||||
let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
|
let drift = max(0, Int64(((startHost - t0Host) * 16_000).rounded())) - writer.framesWritten
|
||||||
if expected > writer.framesWritten {
|
var chunk: AVAudioPCMBuffer? = buffer
|
||||||
let padded = writer.padSilence(expected - writer.framesWritten)
|
if drift > Self.driftTolerance { // real gap → pad to realign
|
||||||
|
let padded = writer.padSilence(drift)
|
||||||
if padded > 0 { vad.feedSilence(padded) }
|
if padded > 0 { vad.feedSilence(padded) }
|
||||||
|
} else if drift < -Self.driftTolerance { // far ahead → trim overlap
|
||||||
|
let trim = Int(-drift)
|
||||||
|
if trim >= Int(buffer.frameLength) { return }
|
||||||
|
chunk = Self.trimFront(buffer, by: trim)
|
||||||
}
|
}
|
||||||
let startIdx = max(0, Int(writer.framesWritten - expected))
|
guard let out = chunk else { return }
|
||||||
if startIdx >= Int(buffer.frameLength) { return }
|
updateLevel(out, isMic: true)
|
||||||
guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
|
if writer.write(out) > 0 { vad.feed(out) }
|
||||||
updateLevel(chunk, isMic: true)
|
|
||||||
if writer.write(chunk) > 0 { vad.feed(chunk) }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private func ingestSystem(_ buffer: AVAudioPCMBuffer, startHost: Double) {
|
private func ingestSystem(_ buffer: AVAudioPCMBuffer, startHost: Double) {
|
||||||
guard !tornDown, let writer = systemWriter else { return }
|
guard !tornDown, let writer = systemWriter else { return }
|
||||||
let expected = max(0, Int64(((startHost - t0Host) * 16_000).rounded()))
|
let drift = max(0, Int64(((startHost - t0Host) * 16_000).rounded())) - writer.framesWritten
|
||||||
if expected > writer.framesWritten {
|
var chunk: AVAudioPCMBuffer? = buffer
|
||||||
writer.padSilence(expected - writer.framesWritten)
|
if drift > Self.driftTolerance {
|
||||||
|
writer.padSilence(drift)
|
||||||
|
} else if drift < -Self.driftTolerance {
|
||||||
|
let trim = Int(-drift)
|
||||||
|
if trim >= Int(buffer.frameLength) { return }
|
||||||
|
chunk = Self.trimFront(buffer, by: trim)
|
||||||
}
|
}
|
||||||
let startIdx = max(0, Int(writer.framesWritten - expected))
|
guard let out = chunk else { return }
|
||||||
if startIdx >= Int(buffer.frameLength) { return }
|
updateLevel(out, isMic: false)
|
||||||
guard let chunk = Self.trimFront(buffer, by: startIdx) else { return }
|
writer.write(out)
|
||||||
updateLevel(chunk, isMic: false)
|
|
||||||
writer.write(chunk)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// MARK: - Mic (AVAudioEngine)
|
// MARK: - Mic (AVAudioEngine)
|
||||||
|
|||||||
@@ -24,7 +24,12 @@ final class Resampler {
|
|||||||
guard !ended, input.frameLength > 0 else { return nil }
|
guard !ended, input.frameLength > 0 else { return nil }
|
||||||
|
|
||||||
if converter == nil || sourceFormat != input.format {
|
if converter == nil || sourceFormat != input.format {
|
||||||
converter = AVAudioConverter(from: input.format, to: Self.targetFormat)
|
let c = AVAudioConverter(from: input.format, to: Self.targetFormat)
|
||||||
|
// Highest-quality sample-rate conversion: best anti-aliasing on the
|
||||||
|
// 48k→16k downsample, which avoids harsh artifacts on loud/bright speech.
|
||||||
|
c?.sampleRateConverterQuality = .max
|
||||||
|
c?.sampleRateConverterAlgorithm = AVSampleRateConverterAlgorithm_Mastering
|
||||||
|
converter = c
|
||||||
sourceFormat = input.format
|
sourceFormat = input.format
|
||||||
}
|
}
|
||||||
guard let converter else { return nil }
|
guard let converter else { return nil }
|
||||||
|
|||||||
@@ -0,0 +1,179 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// Decoded `POST /api/audio/label-merge` response (verified against the live
|
||||||
|
/// backend). Handles both `transcribe=true` (start_ms/end_ms + text) and
|
||||||
|
/// `transcribe=false` (start_s/end_s + confidence) segment shapes.
|
||||||
|
struct LabelMergeResponse: Decodable {
|
||||||
|
let duration: Double
|
||||||
|
let speakers: [Speaker]
|
||||||
|
let segments: [Segment]
|
||||||
|
let fingerprints: [String: [Float]]
|
||||||
|
let models: [String: String]?
|
||||||
|
|
||||||
|
/// The backend's "unmatched" labels — never persisted as a named voiceprint.
|
||||||
|
static func isUnknownName(_ name: String) -> Bool {
|
||||||
|
name.hasPrefix("Unknown_") || name == "Speaker_unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Speaker: Decodable {
|
||||||
|
let cluster: String
|
||||||
|
let name: String
|
||||||
|
let source: String // visual | voiceprint | unmatched
|
||||||
|
let overlapConfidence: Double?
|
||||||
|
let matchSimilarity: Double?
|
||||||
|
let fingerprint: [Float]?
|
||||||
|
enum CodingKeys: String, CodingKey {
|
||||||
|
case cluster, name, source, fingerprint
|
||||||
|
case overlapConfidence = "overlap_confidence"
|
||||||
|
case matchSimilarity = "match_similarity"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Segment: Decodable {
|
||||||
|
let startMs: Int?
|
||||||
|
let endMs: Int?
|
||||||
|
let startS: Double?
|
||||||
|
let endS: Double?
|
||||||
|
let speaker: String
|
||||||
|
let text: String?
|
||||||
|
let confidence: Double?
|
||||||
|
enum CodingKeys: String, CodingKey {
|
||||||
|
case startMs = "start_ms"
|
||||||
|
case endMs = "end_ms"
|
||||||
|
case startS = "start_s"
|
||||||
|
case endS = "end_s"
|
||||||
|
case speaker, text, confidence
|
||||||
|
}
|
||||||
|
/// Start time in seconds regardless of which shape the backend used.
|
||||||
|
var startSeconds: Double { startS ?? startMs.map { Double($0) / 1000 } ?? 0 }
|
||||||
|
var endSeconds: Double { endS ?? endMs.map { Double($0) / 1000 } ?? 0 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enum SparkControlError: Error, LocalizedError {
|
||||||
|
case invalidHost
|
||||||
|
case tooLarge // 413
|
||||||
|
case server(Int, String) // other non-2xx with {"detail":...}
|
||||||
|
case decode(String)
|
||||||
|
case retriesExhausted
|
||||||
|
|
||||||
|
var errorDescription: String? {
|
||||||
|
switch self {
|
||||||
|
case .invalidHost: return "Invalid backend host URL."
|
||||||
|
case .tooLarge: return "Audio chunk exceeds the backend's 200 MB limit."
|
||||||
|
case .server(let code, let detail): return "Backend error \(code): \(detail)"
|
||||||
|
case .decode(let msg): return "Couldn't decode backend response: \(msg)"
|
||||||
|
case .retriesExhausted: return "Backend stayed busy (503) after retries."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Talks to SparkControl's `label-merge`. **Callers must invoke sequentially**
|
||||||
|
/// (one audio request in flight) — concurrent audio requests trip a GPU race
|
||||||
|
/// (503). The Phase-5 pipeline drives one chunk at a time, satisfying this.
|
||||||
|
final class SparkControlClient {
|
||||||
|
private let baseURL: String
|
||||||
|
private let urlSession: URLSession
|
||||||
|
|
||||||
|
init(baseURL: String, skipTLS: Bool) {
|
||||||
|
let trimmed = baseURL.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
|
self.baseURL = trimmed.hasSuffix("/") ? String(trimmed.dropLast()) : trimmed
|
||||||
|
let config = URLSessionConfiguration.ephemeral
|
||||||
|
config.timeoutIntervalForRequest = 600 // diarization can take up to ~600s
|
||||||
|
config.timeoutIntervalForResource = 900
|
||||||
|
config.waitsForConnectivity = false
|
||||||
|
let delegate: URLSessionDelegate? = skipTLS ? InsecureTrustDelegate() : nil
|
||||||
|
self.urlSession = URLSession(configuration: config, delegate: delegate, delegateQueue: nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
deinit { urlSession.finishTasksAndInvalidate() }
|
||||||
|
|
||||||
|
/// One `label-merge` call. `timeline` is the flat `[{start,end,name,confidence}]`
|
||||||
|
/// JSON (chunk-local seconds). Retries on `503 + Retry-After`.
|
||||||
|
func labelMerge(audioURL: URL,
|
||||||
|
timeline: Data,
|
||||||
|
knownVoiceprints: [String: [Float]]?,
|
||||||
|
transcribe: Bool,
|
||||||
|
minOverlap: Double? = nil,
|
||||||
|
voiceprintThreshold: Double? = nil,
|
||||||
|
maxRetries: Int = 3) async throws -> LabelMergeResponse {
|
||||||
|
guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
|
||||||
|
throw SparkControlError.invalidHost
|
||||||
|
}
|
||||||
|
|
||||||
|
var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"]
|
||||||
|
if let timelineString = String(data: timeline, encoding: .utf8) {
|
||||||
|
fields["timeline"] = timelineString
|
||||||
|
}
|
||||||
|
if let known = knownVoiceprints, !known.isEmpty,
|
||||||
|
let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }),
|
||||||
|
let str = String(data: data, encoding: .utf8) {
|
||||||
|
fields["known_voiceprints"] = str
|
||||||
|
}
|
||||||
|
if let minOverlap { fields["min_overlap"] = String(minOverlap) }
|
||||||
|
if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) }
|
||||||
|
|
||||||
|
let audio = try Data(contentsOf: audioURL)
|
||||||
|
// Body doesn't change between retries — build it once.
|
||||||
|
let (body, contentType) = Self.multipart(fields: fields, fileField: "file",
|
||||||
|
filename: audioURL.lastPathComponent, fileData: audio)
|
||||||
|
|
||||||
|
var attempt = 0
|
||||||
|
while true {
|
||||||
|
var request = URLRequest(url: url)
|
||||||
|
request.httpMethod = "POST"
|
||||||
|
request.setValue(contentType, forHTTPHeaderField: "Content-Type")
|
||||||
|
request.httpBody = body
|
||||||
|
|
||||||
|
let (data, response) = try await urlSession.data(for: request)
|
||||||
|
guard let http = response as? HTTPURLResponse else {
|
||||||
|
throw SparkControlError.decode("no HTTP response")
|
||||||
|
}
|
||||||
|
|
||||||
|
switch http.statusCode {
|
||||||
|
case 200..<300:
|
||||||
|
do {
|
||||||
|
return try JSONDecoder().decode(LabelMergeResponse.self, from: data)
|
||||||
|
} catch {
|
||||||
|
throw SparkControlError.decode(error.localizedDescription)
|
||||||
|
}
|
||||||
|
case 503:
|
||||||
|
attempt += 1
|
||||||
|
if attempt > maxRetries { throw SparkControlError.retriesExhausted }
|
||||||
|
let retryAfter = (http.value(forHTTPHeaderField: "Retry-After")).flatMap(Double.init) ?? 5
|
||||||
|
try await Task.sleep(nanoseconds: UInt64(max(1, retryAfter) * 1_000_000_000))
|
||||||
|
case 413:
|
||||||
|
throw SparkControlError.tooLarge
|
||||||
|
default:
|
||||||
|
throw SparkControlError.server(http.statusCode, Self.detail(from: data))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Helpers
|
||||||
|
|
||||||
|
private static func detail(from data: Data) -> String {
|
||||||
|
if let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||||
|
let detail = obj["detail"] as? String { return detail }
|
||||||
|
return String(data: data, encoding: .utf8) ?? "unknown error"
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func multipart(fields: [String: String], fileField: String,
|
||||||
|
filename: String, fileData: Data) -> (Data, String) {
|
||||||
|
let boundary = "Boundary-\(UUID().uuidString)"
|
||||||
|
var body = Data()
|
||||||
|
func append(_ s: String) { body.append(s.data(using: .utf8)!) }
|
||||||
|
|
||||||
|
for (name, value) in fields {
|
||||||
|
append("--\(boundary)\r\n")
|
||||||
|
append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n")
|
||||||
|
append("\(value)\r\n")
|
||||||
|
}
|
||||||
|
append("--\(boundary)\r\n")
|
||||||
|
append("Content-Disposition: form-data; name=\"\(fileField)\"; filename=\"\(filename)\"\r\n")
|
||||||
|
append("Content-Type: audio/wav\r\n\r\n")
|
||||||
|
body.append(fileData)
|
||||||
|
append("\r\n--\(boundary)--\r\n")
|
||||||
|
return (body, "multipart/form-data; boundary=\(boundary)")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,104 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// Local persistence of named voiceprints — the compounding-identity layer.
|
||||||
|
///
|
||||||
|
/// File `~/Ten31Transcripts/voiceprints.json`:
|
||||||
|
/// `{ "<name>": { "vector": [192 floats], "updated": <iso>, "calls": <int> } }`
|
||||||
|
///
|
||||||
|
/// On send → `knownVoiceprints()` feeds `label-merge`. On response → `update(with:)`
|
||||||
|
/// stores/refreshes vectors for speakers resolved by **visual** (overlap ≥ ~0.8)
|
||||||
|
/// or **voiceprint** match. Never stores `Unknown_N` / `Speaker_unknown`.
|
||||||
|
///
|
||||||
|
/// Thread-safe (lock-guarded); the sequential pipeline is the only writer.
|
||||||
|
final class VoiceprintStore {
|
||||||
|
struct Entry: Codable, Equatable {
|
||||||
|
var vector: [Float]
|
||||||
|
var updated: String
|
||||||
|
var calls: Int
|
||||||
|
}
|
||||||
|
|
||||||
|
private let url: URL
|
||||||
|
private let minOverlapToStore: Double
|
||||||
|
private let lock = NSLock()
|
||||||
|
private var entriesStore: [String: Entry] = [:]
|
||||||
|
|
||||||
|
init(fileURL: URL, minOverlapToStore: Double = 0.8) {
|
||||||
|
self.url = fileURL
|
||||||
|
self.minOverlapToStore = minOverlapToStore
|
||||||
|
load()
|
||||||
|
}
|
||||||
|
|
||||||
|
var entries: [String: Entry] {
|
||||||
|
lock.lock(); defer { lock.unlock() }
|
||||||
|
return entriesStore
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Vectors keyed by name, for the `known_voiceprints` field.
|
||||||
|
func knownVoiceprints() -> [String: [Float]] {
|
||||||
|
lock.lock(); defer { lock.unlock() }
|
||||||
|
return entriesStore.mapValues { $0.vector }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Persist fingerprints from a `label-merge` response for confidently-named
|
||||||
|
/// speakers only.
|
||||||
|
func update(with response: LabelMergeResponse) {
|
||||||
|
lock.lock(); defer { lock.unlock() }
|
||||||
|
let now = ISO8601DateFormatter().string(from: Date())
|
||||||
|
for sp in response.speakers {
|
||||||
|
guard !Self.isUnknown(sp.name) else { continue }
|
||||||
|
let acceptable: Bool
|
||||||
|
switch sp.source {
|
||||||
|
case "visual": acceptable = (sp.overlapConfidence ?? 0) >= minOverlapToStore
|
||||||
|
case "voiceprint": acceptable = true // already matched a known print
|
||||||
|
default: acceptable = false // unmatched
|
||||||
|
}
|
||||||
|
guard acceptable, let vector = sp.fingerprint ?? response.fingerprints[sp.name],
|
||||||
|
!vector.isEmpty else { continue }
|
||||||
|
var entry = entriesStore[sp.name] ?? Entry(vector: vector, updated: now, calls: 0)
|
||||||
|
entry.vector = vector
|
||||||
|
entry.updated = now
|
||||||
|
entry.calls += 1
|
||||||
|
entriesStore[sp.name] = entry
|
||||||
|
}
|
||||||
|
save()
|
||||||
|
}
|
||||||
|
|
||||||
|
func rename(_ old: String, to new: String) {
|
||||||
|
lock.lock(); defer { lock.unlock() }
|
||||||
|
guard let e = entriesStore.removeValue(forKey: old) else { return }
|
||||||
|
entriesStore[new] = e
|
||||||
|
save()
|
||||||
|
}
|
||||||
|
|
||||||
|
func remove(_ name: String) {
|
||||||
|
lock.lock(); defer { lock.unlock() }
|
||||||
|
entriesStore.removeValue(forKey: name)
|
||||||
|
save()
|
||||||
|
}
|
||||||
|
|
||||||
|
func reset() {
|
||||||
|
lock.lock(); defer { lock.unlock() }
|
||||||
|
entriesStore = [:]
|
||||||
|
save()
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Persistence (call with lock held)
|
||||||
|
|
||||||
|
private func load() {
|
||||||
|
guard let data = try? Data(contentsOf: url),
|
||||||
|
let decoded = try? JSONDecoder().decode([String: Entry].self, from: data) else { return }
|
||||||
|
entriesStore = decoded
|
||||||
|
}
|
||||||
|
|
||||||
|
private func save() {
|
||||||
|
let encoder = JSONEncoder()
|
||||||
|
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
|
||||||
|
try? FileManager.default.createDirectory(at: url.deletingLastPathComponent(),
|
||||||
|
withIntermediateDirectories: true)
|
||||||
|
if let data = try? encoder.encode(entriesStore) { try? data.write(to: url) }
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func isUnknown(_ name: String) -> Bool {
|
||||||
|
LabelMergeResponse.isUnknownName(name)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,61 @@
|
|||||||
|
import CoreAudio
|
||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// Lists the PIDs of processes currently using an audio **input** (the mic), via
|
||||||
|
/// the CoreAudio process-object API (macOS 14+).
|
||||||
|
///
|
||||||
|
/// This is how we attribute mic usage to a *specific* app — e.g. "is Signal in a
|
||||||
|
/// call?" — which is far more robust than matching window titles, and it works
|
||||||
|
/// uniformly for Zoom/Teams/Signal and browser calls (Meet). It also lets us
|
||||||
|
/// ignore our own recording: we look at the *call app's* PID, not the global mic,
|
||||||
|
/// so a call's end is detected even while we keep the mic open.
|
||||||
|
///
|
||||||
|
/// Approach mirrors fastrepl/anarlog's `list_mic_using_apps`.
|
||||||
|
@available(macOS 14.0, *)
|
||||||
|
enum AudioInputProcesses {
|
||||||
|
static func micUsingPIDs() -> Set<pid_t> {
|
||||||
|
var listAddr = AudioObjectPropertyAddress(
|
||||||
|
mSelector: kAudioHardwarePropertyProcessObjectList,
|
||||||
|
mScope: kAudioObjectPropertyScopeGlobal,
|
||||||
|
mElement: kAudioObjectPropertyElementMain)
|
||||||
|
|
||||||
|
var dataSize: UInt32 = 0
|
||||||
|
guard AudioObjectGetPropertyDataSize(
|
||||||
|
AudioObjectID(kAudioObjectSystemObject), &listAddr, 0, nil, &dataSize) == noErr,
|
||||||
|
dataSize > 0 else { return [] }
|
||||||
|
|
||||||
|
let count = Int(dataSize) / MemoryLayout<AudioObjectID>.size
|
||||||
|
var processes = [AudioObjectID](repeating: 0, count: count)
|
||||||
|
guard AudioObjectGetPropertyData(
|
||||||
|
AudioObjectID(kAudioObjectSystemObject), &listAddr, 0, nil, &dataSize, &processes) == noErr
|
||||||
|
else { return [] }
|
||||||
|
|
||||||
|
var pids = Set<pid_t>()
|
||||||
|
for process in processes where isRunningInput(process) {
|
||||||
|
if let pid = pid(of: process) { pids.insert(pid) }
|
||||||
|
}
|
||||||
|
return pids
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func isRunningInput(_ process: AudioObjectID) -> Bool {
|
||||||
|
var addr = AudioObjectPropertyAddress(
|
||||||
|
mSelector: kAudioProcessPropertyIsRunningInput,
|
||||||
|
mScope: kAudioObjectPropertyScopeGlobal,
|
||||||
|
mElement: kAudioObjectPropertyElementMain)
|
||||||
|
var value: UInt32 = 0
|
||||||
|
var size = UInt32(MemoryLayout<UInt32>.size)
|
||||||
|
guard AudioObjectGetPropertyData(process, &addr, 0, nil, &size, &value) == noErr else { return false }
|
||||||
|
return value != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func pid(of process: AudioObjectID) -> pid_t? {
|
||||||
|
var addr = AudioObjectPropertyAddress(
|
||||||
|
mSelector: kAudioProcessPropertyPID,
|
||||||
|
mScope: kAudioObjectPropertyScopeGlobal,
|
||||||
|
mElement: kAudioObjectPropertyElementMain)
|
||||||
|
var value: pid_t = 0
|
||||||
|
var size = UInt32(MemoryLayout<pid_t>.size)
|
||||||
|
guard AudioObjectGetPropertyData(process, &addr, 0, nil, &size, &value) == noErr else { return nil }
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,226 @@
|
|||||||
|
import AppKit
|
||||||
|
import CoreGraphics
|
||||||
|
import Combine
|
||||||
|
|
||||||
|
/// Detects when the user joins/leaves a call and reports it via callbacks.
|
||||||
|
///
|
||||||
|
/// Heuristic: the mic is live system-wide AND a known call app is present —
|
||||||
|
/// Zoom/Teams/Signal by bundle ID, or Google Meet by a browser window whose
|
||||||
|
/// title looks like a Meet call (read via `CGWindowList`, using the Screen
|
||||||
|
/// Recording permission). Debounced so a quick unrelated mic use doesn't trigger.
|
||||||
|
///
|
||||||
|
/// Main-actor: all evaluation runs on the main thread.
|
||||||
|
@MainActor
|
||||||
|
final class CallDetector: ObservableObject {
|
||||||
|
|
||||||
|
enum DetectedApp: String, Equatable {
|
||||||
|
case zoom, teams, signal, meet
|
||||||
|
var label: String { rawValue }
|
||||||
|
var display: String {
|
||||||
|
switch self {
|
||||||
|
case .zoom: return "Zoom"
|
||||||
|
case .teams: return "Microsoft Teams"
|
||||||
|
case .signal: return "Signal"
|
||||||
|
case .meet: return "Google Meet"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enum Status: Equatable {
|
||||||
|
case disabled
|
||||||
|
case listening
|
||||||
|
case inCall(DetectedApp)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Published private(set) var status: Status = .disabled
|
||||||
|
|
||||||
|
var onCallStart: ((DetectedApp) -> Void)?
|
||||||
|
var onCallEnd: (() -> Void)?
|
||||||
|
|
||||||
|
private let mic = MicActivityMonitor()
|
||||||
|
private var pollTimer: Timer?
|
||||||
|
private var openTimer: Timer?
|
||||||
|
private var closeTimer: Timer?
|
||||||
|
private var inCall = false
|
||||||
|
private var currentApp: DetectedApp?
|
||||||
|
private var enabled = false
|
||||||
|
|
||||||
|
private let openDelay: TimeInterval = 2.0
|
||||||
|
private let closeDelay: TimeInterval = 4.0
|
||||||
|
private let pollInterval: TimeInterval = 3.0
|
||||||
|
|
||||||
|
private static let nativeApps: [(id: String, app: DetectedApp)] = [
|
||||||
|
("us.zoom.xos", .zoom),
|
||||||
|
("com.microsoft.teams2", .teams),
|
||||||
|
("com.microsoft.teams", .teams),
|
||||||
|
("org.whispersystems.signal-desktop", .signal),
|
||||||
|
]
|
||||||
|
private static let browserIDs: Set<String> = [
|
||||||
|
"org.mozilla.firefox", "com.google.Chrome", "com.apple.Safari",
|
||||||
|
"company.thebrowser.Browser", "com.brave.Browser", "com.microsoft.edgemac",
|
||||||
|
]
|
||||||
|
|
||||||
|
func enable() {
|
||||||
|
guard !enabled else { return }
|
||||||
|
enabled = true
|
||||||
|
mic.onChange = { [weak self] _ in self?.evaluate() }
|
||||||
|
mic.start()
|
||||||
|
status = .listening
|
||||||
|
pollTimer = Timer.scheduledTimer(withTimeInterval: pollInterval, repeats: true) { [weak self] _ in
|
||||||
|
Task { @MainActor in self?.evaluate() }
|
||||||
|
}
|
||||||
|
evaluate()
|
||||||
|
}
|
||||||
|
|
||||||
|
func disable() {
|
||||||
|
guard enabled else { return }
|
||||||
|
enabled = false
|
||||||
|
mic.stop()
|
||||||
|
pollTimer?.invalidate(); pollTimer = nil
|
||||||
|
cancelOpen(); cancelClose()
|
||||||
|
inCall = false
|
||||||
|
currentApp = nil
|
||||||
|
status = .disabled
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Evaluation
|
||||||
|
|
||||||
|
private func evaluate() {
|
||||||
|
guard enabled else { return }
|
||||||
|
let candidate = mic.isRunning ? detectApp() : nil
|
||||||
|
|
||||||
|
if let candidate {
|
||||||
|
cancelClose()
|
||||||
|
if inCall {
|
||||||
|
currentApp = candidate
|
||||||
|
status = .inCall(candidate)
|
||||||
|
} else if openTimer == nil {
|
||||||
|
openTimer = Timer.scheduledTimer(withTimeInterval: openDelay, repeats: false) { [weak self] _ in
|
||||||
|
Task { @MainActor in self?.fireOpen() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cancelOpen()
|
||||||
|
if inCall && closeTimer == nil {
|
||||||
|
closeTimer = Timer.scheduledTimer(withTimeInterval: closeDelay, repeats: false) { [weak self] _ in
|
||||||
|
Task { @MainActor in self?.fireClose() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func fireOpen() {
|
||||||
|
openTimer = nil
|
||||||
|
// Re-resolve the app at fire time (the debounce window may have changed it).
|
||||||
|
guard enabled, mic.isRunning, let app = detectApp(), !inCall else { return }
|
||||||
|
inCall = true
|
||||||
|
currentApp = app
|
||||||
|
status = .inCall(app)
|
||||||
|
onCallStart?(app)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func fireClose() {
|
||||||
|
closeTimer = nil
|
||||||
|
guard enabled, inCall else { return }
|
||||||
|
inCall = false
|
||||||
|
currentApp = nil
|
||||||
|
status = .listening
|
||||||
|
onCallEnd?()
|
||||||
|
}
|
||||||
|
|
||||||
|
private func cancelOpen() { openTimer?.invalidate(); openTimer = nil }
|
||||||
|
private func cancelClose() { closeTimer?.invalidate(); closeTimer = nil }
|
||||||
|
|
||||||
|
// MARK: - App detection
|
||||||
|
|
||||||
|
/// A call is active when a known call app is actually using the mic.
|
||||||
|
/// On macOS 14+ we attribute mic usage per-process (robust start AND stop,
|
||||||
|
/// works for Signal/Zoom/Teams/Meet, ignores our own recording). On macOS 13
|
||||||
|
/// we fall back to the per-app call-window heuristic.
|
||||||
|
private func detectApp() -> DetectedApp? {
|
||||||
|
if #available(macOS 14.0, *) {
|
||||||
|
return detectViaMicAttribution()
|
||||||
|
}
|
||||||
|
return detectViaWindowTitle()
|
||||||
|
}
|
||||||
|
|
||||||
|
@available(macOS 14.0, *)
|
||||||
|
private func detectViaMicAttribution() -> DetectedApp? {
|
||||||
|
let micPIDs = AudioInputProcesses.micUsingPIDs()
|
||||||
|
guard !micPIDs.isEmpty else { return nil }
|
||||||
|
let selfPID = NSRunningApplication.current.processIdentifier
|
||||||
|
|
||||||
|
for app in NSWorkspace.shared.runningApplications {
|
||||||
|
let pid = app.processIdentifier
|
||||||
|
guard pid != selfPID, micPIDs.contains(pid), let id = app.bundleIdentifier else { continue }
|
||||||
|
if let native = Self.nativeApps.first(where: { $0.id == id }) {
|
||||||
|
return native.app // Signal/Zoom/Teams using the mic = in a call
|
||||||
|
}
|
||||||
|
// A browser using the mic + a Meet window = a Meet call. The mic state
|
||||||
|
// gives reliable start/stop; the window check keeps non-Meet browser
|
||||||
|
// mic use (other web apps) from being mislabeled as a Meet recording.
|
||||||
|
if Self.browserIDs.contains(id), pidHasMeetWindow(pid) {
|
||||||
|
return .meet
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
private func pidHasMeetWindow(_ pid: pid_t) -> Bool {
|
||||||
|
guard let info = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]]
|
||||||
|
else { return false }
|
||||||
|
for w in info {
|
||||||
|
guard let wpid = w[kCGWindowOwnerPID as String] as? pid_t, wpid == pid,
|
||||||
|
let title = w[kCGWindowName as String] as? String else { continue }
|
||||||
|
if Self.looksLikeMeet(title) { return true }
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
/// macOS 13 fallback: detect by the presence of a call WINDOW per app.
|
||||||
|
private func detectViaWindowTitle() -> DetectedApp? {
|
||||||
|
var pidToApp: [pid_t: DetectedApp] = [:]
|
||||||
|
var browserPIDs = Set<pid_t>()
|
||||||
|
for app in NSWorkspace.shared.runningApplications {
|
||||||
|
guard let id = app.bundleIdentifier else { continue }
|
||||||
|
if let native = Self.nativeApps.first(where: { $0.id == id }) {
|
||||||
|
pidToApp[app.processIdentifier] = native.app
|
||||||
|
} else if Self.browserIDs.contains(id) {
|
||||||
|
browserPIDs.insert(app.processIdentifier)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
guard !pidToApp.isEmpty || !browserPIDs.isEmpty else { return nil }
|
||||||
|
guard let infoList = CGWindowListCopyWindowInfo([.excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
for info in infoList {
|
||||||
|
guard let pid = info[kCGWindowOwnerPID as String] as? pid_t,
|
||||||
|
let title = info[kCGWindowName as String] as? String,
|
||||||
|
!title.isEmpty else { continue }
|
||||||
|
if browserPIDs.contains(pid), Self.looksLikeMeet(title) { return .meet }
|
||||||
|
if let app = pidToApp[pid], Self.isCallWindow(app, title) { return app }
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-app in-call window-title signatures (macOS 13 fallback only).
|
||||||
|
private static func isCallWindow(_ app: DetectedApp, _ title: String) -> Bool {
|
||||||
|
let t = title.lowercased()
|
||||||
|
switch app {
|
||||||
|
case .zoom: return t.contains("zoom meeting") || t.contains("meeting")
|
||||||
|
case .teams: return t.contains("meeting")
|
||||||
|
case .signal: return t.contains("signal call") || t.contains("group call")
|
||||||
|
case .meet: return false // handled via the browser path above
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Match an ACTIVE Google Meet call. Verified against real Firefox titles:
|
||||||
|
/// in a call the title is "Meet - <code>" (e.g. "Meet - tjh-pixe-ier"), while
|
||||||
|
/// the home/lobby/"you left" pages are bare "Meet" or "Google Meet". Matching
|
||||||
|
/// only the "Meet - …" form is what lets auto-STOP fire when you leave (and
|
||||||
|
/// avoids false-starting on the home page). Also excludes "Zoom Meeting" etc.
|
||||||
|
private static func looksLikeMeet(_ title: String) -> Bool {
|
||||||
|
let t = title.lowercased()
|
||||||
|
return t.hasPrefix("meet - ") || t.hasPrefix("meet – ") || t.hasPrefix("meet — ")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,125 @@
|
|||||||
|
import CoreAudio
|
||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// Watches whether *any* app is using the default input device (the system-wide
|
||||||
|
/// "mic is live" signal), via CoreAudio property listeners. Re-binds when the
|
||||||
|
/// default input device changes (e.g. you plug in a headset mid-call).
|
||||||
|
///
|
||||||
|
/// Threading: ALL CoreAudio state (deviceID, listener blocks, `started`) and all
|
||||||
|
/// Add/Remove calls are confined to the serial `queue`. `isRunning` is written
|
||||||
|
/// and read only on the main thread (via `deliver`). `onChange` fires on main.
|
||||||
|
final class MicActivityMonitor {
|
||||||
|
private(set) var isRunning = false // main-thread only
|
||||||
|
var onChange: ((Bool) -> Void)?
|
||||||
|
|
||||||
|
private let queue = DispatchQueue(label: "xyz.ten31.micmonitor")
|
||||||
|
|
||||||
|
// queue-confined:
|
||||||
|
private var deviceID = AudioObjectID(kAudioObjectUnknown)
|
||||||
|
private var runningBlock: AudioObjectPropertyListenerBlock?
|
||||||
|
private var defaultDeviceBlock: AudioObjectPropertyListenerBlock?
|
||||||
|
private var started = false
|
||||||
|
|
||||||
|
private static let runningAddr = AudioObjectPropertyAddress(
|
||||||
|
mSelector: kAudioDevicePropertyDeviceIsRunningSomewhere,
|
||||||
|
mScope: kAudioObjectPropertyScopeGlobal,
|
||||||
|
mElement: kAudioObjectPropertyElementMain)
|
||||||
|
|
||||||
|
private static let defaultDeviceAddr = AudioObjectPropertyAddress(
|
||||||
|
mSelector: kAudioHardwarePropertyDefaultInputDevice,
|
||||||
|
mScope: kAudioObjectPropertyScopeGlobal,
|
||||||
|
mElement: kAudioObjectPropertyElementMain)
|
||||||
|
|
||||||
|
func start() { queue.async { self.begin() } }
|
||||||
|
|
||||||
|
/// Called on the main thread (by the @MainActor CallDetector). Resets
|
||||||
|
/// `isRunning` so a subsequent enable()'s synchronous evaluation can't read a
|
||||||
|
/// stale `true` before the fresh reading arrives.
|
||||||
|
func stop() {
|
||||||
|
queue.sync { self.end() }
|
||||||
|
isRunning = false
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - queue-confined
|
||||||
|
|
||||||
|
private func begin() {
|
||||||
|
guard !started else { return }
|
||||||
|
started = true
|
||||||
|
var addr = Self.defaultDeviceAddr
|
||||||
|
let block: AudioObjectPropertyListenerBlock = { [weak self] _, _ in
|
||||||
|
self?.rebindRunning() // delivered on `queue`
|
||||||
|
}
|
||||||
|
defaultDeviceBlock = block
|
||||||
|
AudioObjectAddPropertyListenerBlock(AudioObjectID(kAudioObjectSystemObject), &addr, queue, block)
|
||||||
|
bindRunning()
|
||||||
|
}
|
||||||
|
|
||||||
|
private func end() {
|
||||||
|
started = false
|
||||||
|
if let block = defaultDeviceBlock {
|
||||||
|
var addr = Self.defaultDeviceAddr
|
||||||
|
AudioObjectRemovePropertyListenerBlock(AudioObjectID(kAudioObjectSystemObject), &addr, queue, block)
|
||||||
|
defaultDeviceBlock = nil
|
||||||
|
}
|
||||||
|
unbindRunning()
|
||||||
|
}
|
||||||
|
|
||||||
|
private func bindRunning() {
|
||||||
|
guard started else { return }
|
||||||
|
deviceID = Self.defaultInputDevice()
|
||||||
|
guard deviceID != AudioObjectID(kAudioObjectUnknown) else { return }
|
||||||
|
var addr = Self.runningAddr
|
||||||
|
let block: AudioObjectPropertyListenerBlock = { [weak self] _, _ in
|
||||||
|
guard let self else { return }
|
||||||
|
self.deliver(Self.isDeviceRunning(self.deviceID)) // on `queue`
|
||||||
|
}
|
||||||
|
runningBlock = block
|
||||||
|
// Install the listener BEFORE the initial read so a flip during setup is
|
||||||
|
// caught (either by the now-installed listener or the post-install read).
|
||||||
|
AudioObjectAddPropertyListenerBlock(deviceID, &addr, queue, block)
|
||||||
|
deliver(Self.isDeviceRunning(deviceID))
|
||||||
|
}
|
||||||
|
|
||||||
|
private func unbindRunning() {
|
||||||
|
if deviceID != AudioObjectID(kAudioObjectUnknown), let block = runningBlock {
|
||||||
|
var addr = Self.runningAddr
|
||||||
|
AudioObjectRemovePropertyListenerBlock(deviceID, &addr, queue, block)
|
||||||
|
}
|
||||||
|
runningBlock = nil
|
||||||
|
deviceID = AudioObjectID(kAudioObjectUnknown)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func rebindRunning() {
|
||||||
|
guard started else { return }
|
||||||
|
unbindRunning()
|
||||||
|
bindRunning()
|
||||||
|
}
|
||||||
|
|
||||||
|
private func deliver(_ running: Bool) {
|
||||||
|
DispatchQueue.main.async {
|
||||||
|
let changed = running != self.isRunning
|
||||||
|
self.isRunning = running
|
||||||
|
if changed { self.onChange?(running) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - CoreAudio reads (use local address copies)
|
||||||
|
|
||||||
|
private static func defaultInputDevice() -> AudioObjectID {
|
||||||
|
var addr = defaultDeviceAddr
|
||||||
|
var device = AudioObjectID(kAudioObjectUnknown)
|
||||||
|
var size = UInt32(MemoryLayout<AudioObjectID>.size)
|
||||||
|
let status = AudioObjectGetPropertyData(
|
||||||
|
AudioObjectID(kAudioObjectSystemObject), &addr, 0, nil, &size, &device)
|
||||||
|
return status == noErr ? device : AudioObjectID(kAudioObjectUnknown)
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func isDeviceRunning(_ device: AudioObjectID) -> Bool {
|
||||||
|
guard device != AudioObjectID(kAudioObjectUnknown) else { return false }
|
||||||
|
var addr = runningAddr
|
||||||
|
var value: UInt32 = 0
|
||||||
|
var size = UInt32(MemoryLayout<UInt32>.size)
|
||||||
|
let status = AudioObjectGetPropertyData(device, &addr, 0, nil, &size, &value)
|
||||||
|
return status == noErr && value != 0
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
import Foundation
|
import Foundation
|
||||||
import Combine
|
import Combine
|
||||||
import AppKit
|
import AppKit
|
||||||
|
import CoreGraphics
|
||||||
|
|
||||||
struct SessionInfo: Equatable {
|
struct SessionInfo: Equatable {
|
||||||
let folder: URL
|
let folder: URL
|
||||||
@@ -25,6 +26,14 @@ final class SessionController: ObservableObject {
|
|||||||
case error(String)
|
case error(String)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Backend transcription status for the most recent session.
|
||||||
|
enum TranscriptStatus: Equatable {
|
||||||
|
case idle
|
||||||
|
case processing(Int, Int) // chunk done, total
|
||||||
|
case done(speakers: Int, segments: Int)
|
||||||
|
case failed(String)
|
||||||
|
}
|
||||||
|
|
||||||
/// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a
|
/// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a
|
||||||
/// recording in progress before the app quits.
|
/// recording in progress before the app quits.
|
||||||
static weak var shared: SessionController?
|
static weak var shared: SessionController?
|
||||||
@@ -37,12 +46,34 @@ final class SessionController: ObservableObject {
|
|||||||
@Published private(set) var systemLevel: Float = 0
|
@Published private(set) var systemLevel: Float = 0
|
||||||
/// Surfaced after a session if system audio stopped early.
|
/// Surfaced after a session if system audio stopped early.
|
||||||
@Published private(set) var warning: String?
|
@Published private(set) var warning: String?
|
||||||
|
/// Mirrored from `CallDetector` for the UI.
|
||||||
|
@Published private(set) var detectionStatus: CallDetector.Status = .disabled
|
||||||
|
/// Backend transcription status for the last session.
|
||||||
|
@Published private(set) var transcriptStatus: TranscriptStatus = .idle
|
||||||
|
|
||||||
private let settings: AppSettings
|
private let settings: AppSettings
|
||||||
|
private var voiceprints: VoiceprintStore
|
||||||
|
private let detector = CallDetector()
|
||||||
|
private var cancellables = Set<AnyCancellable>()
|
||||||
|
private var currentLabel = "manual"
|
||||||
|
/// Inputs needed to (re)process the last finished session through the backend.
|
||||||
|
private struct ProcessInputs {
|
||||||
|
let folder: URL
|
||||||
|
let sessionId: String
|
||||||
|
let app: String
|
||||||
|
let mixedURL: URL
|
||||||
|
let selfSpans: [VADSpan]
|
||||||
|
}
|
||||||
|
private var lastProcess: ProcessInputs?
|
||||||
|
private var processTask: Task<Void, Never>?
|
||||||
private var recorder: AudioRecorder?
|
private var recorder: AudioRecorder?
|
||||||
private var currentFolder: URL?
|
private var currentFolder: URL?
|
||||||
private var startTime: Date?
|
private var startTime: Date?
|
||||||
private var timer: Timer?
|
private var timer: Timer?
|
||||||
|
/// True when the current session was started by call detection (not the user).
|
||||||
|
private var autoStarted = false
|
||||||
|
/// Set if a detected call ends while we're still in `.starting`.
|
||||||
|
private var pendingAutoStop = false
|
||||||
/// The in-flight start or stop Task, so `prepareForTermination` can await it.
|
/// The in-flight start or stop Task, so `prepareForTermination` can await it.
|
||||||
private var lifecycleTask: Task<Void, Never>?
|
private var lifecycleTask: Task<Void, Never>?
|
||||||
/// Bumped each time a start/stop Task is spawned (Task is a value type, so this
|
/// Bumped each time a start/stop Task is spawned (Task is a value type, so this
|
||||||
@@ -51,7 +82,64 @@ final class SessionController: ObservableObject {
|
|||||||
|
|
||||||
init(settings: AppSettings) {
|
init(settings: AppSettings) {
|
||||||
self.settings = settings
|
self.settings = settings
|
||||||
|
self.voiceprints = VoiceprintStore(
|
||||||
|
fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
|
||||||
SessionController.shared = self
|
SessionController.shared = self
|
||||||
|
|
||||||
|
detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
|
||||||
|
detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
|
||||||
|
detector.$status
|
||||||
|
.sink { [weak self] status in self?.detectionStatus = status }
|
||||||
|
.store(in: &cancellables)
|
||||||
|
// Re-point the voiceprint DB if the output folder changes. The in-flight
|
||||||
|
// pipeline keeps its own captured reference, so this can't disrupt a run.
|
||||||
|
settings.$outputFolderPath
|
||||||
|
.dropFirst()
|
||||||
|
.sink { [weak self] path in
|
||||||
|
guard let self else { return }
|
||||||
|
let dir = URL(fileURLWithPath: (path as NSString).expandingTildeInPath, isDirectory: true)
|
||||||
|
self.voiceprints = VoiceprintStore(fileURL: dir.appendingPathComponent("voiceprints.json"))
|
||||||
|
}
|
||||||
|
.store(in: &cancellables)
|
||||||
|
settings.$autoRecordOnDetection
|
||||||
|
.sink { [weak self] on in
|
||||||
|
guard let self else { return }
|
||||||
|
if on {
|
||||||
|
self.detector.enable()
|
||||||
|
} else {
|
||||||
|
self.detector.disable()
|
||||||
|
// Don't leave an auto-started session running with no detector —
|
||||||
|
// handle both .recording and the in-flight .starting case.
|
||||||
|
if self.autoStarted {
|
||||||
|
switch self.state {
|
||||||
|
case .recording: self.stop()
|
||||||
|
case .starting: self.pendingAutoStop = true
|
||||||
|
default: break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.store(in: &cancellables)
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Auto-detection
|
||||||
|
|
||||||
|
private func handleCallStart(_ app: CallDetector.DetectedApp) {
|
||||||
|
guard settings.autoRecordOnDetection else { return }
|
||||||
|
switch state {
|
||||||
|
case .idle, .error: start(label: app.label, auto: true)
|
||||||
|
case .starting, .recording, .finishing: break // don't disturb an active session
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func handleCallEnd() {
|
||||||
|
// Only auto-stop a session we auto-started; never a manual recording.
|
||||||
|
guard autoStarted else { return }
|
||||||
|
switch state {
|
||||||
|
case .recording: stop()
|
||||||
|
case .starting: pendingAutoStop = true // resolved when start() completes
|
||||||
|
case .idle, .error, .finishing: break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var isBusy: Bool {
|
var isBusy: Bool {
|
||||||
@@ -68,15 +156,18 @@ final class SessionController: ObservableObject {
|
|||||||
|
|
||||||
// MARK: - Start / Stop
|
// MARK: - Start / Stop
|
||||||
|
|
||||||
private func start() {
|
private func start(label: String = "manual", auto: Bool = false) {
|
||||||
let folder: URL
|
let folder: URL
|
||||||
do {
|
do {
|
||||||
folder = try makeSessionFolder()
|
folder = try makeSessionFolder(label: label)
|
||||||
} catch {
|
} catch {
|
||||||
fail("Couldn't create session folder: \(error.localizedDescription)")
|
fail("Couldn't create session folder: \(error.localizedDescription)")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
currentFolder = folder
|
currentFolder = folder
|
||||||
|
currentLabel = label
|
||||||
|
autoStarted = auto
|
||||||
|
pendingAutoStop = false
|
||||||
let recorder = AudioRecorder(
|
let recorder = AudioRecorder(
|
||||||
micURL: folder.appendingPathComponent("mic.wav"),
|
micURL: folder.appendingPathComponent("mic.wav"),
|
||||||
systemURL: folder.appendingPathComponent("system.wav"),
|
systemURL: folder.appendingPathComponent("system.wav"),
|
||||||
@@ -92,9 +183,33 @@ final class SessionController: ObservableObject {
|
|||||||
self.state = .recording
|
self.state = .recording
|
||||||
self.startTime = Date()
|
self.startTime = Date()
|
||||||
self.startTimer()
|
self.startTimer()
|
||||||
} catch {
|
// A detected call may have ended while we were still starting.
|
||||||
self.fail("Couldn't start recording: \(error.localizedDescription)")
|
if self.pendingAutoStop {
|
||||||
|
self.pendingAutoStop = false
|
||||||
|
self.stop()
|
||||||
}
|
}
|
||||||
|
} catch {
|
||||||
|
self.handleStartFailure(error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Map a recorder start failure to an actionable message. The common case is
|
||||||
|
/// Screen Recording getting re-checked after a rebuild (the SCStream auth
|
||||||
|
/// check fails even though CGPreflight reports granted), so re-prompt and open
|
||||||
|
/// the right Settings pane rather than show a cryptic TCC error.
|
||||||
|
private func handleStartFailure(_ error: Error) {
|
||||||
|
let msg = error.localizedDescription.lowercased()
|
||||||
|
let screenIssue = msg.contains("declined") || msg.contains("tcc")
|
||||||
|
|| msg.contains("screen") || msg.contains("permission")
|
||||||
|
if screenIssue {
|
||||||
|
_ = CGRequestScreenCaptureAccess()
|
||||||
|
if let url = URL(string: "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") {
|
||||||
|
NSWorkspace.shared.open(url)
|
||||||
|
}
|
||||||
|
fail("Screen Recording needs re-approval for this build. Toggle Ten31Transcripts off then on in System Settings ▸ Screen Recording, then restart the app.")
|
||||||
|
} else {
|
||||||
|
fail("Couldn't start recording: \(error.localizedDescription)")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -114,20 +229,66 @@ final class SessionController: ObservableObject {
|
|||||||
micLevel = 0
|
micLevel = 0
|
||||||
systemLevel = 0
|
systemLevel = 0
|
||||||
warning = result.systemNote.map { "System audio stopped early: \($0)" }
|
warning = result.systemNote.map { "System audio stopped early: \($0)" }
|
||||||
|
transcriptStatus = .idle
|
||||||
if let folder = currentFolder {
|
if let folder = currentFolder {
|
||||||
writeSelfSpans(result, to: folder)
|
writeSelfSpans(result, to: folder)
|
||||||
lastSession = SessionInfo(
|
lastSession = SessionInfo(
|
||||||
folder: folder, mixedURL: result.mixedURL,
|
folder: folder, mixedURL: result.mixedURL,
|
||||||
duration: result.duration, selfSpanCount: result.selfSpans.count)
|
duration: result.duration, selfSpanCount: result.selfSpans.count)
|
||||||
|
lastProcess = ProcessInputs(
|
||||||
|
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
|
||||||
|
mixedURL: result.mixedURL, selfSpans: result.selfSpans)
|
||||||
}
|
}
|
||||||
|
let autoSend = settings.autoSendOnStop
|
||||||
currentFolder = nil
|
currentFolder = nil
|
||||||
|
autoStarted = false
|
||||||
|
pendingAutoStop = false
|
||||||
elapsed = 0
|
elapsed = 0
|
||||||
state = .idle
|
state = .idle
|
||||||
|
if autoSend { processLastSession() }
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Backend transcription
|
||||||
|
|
||||||
|
/// Send the last finished session to the backend → `speakers.json`. Uses the
|
||||||
|
/// mic-VAD self spans as the timeline for now; visual segments (Phase 3–4) get
|
||||||
|
/// merged in once the adapters land. Safe to call manually ("Send to backend")
|
||||||
|
/// or automatically on stop.
|
||||||
|
func processLastSession() {
|
||||||
|
guard let inputs = lastProcess else { return }
|
||||||
|
if case .processing = transcriptStatus { return }
|
||||||
|
transcriptStatus = .processing(0, 1)
|
||||||
|
|
||||||
|
let settings = self.settings
|
||||||
|
let voiceprints = self.voiceprints
|
||||||
|
processTask = Task {
|
||||||
|
let pipeline = TranscriptPipeline(
|
||||||
|
baseURL: settings.backendBaseURL,
|
||||||
|
skipTLS: settings.skipTLSVerification,
|
||||||
|
voiceprints: voiceprints)
|
||||||
|
let timeline = TranscriptPipeline.timeline(
|
||||||
|
fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
|
||||||
|
do {
|
||||||
|
let speakers = try await pipeline.process(
|
||||||
|
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
||||||
|
mixedURL: inputs.mixedURL, timeline: timeline,
|
||||||
|
progress: { done, total in
|
||||||
|
await MainActor.run { self.transcriptStatus = .processing(done, total) }
|
||||||
|
})
|
||||||
|
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
|
||||||
|
} catch is CancellationError {
|
||||||
|
self.transcriptStatus = .idle
|
||||||
|
} catch {
|
||||||
|
self.transcriptStatus = .failed(error.localizedDescription)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private func fail(_ message: String) {
|
private func fail(_ message: String) {
|
||||||
recorder = nil
|
recorder = nil
|
||||||
currentFolder = nil
|
currentFolder = nil
|
||||||
|
autoStarted = false
|
||||||
|
pendingAutoStop = false
|
||||||
stopTimer()
|
stopTimer()
|
||||||
micLevel = 0
|
micLevel = 0
|
||||||
systemLevel = 0
|
systemLevel = 0
|
||||||
@@ -139,6 +300,9 @@ final class SessionController: ObservableObject {
|
|||||||
/// its WAV headers are finalized before the process exits. Handles quit while
|
/// its WAV headers are finalized before the process exits. Handles quit while
|
||||||
/// `.starting` and `.finishing`, not just `.recording`.
|
/// `.starting` and `.finishing`, not just `.recording`.
|
||||||
func prepareForTermination() async {
|
func prepareForTermination() async {
|
||||||
|
// Cancel any in-flight backend transcription (audio is already saved; the
|
||||||
|
// user can resend). The pipeline's checkCancellation + defer clean up chunks.
|
||||||
|
processTask?.cancel()
|
||||||
// Drain whatever lifecycle Task is in flight until nothing is busy. A Stop
|
// Drain whatever lifecycle Task is in flight until nothing is busy. A Stop
|
||||||
// click landing in an await window can spawn a new stop Task, so loop
|
// click landing in an await window can spawn a new stop Task, so loop
|
||||||
// rather than awaiting a single captured task.
|
// rather than awaiting a single captured task.
|
||||||
@@ -178,9 +342,9 @@ final class SessionController: ObservableObject {
|
|||||||
|
|
||||||
// MARK: - Files
|
// MARK: - Files
|
||||||
|
|
||||||
private func makeSessionFolder() throws -> URL {
|
private func makeSessionFolder(label: String) throws -> URL {
|
||||||
let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
|
let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
|
||||||
let folder = base.appendingPathComponent("\(Self.timestamp())_manual", isDirectory: true)
|
let folder = base.appendingPathComponent("\(Self.timestamp())_\(label)", isDirectory: true)
|
||||||
try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true)
|
try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true)
|
||||||
return folder
|
return folder
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,85 @@
|
|||||||
|
import Foundation
|
||||||
|
import AVFoundation
|
||||||
|
|
||||||
|
/// Splits a long session into backend-sized chunks and produces, per chunk, the
|
||||||
|
/// sliced audio and the timeline rebased to chunk-local seconds.
|
||||||
|
///
|
||||||
|
/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3
|
||||||
|
/// min are chunked into ~2–3 min windows; names + voiceprints unify speakers
|
||||||
|
/// across chunks (handled in the pipeline).
|
||||||
|
enum SessionPackager {
|
||||||
|
struct PlannedChunk: Equatable {
|
||||||
|
let index: Int
|
||||||
|
let start: Double // global seconds
|
||||||
|
let end: Double
|
||||||
|
}
|
||||||
|
|
||||||
|
/// One chunk if short; otherwise even ~`chunkSeconds` windows.
|
||||||
|
static func planChunks(durationSec: Double,
|
||||||
|
chunkSeconds: Double = 150,
|
||||||
|
thresholdSec: Double = 180) -> [PlannedChunk] {
|
||||||
|
guard durationSec > thresholdSec else {
|
||||||
|
return [PlannedChunk(index: 0, start: 0, end: durationSec)]
|
||||||
|
}
|
||||||
|
var chunks: [PlannedChunk] = []
|
||||||
|
var start = 0.0
|
||||||
|
var index = 0
|
||||||
|
while start < durationSec - 0.001 {
|
||||||
|
let end = min(start + chunkSeconds, durationSec)
|
||||||
|
chunks.append(PlannedChunk(index: index, start: start, end: end))
|
||||||
|
start = end
|
||||||
|
index += 1
|
||||||
|
}
|
||||||
|
return chunks
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clip segments to `[start, end)` and rebase to chunk-local seconds, then
|
||||||
|
/// emit the flat `label-merge` array `[{start,end,name,confidence}]`.
|
||||||
|
static func rebasedTimelineData(_ segments: [VisualTimeline.Segment],
|
||||||
|
start: Double, end: Double) throws -> Data {
|
||||||
|
let flat: [[String: Any]] = segments.compactMap { seg in
|
||||||
|
let s = max(seg.start, start)
|
||||||
|
let e = min(seg.end, end)
|
||||||
|
guard e > s else { return nil }
|
||||||
|
return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence]
|
||||||
|
}
|
||||||
|
return try JSONSerialization.data(withJSONObject: flat, options: [])
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
|
||||||
|
static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
|
||||||
|
let input = try AVAudioFile(forReading: source)
|
||||||
|
let sr = input.fileFormat.sampleRate
|
||||||
|
let startFrame = AVAudioFramePosition((startSec * sr).rounded())
|
||||||
|
let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded()))
|
||||||
|
guard endFrame > startFrame else { return }
|
||||||
|
|
||||||
|
let settings: [String: Any] = [
|
||||||
|
AVFormatIDKey: kAudioFormatLinearPCM,
|
||||||
|
AVSampleRateKey: sr,
|
||||||
|
AVNumberOfChannelsKey: 1,
|
||||||
|
AVLinearPCMBitDepthKey: 16,
|
||||||
|
AVLinearPCMIsFloatKey: false,
|
||||||
|
AVLinearPCMIsBigEndianKey: false,
|
||||||
|
]
|
||||||
|
let output = try AVAudioFile(forWriting: dest, settings: settings,
|
||||||
|
commonFormat: .pcmFormatFloat32, interleaved: false)
|
||||||
|
input.framePosition = startFrame
|
||||||
|
var remaining = AVAudioFrameCount(endFrame - startFrame)
|
||||||
|
let block: AVAudioFrameCount = 16_000
|
||||||
|
while remaining > 0 {
|
||||||
|
let n = min(block, remaining)
|
||||||
|
guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break }
|
||||||
|
try input.read(into: buffer, frameCount: n)
|
||||||
|
if buffer.frameLength == 0 { break }
|
||||||
|
try output.write(from: buffer)
|
||||||
|
remaining -= buffer.frameLength
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Duration (seconds) of a WAV.
|
||||||
|
static func duration(of url: URL) -> Double {
|
||||||
|
guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 }
|
||||||
|
return Double(file.length) / file.fileFormat.sampleRate
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// `speakers.json` — the final stored output (docs §6): per-chunk `label-merge`
|
||||||
|
/// results concatenated, timestamps offset back to global seconds, names unified.
|
||||||
|
/// This is the hand-off to the downstream summarizer; the app stops here.
|
||||||
|
struct SpeakersFile: Codable {
|
||||||
|
let sessionId: String
|
||||||
|
let app: String
|
||||||
|
let durationSec: Double
|
||||||
|
let speakers: [Speaker]
|
||||||
|
let segments: [Segment]
|
||||||
|
let models: [String: String]
|
||||||
|
|
||||||
|
struct Speaker: Codable, Equatable {
|
||||||
|
let name: String
|
||||||
|
let source: String
|
||||||
|
let overlapConfidence: Double?
|
||||||
|
let matchSimilarity: Double?
|
||||||
|
enum CodingKeys: String, CodingKey {
|
||||||
|
case name, source
|
||||||
|
case overlapConfidence = "overlap_confidence"
|
||||||
|
case matchSimilarity = "match_similarity"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Segment: Codable, Equatable {
|
||||||
|
let start: Double
|
||||||
|
let end: Double
|
||||||
|
let speaker: String
|
||||||
|
let text: String?
|
||||||
|
}
|
||||||
|
|
||||||
|
enum CodingKeys: String, CodingKey {
|
||||||
|
case sessionId = "session_id"
|
||||||
|
case app
|
||||||
|
case durationSec = "duration_sec"
|
||||||
|
case speakers, segments, models
|
||||||
|
}
|
||||||
|
|
||||||
|
func write(to url: URL) throws {
|
||||||
|
let encoder = JSONEncoder()
|
||||||
|
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
|
||||||
|
try encoder.encode(self).write(to: url)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,78 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// Concatenates per-chunk `label-merge` results into one global `speakers.json`:
|
||||||
|
/// segment times offset back to global seconds, speakers unified across chunks by
|
||||||
|
/// name, and fingerprints collected for the voiceprint store.
|
||||||
|
enum TranscriptAssembler {
|
||||||
|
struct ChunkResult {
|
||||||
|
let chunkStart: Double // global seconds
|
||||||
|
let response: LabelMergeResponse
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Assembled {
|
||||||
|
let speakersFile: SpeakersFile
|
||||||
|
let fingerprints: [String: [Float]] // name -> 192-dim, for VoiceprintStore
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Source ranking when the same name appears across chunks with different sources.
|
||||||
|
private static func rank(_ source: String) -> Int {
|
||||||
|
switch source {
|
||||||
|
case "visual": return 3
|
||||||
|
case "voiceprint": return 2
|
||||||
|
default: return 1 // unmatched
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func isUnknown(_ name: String) -> Bool {
|
||||||
|
LabelMergeResponse.isUnknownName(name)
|
||||||
|
}
|
||||||
|
|
||||||
|
static func assemble(sessionId: String, app: String, chunks: [ChunkResult]) -> Assembled {
|
||||||
|
var segments: [SpeakersFile.Segment] = []
|
||||||
|
var bestSpeaker: [String: SpeakersFile.Speaker] = [:]
|
||||||
|
var fingerprints: [String: [Float]] = [:]
|
||||||
|
var models: [String: String] = [:]
|
||||||
|
var duration = 0.0
|
||||||
|
|
||||||
|
for chunk in chunks {
|
||||||
|
let offset = chunk.chunkStart
|
||||||
|
// Audio length from the chunk window, so silent/all-unknown calls still
|
||||||
|
// report a real duration (not just the last segment's end).
|
||||||
|
duration = max(duration, offset + chunk.response.duration)
|
||||||
|
|
||||||
|
for seg in chunk.response.segments {
|
||||||
|
let start = seg.startSeconds + offset
|
||||||
|
let end = seg.endSeconds + offset
|
||||||
|
segments.append(.init(start: start, end: end, speaker: seg.speaker, text: seg.text))
|
||||||
|
duration = max(duration, end)
|
||||||
|
}
|
||||||
|
|
||||||
|
for sp in chunk.response.speakers {
|
||||||
|
let candidate = SpeakersFile.Speaker(
|
||||||
|
name: sp.name, source: sp.source,
|
||||||
|
overlapConfidence: sp.overlapConfidence, matchSimilarity: sp.matchSimilarity)
|
||||||
|
if let existing = bestSpeaker[sp.name] {
|
||||||
|
if rank(sp.source) > rank(existing.source) { bestSpeaker[sp.name] = candidate }
|
||||||
|
} else {
|
||||||
|
bestSpeaker[sp.name] = candidate
|
||||||
|
}
|
||||||
|
// Collect named fingerprints only (never Unknown_N / Speaker_unknown).
|
||||||
|
if !isUnknown(sp.name), let fp = sp.fingerprint, fp.count > 0 {
|
||||||
|
fingerprints[sp.name] = fp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (name, fp) in chunk.response.fingerprints where !isUnknown(name) && fp.count > 0 {
|
||||||
|
fingerprints[name] = fp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
segments.sort { $0.start < $1.start }
|
||||||
|
let speakers = bestSpeaker.values.sorted { $0.name < $1.name }
|
||||||
|
models = chunks.last?.response.models ?? [:]
|
||||||
|
|
||||||
|
let file = SpeakersFile(
|
||||||
|
sessionId: sessionId, app: app, durationSec: duration,
|
||||||
|
speakers: speakers, segments: segments, models: models)
|
||||||
|
return Assembled(speakersFile: file, fingerprints: fingerprints)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// Drives a finished session through the backend: chunk → sequential
|
||||||
|
/// `label-merge` (accumulating voiceprints) → assemble `speakers.json` → persist
|
||||||
|
/// fingerprints. Requests are sequential by construction (one chunk at a time).
|
||||||
|
final class TranscriptPipeline {
|
||||||
|
private let client: SparkControlClient
|
||||||
|
private let voiceprints: VoiceprintStore
|
||||||
|
|
||||||
|
init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
|
||||||
|
self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
|
||||||
|
self.voiceprints = voiceprints
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Process `mixedURL` against `timeline` (visual + self spans). Writes
|
||||||
|
/// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)`
|
||||||
|
/// is called per chunk.
|
||||||
|
func process(sessionFolder: URL,
|
||||||
|
sessionId: String,
|
||||||
|
app: String,
|
||||||
|
mixedURL: URL,
|
||||||
|
timeline: [VisualTimeline.Segment],
|
||||||
|
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
|
||||||
|
let duration = SessionPackager.duration(of: mixedURL)
|
||||||
|
let plan = SessionPackager.planChunks(durationSec: duration)
|
||||||
|
|
||||||
|
// Zero-duration / empty session → a valid empty speakers.json, no backend call.
|
||||||
|
if plan.isEmpty || duration <= 0 {
|
||||||
|
let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
|
||||||
|
try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
|
||||||
|
await progress?(0, 0)
|
||||||
|
return empty.speakersFile
|
||||||
|
}
|
||||||
|
|
||||||
|
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
|
||||||
|
try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true)
|
||||||
|
defer { try? FileManager.default.removeItem(at: chunksDir) } // cleanup on success OR throw
|
||||||
|
|
||||||
|
// Start from stored voiceprints; accumulate this call's prints across chunks
|
||||||
|
// for within-call unification (the store only persists high-confidence ones).
|
||||||
|
var known = voiceprints.knownVoiceprints()
|
||||||
|
var results: [TranscriptAssembler.ChunkResult] = []
|
||||||
|
|
||||||
|
for chunk in plan {
|
||||||
|
try Task.checkCancellation()
|
||||||
|
await progress?(chunk.index, plan.count)
|
||||||
|
let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav")
|
||||||
|
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
|
||||||
|
guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue } // empty slice → skip
|
||||||
|
|
||||||
|
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
|
||||||
|
let response = try await client.labelMerge(
|
||||||
|
audioURL: chunkURL, timeline: timelineData,
|
||||||
|
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||||||
|
|
||||||
|
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
|
||||||
|
known[name] = fp
|
||||||
|
}
|
||||||
|
voiceprints.update(with: response)
|
||||||
|
results.append(.init(chunkStart: chunk.start, response: response))
|
||||||
|
try? FileManager.default.removeItem(at: chunkURL)
|
||||||
|
}
|
||||||
|
await progress?(plan.count, plan.count)
|
||||||
|
|
||||||
|
let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
|
||||||
|
try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
|
||||||
|
return assembled.speakersFile
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
|
||||||
|
/// the visual adapters land (Phase 3–4), their segments are merged in too.
|
||||||
|
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
|
||||||
|
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -32,6 +32,21 @@ final class AppSettings: ObservableObject {
|
|||||||
didSet { defaults.set(adapterEnabled, forKey: Keys.adapterEnabled) }
|
didSet { defaults.set(adapterEnabled, forKey: Keys.adapterEnabled) }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Published var autoRecordOnDetection: Bool {
|
||||||
|
didSet { defaults.set(autoRecordOnDetection, forKey: Keys.autoRecord) }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The user's name, pre-seeded into the timeline for mic-VAD "self" spans.
|
||||||
|
@Published var selfName: String {
|
||||||
|
didSet { defaults.set(selfName, forKey: Keys.selfName) }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Auto-send a finished recording to the backend for transcription. Default
|
||||||
|
/// off while developing; flip on for hands-free transcripts.
|
||||||
|
@Published var autoSendOnStop: Bool {
|
||||||
|
didSet { defaults.set(autoSendOnStop, forKey: Keys.autoSend) }
|
||||||
|
}
|
||||||
|
|
||||||
/// Output folder as a resolved file URL (expands a leading `~`).
|
/// Output folder as a resolved file URL (expands a leading `~`).
|
||||||
var outputFolderURL: URL {
|
var outputFolderURL: URL {
|
||||||
URL(fileURLWithPath: (outputFolderPath as NSString).expandingTildeInPath,
|
URL(fileURLWithPath: (outputFolderPath as NSString).expandingTildeInPath,
|
||||||
@@ -55,6 +70,10 @@ final class AppSettings: ObservableObject {
|
|||||||
self.adapterEnabled = stored ?? Dictionary(
|
self.adapterEnabled = stored ?? Dictionary(
|
||||||
uniqueKeysWithValues: Self.adapterKeys.map { ($0.key, true) }
|
uniqueKeysWithValues: Self.adapterKeys.map { ($0.key, true) }
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.autoRecordOnDetection = defaults.object(forKey: Keys.autoRecord) as? Bool ?? true
|
||||||
|
self.selfName = defaults.string(forKey: Keys.selfName) ?? "Me"
|
||||||
|
self.autoSendOnStop = defaults.object(forKey: Keys.autoSend) as? Bool ?? false
|
||||||
}
|
}
|
||||||
|
|
||||||
private enum Keys {
|
private enum Keys {
|
||||||
@@ -62,5 +81,8 @@ final class AppSettings: ObservableObject {
|
|||||||
static let skipTLS = "skipTLSVerification"
|
static let skipTLS = "skipTLSVerification"
|
||||||
static let outputFolder = "outputFolderPath"
|
static let outputFolder = "outputFolderPath"
|
||||||
static let adapterEnabled = "adapterEnabled"
|
static let adapterEnabled = "adapterEnabled"
|
||||||
|
static let autoRecord = "autoRecordOnDetection"
|
||||||
|
static let selfName = "selfName"
|
||||||
|
static let autoSend = "autoSendOnStop"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,6 +46,9 @@ struct MenuBarView: View {
|
|||||||
.foregroundStyle(.secondary)
|
.foregroundStyle(.secondary)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Text(detectionText)
|
||||||
|
.font(.caption)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
|
||||||
Button {
|
Button {
|
||||||
session.toggle()
|
session.toggle()
|
||||||
@@ -84,6 +87,15 @@ struct MenuBarView: View {
|
|||||||
.font(.caption)
|
.font(.caption)
|
||||||
}
|
}
|
||||||
.buttonStyle(.link)
|
.buttonStyle(.link)
|
||||||
|
|
||||||
|
HStack {
|
||||||
|
Button("Send to backend") { session.processLastSession() }
|
||||||
|
.disabled(transcriptProcessing)
|
||||||
|
Spacer()
|
||||||
|
}
|
||||||
|
if !transcriptText.isEmpty {
|
||||||
|
Text(transcriptText).font(.caption).foregroundStyle(transcriptColor)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -114,6 +126,36 @@ struct MenuBarView: View {
|
|||||||
return String(format: "%02d:%02d", total / 60, total % 60)
|
return String(format: "%02d:%02d", total / 60, total % 60)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private var detectionText: String {
|
||||||
|
switch session.detectionStatus {
|
||||||
|
case .disabled: return "Auto-detect off"
|
||||||
|
case .listening: return "Listening for calls…"
|
||||||
|
case .inCall(let app): return "In call: \(app.display)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private var transcriptProcessing: Bool {
|
||||||
|
if case .processing = session.transcriptStatus { return true }
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
private var transcriptText: String {
|
||||||
|
switch session.transcriptStatus {
|
||||||
|
case .idle: return ""
|
||||||
|
case .processing(let d, let t): return "Transcribing… chunk \(d)/\(t)"
|
||||||
|
case .done(let s, let seg): return "Transcript ready · \(s) speakers · \(seg) segments"
|
||||||
|
case .failed(let m): return "Transcript failed: \(m)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private var transcriptColor: Color {
|
||||||
|
switch session.transcriptStatus {
|
||||||
|
case .failed: return .red
|
||||||
|
case .done: return .green
|
||||||
|
default: return .secondary
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private var header: some View {
|
private var header: some View {
|
||||||
VStack(alignment: .leading, spacing: 2) {
|
VStack(alignment: .leading, spacing: 2) {
|
||||||
Text("Ten31 Transcripts").font(.headline)
|
Text("Ten31 Transcripts").font(.headline)
|
||||||
|
|||||||
@@ -14,6 +14,22 @@ struct SettingsView: View {
|
|||||||
isOn: $settings.skipTLSVerification)
|
isOn: $settings.skipTLSVerification)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Section("Call detection") {
|
||||||
|
Toggle("Auto-record when a call is detected", isOn: $settings.autoRecordOnDetection)
|
||||||
|
Text("Detects Zoom, Teams, Signal, and Google Meet (any browser).")
|
||||||
|
.font(.caption)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
}
|
||||||
|
|
||||||
|
Section("Transcription") {
|
||||||
|
TextField("Your name", text: $settings.selfName)
|
||||||
|
.textFieldStyle(.roundedBorder)
|
||||||
|
Toggle("Auto-send recordings to backend", isOn: $settings.autoSendOnStop)
|
||||||
|
Text("Your name labels the mic-VAD \"self\" spans. Auto-send transcribes each recording on stop.")
|
||||||
|
.font(.caption)
|
||||||
|
.foregroundStyle(.secondary)
|
||||||
|
}
|
||||||
|
|
||||||
Section("Output") {
|
Section("Output") {
|
||||||
HStack {
|
HStack {
|
||||||
Text(settings.outputFolderPath)
|
Text(settings.outputFolderPath)
|
||||||
|
|||||||
@@ -0,0 +1,82 @@
|
|||||||
|
import Foundation
|
||||||
|
import CoreGraphics
|
||||||
|
|
||||||
|
/// Renders a CGImage to an RGBA8 buffer once, then answers cheap colour queries
|
||||||
|
/// over pixel regions. Used to score the active-speaker highlight (a saturated
|
||||||
|
/// coloured border/ring) around participant tiles.
|
||||||
|
struct FrameSampler {
|
||||||
|
let width: Int
|
||||||
|
let height: Int
|
||||||
|
private let pixels: [UInt8] // RGBA8, row-major, top-left origin
|
||||||
|
|
||||||
|
init?(cgImage: CGImage) {
|
||||||
|
let w = cgImage.width, h = cgImage.height
|
||||||
|
guard w > 0, h > 0 else { return nil }
|
||||||
|
var buffer = [UInt8](repeating: 0, count: w * h * 4)
|
||||||
|
let colorSpace = CGColorSpaceCreateDeviceRGB()
|
||||||
|
let info = CGImageAlphaInfo.premultipliedLast.rawValue
|
||||||
|
guard let ctx = buffer.withUnsafeMutableBytes({ raw -> CGContext? in
|
||||||
|
CGContext(data: raw.baseAddress, width: w, height: h, bitsPerComponent: 8,
|
||||||
|
bytesPerRow: w * 4, space: colorSpace, bitmapInfo: info)
|
||||||
|
}) else { return nil }
|
||||||
|
ctx.draw(cgImage, in: CGRect(x: 0, y: 0, width: w, height: h))
|
||||||
|
self.width = w
|
||||||
|
self.height = h
|
||||||
|
self.pixels = buffer
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mean HSV saturation (0…1) over a pixel rect (top-left origin), sampled on a grid.
|
||||||
|
func meanSaturation(inPixelRect rect: CGRect, samples: Int = 24) -> Double {
|
||||||
|
let x0 = max(0, Int(rect.minX)), x1 = min(width, Int(rect.maxX))
|
||||||
|
let y0 = max(0, Int(rect.minY)), y1 = min(height, Int(rect.maxY))
|
||||||
|
guard x1 > x0, y1 > y0 else { return 0 }
|
||||||
|
let stepX = max(1, (x1 - x0) / samples)
|
||||||
|
let stepY = max(1, (y1 - y0) / samples)
|
||||||
|
var sum = 0.0, count = 0
|
||||||
|
var y = y0
|
||||||
|
while y < y1 {
|
||||||
|
var x = x0
|
||||||
|
while x < x1 {
|
||||||
|
let i = (y * width + x) * 4
|
||||||
|
let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2])
|
||||||
|
let mx = max(r, g, b), mn = min(r, g, b)
|
||||||
|
sum += mx > 0 ? (mx - mn) / mx : 0
|
||||||
|
count += 1
|
||||||
|
x += stepX
|
||||||
|
}
|
||||||
|
y += stepY
|
||||||
|
}
|
||||||
|
return count > 0 ? sum / Double(count) : 0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mean saturation of a ring just inside `rect`'s edges (the tile border),
|
||||||
|
/// excluding the interior — that's where the speaking highlight lives.
|
||||||
|
func borderSaturation(inPixelRect rect: CGRect, thicknessFraction: Double = 0.12) -> Double {
|
||||||
|
let t = max(2.0, min(rect.width, rect.height) * thicknessFraction)
|
||||||
|
let top = CGRect(x: rect.minX, y: rect.minY, width: rect.width, height: t)
|
||||||
|
let bottom = CGRect(x: rect.minX, y: rect.maxY - t, width: rect.width, height: t)
|
||||||
|
let left = CGRect(x: rect.minX, y: rect.minY, width: t, height: rect.height)
|
||||||
|
let right = CGRect(x: rect.maxX - t, y: rect.minY, width: t, height: rect.height)
|
||||||
|
return [top, bottom, left, right].map { meanSaturation(inPixelRect: $0) }.max() ?? 0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Grid-sampled pixel positions (top-left origin) that are strongly saturated
|
||||||
|
/// AND bright enough to be a UI highlight — i.e. the speaking ring/border.
|
||||||
|
func saturatedPoints(threshold: Double = 0.5, minBrightness: Double = 60, gridStep: Int = 6) -> [CGPoint] {
|
||||||
|
var points: [CGPoint] = []
|
||||||
|
var y = 0
|
||||||
|
while y < height {
|
||||||
|
var x = 0
|
||||||
|
while x < width {
|
||||||
|
let i = (y * width + x) * 4
|
||||||
|
let r = Double(pixels[i]), g = Double(pixels[i + 1]), b = Double(pixels[i + 2])
|
||||||
|
let mx = max(r, g, b), mn = min(r, g, b)
|
||||||
|
let sat = mx > 0 ? (mx - mn) / mx : 0
|
||||||
|
if sat > threshold && mx > minBrightness { points.append(CGPoint(x: x, y: y)) }
|
||||||
|
x += gridStep
|
||||||
|
}
|
||||||
|
y += gridStep
|
||||||
|
}
|
||||||
|
return points
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,94 @@
|
|||||||
|
import Foundation
|
||||||
|
import CoreGraphics
|
||||||
|
import CoreVideo
|
||||||
|
import CoreImage
|
||||||
|
|
||||||
|
/// Shared engine for tile-grid conferencing UIs (Signal/Zoom/Teams): OCR the
|
||||||
|
/// name/initials on each tile, then mark the active speaker(s) by the saturated
|
||||||
|
/// coloured highlight around their tile.
|
||||||
|
///
|
||||||
|
/// Geometry (`Config`) is a first pass; the exact tile expansion and saturation
|
||||||
|
/// threshold get calibrated per app against real screenshot fixtures. The
|
||||||
|
/// detection *logic* (read names; pick the highlighted tile) is validated with
|
||||||
|
/// synthetic frames.
|
||||||
|
struct GridCallAnalyzer {
|
||||||
|
struct Config {
|
||||||
|
var tileExpandX = 1.8 // grow text bbox → approx tile (for the reported bbox)
|
||||||
|
var tileExpandY = 2.6
|
||||||
|
var minTextConfidence: Float = 0.3
|
||||||
|
var maxNameLength = 40
|
||||||
|
/// Highlight detection: a name is "speaking" if enough strongly-saturated
|
||||||
|
/// highlight pixels sit within `highlightRadiusFraction` of its label.
|
||||||
|
var highlightRadiusFraction = 0.22 // of max(frame W,H)
|
||||||
|
var minHighlightPoints = 6
|
||||||
|
var highlightShareOfMax = 0.35 // must be ≥ this fraction of the busiest tile
|
||||||
|
}
|
||||||
|
|
||||||
|
var config = Config()
|
||||||
|
var recognizer = TextRecognizer()
|
||||||
|
|
||||||
|
func analyze(pixelBuffer: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation] {
|
||||||
|
guard let cg = Self.cgImage(from: pixelBuffer) else { return [] }
|
||||||
|
return analyze(cgImage: cg, at: t)
|
||||||
|
}
|
||||||
|
|
||||||
|
func analyze(cgImage: CGImage, at t: TimeInterval) -> [SpeakerObservation] {
|
||||||
|
let texts = recognizer.recognize(in: cgImage).filter {
|
||||||
|
$0.confidence >= config.minTextConfidence && !cleaned($0.text).isEmpty
|
||||||
|
}
|
||||||
|
guard !texts.isEmpty, let sampler = FrameSampler(cgImage: cgImage) else { return [] }
|
||||||
|
|
||||||
|
let w = cgImage.width, h = cgImage.height
|
||||||
|
let tiles = texts.map { r -> (name: String, center: CGPoint, rect: CGRect, conf: Double) in
|
||||||
|
let rect = tileRect(r.boundingBox, imageW: w, imageH: h)
|
||||||
|
let cx = r.boundingBox.midX * Double(w)
|
||||||
|
let cy = (1 - r.boundingBox.midY) * Double(h) // flip Y to top-left origin
|
||||||
|
return (cleaned(r.text), CGPoint(x: cx, y: cy), rect, Double(r.confidence))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find highlight pixels once, attribute each to the nearest name label.
|
||||||
|
let points = sampler.saturatedPoints()
|
||||||
|
let radius = Double(max(w, h)) * config.highlightRadiusFraction
|
||||||
|
let r2 = radius * radius
|
||||||
|
let counts = tiles.map { tile -> Int in
|
||||||
|
points.reduce(0) { acc, p in
|
||||||
|
let dx = Double(p.x) - tile.center.x, dy = Double(p.y) - tile.center.y
|
||||||
|
return acc + (dx * dx + dy * dy <= r2 ? 1 : 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let maxCount = counts.max() ?? 0
|
||||||
|
let need = max(config.minHighlightPoints, Int(Double(maxCount) * config.highlightShareOfMax))
|
||||||
|
|
||||||
|
return tiles.enumerated().map { idx, tile in
|
||||||
|
let speaking = maxCount >= config.minHighlightPoints && counts[idx] >= need
|
||||||
|
return SpeakerObservation(name: tile.name, speaking: speaking,
|
||||||
|
bbox: tile.rect, confidence: tile.conf, t: t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Vision normalized bbox (bottom-left origin) → pixel tile rect (top-left),
|
||||||
|
/// expanded around the text centre to approximate the whole tile.
|
||||||
|
private func tileRect(_ box: CGRect, imageW: Int, imageH: Int) -> CGRect {
|
||||||
|
let W = Double(imageW), H = Double(imageH)
|
||||||
|
let pw = box.width * W
|
||||||
|
let ph = box.height * H
|
||||||
|
let cx = (box.midX) * W
|
||||||
|
let cy = (1 - box.midY) * H // flip Y to top-left origin
|
||||||
|
let nw = pw * config.tileExpandX
|
||||||
|
let nh = ph * config.tileExpandY
|
||||||
|
let rect = CGRect(x: cx - nw / 2, y: cy - nh / 2, width: nw, height: nh)
|
||||||
|
return rect.intersection(CGRect(x: 0, y: 0, width: W, height: H))
|
||||||
|
}
|
||||||
|
|
||||||
|
private func cleaned(_ s: String) -> String {
|
||||||
|
let t = s.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
|
return t.count <= config.maxNameLength ? t : ""
|
||||||
|
}
|
||||||
|
|
||||||
|
private static let ciContext = CIContext()
|
||||||
|
|
||||||
|
static func cgImage(from pixelBuffer: CVPixelBuffer) -> CGImage? {
|
||||||
|
let ci = CIImage(cvPixelBuffer: pixelBuffer)
|
||||||
|
return ciContext.createCGImage(ci, from: ci.extent) // reuse; allocating per frame is costly
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,36 @@
|
|||||||
|
import Foundation
|
||||||
|
import CoreGraphics
|
||||||
|
import CoreVideo
|
||||||
|
|
||||||
|
/// One per-frame observation from an app adapter: a participant tile, whether its
|
||||||
|
/// active-speaker cue is showing, and where it is. `name` may be a full name,
|
||||||
|
/// initials (Signal), or "" if unknown. `t` is seconds relative to the session t0.
|
||||||
|
struct SpeakerObservation: Equatable {
|
||||||
|
let name: String
|
||||||
|
let speaking: Bool
|
||||||
|
let bbox: CGRect
|
||||||
|
let confidence: Double // 0…1
|
||||||
|
let t: TimeInterval
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-app screen-reading strategy. Each conferencing app gets one implementation
|
||||||
|
/// that knows that app's tile layout, name placement, and active-speaker cue.
|
||||||
|
/// Adapters must be testable offline against still-image fixtures.
|
||||||
|
protocol AppAdapter {
|
||||||
|
static var bundleIDs: [String] { get }
|
||||||
|
var adapterVersion: String { get }
|
||||||
|
var preferredFPS: Int { get }
|
||||||
|
|
||||||
|
/// Analyze one frame; return the speakers visible and whether each is speaking.
|
||||||
|
/// Must process in-memory and never persist the frame.
|
||||||
|
func analyze(frame: CVPixelBuffer, at t: TimeInterval) -> [SpeakerObservation]
|
||||||
|
|
||||||
|
/// Optional: participant names from the app's Accessibility tree (Electron
|
||||||
|
/// apps like Signal expose these), preferred over OCR when available.
|
||||||
|
func namesFromAccessibility() -> [String]?
|
||||||
|
}
|
||||||
|
|
||||||
|
extension AppAdapter {
|
||||||
|
func namesFromAccessibility() -> [String]? { nil }
|
||||||
|
var preferredFPS: Int { 3 }
|
||||||
|
}
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
import Foundation
|
||||||
|
import Vision
|
||||||
|
import CoreVideo
|
||||||
|
import CoreGraphics
|
||||||
|
|
||||||
|
/// Thin wrapper over Vision's text recognition, used by adapters to read names /
|
||||||
|
/// initials off participant tiles. Runs on the Neural Engine; no permission
|
||||||
|
/// needed. Works on any frame, so adapters can be developed against still images.
|
||||||
|
struct TextRecognizer {
|
||||||
|
struct Result {
|
||||||
|
let text: String
|
||||||
|
let confidence: Float
|
||||||
|
/// Normalized Vision bounding box (origin bottom-left, 0…1).
|
||||||
|
let boundingBox: CGRect
|
||||||
|
}
|
||||||
|
|
||||||
|
var recognitionLevel: VNRequestTextRecognitionLevel = .accurate
|
||||||
|
var minimumTextHeight: Float = 0 // 0 = Vision default
|
||||||
|
var usesLanguageCorrection = false // names/initials aren't dictionary words
|
||||||
|
|
||||||
|
/// Recognize text in `pixelBuffer`, optionally limited to a normalized region
|
||||||
|
/// of interest (origin bottom-left, matching Vision's coordinate space).
|
||||||
|
func recognize(in pixelBuffer: CVPixelBuffer, regionOfInterest: CGRect? = nil) -> [Result] {
|
||||||
|
let request = VNRecognizeTextRequest()
|
||||||
|
request.recognitionLevel = recognitionLevel
|
||||||
|
request.usesLanguageCorrection = usesLanguageCorrection
|
||||||
|
if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
|
||||||
|
if let roi = regionOfInterest { request.regionOfInterest = roi }
|
||||||
|
|
||||||
|
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
|
||||||
|
do {
|
||||||
|
try handler.perform([request])
|
||||||
|
} catch {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
|
||||||
|
guard let observations = request.results else { return [] }
|
||||||
|
return observations.compactMap { obs in
|
||||||
|
guard let top = obs.topCandidates(1).first else { return nil }
|
||||||
|
return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convenience for fixtures/tests: recognize text in a CGImage.
|
||||||
|
func recognize(in cgImage: CGImage, regionOfInterest: CGRect? = nil) -> [Result] {
|
||||||
|
let request = VNRecognizeTextRequest()
|
||||||
|
request.recognitionLevel = recognitionLevel
|
||||||
|
request.usesLanguageCorrection = usesLanguageCorrection
|
||||||
|
if minimumTextHeight > 0 { request.minimumTextHeight = minimumTextHeight }
|
||||||
|
if let roi = regionOfInterest { request.regionOfInterest = roi }
|
||||||
|
|
||||||
|
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||||
|
guard (try? handler.perform([request])) != nil, let results = request.results else { return [] }
|
||||||
|
return results.compactMap { obs in
|
||||||
|
guard let top = obs.topCandidates(1).first else { return nil }
|
||||||
|
return Result(text: top.string, confidence: top.confidence, boundingBox: obs.boundingBox)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,127 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// Turns noisy per-frame `SpeakerObservation`s into clean
|
||||||
|
/// `(start, end, name, confidence)` segments.
|
||||||
|
///
|
||||||
|
/// - Hysteresis: open a segment after `openFrames` consecutive speaking frames,
|
||||||
|
/// close after `closeFrames` quiet frames — rides out UI-cue lag/flicker.
|
||||||
|
/// - Overlaps allowed: each name is tracked independently (crosstalk).
|
||||||
|
/// - mic-VAD "self" spans are merged in as high-confidence segments.
|
||||||
|
/// - OCR name variants are normalized via an alias table.
|
||||||
|
///
|
||||||
|
/// Pure logic, no UI/capture deps — fully unit-testable offline.
|
||||||
|
final class TimelineBuilder {
|
||||||
|
private let openFrames: Int
|
||||||
|
private let closeFrames: Int
|
||||||
|
private var aliases: [String: String] = [:] // normalized variant -> canonical
|
||||||
|
private var states: [String: NameState] = [:]
|
||||||
|
private var lastFrameT: Double = 0
|
||||||
|
private(set) var segments: [VisualTimeline.Segment] = []
|
||||||
|
|
||||||
|
init(openFrames: Int = 2, closeFrames: Int = 2) {
|
||||||
|
self.openFrames = max(1, openFrames)
|
||||||
|
self.closeFrames = max(1, closeFrames)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Register that `variant` (e.g. "Sarah J") should map to `canonical`
|
||||||
|
/// (e.g. "Sarah Jones").
|
||||||
|
func addAlias(_ variant: String, canonical: String) {
|
||||||
|
aliases[Self.normalize(variant)] = canonical
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Ingest one frame's observations (all sharing time `t`). Names not present
|
||||||
|
/// (or present but not speaking) count as a quiet frame for any open segment.
|
||||||
|
func ingest(_ observations: [SpeakerObservation], at t: TimeInterval) {
|
||||||
|
lastFrameT = t
|
||||||
|
|
||||||
|
// Best confidence per canonical name that is speaking this frame.
|
||||||
|
var speaking: [String: Double] = [:]
|
||||||
|
for obs in observations where obs.speaking && !obs.name.isEmpty {
|
||||||
|
let name = canonical(obs.name)
|
||||||
|
speaking[name] = max(speaking[name] ?? 0, obs.confidence)
|
||||||
|
}
|
||||||
|
|
||||||
|
let names = Set(states.keys).union(speaking.keys)
|
||||||
|
for name in names {
|
||||||
|
var st = states[name] ?? NameState()
|
||||||
|
if let conf = speaking[name] {
|
||||||
|
if st.voiced == 0 { st.runStart = t }
|
||||||
|
st.voiced += 1
|
||||||
|
st.silent = 0
|
||||||
|
st.lastVoicedT = t
|
||||||
|
if !st.open && st.voiced >= openFrames {
|
||||||
|
st.open = true
|
||||||
|
st.segStart = st.runStart
|
||||||
|
st.confSum = 0
|
||||||
|
st.confN = 0
|
||||||
|
}
|
||||||
|
if st.open { st.confSum += conf; st.confN += 1 }
|
||||||
|
} else {
|
||||||
|
st.silent += 1
|
||||||
|
st.voiced = 0
|
||||||
|
if st.open && st.silent >= closeFrames {
|
||||||
|
closeSegment(name: name, state: st)
|
||||||
|
st.open = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
states[name] = st
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merge mic-VAD self spans (the user) as high-confidence segments.
|
||||||
|
func mergeSelfSpans(_ spans: [VADSpan], selfName: String) {
|
||||||
|
for span in spans where span.end > span.start {
|
||||||
|
segments.append(.init(start: span.start, end: span.end,
|
||||||
|
name: selfName, confidence: span.confidence, source: "mic_vad"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Force-close any open segments at `t` (used when a visual gap begins, so a
|
||||||
|
/// segment isn't carried across the gap).
|
||||||
|
func closeOpenSegments(at t: TimeInterval) {
|
||||||
|
for (name, st) in states where st.open {
|
||||||
|
closeSegment(name: name, state: st)
|
||||||
|
states[name]?.open = false
|
||||||
|
states[name]?.voiced = 0
|
||||||
|
states[name]?.silent = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Close any still-open segments at end of capture.
|
||||||
|
func finish() {
|
||||||
|
for (name, st) in states where st.open {
|
||||||
|
closeSegment(name: name, state: st)
|
||||||
|
states[name]?.open = false
|
||||||
|
}
|
||||||
|
segments.sort { $0.start < $1.start }
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Internal
|
||||||
|
|
||||||
|
private struct NameState {
|
||||||
|
var voiced = 0
|
||||||
|
var silent = 0
|
||||||
|
var open = false
|
||||||
|
var runStart: Double = 0
|
||||||
|
var segStart: Double = 0
|
||||||
|
var lastVoicedT: Double = 0
|
||||||
|
var confSum: Double = 0
|
||||||
|
var confN = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
private func closeSegment(name: String, state st: NameState) {
|
||||||
|
guard st.lastVoicedT > st.segStart else { return }
|
||||||
|
let confidence = st.confN > 0 ? st.confSum / Double(st.confN) : 0.8
|
||||||
|
segments.append(.init(start: st.segStart, end: st.lastVoicedT,
|
||||||
|
name: name, confidence: confidence, source: "vision"))
|
||||||
|
}
|
||||||
|
|
||||||
|
private func canonical(_ raw: String) -> String {
|
||||||
|
let key = Self.normalize(raw)
|
||||||
|
return aliases[key] ?? raw.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
|
}
|
||||||
|
|
||||||
|
private static func normalize(_ s: String) -> String {
|
||||||
|
s.lowercased().trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,131 @@
|
|||||||
|
import Foundation
|
||||||
|
import ScreenCaptureKit
|
||||||
|
import CoreMedia
|
||||||
|
import QuartzCore
|
||||||
|
import AppKit
|
||||||
|
|
||||||
|
/// Window-scoped visual capture: streams the call window's own rendered content
|
||||||
|
/// at ~`fps`, hands each frame to the app adapter, and **releases it immediately
|
||||||
|
/// — frames are never written to disk**. Builds the speaker timeline and records
|
||||||
|
/// `visual_gap`s when the window is minimized (SCK delivers non-live frames).
|
||||||
|
///
|
||||||
|
/// Window visibility/focus is NOT required — SCK captures a window even when it's
|
||||||
|
/// occluded or on another Space; only minimization freezes the backing buffer.
|
||||||
|
@available(macOS 13.0, *)
|
||||||
|
final class VisualObserver: NSObject, SCStreamDelegate, SCStreamOutput {
|
||||||
|
private let bundleID: String
|
||||||
|
private let adapter: any AppAdapter
|
||||||
|
private let t0Host: Double
|
||||||
|
private let fps: Int
|
||||||
|
private let queue = DispatchQueue(label: "xyz.ten31.visual")
|
||||||
|
|
||||||
|
private var stream: SCStream?
|
||||||
|
private let builder = TimelineBuilder()
|
||||||
|
private var gaps: [VisualTimeline.Gap] = []
|
||||||
|
private var gapStart: Double?
|
||||||
|
|
||||||
|
/// Optional live hook (e.g. for a debug HUD). Observations only; no frame.
|
||||||
|
var onObservations: (([SpeakerObservation], TimeInterval) -> Void)?
|
||||||
|
|
||||||
|
init(bundleID: String, adapter: any AppAdapter, t0Host: Double, fps: Int = 3) {
|
||||||
|
self.bundleID = bundleID
|
||||||
|
self.adapter = adapter
|
||||||
|
self.t0Host = t0Host
|
||||||
|
self.fps = max(1, fps)
|
||||||
|
}
|
||||||
|
|
||||||
|
func start() async throws {
|
||||||
|
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
|
||||||
|
// The call window: the largest window owned by the target app.
|
||||||
|
let candidates = content.windows.filter { $0.owningApplication?.bundleIdentifier == bundleID }
|
||||||
|
guard let window = candidates.max(by: { $0.frame.width * $0.frame.height < $1.frame.width * $1.frame.height }) else {
|
||||||
|
throw NSError(domain: "Ten31", code: 2,
|
||||||
|
userInfo: [NSLocalizedDescriptionKey: "No \(bundleID) window to capture."])
|
||||||
|
}
|
||||||
|
|
||||||
|
let filter = SCContentFilter(desktopIndependentWindow: window)
|
||||||
|
let config = SCStreamConfiguration()
|
||||||
|
config.minimumFrameInterval = CMTime(value: 1, timescale: CMTimeScale(fps))
|
||||||
|
config.queueDepth = 3
|
||||||
|
config.showsCursor = false
|
||||||
|
config.pixelFormat = kCVPixelFormatType_32BGRA
|
||||||
|
// window.frame is in points; capture at native pixels so OCR can read small
|
||||||
|
// initials/names (a half-res Retina capture badly hurts recognition).
|
||||||
|
let scale = NSScreen.main?.backingScaleFactor ?? 2
|
||||||
|
config.width = max(2, Int(window.frame.width * scale))
|
||||||
|
config.height = max(2, Int(window.frame.height * scale))
|
||||||
|
|
||||||
|
let stream = SCStream(filter: filter, configuration: config, delegate: self)
|
||||||
|
try stream.addStreamOutput(self, type: .screen, sampleHandlerQueue: queue)
|
||||||
|
try await stream.startCapture()
|
||||||
|
self.stream = stream
|
||||||
|
}
|
||||||
|
|
||||||
|
func stop() async -> (segments: [VisualTimeline.Segment], gaps: [VisualTimeline.Gap]) {
|
||||||
|
if let stream { try? await stream.stopCapture() }
|
||||||
|
stream = nil
|
||||||
|
return queue.sync {
|
||||||
|
if let gs = gapStart {
|
||||||
|
gaps.append(.init(start: gs, end: CACurrentMediaTime() - t0Host, reason: "minimized"))
|
||||||
|
gapStart = nil
|
||||||
|
}
|
||||||
|
builder.finish()
|
||||||
|
return (builder.segments, gaps)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merge mic-VAD self spans into the visual timeline (call before `stop`'s read,
|
||||||
|
/// or fold in afterwards in the packager).
|
||||||
|
func addSelfSpans(_ spans: [VADSpan], selfName: String) {
|
||||||
|
queue.sync { builder.mergeSelfSpans(spans, selfName: selfName) }
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - SCStreamOutput (on `queue`)
|
||||||
|
|
||||||
|
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
|
||||||
|
of type: SCStreamOutputType) {
|
||||||
|
guard type == .screen, CMSampleBufferDataIsReady(sampleBuffer) else { return }
|
||||||
|
let now = CACurrentMediaTime() - t0Host
|
||||||
|
|
||||||
|
switch frameKind(sampleBuffer) {
|
||||||
|
case .idle:
|
||||||
|
// Window is live but static (no pixel change) — no new info, not a gap.
|
||||||
|
return
|
||||||
|
case .gap:
|
||||||
|
// Minimized/blanked: the backing buffer is frozen. Open a gap once and
|
||||||
|
// close any open speaker segments so none is carried across it.
|
||||||
|
if gapStart == nil {
|
||||||
|
gapStart = now
|
||||||
|
builder.closeOpenSegments(at: now)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
case .live:
|
||||||
|
if let gs = gapStart {
|
||||||
|
gaps.append(.init(start: gs, end: now, reason: "minimized"))
|
||||||
|
gapStart = nil
|
||||||
|
}
|
||||||
|
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
|
||||||
|
let observations = adapter.analyze(frame: pixelBuffer, at: now) // frame released after this scope
|
||||||
|
builder.ingest(observations, at: now)
|
||||||
|
onObservations?(observations, now)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func stream(_ stream: SCStream, didStopWithError error: Error) {}
|
||||||
|
|
||||||
|
private enum FrameKind { case live, idle, gap }
|
||||||
|
|
||||||
|
/// SCK delivers `.complete` only when content changes, `.idle` for a static
|
||||||
|
/// (but visible) window, and `.blank`/`.suspended`/`.stopped` when frozen.
|
||||||
|
private func frameKind(_ sampleBuffer: CMSampleBuffer) -> FrameKind {
|
||||||
|
guard let attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, createIfNecessary: false)
|
||||||
|
as? [[SCStreamFrameInfo: Any]],
|
||||||
|
let raw = attachments.first?[.status] as? Int,
|
||||||
|
let status = SCFrameStatus(rawValue: raw) else { return .live }
|
||||||
|
switch status {
|
||||||
|
case .complete: return .live
|
||||||
|
case .idle: return .idle
|
||||||
|
default: return .gap // .blank / .suspended / .stopped
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,72 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// `visual_timeline.json` (schema 1.1) — the app's primary visual output. Times
|
||||||
|
/// are seconds relative to session t0. Segments may overlap (crosstalk).
|
||||||
|
struct VisualTimeline: Codable {
|
||||||
|
var schemaVersion = "1.1"
|
||||||
|
let sessionId: String
|
||||||
|
let app: String
|
||||||
|
let adapterVersion: String
|
||||||
|
let t0Unix: Double
|
||||||
|
let durationSec: Double
|
||||||
|
let fpsSampled: Int
|
||||||
|
let selfName: String?
|
||||||
|
let participants: [Participant]
|
||||||
|
let segments: [Segment]
|
||||||
|
let visualGaps: [Gap]
|
||||||
|
|
||||||
|
struct Participant: Codable {
|
||||||
|
let name: String
|
||||||
|
let isSelf: Bool?
|
||||||
|
let aliases: [String]?
|
||||||
|
enum CodingKeys: String, CodingKey {
|
||||||
|
case name
|
||||||
|
case isSelf = "is_self"
|
||||||
|
case aliases
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Segment: Codable, Equatable {
|
||||||
|
let start: Double
|
||||||
|
let end: Double
|
||||||
|
let name: String
|
||||||
|
let confidence: Double
|
||||||
|
let source: String // vision | accessibility | fused | mic_vad
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Gap: Codable, Equatable {
|
||||||
|
let start: Double
|
||||||
|
let end: Double
|
||||||
|
let reason: String // minimized | tab_switched
|
||||||
|
}
|
||||||
|
|
||||||
|
enum CodingKeys: String, CodingKey {
|
||||||
|
case schemaVersion = "schema_version"
|
||||||
|
case sessionId = "session_id"
|
||||||
|
case app
|
||||||
|
case adapterVersion = "adapter_version"
|
||||||
|
case t0Unix = "t0_unix"
|
||||||
|
case durationSec = "duration_sec"
|
||||||
|
case fpsSampled = "fps_sampled"
|
||||||
|
case selfName = "self_name"
|
||||||
|
case participants
|
||||||
|
case segments
|
||||||
|
case visualGaps = "visual_gaps"
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the rich `visual_timeline.json`.
|
||||||
|
func write(to url: URL) throws {
|
||||||
|
let encoder = JSONEncoder()
|
||||||
|
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
|
||||||
|
try encoder.encode(self).write(to: url)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The flat array `label-merge` wants: `[{start,end,name,confidence}]`,
|
||||||
|
/// dropping `source`. Slice/rebase to chunk-local seconds happens in Phase 5.
|
||||||
|
func flatTimelineData() throws -> Data {
|
||||||
|
let flat = segments.map { seg -> [String: Any] in
|
||||||
|
["start": seg.start, "end": seg.end, "name": seg.name, "confidence": seg.confidence]
|
||||||
|
}
|
||||||
|
return try JSONSerialization.data(withJSONObject: flat, options: [])
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
import XCTest
|
||||||
|
import CoreGraphics
|
||||||
|
import CoreText
|
||||||
|
@testable import Ten31Transcripts
|
||||||
|
|
||||||
|
/// Validates the visual adapter against synthetic call frames (no real
|
||||||
|
/// screenshots needed): OCR anchors the tiles and the highlight is attributed to
|
||||||
|
/// the correct speaker, tracking it as it moves.
|
||||||
|
final class GridCallAnalyzerTests: XCTestCase {
|
||||||
|
|
||||||
|
private func drawText(_ s: String, _ ctx: CGContext, center: CGPoint, size: CGFloat) {
|
||||||
|
let font = CTFontCreateWithName("Helvetica-Bold" as CFString, size, nil)
|
||||||
|
let attrs = [kCTFontAttributeName: font,
|
||||||
|
kCTForegroundColorAttributeName: CGColor(red: 1, green: 1, blue: 1, alpha: 1)] as CFDictionary
|
||||||
|
let line = CTLineCreateWithAttributedString(CFAttributedStringCreate(nil, s as CFString, attrs)!)
|
||||||
|
let b = CTLineGetBoundsWithOptions(line, [])
|
||||||
|
ctx.textPosition = CGPoint(x: center.x - b.width / 2, y: center.y - b.height / 2)
|
||||||
|
CTLineDraw(line, ctx)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func frame(speakingIndex: Int) -> CGImage {
|
||||||
|
let W = 800, H = 600
|
||||||
|
let ctx = CGContext(data: nil, width: W, height: H, bitsPerComponent: 8, bytesPerRow: 0,
|
||||||
|
space: CGColorSpaceCreateDeviceRGB(),
|
||||||
|
bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue)!
|
||||||
|
ctx.setFillColor(CGColor(red: 0.1, green: 0.1, blue: 0.12, alpha: 1))
|
||||||
|
ctx.fill(CGRect(x: 0, y: 0, width: W, height: H))
|
||||||
|
let rects: [(String, CGRect)] = [
|
||||||
|
("GRANT", CGRect(x: 40, y: 320, width: 340, height: 230)),
|
||||||
|
("SARAH", CGRect(x: 420, y: 320, width: 340, height: 230)),
|
||||||
|
("DMITRI", CGRect(x: 40, y: 50, width: 340, height: 230)),
|
||||||
|
("ALEX", CGRect(x: 420, y: 50, width: 340, height: 230)),
|
||||||
|
]
|
||||||
|
for (i, (name, rect)) in rects.enumerated() {
|
||||||
|
ctx.setFillColor(CGColor(red: 0.18, green: 0.18, blue: 0.2, alpha: 1)); ctx.fill(rect)
|
||||||
|
if i == speakingIndex {
|
||||||
|
ctx.setStrokeColor(CGColor(red: 0.1, green: 0.85, blue: 0.2, alpha: 1)); ctx.setLineWidth(14)
|
||||||
|
ctx.stroke(rect.insetBy(dx: 7, dy: 7))
|
||||||
|
}
|
||||||
|
drawText(name, ctx, center: CGPoint(x: rect.midX, y: rect.midY), size: 54)
|
||||||
|
}
|
||||||
|
return ctx.makeImage()!
|
||||||
|
}
|
||||||
|
|
||||||
|
func testReadsNamesAndPicksHighlightedSpeaker() {
|
||||||
|
let obs = SignalAdapter().analyze(cgImage: frame(speakingIndex: 1), at: 0) // SARAH
|
||||||
|
XCTAssertGreaterThanOrEqual(obs.count, 2)
|
||||||
|
let speaking = obs.filter { $0.speaking }
|
||||||
|
XCTAssertEqual(speaking.count, 1)
|
||||||
|
// SARAH tile center in top-left pixels ≈ (590, 165)
|
||||||
|
XCTAssertEqual(speaking.first?.bbox.midX ?? 0, 590, accuracy: 160)
|
||||||
|
XCTAssertEqual(speaking.first?.bbox.midY ?? 0, 165, accuracy: 160)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testHighlightTracksToAnotherTile() {
|
||||||
|
let obs = SignalAdapter().analyze(cgImage: frame(speakingIndex: 2), at: 1) // DMITRI
|
||||||
|
let speaking = obs.filter { $0.speaking }
|
||||||
|
XCTAssertEqual(speaking.count, 1)
|
||||||
|
XCTAssertEqual(speaking.first?.bbox.midX ?? 0, 210, accuracy: 160)
|
||||||
|
XCTAssertEqual(speaking.first?.bbox.midY ?? 0, 435, accuracy: 160)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,47 @@
|
|||||||
|
import XCTest
|
||||||
|
@testable import Ten31Transcripts
|
||||||
|
|
||||||
|
final class Phase5Tests: XCTestCase {
|
||||||
|
func testPlanChunksShort() {
|
||||||
|
let c = SessionPackager.planChunks(durationSec: 70)
|
||||||
|
XCTAssertEqual(c.count, 1)
|
||||||
|
XCTAssertEqual(c[0].end, 70, accuracy: 0.001)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testPlanChunksLong() {
|
||||||
|
let c = SessionPackager.planChunks(durationSec: 400, chunkSeconds: 150)
|
||||||
|
XCTAssertEqual(c.count, 3)
|
||||||
|
XCTAssertEqual(c[0].start, 0); XCTAssertEqual(c[0].end, 150)
|
||||||
|
XCTAssertEqual(c[1].start, 150); XCTAssertEqual(c[2].end, 400)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testRebaseClipsAndRebases() throws {
|
||||||
|
let segs = [
|
||||||
|
VisualTimeline.Segment(start: 140, end: 160, name: "A", confidence: 0.9, source: "vision"),
|
||||||
|
VisualTimeline.Segment(start: 200, end: 260, name: "B", confidence: 0.8, source: "vision"),
|
||||||
|
]
|
||||||
|
let data = try SessionPackager.rebasedTimelineData(segs, start: 150, end: 300)
|
||||||
|
let arr = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [[String: Any]])
|
||||||
|
XCTAssertEqual(arr.count, 2)
|
||||||
|
XCTAssertEqual(arr[0]["start"] as? Double, 0)
|
||||||
|
XCTAssertEqual(arr[0]["end"] as? Double, 10)
|
||||||
|
XCTAssertEqual(arr[1]["start"] as? Double, 50)
|
||||||
|
XCTAssertEqual(arr[1]["end"] as? Double, 110)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testAssembleOffsetsAndUnifies() throws {
|
||||||
|
let resp0 = #"{"duration":150,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1,0.2]}],"segments":[{"start_ms":1000,"end_ms":2000,"speaker":"Grant","text":"hi"}],"fingerprints":{"Grant":[0.1,0.2]},"models":{"diarization":"x"}}"#
|
||||||
|
let resp1 = #"{"duration":100,"speakers":[{"cluster":"Speaker_0","name":"Sarah","source":"voiceprint","match_similarity":0.7,"fingerprint":[0.3,0.4]},{"cluster":"Speaker_1","name":"Unknown_0","source":"unmatched"}],"segments":[{"start_ms":500,"end_ms":1500,"speaker":"Sarah","text":"hello"}],"fingerprints":{"Sarah":[0.3,0.4]},"models":{"diarization":"x"}}"#
|
||||||
|
let r0 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp0.utf8))
|
||||||
|
let r1 = try JSONDecoder().decode(LabelMergeResponse.self, from: Data(resp1.utf8))
|
||||||
|
let asm = TranscriptAssembler.assemble(sessionId: "s", app: "meet",
|
||||||
|
chunks: [.init(chunkStart: 0, response: r0), .init(chunkStart: 150, response: r1)])
|
||||||
|
XCTAssertEqual(asm.speakersFile.segments.count, 2)
|
||||||
|
XCTAssertEqual(asm.speakersFile.segments[0].start, 1, accuracy: 0.001)
|
||||||
|
XCTAssertEqual(asm.speakersFile.segments[1].start, 150.5, accuracy: 0.001)
|
||||||
|
XCTAssertEqual(asm.speakersFile.speakers.count, 3)
|
||||||
|
XCTAssertNotNil(asm.fingerprints["Grant"])
|
||||||
|
XCTAssertNotNil(asm.fingerprints["Sarah"])
|
||||||
|
XCTAssertNil(asm.fingerprints["Unknown_0"])
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
import XCTest
|
||||||
|
@testable import Ten31Transcripts
|
||||||
|
|
||||||
|
final class TimelineBuilderTests: XCTestCase {
|
||||||
|
private func obs(_ name: String, _ speaking: Bool, _ t: Double, _ conf: Double = 0.9) -> SpeakerObservation {
|
||||||
|
SpeakerObservation(name: name, speaking: speaking, bbox: .zero, confidence: conf, t: t)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testOpensAfterKFramesAndClosesAfterMQuiet() {
|
||||||
|
let b = TimelineBuilder(openFrames: 2, closeFrames: 2)
|
||||||
|
b.ingest([obs("A", true, 0)], at: 0)
|
||||||
|
b.ingest([obs("A", true, 1)], at: 1)
|
||||||
|
b.ingest([obs("A", true, 2)], at: 2)
|
||||||
|
b.ingest([], at: 3)
|
||||||
|
b.ingest([], at: 4)
|
||||||
|
b.finish()
|
||||||
|
XCTAssertEqual(b.segments.count, 1)
|
||||||
|
XCTAssertEqual(b.segments.first?.name, "A")
|
||||||
|
XCTAssertEqual(b.segments.first?.start ?? -1, 0, accuracy: 0.001)
|
||||||
|
XCTAssertEqual(b.segments.first?.end ?? -1, 2, accuracy: 0.001)
|
||||||
|
XCTAssertEqual(b.segments.first?.source, "vision")
|
||||||
|
}
|
||||||
|
|
||||||
|
func testSingleFlickerDoesNotOpen() {
|
||||||
|
let b = TimelineBuilder(openFrames: 2, closeFrames: 2)
|
||||||
|
b.ingest([obs("A", true, 0)], at: 0)
|
||||||
|
b.ingest([], at: 1)
|
||||||
|
b.finish()
|
||||||
|
XCTAssertTrue(b.segments.isEmpty)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testAllowsOverlap() {
|
||||||
|
let b = TimelineBuilder(openFrames: 1, closeFrames: 1)
|
||||||
|
b.ingest([obs("A", true, 0), obs("B", true, 0)], at: 0)
|
||||||
|
b.ingest([obs("A", true, 1), obs("B", true, 1)], at: 1)
|
||||||
|
b.ingest([], at: 2)
|
||||||
|
b.finish()
|
||||||
|
XCTAssertEqual(b.segments.count, 2)
|
||||||
|
XCTAssertEqual(Set(b.segments.map { $0.name }), ["A", "B"])
|
||||||
|
}
|
||||||
|
|
||||||
|
func testMergesSelfSpans() {
|
||||||
|
let b = TimelineBuilder()
|
||||||
|
b.mergeSelfSpans([VADSpan(start: 0, end: 4.5, confidence: 0.97)], selfName: "Grant")
|
||||||
|
b.finish()
|
||||||
|
XCTAssertEqual(b.segments.count, 1)
|
||||||
|
XCTAssertEqual(b.segments.first?.name, "Grant")
|
||||||
|
XCTAssertEqual(b.segments.first?.source, "mic_vad")
|
||||||
|
}
|
||||||
|
|
||||||
|
func testNormalizesAlias() {
|
||||||
|
let b = TimelineBuilder(openFrames: 1, closeFrames: 1)
|
||||||
|
b.addAlias("Sarah J", canonical: "Sarah Jones")
|
||||||
|
b.ingest([obs("Sarah J", true, 0)], at: 0)
|
||||||
|
b.ingest([obs("Sarah J", true, 1)], at: 1)
|
||||||
|
b.ingest([], at: 2)
|
||||||
|
b.finish()
|
||||||
|
XCTAssertEqual(b.segments.first?.name, "Sarah Jones")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
import XCTest
|
||||||
|
@testable import Ten31Transcripts
|
||||||
|
|
||||||
|
final class VoiceprintStoreTests: XCTestCase {
|
||||||
|
private func tempURL() -> URL {
|
||||||
|
FileManager.default.temporaryDirectory.appendingPathComponent("vp_\(UUID().uuidString).json")
|
||||||
|
}
|
||||||
|
|
||||||
|
private func response() throws -> LabelMergeResponse {
|
||||||
|
let json = #"{"duration":10,"speakers":[{"cluster":"Speaker_0","name":"Grant","source":"visual","overlap_confidence":0.99,"fingerprint":[0.1,0.2,0.3]},{"cluster":"Speaker_1","name":"Sarah","source":"voiceprint","match_similarity":0.7,"fingerprint":[0.4,0.5,0.6]},{"cluster":"Speaker_2","name":"Bob","source":"visual","overlap_confidence":0.5,"fingerprint":[0.7,0.8,0.9]},{"cluster":"Speaker_3","name":"Unknown_0","source":"unmatched"}],"segments":[],"fingerprints":{"Grant":[0.1,0.2,0.3],"Sarah":[0.4,0.5,0.6]},"models":{}}"#
|
||||||
|
return try JSONDecoder().decode(LabelMergeResponse.self, from: Data(json.utf8))
|
||||||
|
}
|
||||||
|
|
||||||
|
func testStoresOnlyConfidentNamedSpeakers() throws {
|
||||||
|
let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) }
|
||||||
|
let store = VoiceprintStore(fileURL: url)
|
||||||
|
store.update(with: try response())
|
||||||
|
XCTAssertNotNil(store.entries["Grant"]) // visual, high overlap
|
||||||
|
XCTAssertNotNil(store.entries["Sarah"]) // voiceprint match
|
||||||
|
XCTAssertNil(store.entries["Bob"]) // overlap 0.5 < 0.8
|
||||||
|
XCTAssertNil(store.entries["Unknown_0"])
|
||||||
|
XCTAssertEqual(store.knownVoiceprints()["Grant"], [0.1, 0.2, 0.3])
|
||||||
|
XCTAssertEqual(store.entries["Grant"]?.calls, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testPersistsAcrossInstancesAndIncrementsCalls() throws {
|
||||||
|
let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) }
|
||||||
|
let store = VoiceprintStore(fileURL: url)
|
||||||
|
store.update(with: try response())
|
||||||
|
store.update(with: try response())
|
||||||
|
XCTAssertEqual(store.entries["Grant"]?.calls, 2)
|
||||||
|
let reopened = VoiceprintStore(fileURL: url)
|
||||||
|
XCTAssertEqual(reopened.knownVoiceprints().count, 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
func testRenameRemoveReset() throws {
|
||||||
|
let url = tempURL(); defer { try? FileManager.default.removeItem(at: url) }
|
||||||
|
let store = VoiceprintStore(fileURL: url)
|
||||||
|
store.update(with: try response())
|
||||||
|
store.rename("Sarah", to: "Sarah Jones")
|
||||||
|
XCTAssertNotNil(store.entries["Sarah Jones"]); XCTAssertNil(store.entries["Sarah"])
|
||||||
|
store.remove("Grant"); XCTAssertNil(store.entries["Grant"])
|
||||||
|
store.reset(); XCTAssertTrue(store.entries.isEmpty)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user