Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
@@ -0,0 +1,85 @@
|
||||
import Foundation
|
||||
import AVFoundation
|
||||
|
||||
/// Splits a long session into backend-sized chunks and produces, per chunk, the
|
||||
/// sliced audio and the timeline rebased to chunk-local seconds.
|
||||
///
|
||||
/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3
|
||||
/// min are chunked into ~2–3 min windows; names + voiceprints unify speakers
|
||||
/// across chunks (handled in the pipeline).
|
||||
enum SessionPackager {
|
||||
struct PlannedChunk: Equatable {
|
||||
let index: Int
|
||||
let start: Double // global seconds
|
||||
let end: Double
|
||||
}
|
||||
|
||||
/// One chunk if short; otherwise even ~`chunkSeconds` windows.
|
||||
static func planChunks(durationSec: Double,
|
||||
chunkSeconds: Double = 150,
|
||||
thresholdSec: Double = 180) -> [PlannedChunk] {
|
||||
guard durationSec > thresholdSec else {
|
||||
return [PlannedChunk(index: 0, start: 0, end: durationSec)]
|
||||
}
|
||||
var chunks: [PlannedChunk] = []
|
||||
var start = 0.0
|
||||
var index = 0
|
||||
while start < durationSec - 0.001 {
|
||||
let end = min(start + chunkSeconds, durationSec)
|
||||
chunks.append(PlannedChunk(index: index, start: start, end: end))
|
||||
start = end
|
||||
index += 1
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
|
||||
/// Clip segments to `[start, end)` and rebase to chunk-local seconds, then
|
||||
/// emit the flat `label-merge` array `[{start,end,name,confidence}]`.
|
||||
static func rebasedTimelineData(_ segments: [VisualTimeline.Segment],
|
||||
start: Double, end: Double) throws -> Data {
|
||||
let flat: [[String: Any]] = segments.compactMap { seg in
|
||||
let s = max(seg.start, start)
|
||||
let e = min(seg.end, end)
|
||||
guard e > s else { return nil }
|
||||
return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence]
|
||||
}
|
||||
return try JSONSerialization.data(withJSONObject: flat, options: [])
|
||||
}
|
||||
|
||||
/// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
|
||||
static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
|
||||
let input = try AVAudioFile(forReading: source)
|
||||
let sr = input.fileFormat.sampleRate
|
||||
let startFrame = AVAudioFramePosition((startSec * sr).rounded())
|
||||
let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded()))
|
||||
guard endFrame > startFrame else { return }
|
||||
|
||||
let settings: [String: Any] = [
|
||||
AVFormatIDKey: kAudioFormatLinearPCM,
|
||||
AVSampleRateKey: sr,
|
||||
AVNumberOfChannelsKey: 1,
|
||||
AVLinearPCMBitDepthKey: 16,
|
||||
AVLinearPCMIsFloatKey: false,
|
||||
AVLinearPCMIsBigEndianKey: false,
|
||||
]
|
||||
let output = try AVAudioFile(forWriting: dest, settings: settings,
|
||||
commonFormat: .pcmFormatFloat32, interleaved: false)
|
||||
input.framePosition = startFrame
|
||||
var remaining = AVAudioFrameCount(endFrame - startFrame)
|
||||
let block: AVAudioFrameCount = 16_000
|
||||
while remaining > 0 {
|
||||
let n = min(block, remaining)
|
||||
guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break }
|
||||
try input.read(into: buffer, frameCount: n)
|
||||
if buffer.frameLength == 0 { break }
|
||||
try output.write(from: buffer)
|
||||
remaining -= buffer.frameLength
|
||||
}
|
||||
}
|
||||
|
||||
/// Duration (seconds) of a WAV.
|
||||
static func duration(of url: URL) -> Double {
|
||||
guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 }
|
||||
return Double(file.length) / file.fileFormat.sampleRate
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user