import Foundation import AVFoundation /// Splits a long session into backend-sized chunks and produces, per chunk, the /// sliced audio and the timeline rebased to chunk-local seconds. /// /// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3 /// min are chunked into ~2–3 min windows; names + voiceprints unify speakers /// across chunks (handled in the pipeline). enum SessionPackager { struct PlannedChunk: Equatable { let index: Int let start: Double // sliced window start (global seconds, incl. overlap margin) let end: Double // sliced window end (incl. overlap margin) let bodyStart: Double // the region this chunk OWNS (no overlap) — for stitch dedup let bodyEnd: Double } /// One chunk if short; otherwise ~`chunkSeconds` bodies, each sliced with an /// `overlapSeconds` margin on both sides. The margin gives the backend context at /// boundaries (so a sentence isn't cut and the diarizer attributes edge speech /// correctly and keeps a voice consistent across chunks); the stitcher keeps only /// each chunk's owned `body` region, deduping the overlap. static func planChunks(durationSec: Double, chunkSeconds: Double = 150, overlapSeconds: Double = 15, thresholdSec: Double = 180) -> [PlannedChunk] { guard durationSec > thresholdSec else { return [PlannedChunk(index: 0, start: 0, end: durationSec, bodyStart: 0, bodyEnd: durationSec)] } var chunks: [PlannedChunk] = [] var bodyStart = 0.0 var index = 0 while bodyStart < durationSec - 0.001 { let bodyEnd = min(bodyStart + chunkSeconds, durationSec) chunks.append(PlannedChunk( index: index, start: max(0, bodyStart - overlapSeconds), end: min(durationSec, bodyEnd + overlapSeconds), bodyStart: bodyStart, bodyEnd: bodyEnd)) bodyStart = bodyEnd index += 1 } return chunks } /// Clip segments to `[start, end)` and rebase to chunk-local seconds, then /// emit the flat `label-merge` array `[{start,end,name,confidence}]`. static func rebasedTimelineData(_ segments: [VisualTimeline.Segment], start: Double, end: Double) throws -> Data { let flat: [[String: Any]] = segments.compactMap { seg in let s = max(seg.start, start) let e = min(seg.end, end) guard e > s else { return nil } return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence] } return try JSONSerialization.data(withJSONObject: flat, options: []) } /// Clip self-VAD spans to `[start, end)` and rebase to chunk-local seconds, as /// the `self_vad` array `[{start,end}]` for dual-channel `label-merge`. static func rebasedSelfVadData(_ spans: [VADSpan], start: Double, end: Double) throws -> Data { let flat: [[String: Any]] = spans.compactMap { span in let s = max(span.start, start) let e = min(span.end, end) guard e > s else { return nil } return ["start": s - start, "end": e - start] } return try JSONSerialization.data(withJSONObject: flat, options: []) } /// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`. static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws { let input = try AVAudioFile(forReading: source) let sr = input.fileFormat.sampleRate let startFrame = AVAudioFramePosition((startSec * sr).rounded()) let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded())) guard endFrame > startFrame else { return } let settings: [String: Any] = [ AVFormatIDKey: kAudioFormatLinearPCM, AVSampleRateKey: sr, AVNumberOfChannelsKey: 1, AVLinearPCMBitDepthKey: 16, AVLinearPCMIsFloatKey: false, AVLinearPCMIsBigEndianKey: false, ] let output = try AVAudioFile(forWriting: dest, settings: settings, commonFormat: .pcmFormatFloat32, interleaved: false) input.framePosition = startFrame var remaining = AVAudioFrameCount(endFrame - startFrame) let block: AVAudioFrameCount = 16_000 while remaining > 0 { let n = min(block, remaining) guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break } try input.read(into: buffer, frameCount: n) if buffer.frameLength == 0 { break } try output.write(from: buffer) remaining -= buffer.frameLength } } /// Duration (seconds) of a WAV. static func duration(of url: URL) -> Double { guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 } return Double(file.length) / file.fileFormat.sampleRate } }