ten31-transcripts/Ten31Transcripts/Session/SessionPackager.swift

import Foundation
import AVFoundation

/// Splits a long session into backend-sized chunks and produces, per chunk, the
/// sliced audio and the timeline rebased to chunk-local seconds.
///
/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3
/// min are chunked into ~2–3 min windows; names + voiceprints unify speakers
/// across chunks (handled in the pipeline).
enum SessionPackager {
    struct PlannedChunk: Equatable {
        let index: Int
        let start: Double      // sliced window start (global seconds, incl. overlap margin)
        let end: Double        // sliced window end (incl. overlap margin)
        let bodyStart: Double  // the region this chunk OWNS (no overlap) — for stitch dedup
        let bodyEnd: Double
    }

    /// One chunk if short; otherwise ~`chunkSeconds` bodies, each sliced with an
    /// `overlapSeconds` margin on both sides. The margin gives the backend context at
    /// boundaries (so a sentence isn't cut and the diarizer attributes edge speech
    /// correctly and keeps a voice consistent across chunks); the stitcher keeps only
    /// each chunk's owned `body` region, deduping the overlap.
    static func planChunks(durationSec: Double,
                           chunkSeconds: Double = 150,
                           overlapSeconds: Double = 15,
                           thresholdSec: Double = 180) -> [PlannedChunk] {
        guard durationSec > thresholdSec else {
            return [PlannedChunk(index: 0, start: 0, end: durationSec, bodyStart: 0, bodyEnd: durationSec)]
        }
        var chunks: [PlannedChunk] = []
        var bodyStart = 0.0
        var index = 0
        while bodyStart < durationSec - 0.001 {
            let bodyEnd = min(bodyStart + chunkSeconds, durationSec)
            chunks.append(PlannedChunk(
                index: index,
                start: max(0, bodyStart - overlapSeconds),
                end: min(durationSec, bodyEnd + overlapSeconds),
                bodyStart: bodyStart, bodyEnd: bodyEnd))
            bodyStart = bodyEnd
            index += 1
        }
        return chunks
    }

    /// Clip segments to `[start, end)` and rebase to chunk-local seconds, then
    /// emit the flat `label-merge` array `[{start,end,name,confidence}]`.
    static func rebasedTimelineData(_ segments: [VisualTimeline.Segment],
                                    start: Double, end: Double) throws -> Data {
        let flat: [[String: Any]] = segments.compactMap { seg in
            let s = max(seg.start, start)
            let e = min(seg.end, end)
            guard e > s else { return nil }
            return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence]
        }
        return try JSONSerialization.data(withJSONObject: flat, options: [])
    }

    /// Clip self-VAD spans to `[start, end)` and rebase to chunk-local seconds, as
    /// the `self_vad` array `[{start,end}]` for dual-channel `label-merge`.
    static func rebasedSelfVadData(_ spans: [VADSpan], start: Double, end: Double) throws -> Data {
        let flat: [[String: Any]] = spans.compactMap { span in
            let s = max(span.start, start)
            let e = min(span.end, end)
            guard e > s else { return nil }
            return ["start": s - start, "end": e - start]
        }
        return try JSONSerialization.data(withJSONObject: flat, options: [])
    }

    /// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
    static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
        let input = try AVAudioFile(forReading: source)
        let sr = input.fileFormat.sampleRate
        let startFrame = AVAudioFramePosition((startSec * sr).rounded())
        let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded()))
        guard endFrame > startFrame else { return }

        let settings: [String: Any] = [
            AVFormatIDKey: kAudioFormatLinearPCM,
            AVSampleRateKey: sr,
            AVNumberOfChannelsKey: 1,
            AVLinearPCMBitDepthKey: 16,
            AVLinearPCMIsFloatKey: false,
            AVLinearPCMIsBigEndianKey: false,
        ]
        let output = try AVAudioFile(forWriting: dest, settings: settings,
                                     commonFormat: .pcmFormatFloat32, interleaved: false)
        input.framePosition = startFrame
        var remaining = AVAudioFrameCount(endFrame - startFrame)
        let block: AVAudioFrameCount = 16_000
        while remaining > 0 {
            let n = min(block, remaining)
            guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break }
            try input.read(into: buffer, frameCount: n)
            if buffer.frameLength == 0 { break }
            try output.write(from: buffer)
            remaining -= buffer.frameLength
        }
    }

    /// Duration (seconds) of a WAV.
    static func duration(of url: URL) -> Double {
        guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 }
        return Double(file.length) / file.fileFormat.sampleRate
    }
}