ten31-transcripts/Ten31Transcripts/Session/SessionPackager.swift

import Foundation
import AVFoundation

/// Splits a long session into backend-sized chunks and produces, per chunk, the
/// sliced audio and the timeline rebased to chunk-local seconds.
///
/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3
/// min are chunked into ~2–3 min windows; names + voiceprints unify speakers
/// across chunks (handled in the pipeline).
enum SessionPackager {
    struct PlannedChunk: Equatable {
        let index: Int
        let start: Double      // global seconds
        let end: Double
    }

    /// One chunk if short; otherwise even ~`chunkSeconds` windows.
    static func planChunks(durationSec: Double,
                           chunkSeconds: Double = 150,
                           thresholdSec: Double = 180) -> [PlannedChunk] {
        guard durationSec > thresholdSec else {
            return [PlannedChunk(index: 0, start: 0, end: durationSec)]
        }
        var chunks: [PlannedChunk] = []
        var start = 0.0
        var index = 0
        while start < durationSec - 0.001 {
            let end = min(start + chunkSeconds, durationSec)
            chunks.append(PlannedChunk(index: index, start: start, end: end))
            start = end
            index += 1
        }
        return chunks
    }

    /// Clip segments to `[start, end)` and rebase to chunk-local seconds, then
    /// emit the flat `label-merge` array `[{start,end,name,confidence}]`.
    static func rebasedTimelineData(_ segments: [VisualTimeline.Segment],
                                    start: Double, end: Double) throws -> Data {
        let flat: [[String: Any]] = segments.compactMap { seg in
            let s = max(seg.start, start)
            let e = min(seg.end, end)
            guard e > s else { return nil }
            return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence]
        }
        return try JSONSerialization.data(withJSONObject: flat, options: [])
    }

    /// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
    static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
        let input = try AVAudioFile(forReading: source)
        let sr = input.fileFormat.sampleRate
        let startFrame = AVAudioFramePosition((startSec * sr).rounded())
        let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded()))
        guard endFrame > startFrame else { return }

        let settings: [String: Any] = [
            AVFormatIDKey: kAudioFormatLinearPCM,
            AVSampleRateKey: sr,
            AVNumberOfChannelsKey: 1,
            AVLinearPCMBitDepthKey: 16,
            AVLinearPCMIsFloatKey: false,
            AVLinearPCMIsBigEndianKey: false,
        ]
        let output = try AVAudioFile(forWriting: dest, settings: settings,
                                     commonFormat: .pcmFormatFloat32, interleaved: false)
        input.framePosition = startFrame
        var remaining = AVAudioFrameCount(endFrame - startFrame)
        let block: AVAudioFrameCount = 16_000
        while remaining > 0 {
            let n = min(block, remaining)
            guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break }
            try input.read(into: buffer, frameCount: n)
            if buffer.frameLength == 0 { break }
            try output.write(from: buffer)
            remaining -= buffer.frameLength
        }
    }

    /// Duration (seconds) of a WAV.
    static func duration(of url: URL) -> Double {
        guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 }
        return Double(file.length) / file.fileFormat.sampleRate
    }
}