ab910cf742
Chunks were contiguous (start = prev end) with a naïve offset-concat stitch — no overlap. That cut sentences at boundaries, denied the diarizer context at edges, and let one voice split across chunks (the MH/Unknown_0 problem). Now each ~150s body is sliced with a 15s margin on both sides ([bodyStart-15, bodyEnd+15]); the stitcher keeps a segment only in the chunk that owns its MIDPOINT (body region) and drops it from the neighbour's margin — so boundary-spanning speech is seen whole by the backend and kept exactly once. - SessionPackager.PlannedChunk gains bodyStart/bodyEnd; planChunks adds overlapSeconds. - TranscriptAssembler.ChunkResult carries body bounds (defaults keep-all → no-overlap behaviour preserved for existing callers); assemble dedups by midpoint-in-body. - TranscriptPipeline passes body bounds through. Complements (doesn't replace) the fragment-smoothing + reconciliation safety nets; this is the upstream fix. ~+20% backend audio per interior chunk. 63/63 XCTest (new: overlap window layout + boundary-segment dedup).
109 lines
5.1 KiB
Swift
109 lines
5.1 KiB
Swift
import Foundation
|
||
import AVFoundation
|
||
|
||
/// Splits a long session into backend-sized chunks and produces, per chunk, the
|
||
/// sliced audio and the timeline rebased to chunk-local seconds.
|
||
///
|
||
/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3
|
||
/// min are chunked into ~2–3 min windows; names + voiceprints unify speakers
|
||
/// across chunks (handled in the pipeline).
|
||
enum SessionPackager {
|
||
struct PlannedChunk: Equatable {
|
||
let index: Int
|
||
let start: Double // sliced window start (global seconds, incl. overlap margin)
|
||
let end: Double // sliced window end (incl. overlap margin)
|
||
let bodyStart: Double // the region this chunk OWNS (no overlap) — for stitch dedup
|
||
let bodyEnd: Double
|
||
}
|
||
|
||
/// One chunk if short; otherwise ~`chunkSeconds` bodies, each sliced with an
|
||
/// `overlapSeconds` margin on both sides. The margin gives the backend context at
|
||
/// boundaries (so a sentence isn't cut and the diarizer attributes edge speech
|
||
/// correctly and keeps a voice consistent across chunks); the stitcher keeps only
|
||
/// each chunk's owned `body` region, deduping the overlap.
|
||
static func planChunks(durationSec: Double,
|
||
chunkSeconds: Double = 150,
|
||
overlapSeconds: Double = 15,
|
||
thresholdSec: Double = 180) -> [PlannedChunk] {
|
||
guard durationSec > thresholdSec else {
|
||
return [PlannedChunk(index: 0, start: 0, end: durationSec, bodyStart: 0, bodyEnd: durationSec)]
|
||
}
|
||
var chunks: [PlannedChunk] = []
|
||
var bodyStart = 0.0
|
||
var index = 0
|
||
while bodyStart < durationSec - 0.001 {
|
||
let bodyEnd = min(bodyStart + chunkSeconds, durationSec)
|
||
chunks.append(PlannedChunk(
|
||
index: index,
|
||
start: max(0, bodyStart - overlapSeconds),
|
||
end: min(durationSec, bodyEnd + overlapSeconds),
|
||
bodyStart: bodyStart, bodyEnd: bodyEnd))
|
||
bodyStart = bodyEnd
|
||
index += 1
|
||
}
|
||
return chunks
|
||
}
|
||
|
||
/// Clip segments to `[start, end)` and rebase to chunk-local seconds, then
|
||
/// emit the flat `label-merge` array `[{start,end,name,confidence}]`.
|
||
static func rebasedTimelineData(_ segments: [VisualTimeline.Segment],
|
||
start: Double, end: Double) throws -> Data {
|
||
let flat: [[String: Any]] = segments.compactMap { seg in
|
||
let s = max(seg.start, start)
|
||
let e = min(seg.end, end)
|
||
guard e > s else { return nil }
|
||
return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence]
|
||
}
|
||
return try JSONSerialization.data(withJSONObject: flat, options: [])
|
||
}
|
||
|
||
/// Clip self-VAD spans to `[start, end)` and rebase to chunk-local seconds, as
|
||
/// the `self_vad` array `[{start,end}]` for dual-channel `label-merge`.
|
||
static func rebasedSelfVadData(_ spans: [VADSpan], start: Double, end: Double) throws -> Data {
|
||
let flat: [[String: Any]] = spans.compactMap { span in
|
||
let s = max(span.start, start)
|
||
let e = min(span.end, end)
|
||
guard e > s else { return nil }
|
||
return ["start": s - start, "end": e - start]
|
||
}
|
||
return try JSONSerialization.data(withJSONObject: flat, options: [])
|
||
}
|
||
|
||
/// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
|
||
static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
|
||
let input = try AVAudioFile(forReading: source)
|
||
let sr = input.fileFormat.sampleRate
|
||
let startFrame = AVAudioFramePosition((startSec * sr).rounded())
|
||
let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded()))
|
||
guard endFrame > startFrame else { return }
|
||
|
||
let settings: [String: Any] = [
|
||
AVFormatIDKey: kAudioFormatLinearPCM,
|
||
AVSampleRateKey: sr,
|
||
AVNumberOfChannelsKey: 1,
|
||
AVLinearPCMBitDepthKey: 16,
|
||
AVLinearPCMIsFloatKey: false,
|
||
AVLinearPCMIsBigEndianKey: false,
|
||
]
|
||
let output = try AVAudioFile(forWriting: dest, settings: settings,
|
||
commonFormat: .pcmFormatFloat32, interleaved: false)
|
||
input.framePosition = startFrame
|
||
var remaining = AVAudioFrameCount(endFrame - startFrame)
|
||
let block: AVAudioFrameCount = 16_000
|
||
while remaining > 0 {
|
||
let n = min(block, remaining)
|
||
guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break }
|
||
try input.read(into: buffer, frameCount: n)
|
||
if buffer.frameLength == 0 { break }
|
||
try output.write(from: buffer)
|
||
remaining -= buffer.frameLength
|
||
}
|
||
}
|
||
|
||
/// Duration (seconds) of a WAV.
|
||
static func duration(of url: URL) -> Double {
|
||
guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 }
|
||
return Double(file.length) / file.fileFormat.sampleRate
|
||
}
|
||
}
|