Files
Grant Gilliam ab910cf742 Chunk overlap + overlap-aware stitching
Chunks were contiguous (start = prev end) with a naïve offset-concat stitch — no
overlap. That cut sentences at boundaries, denied the diarizer context at edges, and
let one voice split across chunks (the MH/Unknown_0 problem). Now each ~150s body is
sliced with a 15s margin on both sides ([bodyStart-15, bodyEnd+15]); the stitcher
keeps a segment only in the chunk that owns its MIDPOINT (body region) and drops it
from the neighbour's margin — so boundary-spanning speech is seen whole by the
backend and kept exactly once.

- SessionPackager.PlannedChunk gains bodyStart/bodyEnd; planChunks adds overlapSeconds.
- TranscriptAssembler.ChunkResult carries body bounds (defaults keep-all → no-overlap
  behaviour preserved for existing callers); assemble dedups by midpoint-in-body.
- TranscriptPipeline passes body bounds through.

Complements (doesn't replace) the fragment-smoothing + reconciliation safety nets;
this is the upstream fix. ~+20% backend audio per interior chunk. 63/63 XCTest
(new: overlap window layout + boundary-segment dedup).
2026-06-08 13:03:56 -05:00

109 lines
5.1 KiB
Swift
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
import AVFoundation
/// Splits a long session into backend-sized chunks and produces, per chunk, the
/// sliced audio and the timeline rebased to chunk-local seconds.
///
/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3
/// min are chunked into ~23 min windows; names + voiceprints unify speakers
/// across chunks (handled in the pipeline).
enum SessionPackager {
struct PlannedChunk: Equatable {
let index: Int
let start: Double // sliced window start (global seconds, incl. overlap margin)
let end: Double // sliced window end (incl. overlap margin)
let bodyStart: Double // the region this chunk OWNS (no overlap) for stitch dedup
let bodyEnd: Double
}
/// One chunk if short; otherwise ~`chunkSeconds` bodies, each sliced with an
/// `overlapSeconds` margin on both sides. The margin gives the backend context at
/// boundaries (so a sentence isn't cut and the diarizer attributes edge speech
/// correctly and keeps a voice consistent across chunks); the stitcher keeps only
/// each chunk's owned `body` region, deduping the overlap.
static func planChunks(durationSec: Double,
chunkSeconds: Double = 150,
overlapSeconds: Double = 15,
thresholdSec: Double = 180) -> [PlannedChunk] {
guard durationSec > thresholdSec else {
return [PlannedChunk(index: 0, start: 0, end: durationSec, bodyStart: 0, bodyEnd: durationSec)]
}
var chunks: [PlannedChunk] = []
var bodyStart = 0.0
var index = 0
while bodyStart < durationSec - 0.001 {
let bodyEnd = min(bodyStart + chunkSeconds, durationSec)
chunks.append(PlannedChunk(
index: index,
start: max(0, bodyStart - overlapSeconds),
end: min(durationSec, bodyEnd + overlapSeconds),
bodyStart: bodyStart, bodyEnd: bodyEnd))
bodyStart = bodyEnd
index += 1
}
return chunks
}
/// Clip segments to `[start, end)` and rebase to chunk-local seconds, then
/// emit the flat `label-merge` array `[{start,end,name,confidence}]`.
static func rebasedTimelineData(_ segments: [VisualTimeline.Segment],
start: Double, end: Double) throws -> Data {
let flat: [[String: Any]] = segments.compactMap { seg in
let s = max(seg.start, start)
let e = min(seg.end, end)
guard e > s else { return nil }
return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence]
}
return try JSONSerialization.data(withJSONObject: flat, options: [])
}
/// Clip self-VAD spans to `[start, end)` and rebase to chunk-local seconds, as
/// the `self_vad` array `[{start,end}]` for dual-channel `label-merge`.
static func rebasedSelfVadData(_ spans: [VADSpan], start: Double, end: Double) throws -> Data {
let flat: [[String: Any]] = spans.compactMap { span in
let s = max(span.start, start)
let e = min(span.end, end)
guard e > s else { return nil }
return ["start": s - start, "end": e - start]
}
return try JSONSerialization.data(withJSONObject: flat, options: [])
}
/// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
let input = try AVAudioFile(forReading: source)
let sr = input.fileFormat.sampleRate
let startFrame = AVAudioFramePosition((startSec * sr).rounded())
let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded()))
guard endFrame > startFrame else { return }
let settings: [String: Any] = [
AVFormatIDKey: kAudioFormatLinearPCM,
AVSampleRateKey: sr,
AVNumberOfChannelsKey: 1,
AVLinearPCMBitDepthKey: 16,
AVLinearPCMIsFloatKey: false,
AVLinearPCMIsBigEndianKey: false,
]
let output = try AVAudioFile(forWriting: dest, settings: settings,
commonFormat: .pcmFormatFloat32, interleaved: false)
input.framePosition = startFrame
var remaining = AVAudioFrameCount(endFrame - startFrame)
let block: AVAudioFrameCount = 16_000
while remaining > 0 {
let n = min(block, remaining)
guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break }
try input.read(into: buffer, frameCount: n)
if buffer.frameLength == 0 { break }
try output.write(from: buffer)
remaining -= buffer.frameLength
}
}
/// Duration (seconds) of a WAV.
static func duration(of url: URL) -> Double {
guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 }
return Double(file.length) / file.fileFormat.sampleRate
}
}