ab910cf742
Chunks were contiguous (start = prev end) with a naïve offset-concat stitch — no overlap. That cut sentences at boundaries, denied the diarizer context at edges, and let one voice split across chunks (the MH/Unknown_0 problem). Now each ~150s body is sliced with a 15s margin on both sides ([bodyStart-15, bodyEnd+15]); the stitcher keeps a segment only in the chunk that owns its MIDPOINT (body region) and drops it from the neighbour's margin — so boundary-spanning speech is seen whole by the backend and kept exactly once. - SessionPackager.PlannedChunk gains bodyStart/bodyEnd; planChunks adds overlapSeconds. - TranscriptAssembler.ChunkResult carries body bounds (defaults keep-all → no-overlap behaviour preserved for existing callers); assemble dedups by midpoint-in-body. - TranscriptPipeline passes body bounds through. Complements (doesn't replace) the fragment-smoothing + reconciliation safety nets; this is the upstream fix. ~+20% backend audio per interior chunk. 63/63 XCTest (new: overlap window layout + boundary-segment dedup).
92 lines
4.2 KiB
Swift
92 lines
4.2 KiB
Swift
import Foundation
|
|
|
|
/// Concatenates per-chunk `label-merge` results into one global `speakers.json`:
|
|
/// segment times offset back to global seconds, speakers unified across chunks by
|
|
/// name, and fingerprints collected for the voiceprint store.
|
|
enum TranscriptAssembler {
|
|
struct ChunkResult {
|
|
let chunkStart: Double // global seconds (the sliced window start)
|
|
let response: LabelMergeResponse
|
|
// The region this chunk OWNS; segments whose midpoint falls outside it are the
|
|
// neighbour's (overlap margin) and are dropped here. Defaults keep everything
|
|
// (no-overlap behaviour).
|
|
var bodyStart: Double = -.greatestFiniteMagnitude
|
|
var bodyEnd: Double = .greatestFiniteMagnitude
|
|
}
|
|
|
|
struct Assembled {
|
|
let speakersFile: SpeakersFile
|
|
let fingerprints: [String: [Float]] // confidently-named only, for VoiceprintStore
|
|
let allFingerprints: [String: [Float]] // EVERY cluster incl. Unknown — for editor voice-learning
|
|
}
|
|
|
|
/// Source ranking when the same name appears across chunks with different sources.
|
|
/// `mic_channel` (the local user's own microphone) is the most authoritative.
|
|
private static func rank(_ source: String) -> Int {
|
|
switch source {
|
|
case "mic_channel": return 4
|
|
case "visual": return 3
|
|
case "voiceprint": return 2
|
|
default: return 1 // unmatched
|
|
}
|
|
}
|
|
|
|
private static func isUnknown(_ name: String) -> Bool {
|
|
LabelMergeResponse.isUnknownName(name)
|
|
}
|
|
|
|
static func assemble(sessionId: String, app: String, chunks: [ChunkResult]) -> Assembled {
|
|
var segments: [SpeakersFile.Segment] = []
|
|
var bestSpeaker: [String: SpeakersFile.Speaker] = [:]
|
|
var fingerprints: [String: [Float]] = [:]
|
|
var allFingerprints: [String: [Float]] = [:]
|
|
var models: [String: String] = [:]
|
|
var duration = 0.0
|
|
|
|
for chunk in chunks {
|
|
let offset = chunk.chunkStart
|
|
// Body end bounds the real session length even on silent/all-unknown calls.
|
|
duration = max(duration, min(chunk.bodyEnd, offset + chunk.response.duration))
|
|
|
|
for seg in chunk.response.segments {
|
|
let start = seg.startSeconds + offset
|
|
let end = seg.endSeconds + offset
|
|
// Overlap dedup: keep a segment only in the chunk that OWNS its midpoint;
|
|
// the other chunk saw it only in its margin (for context) and drops it.
|
|
let mid = (start + end) / 2
|
|
guard mid >= chunk.bodyStart, mid < chunk.bodyEnd else { continue }
|
|
segments.append(.init(start: start, end: end, speaker: seg.speaker, text: seg.text))
|
|
duration = max(duration, end)
|
|
}
|
|
|
|
for sp in chunk.response.speakers {
|
|
let candidate = SpeakersFile.Speaker(
|
|
name: sp.name, source: sp.source,
|
|
overlapConfidence: sp.overlapConfidence, matchSimilarity: sp.matchSimilarity)
|
|
if let existing = bestSpeaker[sp.name] {
|
|
if rank(sp.source) > rank(existing.source) { bestSpeaker[sp.name] = candidate }
|
|
} else {
|
|
bestSpeaker[sp.name] = candidate
|
|
}
|
|
if let fp = sp.fingerprint, fp.count > 0 {
|
|
allFingerprints[sp.name] = fp // every cluster, for the editor
|
|
if !isUnknown(sp.name) { fingerprints[sp.name] = fp } // named only, for the store
|
|
}
|
|
}
|
|
for (name, fp) in chunk.response.fingerprints where fp.count > 0 {
|
|
allFingerprints[name] = fp
|
|
if !isUnknown(name) { fingerprints[name] = fp }
|
|
}
|
|
}
|
|
|
|
segments.sort { $0.start < $1.start }
|
|
let speakers = bestSpeaker.values.sorted { $0.name < $1.name }
|
|
models = chunks.last?.response.models ?? [:]
|
|
|
|
let file = SpeakersFile(
|
|
sessionId: sessionId, app: app, durationSec: duration,
|
|
speakers: speakers, segments: segments, models: models)
|
|
return Assembled(speakersFile: file, fingerprints: fingerprints, allFingerprints: allFingerprints)
|
|
}
|
|
}
|