Files
ten31-transcripts/Ten31Transcripts/Session/TranscriptPipeline.swift
T
Grant Gilliam ab910cf742 Chunk overlap + overlap-aware stitching
Chunks were contiguous (start = prev end) with a naïve offset-concat stitch — no
overlap. That cut sentences at boundaries, denied the diarizer context at edges, and
let one voice split across chunks (the MH/Unknown_0 problem). Now each ~150s body is
sliced with a 15s margin on both sides ([bodyStart-15, bodyEnd+15]); the stitcher
keeps a segment only in the chunk that owns its MIDPOINT (body region) and drops it
from the neighbour's margin — so boundary-spanning speech is seen whole by the
backend and kept exactly once.

- SessionPackager.PlannedChunk gains bodyStart/bodyEnd; planChunks adds overlapSeconds.
- TranscriptAssembler.ChunkResult carries body bounds (defaults keep-all → no-overlap
  behaviour preserved for existing callers); assemble dedups by midpoint-in-body.
- TranscriptPipeline passes body bounds through.

Complements (doesn't replace) the fragment-smoothing + reconciliation safety nets;
this is the upstream fix. ~+20% backend audio per interior chunk. 63/63 XCTest
(new: overlap window layout + boundary-segment dedup).
2026-06-08 13:03:56 -05:00

117 lines
6.7 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
/// Drives a finished session through the backend: chunk sequential
/// `label-merge` (accumulating voiceprints) assemble `speakers.json` persist
/// fingerprints. Requests are sequential by construction (one chunk at a time).
final class TranscriptPipeline {
private let client: SparkControlClient
private let voiceprints: VoiceprintStore
init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
self.voiceprints = voiceprints
}
/// Process a finished session. **Dual-channel** when the system track is healthy
/// and present: mic (the local user) + system (remote) go as separate files, the
/// `timeline` names only the remote speakers, and `selfSpans` become `self_vad`.
/// Otherwise falls back to the **mono** mixed file with self folded into the
/// timeline. Writes `speakers.json` into `sessionFolder`. `progress(done,total)`
/// is called per chunk.
func process(sessionFolder: URL,
sessionId: String,
app: String,
micURL: URL,
systemURL: URL,
mixedURL: URL,
timeline: [VisualTimeline.Segment],
selfSpans: [VADSpan],
selfName: String,
systemHealthy: Bool,
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
let fm = FileManager.default
let dual = systemHealthy
&& fm.fileExists(atPath: micURL.path) && fm.fileExists(atPath: systemURL.path)
&& SessionPackager.duration(of: systemURL) > 0
let duration = dual
? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
: SessionPackager.duration(of: mixedURL)
let plan = SessionPackager.planChunks(durationSec: duration)
// Zero-duration / empty session a valid empty speakers.json, no backend call.
if plan.isEmpty || duration <= 0 {
let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
await progress?(0, 0)
return empty.speakersFile
}
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true)
defer { try? fm.removeItem(at: chunksDir) } // cleanup on success OR throw
// Start from stored voiceprints; accumulate this call's prints across chunks
// for within-call unification (the store only persists high-confidence ones).
var known = voiceprints.knownVoiceprints()
var results: [TranscriptAssembler.ChunkResult] = []
// Mono fallback needs self folded into the timeline; dual sends it separately.
let monoTimeline = dual ? timeline
: timeline + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName)
for chunk in plan {
try Task.checkCancellation()
await progress?(chunk.index, plan.count)
let pad = String(format: "%03d", chunk.index)
let response: LabelMergeResponse
if dual {
let micChunk = chunksDir.appendingPathComponent("chunk_\(pad)_mic.wav")
let sysChunk = chunksDir.appendingPathComponent("chunk_\(pad)_sys.wav")
try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk)
try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk)
guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue }
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end)
response = try await client.labelMergeDual(
micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData,
timeline: timelineData, knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
try? fm.removeItem(at: micChunk); try? fm.removeItem(at: sysChunk)
} else {
let chunkURL = chunksDir.appendingPathComponent("chunk_\(pad).wav")
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
guard fm.fileExists(atPath: chunkURL.path) else { continue } // empty slice skip
let timelineData = try SessionPackager.rebasedTimelineData(monoTimeline, start: chunk.start, end: chunk.end)
response = try await client.labelMerge(
audioURL: chunkURL, timeline: timelineData,
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
try? fm.removeItem(at: chunkURL)
}
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
known[name] = fp
}
voiceprints.update(with: response)
results.append(.init(chunkStart: chunk.start, response: response,
bodyStart: chunk.bodyStart, bodyEnd: chunk.bodyEnd))
}
await progress?(plan.count, plan.count)
let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
// Persist every cluster's voiceprint (incl. Unknown) so the speaker editor can
// teach the store a voice when the user renames an Unknown to a real name.
if !assembled.allFingerprints.isEmpty,
let data = try? JSONSerialization.data(withJSONObject: assembled.allFingerprints.mapValues { $0.map(Double.init) },
options: [.sortedKeys]) {
try? data.write(to: sessionFolder.appendingPathComponent("cluster_fingerprints.json"))
}
return assembled.speakersFile
}
/// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
/// the visual adapters land (Phase 34), their segments are merged in too.
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
}
}