Files
ten31-transcripts/Ten31Transcripts/Session/TranscriptPipeline.swift
T
Grant Gilliam 1b6bb8ab67 Drop stuck whole-call visual spans at processing time
Defense-in-depth + salvage for sessions captured before the adapter fix: drop any
vision-source span whose single unbroken duration covers ≥60% of the call. No one
speaks that long without a break, so it's a stuck/false active-speaker cue that
would dominate backend name attribution. Self (mic_vad) spans are never dropped.
Applied to both the live and re-process paths. Test added; 66 pass.
2026-06-08 16:21:45 -05:00

134 lines
7.7 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
/// Drives a finished session through the backend: chunk sequential
/// `label-merge` (accumulating voiceprints) assemble `speakers.json` persist
/// fingerprints. Requests are sequential by construction (one chunk at a time).
final class TranscriptPipeline {
private let client: SparkControlClient
private let voiceprints: VoiceprintStore
init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
self.voiceprints = voiceprints
}
/// Process a finished session. **Dual-channel** when the system track is healthy
/// and present: mic (the local user) + system (remote) go as separate files, the
/// `timeline` names only the remote speakers, and `selfSpans` become `self_vad`.
/// Otherwise falls back to the **mono** mixed file with self folded into the
/// timeline. Writes `speakers.json` into `sessionFolder`. `progress(done,total)`
/// is called per chunk.
func process(sessionFolder: URL,
sessionId: String,
app: String,
micURL: URL,
systemURL: URL,
mixedURL: URL,
timeline: [VisualTimeline.Segment],
selfSpans: [VADSpan],
selfName: String,
systemHealthy: Bool,
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
let fm = FileManager.default
let dual = systemHealthy
&& fm.fileExists(atPath: micURL.path) && fm.fileExists(atPath: systemURL.path)
&& SessionPackager.duration(of: systemURL) > 0
let duration = dual
? max(SessionPackager.duration(of: micURL), SessionPackager.duration(of: systemURL))
: SessionPackager.duration(of: mixedURL)
let plan = SessionPackager.planChunks(durationSec: duration)
// Zero-duration / empty session a valid empty speakers.json, no backend call.
if plan.isEmpty || duration <= 0 {
let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
await progress?(0, 0)
return empty.speakersFile
}
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
try? fm.createDirectory(at: chunksDir, withIntermediateDirectories: true)
defer { try? fm.removeItem(at: chunksDir) } // cleanup on success OR throw
// Defensive: drop any visual span covering most of the call in one unbroken
// segment the signature of a stuck/false active-speaker cue (e.g. a solid
// camera-off tile read as "speaking" the whole call). Such a span would
// dominate the backend's name attribution and collapse every voice onto one
// name. Also salvages sessions captured before the adapter fix landed.
let vis = Self.dropStuckSpans(timeline, duration: duration)
// Start from stored voiceprints; accumulate this call's prints across chunks
// for within-call unification (the store only persists high-confidence ones).
var known = voiceprints.knownVoiceprints()
var results: [TranscriptAssembler.ChunkResult] = []
// Mono fallback needs self folded into the timeline; dual sends it separately.
let monoTimeline = dual ? vis
: vis + Self.timeline(fromSelfSpans: selfSpans, selfName: selfName)
for chunk in plan {
try Task.checkCancellation()
await progress?(chunk.index, plan.count)
let pad = String(format: "%03d", chunk.index)
let response: LabelMergeResponse
if dual {
let micChunk = chunksDir.appendingPathComponent("chunk_\(pad)_mic.wav")
let sysChunk = chunksDir.appendingPathComponent("chunk_\(pad)_sys.wav")
try SessionPackager.sliceAudio(from: micURL, startSec: chunk.start, endSec: chunk.end, to: micChunk)
try SessionPackager.sliceAudio(from: systemURL, startSec: chunk.start, endSec: chunk.end, to: sysChunk)
guard fm.fileExists(atPath: micChunk.path), fm.fileExists(atPath: sysChunk.path) else { continue }
let timelineData = try SessionPackager.rebasedTimelineData(vis, start: chunk.start, end: chunk.end)
let selfVadData = try SessionPackager.rebasedSelfVadData(selfSpans, start: chunk.start, end: chunk.end)
response = try await client.labelMergeDual(
micURL: micChunk, systemURL: sysChunk, selfName: selfName, selfVad: selfVadData,
timeline: timelineData, knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
try? fm.removeItem(at: micChunk); try? fm.removeItem(at: sysChunk)
} else {
let chunkURL = chunksDir.appendingPathComponent("chunk_\(pad).wav")
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
guard fm.fileExists(atPath: chunkURL.path) else { continue } // empty slice skip
let timelineData = try SessionPackager.rebasedTimelineData(monoTimeline, start: chunk.start, end: chunk.end)
response = try await client.labelMerge(
audioURL: chunkURL, timeline: timelineData,
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
try? fm.removeItem(at: chunkURL)
}
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
known[name] = fp
}
voiceprints.update(with: response)
results.append(.init(chunkStart: chunk.start, response: response,
bodyStart: chunk.bodyStart, bodyEnd: chunk.bodyEnd))
}
await progress?(plan.count, plan.count)
let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
// Persist every cluster's voiceprint (incl. Unknown) so the speaker editor can
// teach the store a voice when the user renames an Unknown to a real name.
if !assembled.allFingerprints.isEmpty,
let data = try? JSONSerialization.data(withJSONObject: assembled.allFingerprints.mapValues { $0.map(Double.init) },
options: [.sortedKeys]) {
try? data.write(to: sessionFolder.appendingPathComponent("cluster_fingerprints.json"))
}
return assembled.speakersFile
}
/// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
/// the visual adapters land (Phase 34), their segments are merged in too.
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
}
/// Drop visual (vision-source) spans whose single unbroken duration covers at
/// least `maxFraction` of the whole call no one legitimately speaks that long
/// without a break, so it's a stuck/false cue. Self spans (mic_vad) are kept.
static func dropStuckSpans(_ timeline: [VisualTimeline.Segment], duration: Double,
maxFraction: Double = 0.6) -> [VisualTimeline.Segment] {
guard duration > 0 else { return timeline }
let limit = maxFraction * duration
return timeline.filter { $0.source != "vision" || ($0.end - $0.start) < limit }
}
}