Files
ten31-transcripts/Ten31Transcripts/Session/TranscriptPipeline.swift
T
Grant Gilliam 863136aeec Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic
attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet,
ignoring our own recording; auto-record toggle. Built; pending live multi-app
confirmation by the user.

Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation,
TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema
1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR +
saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver
(window capture; frames released, never saved; minimized->visual_gap, idle != gap).
Synthetic-frame tested; adapter geometry pending real Signal fixtures + live
VisualObserver validation.

Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential,
TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline
slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated
END-TO-END against the live backend (chunk -> label-merge -> speakers.json).

Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named
fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status,
auto-send toggle (default off) + self-name setting.

All adversarial-review findings fixed. App + XCTest suite build; tests pass.
2026-06-06 00:15:49 -05:00

76 lines
3.9 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
/// Drives a finished session through the backend: chunk sequential
/// `label-merge` (accumulating voiceprints) assemble `speakers.json` persist
/// fingerprints. Requests are sequential by construction (one chunk at a time).
final class TranscriptPipeline {
private let client: SparkControlClient
private let voiceprints: VoiceprintStore
init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
self.voiceprints = voiceprints
}
/// Process `mixedURL` against `timeline` (visual + self spans). Writes
/// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)`
/// is called per chunk.
func process(sessionFolder: URL,
sessionId: String,
app: String,
mixedURL: URL,
timeline: [VisualTimeline.Segment],
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
let duration = SessionPackager.duration(of: mixedURL)
let plan = SessionPackager.planChunks(durationSec: duration)
// Zero-duration / empty session a valid empty speakers.json, no backend call.
if plan.isEmpty || duration <= 0 {
let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
await progress?(0, 0)
return empty.speakersFile
}
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true)
defer { try? FileManager.default.removeItem(at: chunksDir) } // cleanup on success OR throw
// Start from stored voiceprints; accumulate this call's prints across chunks
// for within-call unification (the store only persists high-confidence ones).
var known = voiceprints.knownVoiceprints()
var results: [TranscriptAssembler.ChunkResult] = []
for chunk in plan {
try Task.checkCancellation()
await progress?(chunk.index, plan.count)
let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav")
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue } // empty slice skip
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
let response = try await client.labelMerge(
audioURL: chunkURL, timeline: timelineData,
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
known[name] = fp
}
voiceprints.update(with: response)
results.append(.init(chunkStart: chunk.start, response: response))
try? FileManager.default.removeItem(at: chunkURL)
}
await progress?(plan.count, plan.count)
let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
return assembled.speakersFile
}
/// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
/// the visual adapters land (Phase 34), their segments are merged in too.
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
}
}