863136aeec
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
76 lines
3.9 KiB
Swift
76 lines
3.9 KiB
Swift
import Foundation
|
||
|
||
/// Drives a finished session through the backend: chunk → sequential
|
||
/// `label-merge` (accumulating voiceprints) → assemble `speakers.json` → persist
|
||
/// fingerprints. Requests are sequential by construction (one chunk at a time).
|
||
final class TranscriptPipeline {
|
||
private let client: SparkControlClient
|
||
private let voiceprints: VoiceprintStore
|
||
|
||
init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
|
||
self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
|
||
self.voiceprints = voiceprints
|
||
}
|
||
|
||
/// Process `mixedURL` against `timeline` (visual + self spans). Writes
|
||
/// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)`
|
||
/// is called per chunk.
|
||
func process(sessionFolder: URL,
|
||
sessionId: String,
|
||
app: String,
|
||
mixedURL: URL,
|
||
timeline: [VisualTimeline.Segment],
|
||
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
|
||
let duration = SessionPackager.duration(of: mixedURL)
|
||
let plan = SessionPackager.planChunks(durationSec: duration)
|
||
|
||
// Zero-duration / empty session → a valid empty speakers.json, no backend call.
|
||
if plan.isEmpty || duration <= 0 {
|
||
let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
|
||
try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
|
||
await progress?(0, 0)
|
||
return empty.speakersFile
|
||
}
|
||
|
||
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
|
||
try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true)
|
||
defer { try? FileManager.default.removeItem(at: chunksDir) } // cleanup on success OR throw
|
||
|
||
// Start from stored voiceprints; accumulate this call's prints across chunks
|
||
// for within-call unification (the store only persists high-confidence ones).
|
||
var known = voiceprints.knownVoiceprints()
|
||
var results: [TranscriptAssembler.ChunkResult] = []
|
||
|
||
for chunk in plan {
|
||
try Task.checkCancellation()
|
||
await progress?(chunk.index, plan.count)
|
||
let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav")
|
||
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
|
||
guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue } // empty slice → skip
|
||
|
||
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
|
||
let response = try await client.labelMerge(
|
||
audioURL: chunkURL, timeline: timelineData,
|
||
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||
|
||
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
|
||
known[name] = fp
|
||
}
|
||
voiceprints.update(with: response)
|
||
results.append(.init(chunkStart: chunk.start, response: response))
|
||
try? FileManager.default.removeItem(at: chunkURL)
|
||
}
|
||
await progress?(plan.count, plan.count)
|
||
|
||
let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
|
||
try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
|
||
return assembled.speakersFile
|
||
}
|
||
|
||
/// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
|
||
/// the visual adapters land (Phase 3–4), their segments are merged in too.
|
||
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
|
||
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
|
||
}
|
||
}
|