Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
@@ -0,0 +1,78 @@
|
||||
import Foundation
|
||||
|
||||
/// Concatenates per-chunk `label-merge` results into one global `speakers.json`:
|
||||
/// segment times offset back to global seconds, speakers unified across chunks by
|
||||
/// name, and fingerprints collected for the voiceprint store.
|
||||
enum TranscriptAssembler {
|
||||
struct ChunkResult {
|
||||
let chunkStart: Double // global seconds
|
||||
let response: LabelMergeResponse
|
||||
}
|
||||
|
||||
struct Assembled {
|
||||
let speakersFile: SpeakersFile
|
||||
let fingerprints: [String: [Float]] // name -> 192-dim, for VoiceprintStore
|
||||
}
|
||||
|
||||
/// Source ranking when the same name appears across chunks with different sources.
|
||||
private static func rank(_ source: String) -> Int {
|
||||
switch source {
|
||||
case "visual": return 3
|
||||
case "voiceprint": return 2
|
||||
default: return 1 // unmatched
|
||||
}
|
||||
}
|
||||
|
||||
private static func isUnknown(_ name: String) -> Bool {
|
||||
LabelMergeResponse.isUnknownName(name)
|
||||
}
|
||||
|
||||
static func assemble(sessionId: String, app: String, chunks: [ChunkResult]) -> Assembled {
|
||||
var segments: [SpeakersFile.Segment] = []
|
||||
var bestSpeaker: [String: SpeakersFile.Speaker] = [:]
|
||||
var fingerprints: [String: [Float]] = [:]
|
||||
var models: [String: String] = [:]
|
||||
var duration = 0.0
|
||||
|
||||
for chunk in chunks {
|
||||
let offset = chunk.chunkStart
|
||||
// Audio length from the chunk window, so silent/all-unknown calls still
|
||||
// report a real duration (not just the last segment's end).
|
||||
duration = max(duration, offset + chunk.response.duration)
|
||||
|
||||
for seg in chunk.response.segments {
|
||||
let start = seg.startSeconds + offset
|
||||
let end = seg.endSeconds + offset
|
||||
segments.append(.init(start: start, end: end, speaker: seg.speaker, text: seg.text))
|
||||
duration = max(duration, end)
|
||||
}
|
||||
|
||||
for sp in chunk.response.speakers {
|
||||
let candidate = SpeakersFile.Speaker(
|
||||
name: sp.name, source: sp.source,
|
||||
overlapConfidence: sp.overlapConfidence, matchSimilarity: sp.matchSimilarity)
|
||||
if let existing = bestSpeaker[sp.name] {
|
||||
if rank(sp.source) > rank(existing.source) { bestSpeaker[sp.name] = candidate }
|
||||
} else {
|
||||
bestSpeaker[sp.name] = candidate
|
||||
}
|
||||
// Collect named fingerprints only (never Unknown_N / Speaker_unknown).
|
||||
if !isUnknown(sp.name), let fp = sp.fingerprint, fp.count > 0 {
|
||||
fingerprints[sp.name] = fp
|
||||
}
|
||||
}
|
||||
for (name, fp) in chunk.response.fingerprints where !isUnknown(name) && fp.count > 0 {
|
||||
fingerprints[name] = fp
|
||||
}
|
||||
}
|
||||
|
||||
segments.sort { $0.start < $1.start }
|
||||
let speakers = bestSpeaker.values.sorted { $0.name < $1.name }
|
||||
models = chunks.last?.response.models ?? [:]
|
||||
|
||||
let file = SpeakersFile(
|
||||
sessionId: sessionId, app: app, durationSec: duration,
|
||||
speakers: speakers, segments: segments, models: models)
|
||||
return Assembled(speakersFile: file, fingerprints: fingerprints)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user