Phases 2-6: detection, visual timeline, backend hand-off, voiceprints
Phase 2 (call detection): CallDetector using CoreAudio per-process mic attribution (anarlog technique) — robust start+stop for Zoom/Teams/Signal/Meet, ignoring our own recording; auto-record toggle. Built; pending live multi-app confirmation by the user. Phase 3 (visual timeline foundation): AppAdapter protocol + SpeakerObservation, TimelineBuilder (hysteresis/overlap/self-merge/aliases), VisualTimeline (schema 1.1), TextRecognizer (Vision OCR), FrameSampler + GridCallAnalyzer (name OCR + saturated-highlight active-speaker attribution), SignalAdapter, VisualObserver (window capture; frames released, never saved; minimized->visual_gap, idle != gap). Synthetic-frame tested; adapter geometry pending real Signal fixtures + live VisualObserver validation. Phase 5 (backend hand-off): SparkControlClient (multipart label-merge, sequential, TLS-skip, 503 Retry-After/413), SessionPackager (chunk plan + WAV slice + timeline slice/rebase), TranscriptAssembler + SpeakersFile, TranscriptPipeline. Validated END-TO-END against the live backend (chunk -> label-merge -> speakers.json). Phase 6 (voiceprints): VoiceprintStore (known_voiceprints, persist named fingerprints, skip Unknown). Wired: 'Send to backend' button + transcript status, auto-send toggle (default off) + self-name setting. All adversarial-review findings fixed. App + XCTest suite build; tests pass.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import Foundation
|
||||
import Combine
|
||||
import AppKit
|
||||
import CoreGraphics
|
||||
|
||||
struct SessionInfo: Equatable {
|
||||
let folder: URL
|
||||
@@ -25,6 +26,14 @@ final class SessionController: ObservableObject {
|
||||
case error(String)
|
||||
}
|
||||
|
||||
/// Backend transcription status for the most recent session.
|
||||
enum TranscriptStatus: Equatable {
|
||||
case idle
|
||||
case processing(Int, Int) // chunk done, total
|
||||
case done(speakers: Int, segments: Int)
|
||||
case failed(String)
|
||||
}
|
||||
|
||||
/// Set in init so `AppDelegate.applicationShouldTerminate` can finalize a
|
||||
/// recording in progress before the app quits.
|
||||
static weak var shared: SessionController?
|
||||
@@ -37,12 +46,34 @@ final class SessionController: ObservableObject {
|
||||
@Published private(set) var systemLevel: Float = 0
|
||||
/// Surfaced after a session if system audio stopped early.
|
||||
@Published private(set) var warning: String?
|
||||
/// Mirrored from `CallDetector` for the UI.
|
||||
@Published private(set) var detectionStatus: CallDetector.Status = .disabled
|
||||
/// Backend transcription status for the last session.
|
||||
@Published private(set) var transcriptStatus: TranscriptStatus = .idle
|
||||
|
||||
private let settings: AppSettings
|
||||
private var voiceprints: VoiceprintStore
|
||||
private let detector = CallDetector()
|
||||
private var cancellables = Set<AnyCancellable>()
|
||||
private var currentLabel = "manual"
|
||||
/// Inputs needed to (re)process the last finished session through the backend.
|
||||
private struct ProcessInputs {
|
||||
let folder: URL
|
||||
let sessionId: String
|
||||
let app: String
|
||||
let mixedURL: URL
|
||||
let selfSpans: [VADSpan]
|
||||
}
|
||||
private var lastProcess: ProcessInputs?
|
||||
private var processTask: Task<Void, Never>?
|
||||
private var recorder: AudioRecorder?
|
||||
private var currentFolder: URL?
|
||||
private var startTime: Date?
|
||||
private var timer: Timer?
|
||||
/// True when the current session was started by call detection (not the user).
|
||||
private var autoStarted = false
|
||||
/// Set if a detected call ends while we're still in `.starting`.
|
||||
private var pendingAutoStop = false
|
||||
/// The in-flight start or stop Task, so `prepareForTermination` can await it.
|
||||
private var lifecycleTask: Task<Void, Never>?
|
||||
/// Bumped each time a start/stop Task is spawned (Task is a value type, so this
|
||||
@@ -51,7 +82,64 @@ final class SessionController: ObservableObject {
|
||||
|
||||
init(settings: AppSettings) {
|
||||
self.settings = settings
|
||||
self.voiceprints = VoiceprintStore(
|
||||
fileURL: settings.outputFolderURL.appendingPathComponent("voiceprints.json"))
|
||||
SessionController.shared = self
|
||||
|
||||
detector.onCallStart = { [weak self] app in self?.handleCallStart(app) }
|
||||
detector.onCallEnd = { [weak self] in self?.handleCallEnd() }
|
||||
detector.$status
|
||||
.sink { [weak self] status in self?.detectionStatus = status }
|
||||
.store(in: &cancellables)
|
||||
// Re-point the voiceprint DB if the output folder changes. The in-flight
|
||||
// pipeline keeps its own captured reference, so this can't disrupt a run.
|
||||
settings.$outputFolderPath
|
||||
.dropFirst()
|
||||
.sink { [weak self] path in
|
||||
guard let self else { return }
|
||||
let dir = URL(fileURLWithPath: (path as NSString).expandingTildeInPath, isDirectory: true)
|
||||
self.voiceprints = VoiceprintStore(fileURL: dir.appendingPathComponent("voiceprints.json"))
|
||||
}
|
||||
.store(in: &cancellables)
|
||||
settings.$autoRecordOnDetection
|
||||
.sink { [weak self] on in
|
||||
guard let self else { return }
|
||||
if on {
|
||||
self.detector.enable()
|
||||
} else {
|
||||
self.detector.disable()
|
||||
// Don't leave an auto-started session running with no detector —
|
||||
// handle both .recording and the in-flight .starting case.
|
||||
if self.autoStarted {
|
||||
switch self.state {
|
||||
case .recording: self.stop()
|
||||
case .starting: self.pendingAutoStop = true
|
||||
default: break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
.store(in: &cancellables)
|
||||
}
|
||||
|
||||
// MARK: - Auto-detection
|
||||
|
||||
private func handleCallStart(_ app: CallDetector.DetectedApp) {
|
||||
guard settings.autoRecordOnDetection else { return }
|
||||
switch state {
|
||||
case .idle, .error: start(label: app.label, auto: true)
|
||||
case .starting, .recording, .finishing: break // don't disturb an active session
|
||||
}
|
||||
}
|
||||
|
||||
private func handleCallEnd() {
|
||||
// Only auto-stop a session we auto-started; never a manual recording.
|
||||
guard autoStarted else { return }
|
||||
switch state {
|
||||
case .recording: stop()
|
||||
case .starting: pendingAutoStop = true // resolved when start() completes
|
||||
case .idle, .error, .finishing: break
|
||||
}
|
||||
}
|
||||
|
||||
var isBusy: Bool {
|
||||
@@ -68,15 +156,18 @@ final class SessionController: ObservableObject {
|
||||
|
||||
// MARK: - Start / Stop
|
||||
|
||||
private func start() {
|
||||
private func start(label: String = "manual", auto: Bool = false) {
|
||||
let folder: URL
|
||||
do {
|
||||
folder = try makeSessionFolder()
|
||||
folder = try makeSessionFolder(label: label)
|
||||
} catch {
|
||||
fail("Couldn't create session folder: \(error.localizedDescription)")
|
||||
return
|
||||
}
|
||||
currentFolder = folder
|
||||
currentLabel = label
|
||||
autoStarted = auto
|
||||
pendingAutoStop = false
|
||||
let recorder = AudioRecorder(
|
||||
micURL: folder.appendingPathComponent("mic.wav"),
|
||||
systemURL: folder.appendingPathComponent("system.wav"),
|
||||
@@ -92,12 +183,36 @@ final class SessionController: ObservableObject {
|
||||
self.state = .recording
|
||||
self.startTime = Date()
|
||||
self.startTimer()
|
||||
// A detected call may have ended while we were still starting.
|
||||
if self.pendingAutoStop {
|
||||
self.pendingAutoStop = false
|
||||
self.stop()
|
||||
}
|
||||
} catch {
|
||||
self.fail("Couldn't start recording: \(error.localizedDescription)")
|
||||
self.handleStartFailure(error)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Map a recorder start failure to an actionable message. The common case is
|
||||
/// Screen Recording getting re-checked after a rebuild (the SCStream auth
|
||||
/// check fails even though CGPreflight reports granted), so re-prompt and open
|
||||
/// the right Settings pane rather than show a cryptic TCC error.
|
||||
private func handleStartFailure(_ error: Error) {
|
||||
let msg = error.localizedDescription.lowercased()
|
||||
let screenIssue = msg.contains("declined") || msg.contains("tcc")
|
||||
|| msg.contains("screen") || msg.contains("permission")
|
||||
if screenIssue {
|
||||
_ = CGRequestScreenCaptureAccess()
|
||||
if let url = URL(string: "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") {
|
||||
NSWorkspace.shared.open(url)
|
||||
}
|
||||
fail("Screen Recording needs re-approval for this build. Toggle Ten31Transcripts off then on in System Settings ▸ Screen Recording, then restart the app.")
|
||||
} else {
|
||||
fail("Couldn't start recording: \(error.localizedDescription)")
|
||||
}
|
||||
}
|
||||
|
||||
private func stop() {
|
||||
guard let recorder else { return }
|
||||
state = .finishing
|
||||
@@ -114,20 +229,66 @@ final class SessionController: ObservableObject {
|
||||
micLevel = 0
|
||||
systemLevel = 0
|
||||
warning = result.systemNote.map { "System audio stopped early: \($0)" }
|
||||
transcriptStatus = .idle
|
||||
if let folder = currentFolder {
|
||||
writeSelfSpans(result, to: folder)
|
||||
lastSession = SessionInfo(
|
||||
folder: folder, mixedURL: result.mixedURL,
|
||||
duration: result.duration, selfSpanCount: result.selfSpans.count)
|
||||
lastProcess = ProcessInputs(
|
||||
folder: folder, sessionId: folder.lastPathComponent, app: currentLabel,
|
||||
mixedURL: result.mixedURL, selfSpans: result.selfSpans)
|
||||
}
|
||||
let autoSend = settings.autoSendOnStop
|
||||
currentFolder = nil
|
||||
autoStarted = false
|
||||
pendingAutoStop = false
|
||||
elapsed = 0
|
||||
state = .idle
|
||||
if autoSend { processLastSession() }
|
||||
}
|
||||
|
||||
// MARK: - Backend transcription
|
||||
|
||||
/// Send the last finished session to the backend → `speakers.json`. Uses the
|
||||
/// mic-VAD self spans as the timeline for now; visual segments (Phase 3–4) get
|
||||
/// merged in once the adapters land. Safe to call manually ("Send to backend")
|
||||
/// or automatically on stop.
|
||||
func processLastSession() {
|
||||
guard let inputs = lastProcess else { return }
|
||||
if case .processing = transcriptStatus { return }
|
||||
transcriptStatus = .processing(0, 1)
|
||||
|
||||
let settings = self.settings
|
||||
let voiceprints = self.voiceprints
|
||||
processTask = Task {
|
||||
let pipeline = TranscriptPipeline(
|
||||
baseURL: settings.backendBaseURL,
|
||||
skipTLS: settings.skipTLSVerification,
|
||||
voiceprints: voiceprints)
|
||||
let timeline = TranscriptPipeline.timeline(
|
||||
fromSelfSpans: inputs.selfSpans, selfName: settings.selfName)
|
||||
do {
|
||||
let speakers = try await pipeline.process(
|
||||
sessionFolder: inputs.folder, sessionId: inputs.sessionId, app: inputs.app,
|
||||
mixedURL: inputs.mixedURL, timeline: timeline,
|
||||
progress: { done, total in
|
||||
await MainActor.run { self.transcriptStatus = .processing(done, total) }
|
||||
})
|
||||
self.transcriptStatus = .done(speakers: speakers.speakers.count, segments: speakers.segments.count)
|
||||
} catch is CancellationError {
|
||||
self.transcriptStatus = .idle
|
||||
} catch {
|
||||
self.transcriptStatus = .failed(error.localizedDescription)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func fail(_ message: String) {
|
||||
recorder = nil
|
||||
currentFolder = nil
|
||||
autoStarted = false
|
||||
pendingAutoStop = false
|
||||
stopTimer()
|
||||
micLevel = 0
|
||||
systemLevel = 0
|
||||
@@ -139,6 +300,9 @@ final class SessionController: ObservableObject {
|
||||
/// its WAV headers are finalized before the process exits. Handles quit while
|
||||
/// `.starting` and `.finishing`, not just `.recording`.
|
||||
func prepareForTermination() async {
|
||||
// Cancel any in-flight backend transcription (audio is already saved; the
|
||||
// user can resend). The pipeline's checkCancellation + defer clean up chunks.
|
||||
processTask?.cancel()
|
||||
// Drain whatever lifecycle Task is in flight until nothing is busy. A Stop
|
||||
// click landing in an await window can spawn a new stop Task, so loop
|
||||
// rather than awaiting a single captured task.
|
||||
@@ -178,9 +342,9 @@ final class SessionController: ObservableObject {
|
||||
|
||||
// MARK: - Files
|
||||
|
||||
private func makeSessionFolder() throws -> URL {
|
||||
private func makeSessionFolder(label: String) throws -> URL {
|
||||
let base = settings.outputFolderURL.appendingPathComponent("sessions", isDirectory: true)
|
||||
let folder = base.appendingPathComponent("\(Self.timestamp())_manual", isDirectory: true)
|
||||
let folder = base.appendingPathComponent("\(Self.timestamp())_\(label)", isDirectory: true)
|
||||
try FileManager.default.createDirectory(at: folder, withIntermediateDirectories: true)
|
||||
return folder
|
||||
}
|
||||
|
||||
@@ -0,0 +1,85 @@
|
||||
import Foundation
|
||||
import AVFoundation
|
||||
|
||||
/// Splits a long session into backend-sized chunks and produces, per chunk, the
|
||||
/// sliced audio and the timeline rebased to chunk-local seconds.
|
||||
///
|
||||
/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3
|
||||
/// min are chunked into ~2–3 min windows; names + voiceprints unify speakers
|
||||
/// across chunks (handled in the pipeline).
|
||||
enum SessionPackager {
|
||||
struct PlannedChunk: Equatable {
|
||||
let index: Int
|
||||
let start: Double // global seconds
|
||||
let end: Double
|
||||
}
|
||||
|
||||
/// One chunk if short; otherwise even ~`chunkSeconds` windows.
|
||||
static func planChunks(durationSec: Double,
|
||||
chunkSeconds: Double = 150,
|
||||
thresholdSec: Double = 180) -> [PlannedChunk] {
|
||||
guard durationSec > thresholdSec else {
|
||||
return [PlannedChunk(index: 0, start: 0, end: durationSec)]
|
||||
}
|
||||
var chunks: [PlannedChunk] = []
|
||||
var start = 0.0
|
||||
var index = 0
|
||||
while start < durationSec - 0.001 {
|
||||
let end = min(start + chunkSeconds, durationSec)
|
||||
chunks.append(PlannedChunk(index: index, start: start, end: end))
|
||||
start = end
|
||||
index += 1
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
|
||||
/// Clip segments to `[start, end)` and rebase to chunk-local seconds, then
|
||||
/// emit the flat `label-merge` array `[{start,end,name,confidence}]`.
|
||||
static func rebasedTimelineData(_ segments: [VisualTimeline.Segment],
|
||||
start: Double, end: Double) throws -> Data {
|
||||
let flat: [[String: Any]] = segments.compactMap { seg in
|
||||
let s = max(seg.start, start)
|
||||
let e = min(seg.end, end)
|
||||
guard e > s else { return nil }
|
||||
return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence]
|
||||
}
|
||||
return try JSONSerialization.data(withJSONObject: flat, options: [])
|
||||
}
|
||||
|
||||
/// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
|
||||
static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
|
||||
let input = try AVAudioFile(forReading: source)
|
||||
let sr = input.fileFormat.sampleRate
|
||||
let startFrame = AVAudioFramePosition((startSec * sr).rounded())
|
||||
let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded()))
|
||||
guard endFrame > startFrame else { return }
|
||||
|
||||
let settings: [String: Any] = [
|
||||
AVFormatIDKey: kAudioFormatLinearPCM,
|
||||
AVSampleRateKey: sr,
|
||||
AVNumberOfChannelsKey: 1,
|
||||
AVLinearPCMBitDepthKey: 16,
|
||||
AVLinearPCMIsFloatKey: false,
|
||||
AVLinearPCMIsBigEndianKey: false,
|
||||
]
|
||||
let output = try AVAudioFile(forWriting: dest, settings: settings,
|
||||
commonFormat: .pcmFormatFloat32, interleaved: false)
|
||||
input.framePosition = startFrame
|
||||
var remaining = AVAudioFrameCount(endFrame - startFrame)
|
||||
let block: AVAudioFrameCount = 16_000
|
||||
while remaining > 0 {
|
||||
let n = min(block, remaining)
|
||||
guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break }
|
||||
try input.read(into: buffer, frameCount: n)
|
||||
if buffer.frameLength == 0 { break }
|
||||
try output.write(from: buffer)
|
||||
remaining -= buffer.frameLength
|
||||
}
|
||||
}
|
||||
|
||||
/// Duration (seconds) of a WAV.
|
||||
static func duration(of url: URL) -> Double {
|
||||
guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 }
|
||||
return Double(file.length) / file.fileFormat.sampleRate
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
import Foundation
|
||||
|
||||
/// `speakers.json` — the final stored output (docs §6): per-chunk `label-merge`
|
||||
/// results concatenated, timestamps offset back to global seconds, names unified.
|
||||
/// This is the hand-off to the downstream summarizer; the app stops here.
|
||||
struct SpeakersFile: Codable {
|
||||
let sessionId: String
|
||||
let app: String
|
||||
let durationSec: Double
|
||||
let speakers: [Speaker]
|
||||
let segments: [Segment]
|
||||
let models: [String: String]
|
||||
|
||||
struct Speaker: Codable, Equatable {
|
||||
let name: String
|
||||
let source: String
|
||||
let overlapConfidence: Double?
|
||||
let matchSimilarity: Double?
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case name, source
|
||||
case overlapConfidence = "overlap_confidence"
|
||||
case matchSimilarity = "match_similarity"
|
||||
}
|
||||
}
|
||||
|
||||
struct Segment: Codable, Equatable {
|
||||
let start: Double
|
||||
let end: Double
|
||||
let speaker: String
|
||||
let text: String?
|
||||
}
|
||||
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case sessionId = "session_id"
|
||||
case app
|
||||
case durationSec = "duration_sec"
|
||||
case speakers, segments, models
|
||||
}
|
||||
|
||||
func write(to url: URL) throws {
|
||||
let encoder = JSONEncoder()
|
||||
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
|
||||
try encoder.encode(self).write(to: url)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
import Foundation
|
||||
|
||||
/// Concatenates per-chunk `label-merge` results into one global `speakers.json`:
|
||||
/// segment times offset back to global seconds, speakers unified across chunks by
|
||||
/// name, and fingerprints collected for the voiceprint store.
|
||||
enum TranscriptAssembler {
|
||||
struct ChunkResult {
|
||||
let chunkStart: Double // global seconds
|
||||
let response: LabelMergeResponse
|
||||
}
|
||||
|
||||
struct Assembled {
|
||||
let speakersFile: SpeakersFile
|
||||
let fingerprints: [String: [Float]] // name -> 192-dim, for VoiceprintStore
|
||||
}
|
||||
|
||||
/// Source ranking when the same name appears across chunks with different sources.
|
||||
private static func rank(_ source: String) -> Int {
|
||||
switch source {
|
||||
case "visual": return 3
|
||||
case "voiceprint": return 2
|
||||
default: return 1 // unmatched
|
||||
}
|
||||
}
|
||||
|
||||
private static func isUnknown(_ name: String) -> Bool {
|
||||
LabelMergeResponse.isUnknownName(name)
|
||||
}
|
||||
|
||||
static func assemble(sessionId: String, app: String, chunks: [ChunkResult]) -> Assembled {
|
||||
var segments: [SpeakersFile.Segment] = []
|
||||
var bestSpeaker: [String: SpeakersFile.Speaker] = [:]
|
||||
var fingerprints: [String: [Float]] = [:]
|
||||
var models: [String: String] = [:]
|
||||
var duration = 0.0
|
||||
|
||||
for chunk in chunks {
|
||||
let offset = chunk.chunkStart
|
||||
// Audio length from the chunk window, so silent/all-unknown calls still
|
||||
// report a real duration (not just the last segment's end).
|
||||
duration = max(duration, offset + chunk.response.duration)
|
||||
|
||||
for seg in chunk.response.segments {
|
||||
let start = seg.startSeconds + offset
|
||||
let end = seg.endSeconds + offset
|
||||
segments.append(.init(start: start, end: end, speaker: seg.speaker, text: seg.text))
|
||||
duration = max(duration, end)
|
||||
}
|
||||
|
||||
for sp in chunk.response.speakers {
|
||||
let candidate = SpeakersFile.Speaker(
|
||||
name: sp.name, source: sp.source,
|
||||
overlapConfidence: sp.overlapConfidence, matchSimilarity: sp.matchSimilarity)
|
||||
if let existing = bestSpeaker[sp.name] {
|
||||
if rank(sp.source) > rank(existing.source) { bestSpeaker[sp.name] = candidate }
|
||||
} else {
|
||||
bestSpeaker[sp.name] = candidate
|
||||
}
|
||||
// Collect named fingerprints only (never Unknown_N / Speaker_unknown).
|
||||
if !isUnknown(sp.name), let fp = sp.fingerprint, fp.count > 0 {
|
||||
fingerprints[sp.name] = fp
|
||||
}
|
||||
}
|
||||
for (name, fp) in chunk.response.fingerprints where !isUnknown(name) && fp.count > 0 {
|
||||
fingerprints[name] = fp
|
||||
}
|
||||
}
|
||||
|
||||
segments.sort { $0.start < $1.start }
|
||||
let speakers = bestSpeaker.values.sorted { $0.name < $1.name }
|
||||
models = chunks.last?.response.models ?? [:]
|
||||
|
||||
let file = SpeakersFile(
|
||||
sessionId: sessionId, app: app, durationSec: duration,
|
||||
speakers: speakers, segments: segments, models: models)
|
||||
return Assembled(speakersFile: file, fingerprints: fingerprints)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
import Foundation
|
||||
|
||||
/// Drives a finished session through the backend: chunk → sequential
|
||||
/// `label-merge` (accumulating voiceprints) → assemble `speakers.json` → persist
|
||||
/// fingerprints. Requests are sequential by construction (one chunk at a time).
|
||||
final class TranscriptPipeline {
|
||||
private let client: SparkControlClient
|
||||
private let voiceprints: VoiceprintStore
|
||||
|
||||
init(baseURL: String, skipTLS: Bool, voiceprints: VoiceprintStore) {
|
||||
self.client = SparkControlClient(baseURL: baseURL, skipTLS: skipTLS)
|
||||
self.voiceprints = voiceprints
|
||||
}
|
||||
|
||||
/// Process `mixedURL` against `timeline` (visual + self spans). Writes
|
||||
/// `speakers.json` into `sessionFolder` and returns it. `progress(done,total)`
|
||||
/// is called per chunk.
|
||||
func process(sessionFolder: URL,
|
||||
sessionId: String,
|
||||
app: String,
|
||||
mixedURL: URL,
|
||||
timeline: [VisualTimeline.Segment],
|
||||
progress: ((Int, Int) async -> Void)? = nil) async throws -> SpeakersFile {
|
||||
let duration = SessionPackager.duration(of: mixedURL)
|
||||
let plan = SessionPackager.planChunks(durationSec: duration)
|
||||
|
||||
// Zero-duration / empty session → a valid empty speakers.json, no backend call.
|
||||
if plan.isEmpty || duration <= 0 {
|
||||
let empty = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: [])
|
||||
try empty.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
|
||||
await progress?(0, 0)
|
||||
return empty.speakersFile
|
||||
}
|
||||
|
||||
let chunksDir = sessionFolder.appendingPathComponent("chunks", isDirectory: true)
|
||||
try? FileManager.default.createDirectory(at: chunksDir, withIntermediateDirectories: true)
|
||||
defer { try? FileManager.default.removeItem(at: chunksDir) } // cleanup on success OR throw
|
||||
|
||||
// Start from stored voiceprints; accumulate this call's prints across chunks
|
||||
// for within-call unification (the store only persists high-confidence ones).
|
||||
var known = voiceprints.knownVoiceprints()
|
||||
var results: [TranscriptAssembler.ChunkResult] = []
|
||||
|
||||
for chunk in plan {
|
||||
try Task.checkCancellation()
|
||||
await progress?(chunk.index, plan.count)
|
||||
let chunkURL = chunksDir.appendingPathComponent("chunk_\(String(format: "%03d", chunk.index)).wav")
|
||||
try SessionPackager.sliceAudio(from: mixedURL, startSec: chunk.start, endSec: chunk.end, to: chunkURL)
|
||||
guard FileManager.default.fileExists(atPath: chunkURL.path) else { continue } // empty slice → skip
|
||||
|
||||
let timelineData = try SessionPackager.rebasedTimelineData(timeline, start: chunk.start, end: chunk.end)
|
||||
let response = try await client.labelMerge(
|
||||
audioURL: chunkURL, timeline: timelineData,
|
||||
knownVoiceprints: known.isEmpty ? nil : known, transcribe: true)
|
||||
|
||||
for (name, fp) in response.fingerprints where !LabelMergeResponse.isUnknownName(name) {
|
||||
known[name] = fp
|
||||
}
|
||||
voiceprints.update(with: response)
|
||||
results.append(.init(chunkStart: chunk.start, response: response))
|
||||
try? FileManager.default.removeItem(at: chunkURL)
|
||||
}
|
||||
await progress?(plan.count, plan.count)
|
||||
|
||||
let assembled = TranscriptAssembler.assemble(sessionId: sessionId, app: app, chunks: results)
|
||||
try assembled.speakersFile.write(to: sessionFolder.appendingPathComponent("speakers.json"))
|
||||
return assembled.speakersFile
|
||||
}
|
||||
|
||||
/// Build the `label-merge` timeline from mic-VAD self spans (Phase 1/2). Once
|
||||
/// the visual adapters land (Phase 3–4), their segments are merged in too.
|
||||
static func timeline(fromSelfSpans spans: [VADSpan], selfName: String) -> [VisualTimeline.Segment] {
|
||||
spans.map { .init(start: $0.start, end: $0.end, name: selfName, confidence: $0.confidence, source: "mic_vad") }
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user