Files
ten31-transcripts/Ten31Transcripts/Visual/VisualCapture.swift
T
Grant Gilliam 53d7fcdac0 Client: dual-channel label-merge (mic_file + system_file)
The backend shipped dual-channel mode; wire the client to it. We already capture
mic (you) and system (others) separately, so send them as two files instead of the
mono mix — fixing the misattribution at the source.

- SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad);
  multipart generalized to N files; shared POST/retry/decode extracted.
- SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad;
  sliceAudio reused for both tracks.
- TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase
  timeline + self_vad per chunk) when system audio is healthy; mono mixed-file
  fallback (self folded into the timeline) otherwise.
- VisualCapture.finish: write the full visual_timeline.json (remote + self merged)
  but return REMOTE (vision) segments only — self travels via the mic channel.
- TranscriptAssembler: rank mic_channel highest (the user's own track wins).
- VoiceprintStore: store the clean mic_channel self voiceprint.
- SessionController: pass mic/system URLs + remote timeline + channel self-spans +
  self_name + systemHealthy; self_vad.json now reflects the channel-verified spans.

Validated END-TO-END against the live backend on the real misattributing session:
'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own
lines come back source=mic_channel; per-channel ASR recovered fuller remote text.
36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
2026-06-06 13:15:29 -05:00

90 lines
4.4 KiB
Swift

import Foundation
import CoreGraphics
/// Owns the visual side of one recording session: picks the app's adapter, runs a
/// `VisualObserver` over the call window, and on stop writes `visual_timeline.json`
/// and returns the speaker segments for the backend hand-off.
///
/// Strictly best-effort: if there's no adapter for the app, or the window can't be
/// captured, the session simply records audio-only visuals never block or break
/// the proven audio path. `init?` returns nil when the app has no visual adapter.
@available(macOS 13.0, *)
final class VisualCapture {
let app: CallDetector.DetectedApp
private let adapter: any AppAdapter
private let observer: VisualObserver
init?(app: CallDetector.DetectedApp, bundleID: String, windowID: CGWindowID?, t0Host: Double) {
guard let adapter = AdapterRegistry.adapter(for: app) else { return nil }
self.app = app
self.adapter = adapter
self.observer = VisualObserver(bundleID: bundleID, windowID: windowID, adapter: adapter,
t0Host: t0Host, fps: adapter.preferredFPS)
}
/// Start window capture. Throws if the window isn't capturable (no window yet,
/// Screen Recording denied) the caller catches and falls back to audio-only.
func start() async throws {
try await observer.start()
}
/// Stop and discard capture without writing anything (used when the session
/// ends before capture was fully adopted).
func cancel() async {
_ = await observer.stop()
}
/// Clamp segment ends to the audio duration; drop any that become empty. Keeps
/// `visual_timeline.json` internally consistent and never sends the backend a
/// segment longer than the audio. (`duration <= 0` passthrough.)
static func clampSegments(_ segs: [VisualTimeline.Segment], to duration: Double) -> [VisualTimeline.Segment] {
guard duration > 0 else { return segs }
return segs.compactMap { s in
let end = min(s.end, duration)
guard end > s.start else { return nil }
return .init(start: s.start, end: end, name: s.name, confidence: s.confidence, source: s.source)
}
}
static func clampGaps(_ gaps: [VisualTimeline.Gap], to duration: Double) -> [VisualTimeline.Gap] {
guard duration > 0 else { return gaps }
return gaps.compactMap { g in
let end = min(g.end, duration)
guard end > g.start else { return nil }
return .init(start: g.start, end: end, reason: g.reason)
}
}
/// Stop capture and write `visual_timeline.json` (the full human-readable picture:
/// remote visual segments + the mic-VAD self spans, merged). Returns ONLY the
/// remote (vision) segments in dual-channel mode the backend names the system
/// track from these, while self is handled by the mic channel + `self_vad`.
func finish(selfSpans: [VADSpan], selfName: String,
sessionId: String, t0Unix: Double, durationSec: Double,
folder: URL) async -> [VisualTimeline.Segment] {
let (rawSegments, rawGaps) = await observer.stop()
// The observer stops slightly after audio fixes `durationSec`, so a trailing
// gap/segment can run past it. Clamp ends so the JSON is internally consistent
// (and we never hand the backend a segment longer than the audio).
let vision = Self.clampSegments(rawSegments, to: durationSec) // remote speakers
let gaps = Self.clampGaps(rawGaps, to: durationSec)
let selfSegs = Self.clampSegments(selfSpans.map {
VisualTimeline.Segment(start: $0.start, end: $0.end, name: selfName,
confidence: $0.confidence, source: "mic_vad")
}, to: durationSec)
let artifact = (vision + selfSegs).sorted { $0.start < $1.start }
let names = Set(artifact.map { $0.name })
let participants = names.sorted().map {
VisualTimeline.Participant(name: $0, isSelf: $0 == selfName ? true : nil, aliases: nil)
}
let timeline = VisualTimeline(
sessionId: sessionId, app: app.label, adapterVersion: adapter.adapterVersion,
t0Unix: t0Unix, durationSec: durationSec, fpsSampled: adapter.preferredFPS,
selfName: selfName, participants: participants, segments: artifact, visualGaps: gaps)
try? timeline.write(to: folder.appendingPathComponent("visual_timeline.json"))
return vision
}
}