Files
ten31-transcripts/Ten31Transcripts/Session/TranscriptAssembler.swift
T
Grant Gilliam 53d7fcdac0 Client: dual-channel label-merge (mic_file + system_file)
The backend shipped dual-channel mode; wire the client to it. We already capture
mic (you) and system (others) separately, so send them as two files instead of the
mono mix — fixing the misattribution at the source.

- SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad);
  multipart generalized to N files; shared POST/retry/decode extracted.
- SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad;
  sliceAudio reused for both tracks.
- TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase
  timeline + self_vad per chunk) when system audio is healthy; mono mixed-file
  fallback (self folded into the timeline) otherwise.
- VisualCapture.finish: write the full visual_timeline.json (remote + self merged)
  but return REMOTE (vision) segments only — self travels via the mic channel.
- TranscriptAssembler: rank mic_channel highest (the user's own track wins).
- VoiceprintStore: store the clean mic_channel self voiceprint.
- SessionController: pass mic/system URLs + remote timeline + channel self-spans +
  self_name + systemHealthy; self_vad.json now reflects the channel-verified spans.

Validated END-TO-END against the live backend on the real misattributing session:
'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own
lines come back source=mic_channel; per-channel ASR recovered fuller remote text.
36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
2026-06-06 13:15:29 -05:00

81 lines
3.3 KiB
Swift

import Foundation
/// Concatenates per-chunk `label-merge` results into one global `speakers.json`:
/// segment times offset back to global seconds, speakers unified across chunks by
/// name, and fingerprints collected for the voiceprint store.
enum TranscriptAssembler {
struct ChunkResult {
let chunkStart: Double // global seconds
let response: LabelMergeResponse
}
struct Assembled {
let speakersFile: SpeakersFile
let fingerprints: [String: [Float]] // name -> 192-dim, for VoiceprintStore
}
/// Source ranking when the same name appears across chunks with different sources.
/// `mic_channel` (the local user's own microphone) is the most authoritative.
private static func rank(_ source: String) -> Int {
switch source {
case "mic_channel": return 4
case "visual": return 3
case "voiceprint": return 2
default: return 1 // unmatched
}
}
private static func isUnknown(_ name: String) -> Bool {
LabelMergeResponse.isUnknownName(name)
}
static func assemble(sessionId: String, app: String, chunks: [ChunkResult]) -> Assembled {
var segments: [SpeakersFile.Segment] = []
var bestSpeaker: [String: SpeakersFile.Speaker] = [:]
var fingerprints: [String: [Float]] = [:]
var models: [String: String] = [:]
var duration = 0.0
for chunk in chunks {
let offset = chunk.chunkStart
// Audio length from the chunk window, so silent/all-unknown calls still
// report a real duration (not just the last segment's end).
duration = max(duration, offset + chunk.response.duration)
for seg in chunk.response.segments {
let start = seg.startSeconds + offset
let end = seg.endSeconds + offset
segments.append(.init(start: start, end: end, speaker: seg.speaker, text: seg.text))
duration = max(duration, end)
}
for sp in chunk.response.speakers {
let candidate = SpeakersFile.Speaker(
name: sp.name, source: sp.source,
overlapConfidence: sp.overlapConfidence, matchSimilarity: sp.matchSimilarity)
if let existing = bestSpeaker[sp.name] {
if rank(sp.source) > rank(existing.source) { bestSpeaker[sp.name] = candidate }
} else {
bestSpeaker[sp.name] = candidate
}
// Collect named fingerprints only (never Unknown_N / Speaker_unknown).
if !isUnknown(sp.name), let fp = sp.fingerprint, fp.count > 0 {
fingerprints[sp.name] = fp
}
}
for (name, fp) in chunk.response.fingerprints where !isUnknown(name) && fp.count > 0 {
fingerprints[name] = fp
}
}
segments.sort { $0.start < $1.start }
let speakers = bestSpeaker.values.sorted { $0.name < $1.name }
models = chunks.last?.response.models ?? [:]
let file = SpeakersFile(
sessionId: sessionId, app: app, durationSec: duration,
speakers: speakers, segments: segments, models: models)
return Assembled(speakersFile: file, fingerprints: fingerprints)
}
}