Files
ten31-transcripts/Ten31Transcripts/Session/SessionPackager.swift
T
Grant Gilliam 53d7fcdac0 Client: dual-channel label-merge (mic_file + system_file)
The backend shipped dual-channel mode; wire the client to it. We already capture
mic (you) and system (others) separately, so send them as two files instead of the
mono mix — fixing the misattribution at the source.

- SparkControlClient: labelMergeDual(mic_file, system_file, self_name, self_vad);
  multipart generalized to N files; shared POST/retry/decode extracted.
- SessionPackager.rebasedSelfVadData: chunk-local [{start,end}] for self_vad;
  sliceAudio reused for both tracks.
- TranscriptPipeline.process: dual-channel chunking (slice mic+system, rebase
  timeline + self_vad per chunk) when system audio is healthy; mono mixed-file
  fallback (self folded into the timeline) otherwise.
- VisualCapture.finish: write the full visual_timeline.json (remote + self merged)
  but return REMOTE (vision) segments only — self travels via the mic channel.
- TranscriptAssembler: rank mic_channel highest (the user's own track wins).
- VoiceprintStore: store the clean mic_channel self voiceprint.
- SessionController: pass mic/system URLs + remote timeline + channel self-spans +
  self_name + systemHealthy; self_vad.json now reflects the channel-verified spans.

Validated END-TO-END against the live backend on the real misattributing session:
'Go Bitcoin' (remote) is now attributed to Unknown_0, NOT the user; the user's own
lines come back source=mic_channel; per-channel ASR recovered fuller remote text.
36/36 XCTest (4 new: self_vad rebase, mic_channel ranking + voiceprint storage).
2026-06-06 13:15:29 -05:00

98 lines
4.3 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
import AVFoundation
/// Splits a long session into backend-sized chunks and produces, per chunk, the
/// sliced audio and the timeline rebased to chunk-local seconds.
///
/// The diarizer caps at 4 speakers/chunk and has request limits, so calls > ~3
/// min are chunked into ~23 min windows; names + voiceprints unify speakers
/// across chunks (handled in the pipeline).
enum SessionPackager {
struct PlannedChunk: Equatable {
let index: Int
let start: Double // global seconds
let end: Double
}
/// One chunk if short; otherwise even ~`chunkSeconds` windows.
static func planChunks(durationSec: Double,
chunkSeconds: Double = 150,
thresholdSec: Double = 180) -> [PlannedChunk] {
guard durationSec > thresholdSec else {
return [PlannedChunk(index: 0, start: 0, end: durationSec)]
}
var chunks: [PlannedChunk] = []
var start = 0.0
var index = 0
while start < durationSec - 0.001 {
let end = min(start + chunkSeconds, durationSec)
chunks.append(PlannedChunk(index: index, start: start, end: end))
start = end
index += 1
}
return chunks
}
/// Clip segments to `[start, end)` and rebase to chunk-local seconds, then
/// emit the flat `label-merge` array `[{start,end,name,confidence}]`.
static func rebasedTimelineData(_ segments: [VisualTimeline.Segment],
start: Double, end: Double) throws -> Data {
let flat: [[String: Any]] = segments.compactMap { seg in
let s = max(seg.start, start)
let e = min(seg.end, end)
guard e > s else { return nil }
return ["start": s - start, "end": e - start, "name": seg.name, "confidence": seg.confidence]
}
return try JSONSerialization.data(withJSONObject: flat, options: [])
}
/// Clip self-VAD spans to `[start, end)` and rebase to chunk-local seconds, as
/// the `self_vad` array `[{start,end}]` for dual-channel `label-merge`.
static func rebasedSelfVadData(_ spans: [VADSpan], start: Double, end: Double) throws -> Data {
let flat: [[String: Any]] = spans.compactMap { span in
let s = max(span.start, start)
let e = min(span.end, end)
guard e > s else { return nil }
return ["start": s - start, "end": e - start]
}
return try JSONSerialization.data(withJSONObject: flat, options: [])
}
/// Slice `[startSec, endSec)` of a 16 kHz mono WAV into `dest`.
static func sliceAudio(from source: URL, startSec: Double, endSec: Double, to dest: URL) throws {
let input = try AVAudioFile(forReading: source)
let sr = input.fileFormat.sampleRate
let startFrame = AVAudioFramePosition((startSec * sr).rounded())
let endFrame = min(input.length, AVAudioFramePosition((endSec * sr).rounded()))
guard endFrame > startFrame else { return }
let settings: [String: Any] = [
AVFormatIDKey: kAudioFormatLinearPCM,
AVSampleRateKey: sr,
AVNumberOfChannelsKey: 1,
AVLinearPCMBitDepthKey: 16,
AVLinearPCMIsFloatKey: false,
AVLinearPCMIsBigEndianKey: false,
]
let output = try AVAudioFile(forWriting: dest, settings: settings,
commonFormat: .pcmFormatFloat32, interleaved: false)
input.framePosition = startFrame
var remaining = AVAudioFrameCount(endFrame - startFrame)
let block: AVAudioFrameCount = 16_000
while remaining > 0 {
let n = min(block, remaining)
guard let buffer = AVAudioPCMBuffer(pcmFormat: input.processingFormat, frameCapacity: n) else { break }
try input.read(into: buffer, frameCount: n)
if buffer.frameLength == 0 { break }
try output.write(from: buffer)
remaining -= buffer.frameLength
}
}
/// Duration (seconds) of a WAV.
static func duration(of url: URL) -> Double {
guard let file = try? AVAudioFile(forReading: url), file.fileFormat.sampleRate > 0 else { return 0 }
return Double(file.length) / file.fileFormat.sampleRate
}
}