Files
ten31-transcripts/Ten31Transcripts/Backend/SparkControlClient.swift
T
Grant Gilliam 3629dbdaaa Default TLS validation on; scope skip-TLS bypass to the configured host
The app shipped with certificate validation bypassed globally and on by
default — InsecureTrustDelegate trusted any cert from any host. That was
the evaluation's P1: anyone on the LAN could MITM call audio, transcripts,
and voiceprints.

The backend's Start9 cert already validates under normal system trust when
the StartOS Root CA is installed in the keychain (confirmed: URLSession
default validation returns 200 against the backend and its fallback), so the
bypass is unnecessary:
- skip-TLS now defaults to off
- when explicitly enabled, the bypass is scoped to the configured host via
  InsecureTrustDelegate.allowsTrustOverride, never "trust any server"
- the host gate is pure and unit-tested (InsecureTrustDelegateTests)

Docs reconciled: AGENTS.md backend/TLS line and Current state.
2026-06-13 16:02:57 -05:00

222 lines
10 KiB
Swift

import Foundation
/// Decoded `POST /api/audio/label-merge` response (verified against the live
/// backend). Handles both `transcribe=true` (start_ms/end_ms + text) and
/// `transcribe=false` (start_s/end_s + confidence) segment shapes.
struct LabelMergeResponse: Decodable {
let duration: Double
let speakers: [Speaker]
let segments: [Segment]
let fingerprints: [String: [Float]]
let models: [String: String]?
/// The backend's "unmatched" labels never persisted as a named voiceprint.
static func isUnknownName(_ name: String) -> Bool {
name.hasPrefix("Unknown_") || name == "Speaker_unknown"
}
struct Speaker: Decodable {
let cluster: String
let name: String
let source: String // visual | voiceprint | unmatched
let overlapConfidence: Double?
let matchSimilarity: Double?
let fingerprint: [Float]?
enum CodingKeys: String, CodingKey {
case cluster, name, source, fingerprint
case overlapConfidence = "overlap_confidence"
case matchSimilarity = "match_similarity"
}
}
struct Segment: Decodable {
let startMs: Int?
let endMs: Int?
let startS: Double?
let endS: Double?
let speaker: String
let text: String?
let confidence: Double?
enum CodingKeys: String, CodingKey {
case startMs = "start_ms"
case endMs = "end_ms"
case startS = "start_s"
case endS = "end_s"
case speaker, text, confidence
}
/// Start time in seconds regardless of which shape the backend used.
var startSeconds: Double { startS ?? startMs.map { Double($0) / 1000 } ?? 0 }
var endSeconds: Double { endS ?? endMs.map { Double($0) / 1000 } ?? 0 }
}
}
enum SparkControlError: Error, LocalizedError {
case invalidHost
case tooLarge // 413
case server(Int, String) // other non-2xx with {"detail":...}
case decode(String)
case retriesExhausted
var errorDescription: String? {
switch self {
case .invalidHost: return "Invalid backend host URL."
case .tooLarge: return "Audio chunk exceeds the backend's 200 MB limit."
case .server(let code, let detail): return "Backend error \(code): \(detail)"
case .decode(let msg): return "Couldn't decode backend response: \(msg)"
case .retriesExhausted: return "Backend stayed busy (503) after retries."
}
}
}
/// Talks to SparkControl's `label-merge`. **Callers must invoke sequentially**
/// (one audio request in flight) concurrent audio requests trip a GPU race
/// (503). The Phase-5 pipeline drives one chunk at a time, satisfying this.
final class SparkControlClient {
private let baseURL: String
private let urlSession: URLSession
init(baseURL: String, skipTLS: Bool) {
let trimmed = baseURL.trimmingCharacters(in: .whitespacesAndNewlines)
self.baseURL = trimmed.hasSuffix("/") ? String(trimmed.dropLast()) : trimmed
let config = URLSessionConfiguration.ephemeral
config.timeoutIntervalForRequest = 600 // diarization can take up to ~600s
config.timeoutIntervalForResource = 900
config.waitsForConnectivity = false
let delegate: URLSessionDelegate? = skipTLS
? InsecureTrustDelegate(allowedHost: URL(string: self.baseURL)?.host)
: nil
self.urlSession = URLSession(configuration: config, delegate: delegate, delegateQueue: nil)
}
deinit { urlSession.finishTasksAndInvalidate() }
/// Mono `label-merge`: one mixed-mono file + timeline. Retries on `503`.
func labelMerge(audioURL: URL,
timeline: Data,
knownVoiceprints: [String: [Float]]?,
transcribe: Bool,
minOverlap: Double? = nil,
voiceprintThreshold: Double? = nil,
maxRetries: Int = 3) async throws -> LabelMergeResponse {
let fields = Self.commonFields(timeline: timeline, knownVoiceprints: knownVoiceprints,
transcribe: transcribe, minOverlap: minOverlap,
voiceprintThreshold: voiceprintThreshold)
let files = [(field: "file", filename: audioURL.lastPathComponent, data: try Data(contentsOf: audioURL))]
return try await perform(fields: fields, files: files, maxRetries: maxRetries)
}
/// Dual-channel `label-merge`: separate mic (local user) + system (remote)
/// tracks. The mic channel is attributed as `self_name`; `timeline` names only
/// the remote/system speakers; `selfVad` (optional) are the chunk-local windows
/// where the mic is genuinely the user (active and louder than system).
func labelMergeDual(micURL: URL,
systemURL: URL,
selfName: String,
selfVad: Data?,
timeline: Data,
knownVoiceprints: [String: [Float]]?,
transcribe: Bool,
minOverlap: Double? = nil,
voiceprintThreshold: Double? = nil,
maxRetries: Int = 3) async throws -> LabelMergeResponse {
var fields = Self.commonFields(timeline: timeline, knownVoiceprints: knownVoiceprints,
transcribe: transcribe, minOverlap: minOverlap,
voiceprintThreshold: voiceprintThreshold)
fields["self_name"] = selfName
if let selfVad, let str = String(data: selfVad, encoding: .utf8) { fields["self_vad"] = str }
let files = [
(field: "mic_file", filename: micURL.lastPathComponent, data: try Data(contentsOf: micURL)),
(field: "system_file", filename: systemURL.lastPathComponent, data: try Data(contentsOf: systemURL)),
]
return try await perform(fields: fields, files: files, maxRetries: maxRetries)
}
// MARK: - Transport
private static func commonFields(timeline: Data, knownVoiceprints: [String: [Float]]?,
transcribe: Bool, minOverlap: Double?,
voiceprintThreshold: Double?) -> [String: String] {
var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"]
if let timelineString = String(data: timeline, encoding: .utf8) { fields["timeline"] = timelineString }
if let known = knownVoiceprints, !known.isEmpty,
let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }),
let str = String(data: data, encoding: .utf8) {
fields["known_voiceprints"] = str
}
if let minOverlap { fields["min_overlap"] = String(minOverlap) }
if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) }
return fields
}
/// Shared POST + retry-on-503 + decode. Body is built once (constant across retries).
private func perform(fields: [String: String],
files: [(field: String, filename: String, data: Data)],
maxRetries: Int) async throws -> LabelMergeResponse {
guard let url = URL(string: baseURL + "/api/audio/label-merge") else {
throw SparkControlError.invalidHost
}
let (body, contentType) = Self.multipart(fields: fields, files: files)
var attempt = 0
while true {
var request = URLRequest(url: url)
request.httpMethod = "POST"
request.setValue(contentType, forHTTPHeaderField: "Content-Type")
request.httpBody = body
let (data, response) = try await urlSession.data(for: request)
guard let http = response as? HTTPURLResponse else {
throw SparkControlError.decode("no HTTP response")
}
switch http.statusCode {
case 200..<300:
do {
return try JSONDecoder().decode(LabelMergeResponse.self, from: data)
} catch {
throw SparkControlError.decode(error.localizedDescription)
}
case 503:
attempt += 1
if attempt > maxRetries { throw SparkControlError.retriesExhausted }
let retryAfter = (http.value(forHTTPHeaderField: "Retry-After")).flatMap(Double.init) ?? 5
try await Task.sleep(nanoseconds: UInt64(max(1, retryAfter) * 1_000_000_000))
case 413:
throw SparkControlError.tooLarge
default:
throw SparkControlError.server(http.statusCode, Self.detail(from: data))
}
}
}
// MARK: - Helpers
private static func detail(from data: Data) -> String {
if let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let detail = obj["detail"] as? String { return detail }
return String(data: data, encoding: .utf8) ?? "unknown error"
}
private static func multipart(fields: [String: String],
files: [(field: String, filename: String, data: Data)]) -> (Data, String) {
let boundary = "Boundary-\(UUID().uuidString)"
var body = Data()
func append(_ s: String) { body.append(s.data(using: .utf8)!) }
for (name, value) in fields {
append("--\(boundary)\r\n")
append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n")
append("\(value)\r\n")
}
for file in files {
append("--\(boundary)\r\n")
append("Content-Disposition: form-data; name=\"\(file.field)\"; filename=\"\(file.filename)\"\r\n")
append("Content-Type: audio/wav\r\n\r\n")
body.append(file.data)
append("\r\n")
}
append("--\(boundary)--\r\n")
return (body, "multipart/form-data; boundary=\(boundary)")
}
}