import Foundation /// Decoded `POST /api/audio/label-merge` response (verified against the live /// backend). Handles both `transcribe=true` (start_ms/end_ms + text) and /// `transcribe=false` (start_s/end_s + confidence) segment shapes. struct LabelMergeResponse: Decodable { let duration: Double let speakers: [Speaker] let segments: [Segment] let fingerprints: [String: [Float]] let models: [String: String]? /// The backend's "unmatched" labels — never persisted as a named voiceprint. static func isUnknownName(_ name: String) -> Bool { name.hasPrefix("Unknown_") || name == "Speaker_unknown" } struct Speaker: Decodable { let cluster: String let name: String let source: String // visual | voiceprint | unmatched let overlapConfidence: Double? let matchSimilarity: Double? let fingerprint: [Float]? enum CodingKeys: String, CodingKey { case cluster, name, source, fingerprint case overlapConfidence = "overlap_confidence" case matchSimilarity = "match_similarity" } } struct Segment: Decodable { let startMs: Int? let endMs: Int? let startS: Double? let endS: Double? let speaker: String let text: String? let confidence: Double? enum CodingKeys: String, CodingKey { case startMs = "start_ms" case endMs = "end_ms" case startS = "start_s" case endS = "end_s" case speaker, text, confidence } /// Start time in seconds regardless of which shape the backend used. var startSeconds: Double { startS ?? startMs.map { Double($0) / 1000 } ?? 0 } var endSeconds: Double { endS ?? endMs.map { Double($0) / 1000 } ?? 0 } } } enum SparkControlError: Error, LocalizedError { case invalidHost case tooLarge // 413 case server(Int, String) // other non-2xx with {"detail":...} case decode(String) case retriesExhausted var errorDescription: String? { switch self { case .invalidHost: return "Invalid backend host URL." case .tooLarge: return "Audio chunk exceeds the backend's 200 MB limit." case .server(let code, let detail): return "Backend error \(code): \(detail)" case .decode(let msg): return "Couldn't decode backend response: \(msg)" case .retriesExhausted: return "Backend stayed busy (503) after retries." } } } /// Talks to SparkControl's `label-merge`. **Callers must invoke sequentially** /// (one audio request in flight) — concurrent audio requests trip a GPU race /// (503). The Phase-5 pipeline drives one chunk at a time, satisfying this. final class SparkControlClient { private let baseURL: String private let urlSession: URLSession init(baseURL: String, skipTLS: Bool) { let trimmed = baseURL.trimmingCharacters(in: .whitespacesAndNewlines) self.baseURL = trimmed.hasSuffix("/") ? String(trimmed.dropLast()) : trimmed let config = URLSessionConfiguration.ephemeral config.timeoutIntervalForRequest = 600 // diarization can take up to ~600s config.timeoutIntervalForResource = 900 config.waitsForConnectivity = false let delegate: URLSessionDelegate? = skipTLS ? InsecureTrustDelegate() : nil self.urlSession = URLSession(configuration: config, delegate: delegate, delegateQueue: nil) } deinit { urlSession.finishTasksAndInvalidate() } /// One `label-merge` call. `timeline` is the flat `[{start,end,name,confidence}]` /// JSON (chunk-local seconds). Retries on `503 + Retry-After`. func labelMerge(audioURL: URL, timeline: Data, knownVoiceprints: [String: [Float]]?, transcribe: Bool, minOverlap: Double? = nil, voiceprintThreshold: Double? = nil, maxRetries: Int = 3) async throws -> LabelMergeResponse { guard let url = URL(string: baseURL + "/api/audio/label-merge") else { throw SparkControlError.invalidHost } var fields: [String: String] = ["transcribe": transcribe ? "true" : "false"] if let timelineString = String(data: timeline, encoding: .utf8) { fields["timeline"] = timelineString } if let known = knownVoiceprints, !known.isEmpty, let data = try? JSONSerialization.data(withJSONObject: known.mapValues { $0.map { Double($0) } }), let str = String(data: data, encoding: .utf8) { fields["known_voiceprints"] = str } if let minOverlap { fields["min_overlap"] = String(minOverlap) } if let voiceprintThreshold { fields["voiceprint_threshold"] = String(voiceprintThreshold) } let audio = try Data(contentsOf: audioURL) // Body doesn't change between retries — build it once. let (body, contentType) = Self.multipart(fields: fields, fileField: "file", filename: audioURL.lastPathComponent, fileData: audio) var attempt = 0 while true { var request = URLRequest(url: url) request.httpMethod = "POST" request.setValue(contentType, forHTTPHeaderField: "Content-Type") request.httpBody = body let (data, response) = try await urlSession.data(for: request) guard let http = response as? HTTPURLResponse else { throw SparkControlError.decode("no HTTP response") } switch http.statusCode { case 200..<300: do { return try JSONDecoder().decode(LabelMergeResponse.self, from: data) } catch { throw SparkControlError.decode(error.localizedDescription) } case 503: attempt += 1 if attempt > maxRetries { throw SparkControlError.retriesExhausted } let retryAfter = (http.value(forHTTPHeaderField: "Retry-After")).flatMap(Double.init) ?? 5 try await Task.sleep(nanoseconds: UInt64(max(1, retryAfter) * 1_000_000_000)) case 413: throw SparkControlError.tooLarge default: throw SparkControlError.server(http.statusCode, Self.detail(from: data)) } } } // MARK: - Helpers private static func detail(from data: Data) -> String { if let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any], let detail = obj["detail"] as? String { return detail } return String(data: data, encoding: .utf8) ?? "unknown error" } private static func multipart(fields: [String: String], fileField: String, filename: String, fileData: Data) -> (Data, String) { let boundary = "Boundary-\(UUID().uuidString)" var body = Data() func append(_ s: String) { body.append(s.data(using: .utf8)!) } for (name, value) in fields { append("--\(boundary)\r\n") append("Content-Disposition: form-data; name=\"\(name)\"\r\n\r\n") append("\(value)\r\n") } append("--\(boundary)\r\n") append("Content-Disposition: form-data; name=\"\(fileField)\"; filename=\"\(filename)\"\r\n") append("Content-Type: audio/wav\r\n\r\n") body.append(fileData) append("\r\n--\(boundary)--\r\n") return (body, "multipart/form-data; boundary=\(boundary)") } }