import Foundation enum GatewayLLMError: Error, LocalizedError { case invalidHost case notReady // vLLM not loaded on the gateway case server(Int, String) case decode(String) case retriesExhausted var errorDescription: String? { switch self { case .invalidHost: return "Invalid backend host URL." case .notReady: return "The gateway's language model isn't ready." case .server(let code, let detail): return "LLM error \(code): \(detail)" case .decode(let msg): return "Couldn't decode the LLM response: \(msg)" case .retriesExhausted: return "Gateway stayed busy (503) after retries." } } } /// Talks to the Spark Control gateway's OpenAI-compatible `/v1/chat/completions` /// (the same host + TLS as `label-merge`). Used for the recap analysis (topic /// sections, summary polish, meeting extras). **Call sequentially** — like audio, /// the gateway serializes GPU work; the recap pipeline issues one request at a time. final class GatewayLLMClient { private let baseURL: String private let urlSession: URLSession init(baseURL: String, skipTLS: Bool) { let trimmed = baseURL.trimmingCharacters(in: .whitespacesAndNewlines) self.baseURL = trimmed.hasSuffix("/") ? String(trimmed.dropLast()) : trimmed let config = URLSessionConfiguration.ephemeral config.timeoutIntervalForRequest = 600 config.timeoutIntervalForResource = 900 config.waitsForConnectivity = false let delegate: URLSessionDelegate? = skipTLS ? InsecureTrustDelegate(allowedHost: URL(string: self.baseURL)?.host) : nil self.urlSession = URLSession(configuration: config, delegate: delegate, delegateQueue: nil) } deinit { urlSession.finishTasksAndInvalidate() } /// The ready chat model id from `/api/endpoints` (`vllm.model`), or nil if the /// gateway has no language model loaded. func chatModelId() async -> String? { guard let url = URL(string: baseURL + "/api/endpoints") else { return nil } guard let (data, _) = try? await urlSession.data(from: url), let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any], let vllm = obj["vllm"] as? [String: Any], (vllm["ready"] as? Bool) == true, let model = vllm["model"] as? String, !model.isEmpty else { return nil } return model } /// One JSON-mode chat completion. Returns the raw `content` string (the caller /// parses it as JSON). Retries on `503 + Retry-After`. func completeJSON(model: String, system: String?, user: String, maxTokens: Int = 4096, maxRetries: Int = 3) async throws -> String { guard let url = URL(string: baseURL + "/v1/chat/completions") else { throw GatewayLLMError.invalidHost } var messages: [[String: String]] = [] if let system { messages.append(["role": "system", "content": system]) } messages.append(["role": "user", "content": user]) let body: [String: Any] = [ "model": model, "messages": messages, "max_tokens": maxTokens, "stream": false, "response_format": ["type": "json_object"], "chat_template_kwargs": ["enable_thinking": false], ] let bodyData = try JSONSerialization.data(withJSONObject: body) var attempt = 0 while true { var request = URLRequest(url: url) request.httpMethod = "POST" request.setValue("application/json", forHTTPHeaderField: "Content-Type") request.httpBody = bodyData let (data, response) = try await urlSession.data(for: request) guard let http = response as? HTTPURLResponse else { throw GatewayLLMError.decode("no HTTP response") } switch http.statusCode { case 200..<300: return try Self.content(from: data) case 503: attempt += 1 if attempt > maxRetries { throw GatewayLLMError.retriesExhausted } let retryAfter = http.value(forHTTPHeaderField: "Retry-After").flatMap(Double.init) ?? 5 try await Task.sleep(nanoseconds: UInt64(max(1, retryAfter) * 1_000_000_000)) default: throw GatewayLLMError.server(http.statusCode, Self.detail(from: data)) } } } // MARK: - Parsing private static func content(from data: Data) throws -> String { struct ChatResponse: Decodable { struct Choice: Decodable { struct Message: Decodable { let content: String }; let message: Message } let choices: [Choice] } do { let decoded = try JSONDecoder().decode(ChatResponse.self, from: data) guard let text = decoded.choices.first?.message.content else { throw GatewayLLMError.decode("no choices in response") } return text } catch { throw GatewayLLMError.decode(error.localizedDescription) } } private static func detail(from data: Data) -> String { if let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any] { if let detail = obj["detail"] as? String { return detail } if let err = obj["error"] as? [String: Any], let msg = err["message"] as? String { return msg } } return String(data: data, encoding: .utf8) ?? "unknown error" } /// Strip a ```json … ``` code fence if the model wrapped its JSON (defensive; /// JSON mode usually prevents this). static func stripCodeFence(_ s: String) -> String { var t = s.trimmingCharacters(in: .whitespacesAndNewlines) if t.hasPrefix("```") { if let firstNewline = t.firstIndex(of: "\n") { t = String(t[t.index(after: firstNewline)...]) } if let fenceRange = t.range(of: "```", options: .backwards) { t = String(t[..