ten31-transcripts/Ten31Transcripts/Backend/GatewayLLMClient.swift

import Foundation

enum GatewayLLMError: Error, LocalizedError {
    case invalidHost
    case notReady                       // vLLM not loaded on the gateway
    case server(Int, String)
    case decode(String)
    case retriesExhausted

    var errorDescription: String? {
        switch self {
        case .invalidHost: return "Invalid backend host URL."
        case .notReady: return "The gateway's language model isn't ready."
        case .server(let code, let detail): return "LLM error \(code): \(detail)"
        case .decode(let msg): return "Couldn't decode the LLM response: \(msg)"
        case .retriesExhausted: return "Gateway stayed busy (503) after retries."
        }
    }
}

/// Talks to the Spark Control gateway's OpenAI-compatible `/v1/chat/completions`
/// (the same host + TLS as `label-merge`). Used for the recap analysis (topic
/// sections, summary polish, meeting extras). **Call sequentially** — like audio,
/// the gateway serializes GPU work; the recap pipeline issues one request at a time.
final class GatewayLLMClient {
    private let baseURL: String
    private let urlSession: URLSession

    init(baseURL: String, skipTLS: Bool) {
        let trimmed = baseURL.trimmingCharacters(in: .whitespacesAndNewlines)
        self.baseURL = trimmed.hasSuffix("/") ? String(trimmed.dropLast()) : trimmed
        let config = URLSessionConfiguration.ephemeral
        config.timeoutIntervalForRequest = 600
        config.timeoutIntervalForResource = 900
        config.waitsForConnectivity = false
        let delegate: URLSessionDelegate? = skipTLS ? InsecureTrustDelegate() : nil
        self.urlSession = URLSession(configuration: config, delegate: delegate, delegateQueue: nil)
    }

    deinit { urlSession.finishTasksAndInvalidate() }

    /// The ready chat model id from `/api/endpoints` (`vllm.model`), or nil if the
    /// gateway has no language model loaded.
    func chatModelId() async -> String? {
        guard let url = URL(string: baseURL + "/api/endpoints") else { return nil }
        guard let (data, _) = try? await urlSession.data(from: url),
              let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
              let vllm = obj["vllm"] as? [String: Any],
              (vllm["ready"] as? Bool) == true,
              let model = vllm["model"] as? String, !model.isEmpty else { return nil }
        return model
    }

    /// One JSON-mode chat completion. Returns the raw `content` string (the caller
    /// parses it as JSON). Retries on `503 + Retry-After`.
    func completeJSON(model: String, system: String?, user: String,
                      maxTokens: Int = 4096, maxRetries: Int = 3) async throws -> String {
        guard let url = URL(string: baseURL + "/v1/chat/completions") else {
            throw GatewayLLMError.invalidHost
        }
        var messages: [[String: String]] = []
        if let system { messages.append(["role": "system", "content": system]) }
        messages.append(["role": "user", "content": user])
        let body: [String: Any] = [
            "model": model,
            "messages": messages,
            "max_tokens": maxTokens,
            "stream": false,
            "response_format": ["type": "json_object"],
            "chat_template_kwargs": ["enable_thinking": false],
        ]
        let bodyData = try JSONSerialization.data(withJSONObject: body)

        var attempt = 0
        while true {
            var request = URLRequest(url: url)
            request.httpMethod = "POST"
            request.setValue("application/json", forHTTPHeaderField: "Content-Type")
            request.httpBody = bodyData

            let (data, response) = try await urlSession.data(for: request)
            guard let http = response as? HTTPURLResponse else {
                throw GatewayLLMError.decode("no HTTP response")
            }
            switch http.statusCode {
            case 200..<300:
                return try Self.content(from: data)
            case 503:
                attempt += 1
                if attempt > maxRetries { throw GatewayLLMError.retriesExhausted }
                let retryAfter = http.value(forHTTPHeaderField: "Retry-After").flatMap(Double.init) ?? 5
                try await Task.sleep(nanoseconds: UInt64(max(1, retryAfter) * 1_000_000_000))
            default:
                throw GatewayLLMError.server(http.statusCode, Self.detail(from: data))
            }
        }
    }

    // MARK: - Parsing

    private static func content(from data: Data) throws -> String {
        struct ChatResponse: Decodable {
            struct Choice: Decodable { struct Message: Decodable { let content: String }; let message: Message }
            let choices: [Choice]
        }
        do {
            let decoded = try JSONDecoder().decode(ChatResponse.self, from: data)
            guard let text = decoded.choices.first?.message.content else {
                throw GatewayLLMError.decode("no choices in response")
            }
            return text
        } catch {
            throw GatewayLLMError.decode(error.localizedDescription)
        }
    }

    private static func detail(from data: Data) -> String {
        if let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any] {
            if let detail = obj["detail"] as? String { return detail }
            if let err = obj["error"] as? [String: Any], let msg = err["message"] as? String { return msg }
        }
        return String(data: data, encoding: .utf8) ?? "unknown error"
    }

    /// Strip a ```json … ``` code fence if the model wrapped its JSON (defensive;
    /// JSON mode usually prevents this).
    static func stripCodeFence(_ s: String) -> String {
        var t = s.trimmingCharacters(in: .whitespacesAndNewlines)
        if t.hasPrefix("```") {
            if let firstNewline = t.firstIndex(of: "\n") { t = String(t[t.index(after: firstNewline)...]) }
            if let fenceRange = t.range(of: "```", options: .backwards) { t = String(t[..<fenceRange.lowerBound]) }
        }
        return t.trimmingCharacters(in: .whitespacesAndNewlines)
    }
}