spark-control/image/app/health.py

from __future__ import annotations
import httpx
from .config import Settings


_TIMEOUT = 3.0


def _disabled(settings: Settings, key: str) -> dict | None:
    """A clean 'disabled' verdict if `key` is in DISABLED_SERVICES, else None.

    Lets an adopter who doesn't run a given support service switch its probe off
    entirely — so the probe never hits whatever else listens on that port, and
    the connectivity log doesn't record it as perpetually down."""
    if key in settings.disabled_services:
        return {"ok": False, "disabled": True, "error": "disabled", "base_url": None}
    return None


async def probe_vllm_endpoint(host: str, port: int) -> dict:
    """Probe any OpenAI-compatible vLLM at host:port via /v1/models.

    Shared by the primary (Spark 1) health check and any extra vLLM registered
    as a custom service (kind: vllm) to monitor a second Spark."""
    base_url = f"http://{host}:{port}/v1" if host else None
    if not host:
        return {"ok": False, "error": "vllm host not configured", "base_url": base_url}
    try:
        async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
            r = await c.get(f"http://{host}:{port}/v1/models")
            r.raise_for_status()
            ids = [m["id"] for m in r.json().get("data", [])]
            return {
                "ok": True,
                "current_model": ids[0] if ids else None,
                "all": ids,
                "base_url": base_url,
            }
    except Exception as e:
        return {"ok": False, "error": str(e), "base_url": base_url}


async def check_vllm(settings: Settings) -> dict:
    if not settings.spark1_host:
        return {"ok": False, "error": "spark1 not configured", "base_url": None}
    return await probe_vllm_endpoint(settings.spark1_host, settings.vllm_port)


async def check_parakeet(settings: Settings) -> dict:
    if d := _disabled(settings, "parakeet"):
        return d
    base_url = (
        f"http://{settings.parakeet_host}:{settings.parakeet_port}"
        if settings.parakeet_host
        else None
    )
    if not settings.parakeet_host:
        return {"ok": False, "error": "parakeet host not configured", "base_url": base_url}
    try:
        async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
            r = await c.get(f"http://{settings.parakeet_host}:{settings.parakeet_port}/health")
            r.raise_for_status()
            return {"ok": True, "detail": r.json(), "base_url": base_url}
    except Exception as e:
        return {"ok": False, "error": str(e), "base_url": base_url}


async def check_kokoro(settings: Settings) -> dict:
    if d := _disabled(settings, "kokoro"):
        return d
    base_url = (
        f"http://{settings.kokoro_host}:{settings.kokoro_port}"
        if settings.kokoro_host
        else None
    )
    if not settings.kokoro_host:
        return {"ok": False, "error": "kokoro host not configured", "base_url": base_url}
    try:
        async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
            r = await c.get(f"http://{settings.kokoro_host}:{settings.kokoro_port}/health")
            r.raise_for_status()
            return {
                "ok": True,
                "detail": r.json() if r.headers.get("content-type", "").startswith("application/json") else r.text,
                "base_url": base_url,
            }
    except Exception as e:
        return {"ok": False, "error": str(e), "base_url": base_url}


async def check_embeddings(settings: Settings) -> dict:
    if d := _disabled(settings, "embeddings"):
        return d
    base_url = (
        f"http://{settings.embed_host}:{settings.embed_port}"
        if settings.embed_host
        else None
    )
    if not settings.embed_host:
        return {"ok": False, "error": "embedding host not configured", "base_url": base_url}
    try:
        async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
            r = await c.get(f"{base_url}/health")
            r.raise_for_status()
            detail = r.json() if r.headers.get("content-type", "").startswith("application/json") else r.text
            # spark-embed reports {"status":"ready"|"loading", ...} — only "ready" is healthy.
            ready = isinstance(detail, dict) and detail.get("status") == "ready"
            return {"ok": ready, "detail": detail, "base_url": base_url,
                    "model": detail.get("dense_model") if isinstance(detail, dict) else None}
    except Exception as e:
        return {"ok": False, "error": str(e), "base_url": base_url}


async def check_qdrant(settings: Settings) -> dict:
    if d := _disabled(settings, "qdrant"):
        return d
    base_url = (
        f"http://{settings.qdrant_host}:{settings.qdrant_port}"
        if settings.qdrant_host
        else None
    )
    if not settings.qdrant_host:
        return {"ok": False, "error": "qdrant host not configured", "base_url": base_url}
    try:
        async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
            # /readyz returns 200 "all shards are ready" when serving.
            r = await c.get(f"{base_url}/readyz")
            r.raise_for_status()
            return {"ok": True, "detail": r.text.strip()[:120], "base_url": base_url}
    except Exception as e:
        return {"ok": False, "error": str(e), "base_url": base_url}