v0.24.0:0 - configurable cluster topology (vllm container name, hide services, second-vllm monitor)

Make the cluster topology configurable so an adopter wired differently (vLLM on both Sparks, port 8000, different container name, no Parakeet) can monitor without forking. Covers the OpenClaw report P4/P5/#6. - VLLM_CONTAINER override (default vllm_node), validated at the boundary and quote_arg-quoted into the swap log-tail + pre-flight validator exec. - DISABLED_SERVICES list: hidden services show no tile and are skipped by status/deep-health/connectivity probes (kills the Parakeet-on-8000 collision). - kind: vllm custom service monitors a second Spark's vLLM via the shared probe_vllm_endpoint; /api/endpoints gains a disabled flag. Swap mechanism intentionally not generalized to raw docker run (that's coordination, roadmap item 4).
2026-06-17 23:03:33 -05:00
parent 90394f891b
commit 26070eb191
17 changed files with 304 additions and 26 deletions
@@ -6,17 +6,28 @@ from .config import Settings
 _TIMEOUT = 3.0


-async def check_vllm(settings: Settings) -> dict:
-    base_url = (
-        f"http://{settings.spark1_host}:{settings.vllm_port}/v1"
-        if settings.spark1_host
-        else None
-    )
-    if not settings.spark1_host:
-        return {"ok": False, "error": "spark1 not configured", "base_url": base_url}
+def _disabled(settings: Settings, key: str) -> dict | None:
+    """A clean 'disabled' verdict if `key` is in DISABLED_SERVICES, else None.
+
+    Lets an adopter who doesn't run a given support service switch its probe off
+    entirely — so the probe never hits whatever else listens on that port, and
+    the connectivity log doesn't record it as perpetually down."""
+    if key in settings.disabled_services:
+        return {"ok": False, "disabled": True, "error": "disabled", "base_url": None}
+    return None
+
+
+async def probe_vllm_endpoint(host: str, port: int) -> dict:
+    """Probe any OpenAI-compatible vLLM at host:port via /v1/models.
+
+    Shared by the primary (Spark 1) health check and any extra vLLM registered
+    as a custom service (kind: vllm) to monitor a second Spark."""
+    base_url = f"http://{host}:{port}/v1" if host else None
+    if not host:
+        return {"ok": False, "error": "vllm host not configured", "base_url": base_url}
    try:
        async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
-            r = await c.get(f"http://{settings.spark1_host}:{settings.vllm_port}/v1/models")
+            r = await c.get(f"http://{host}:{port}/v1/models")
            r.raise_for_status()
            ids = [m["id"] for m in r.json().get("data", [])]
            return {
@@ -29,7 +40,15 @@ async def check_vllm(settings: Settings) -> dict:
        return {"ok": False, "error": str(e), "base_url": base_url}


+async def check_vllm(settings: Settings) -> dict:
+    if not settings.spark1_host:
+        return {"ok": False, "error": "spark1 not configured", "base_url": None}
+    return await probe_vllm_endpoint(settings.spark1_host, settings.vllm_port)
+
+
 async def check_parakeet(settings: Settings) -> dict:
+    if d := _disabled(settings, "parakeet"):
+        return d
    base_url = (
        f"http://{settings.parakeet_host}:{settings.parakeet_port}"
        if settings.parakeet_host
@@ -47,6 +66,8 @@ async def check_parakeet(settings: Settings) -> dict:


 async def check_kokoro(settings: Settings) -> dict:
+    if d := _disabled(settings, "kokoro"):
+        return d
    base_url = (
        f"http://{settings.kokoro_host}:{settings.kokoro_port}"
        if settings.kokoro_host
@@ -68,6 +89,8 @@ async def check_kokoro(settings: Settings) -> dict:


 async def check_embeddings(settings: Settings) -> dict:
+    if d := _disabled(settings, "embeddings"):
+        return d
    base_url = (
        f"http://{settings.embed_host}:{settings.embed_port}"
        if settings.embed_host
@@ -89,6 +112,8 @@ async def check_embeddings(settings: Settings) -> dict:


 async def check_qdrant(settings: Settings) -> dict:
+    if d := _disabled(settings, "qdrant"):
+        return d
    base_url = (
        f"http://{settings.qdrant_host}:{settings.qdrant_port}"
        if settings.qdrant_host