v0.24.0:0 - configurable cluster topology (vllm container name, hide services, second-vllm monitor)

Make the cluster topology configurable so an adopter wired differently (vLLM on both Sparks, port 8000, different container name, no Parakeet) can monitor without forking. Covers the OpenClaw report P4/P5/#6. - VLLM_CONTAINER override (default vllm_node), validated at the boundary and quote_arg-quoted into the swap log-tail + pre-flight validator exec. - DISABLED_SERVICES list: hidden services show no tile and are skipped by status/deep-health/connectivity probes (kills the Parakeet-on-8000 collision). - kind: vllm custom service monitors a second Spark's vLLM via the shared probe_vllm_endpoint; /api/endpoints gains a disabled flag. Swap mechanism intentionally not generalized to raw docker run (that's coordination, roadmap item 4).
2026-06-17 23:03:33 -05:00
parent 90394f891b
commit 26070eb191
17 changed files with 304 additions and 26 deletions
@@ -1,13 +1,44 @@
 from __future__ import annotations
+import logging
 import os
 from dataclasses import dataclass
 from pathlib import Path

+from .shellsafe import validate_container
+
+log = logging.getLogger(__name__)
+

 def _env(name: str, default: str = "") -> str:
    return os.environ.get(name, default)


+def _env_container(name: str, default: str) -> str:
+    """Resolve a container-name env var, validating it at the config boundary.
+
+    The value flows into `docker logs`/`docker exec` over SSH, so it's quoted at
+    the sink — but per the repo's two-layer convention it's also whitelist-checked
+    here. A malformed optional value falls back to `default` rather than crashing
+    daemon startup (mirrors `_env_int` for VLLM_PORT)."""
+    val = os.environ.get(name, "") or default
+    try:
+        return validate_container(val)
+    except ValueError:
+        log.warning("ignoring invalid %s=%r; using %r", name, val, default)
+        return default
+
+
+def _env_set(name: str) -> frozenset[str]:
+    """Parse a comma-separated env var into a lowercased frozenset of keys.
+
+    Used by DISABLED_SERVICES so an adopter whose cluster doesn't run a given
+    support service can switch its tile + probes off entirely (rather than have
+    the probe hit whatever else listens on that port — e.g. a vLLM sharing
+    Parakeet's default 8000)."""
+    raw = os.environ.get(name, "")
+    return frozenset(part.strip().lower() for part in raw.split(",") if part.strip())
+
+
 def _env_int(name: str, default: int) -> int:
    """Parse an int env var, falling back to `default` when unset, blank, or
    malformed. The StartOS Configure panel passes optional numeric fields as an
@@ -63,6 +94,8 @@ class Settings:
    ssh_known_hosts: str
    models_yaml: str
    vllm_port: int
+    vllm_container: str
+    disabled_services: frozenset[str]
    parakeet_port: int
    kokoro_port: int
    embed_port: int
@@ -116,6 +149,15 @@ class Settings:
            ssh_known_hosts=_env("SSH_KNOWN_HOSTS"),
            models_yaml=_resolve_models_yaml(),
            vllm_port=_env_int("VLLM_PORT", 8888),
+            # Container name for the swappable vLLM on Spark 1. Defaults to the
+            # bundled launch-cluster.sh container; override if you named yours
+            # something else (the swap log-tail and pre-flight validator exec
+            # into it by name).
+            vllm_container=_env_container("VLLM_CONTAINER", "vllm_node"),
+            # Built-in support-service keys (parakeet, kokoro, embeddings,
+            # qdrant) the deployment doesn't run — hidden from the dashboard and
+            # never probed.
+            disabled_services=_env_set("DISABLED_SERVICES"),
            parakeet_port=_env_int("PARAKEET_PORT", 8000),
            kokoro_port=_env_int("KOKORO_PORT", 8880),
            embed_port=_env_int("EMBED_PORT", 8088),
@@ -10,6 +10,17 @@ Format:
        port: 8001
        health_path: /health
        image: nvcr.io/nim/nvidia/riva-multilingual:latest
+
+A `kind: vllm` entry monitors an additional vLLM on another Spark (read-only —
+the swap machinery only drives the primary Spark 1 vLLM). It gets a health tile
+probed via /v1/models plus container state and start/stop/restart:
+    custom:
+      - key: vllm-spark2
+        kind: vllm
+        host: <spark-2-ip>
+        user: <ssh-user>
+        container: vllm_node
+        port: 8000
 """
 from __future__ import annotations
 import os
@@ -377,6 +377,10 @@ class DeepHealth:
    async def run_all(self) -> dict[str, ProbeResult]:
        results = {}
        for name in self.PROBES:
+            # Don't deep-probe a service the deployment switched off — its port
+            # may be answered by something else (e.g. a vLLM on Parakeet's 8000).
+            if name in self.settings.disabled_services:
+                continue
            results[name] = await self.run_one(name)
        return results

@@ -6,17 +6,28 @@ from .config import Settings
 _TIMEOUT = 3.0


-async def check_vllm(settings: Settings) -> dict:
-    base_url = (
-        f"http://{settings.spark1_host}:{settings.vllm_port}/v1"
-        if settings.spark1_host
-        else None
-    )
-    if not settings.spark1_host:
-        return {"ok": False, "error": "spark1 not configured", "base_url": base_url}
+def _disabled(settings: Settings, key: str) -> dict | None:
+    """A clean 'disabled' verdict if `key` is in DISABLED_SERVICES, else None.
+
+    Lets an adopter who doesn't run a given support service switch its probe off
+    entirely — so the probe never hits whatever else listens on that port, and
+    the connectivity log doesn't record it as perpetually down."""
+    if key in settings.disabled_services:
+        return {"ok": False, "disabled": True, "error": "disabled", "base_url": None}
+    return None
+
+
+async def probe_vllm_endpoint(host: str, port: int) -> dict:
+    """Probe any OpenAI-compatible vLLM at host:port via /v1/models.
+
+    Shared by the primary (Spark 1) health check and any extra vLLM registered
+    as a custom service (kind: vllm) to monitor a second Spark."""
+    base_url = f"http://{host}:{port}/v1" if host else None
+    if not host:
+        return {"ok": False, "error": "vllm host not configured", "base_url": base_url}
    try:
        async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
-            r = await c.get(f"http://{settings.spark1_host}:{settings.vllm_port}/v1/models")
+            r = await c.get(f"http://{host}:{port}/v1/models")
            r.raise_for_status()
            ids = [m["id"] for m in r.json().get("data", [])]
            return {
@@ -29,7 +40,15 @@ async def check_vllm(settings: Settings) -> dict:
        return {"ok": False, "error": str(e), "base_url": base_url}


+async def check_vllm(settings: Settings) -> dict:
+    if not settings.spark1_host:
+        return {"ok": False, "error": "spark1 not configured", "base_url": None}
+    return await probe_vllm_endpoint(settings.spark1_host, settings.vllm_port)
+
+
 async def check_parakeet(settings: Settings) -> dict:
+    if d := _disabled(settings, "parakeet"):
+        return d
    base_url = (
        f"http://{settings.parakeet_host}:{settings.parakeet_port}"
        if settings.parakeet_host
@@ -47,6 +66,8 @@ async def check_parakeet(settings: Settings) -> dict:


 async def check_kokoro(settings: Settings) -> dict:
+    if d := _disabled(settings, "kokoro"):
+        return d
    base_url = (
        f"http://{settings.kokoro_host}:{settings.kokoro_port}"
        if settings.kokoro_host
@@ -68,6 +89,8 @@ async def check_kokoro(settings: Settings) -> dict:


 async def check_embeddings(settings: Settings) -> dict:
+    if d := _disabled(settings, "embeddings"):
+        return d
    base_url = (
        f"http://{settings.embed_host}:{settings.embed_port}"
        if settings.embed_host
@@ -89,6 +112,8 @@ async def check_embeddings(settings: Settings) -> dict:


 async def check_qdrant(settings: Settings) -> dict:
+    if d := _disabled(settings, "qdrant"):
+        return d
    base_url = (
        f"http://{settings.qdrant_host}:{settings.qdrant_port}"
        if settings.qdrant_host
@@ -20,7 +20,7 @@ from .llm_proxy import build_router as build_llm_router
 from .embeddings_proxy import build_router as build_embeddings_router
 from .redaction_gateway import build_router as build_redaction_router, MapStore
 from .hardware import HardwareProbe
-from .health import check_kokoro, check_parakeet, check_vllm, check_embeddings, check_qdrant
+from .health import check_kokoro, check_parakeet, check_vllm, check_embeddings, check_qdrant, probe_vllm_endpoint
 from .matrix_bridge import MatrixBridgeManager
 from .models import ModelDef, load_catalog
 from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager
@@ -500,6 +500,10 @@ async def get_services() -> dict:
            http = await check_embeddings(settings)
        elif name == "qdrant":
            http = await check_qdrant(settings)
+        elif svc.kind == "vllm":
+            # An extra vLLM monitored on another Spark (registered as a custom
+            # service). Probe its own host/port, not the primary Spark 1 one.
+            http = await probe_vllm_endpoint(svc.host, svc.port)
        elif svc.kind == "bot":
            # No HTTP health endpoint (host networking, no port) — judged purely
            # by docker state. http_ready stays None so the badge isn't pinned
@@ -521,7 +525,7 @@ async def get_services() -> dict:
            # Prefer the check fn's own top-level model key (embeddings reports
            # it there); fall back to a model field inside detail for services
            # whose /health embeds it (parakeet).
-            "model": http.get("model") or ((http.get("detail") or {}).get("model") if isinstance(http.get("detail"), dict) else None),
+            "model": http.get("model") or http.get("current_model") or ((http.get("detail") or {}).get("model") if isinstance(http.get("detail"), dict) else None),
            "docker_state": docker.get("state"),
            "restart_count": docker.get("restart_count"),
            "started_at": docker.get("started_at"),
@@ -799,17 +803,20 @@ async def get_endpoints() -> dict:
            "base_url": vllm.get("base_url"),
            "model": vllm.get("current_model"),
            "openai_compat": True,
+            "disabled": bool(vllm.get("disabled")),
        },
        "parakeet": {
            "ready": bool(parakeet.get("ok")),
            "base_url": parakeet.get("base_url"),
            "kind": "stt",
            "model": (parakeet.get("detail") or {}).get("model") if isinstance(parakeet.get("detail"), dict) else None,
+            "disabled": bool(parakeet.get("disabled")),
        },
        "kokoro": {
            "ready": bool(kokoro.get("ok")),
            "base_url": kokoro.get("base_url"),
            "kind": "tts",
+            "disabled": bool(kokoro.get("disabled")),
        },
        "embeddings": {
            "ready": bool(embeddings.get("ok")),
@@ -818,12 +825,14 @@ async def get_endpoints() -> dict:
            "model": embeddings.get("model"),
            # The proxied OpenAI-compatible endpoints live on Spark Control itself.
            "openai_endpoints": ["/v1/embeddings", "/v1/rerank", "/api/search"],
+            "disabled": bool(embeddings.get("disabled")),
        },
        "qdrant": {
            "ready": bool(qdrant.get("ok")),
            "base_url": qdrant.get("base_url"),
            "kind": "vectordb",
            "collection": settings.qdrant_collection or None,
+            "disabled": bool(qdrant.get("disabled")),
        },
    }

@@ -837,12 +846,15 @@ async def get_status() -> dict:
        check_embeddings(settings),
        check_qdrant(settings),
    )
-    # Feed health into the connectivity log (deduped — only logs on transition)
-    record_state("vllm", bool(vllm.get("ok")))
-    record_state("parakeet", bool(parakeet.get("ok")))
-    record_state("kokoro", bool(kokoro.get("ok")))
-    record_state("embeddings", bool(embeddings.get("ok")))
-    record_state("qdrant", bool(qdrant.get("ok")))
+    # Feed health into the connectivity log (deduped — only logs on transition).
+    # Skip services switched off via DISABLED_SERVICES — they'd otherwise log as
+    # perpetually down.
+    for _name, _r in (
+        ("vllm", vllm), ("parakeet", parakeet), ("kokoro", kokoro),
+        ("embeddings", embeddings), ("qdrant", qdrant),
+    ):
+        if not _r.get("disabled"):
+            record_state(_name, bool(_r.get("ok")))
    current_key = _identify_current_model(vllm.get("current_model"))
    return {
        "configured": settings.configured,
@@ -5,6 +5,7 @@ machinery. We just run `docker start|stop|restart <container>` via SSH on the
 appropriate host.
 """
 from __future__ import annotations
+import logging
 import time
 from dataclasses import dataclass
 from typing import Literal, Optional
@@ -13,6 +14,8 @@ from .config import Settings
 from .shellsafe import quote_arg
 from .ssh import ssh_run

+log = logging.getLogger(__name__)
+

 # Cache the "unreachable" verdict per (host, user) for a short period so that a
 # repeated docker_state call doesn't re-pay the 6 s SSH connect timeout each time.
@@ -103,7 +106,13 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
    }
    for entry in load_custom_services():
        key = entry.get("key")
-        if not key or key in out:
+        if not key:
+            continue
+        if key in out:
+            # A custom entry can't shadow a built-in (parakeet/kokoro/…); warn so
+            # an adopter who picked a colliding key for, say, a second vLLM sees
+            # why no tile appeared instead of a silent no-op.
+            log.warning("custom service %r collides with a built-in name; ignoring", key)
            continue
        out[key] = ServiceDef(
            name=key,
@@ -113,7 +122,9 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
            container=entry.get("container", key),
            port=int(entry.get("port", 0)),
        )
-    return out
+    # Drop services the deployment has switched off (DISABLED_SERVICES) so they
+    # show no tile and are never probed/auto-restarted.
+    return {k: v for k, v in out.items() if k not in s.disabled_services}


 async def docker_state(settings: Settings, svc: ServiceDef) -> dict:
@@ -932,6 +932,10 @@ function renderHealth(status) {
  function setDot(id, ok, payload) {
    const item = el(id);
    if (!item) return;
+    // A service switched off via DISABLED_SERVICES isn't part of this
+    // deployment — hide its indicator entirely rather than show it as down.
+    if (payload && payload.disabled) { item.classList.add('hidden'); return; }
+    item.classList.remove('hidden');
    const dot = item.querySelector('.dot');
    dot.classList.remove('ok', 'bad', 'warn');
    if (ok === true) dot.classList.add('ok');
@@ -7,6 +7,7 @@ from typing import Optional

 from .config import Settings
 from .models import Catalog, build_launch_command
+from .shellsafe import quote_arg
 from .ssh import ssh_run, ssh_stream, StreamHandle


@@ -112,7 +113,7 @@ class SwapManager:

        # Step 3: tail logs until the ready marker (or timeout)
        job.state = "tailing"
-        tail_cmd = "docker logs -f --tail 50 vllm_node"
+        tail_cmd = f"docker logs -f --tail 50 {quote_arg(s.vllm_container)}"
        job.append(f"$ {tail_cmd}")
        timeout = max(model.expected_ready_seconds * 2, 600)
        handle = StreamHandle()
@@ -22,6 +22,7 @@ from typing import Any

 from .config import Settings
 from .models import Catalog, build_launch_command
+from .shellsafe import quote_arg
 from .ssh import ssh_run


@@ -114,7 +115,7 @@ async def validate_launch(key: str, catalog: Catalog, settings: Settings) -> dic
    # Pipe the JSON args list to a here-doc Python invocation. The validator
    # reads from stdin to avoid shell-escaping the args themselves.
    cmd = (
-        f"echo '{payload}' | docker exec -i vllm_node python3 -c "
+        f"echo '{payload}' | docker exec -i {quote_arg(settings.vllm_container)} python3 -c "
        + shlex.quote(_VALIDATOR_SCRIPT)
    )