v0.24.0:0 - configurable cluster topology (vllm container name, hide services, second-vllm monitor)

Make the cluster topology configurable so an adopter wired differently
(vLLM on both Sparks, port 8000, different container name, no Parakeet)
can monitor without forking. Covers the OpenClaw report P4/P5/#6.

- VLLM_CONTAINER override (default vllm_node), validated at the boundary
  and quote_arg-quoted into the swap log-tail + pre-flight validator exec.
- DISABLED_SERVICES list: hidden services show no tile and are skipped by
  status/deep-health/connectivity probes (kills the Parakeet-on-8000
  collision).
- kind: vllm custom service monitors a second Spark's vLLM via the shared
  probe_vllm_endpoint; /api/endpoints gains a disabled flag.

Swap mechanism intentionally not generalized to raw docker run (that's
coordination, roadmap item 4).
This commit is contained in:
Keysat
2026-06-17 23:03:33 -05:00
parent 90394f891b
commit 26070eb191
17 changed files with 304 additions and 26 deletions
+42
View File
@@ -1,13 +1,44 @@
from __future__ import annotations
import logging
import os
from dataclasses import dataclass
from pathlib import Path
from .shellsafe import validate_container
log = logging.getLogger(__name__)
def _env(name: str, default: str = "") -> str:
return os.environ.get(name, default)
def _env_container(name: str, default: str) -> str:
"""Resolve a container-name env var, validating it at the config boundary.
The value flows into `docker logs`/`docker exec` over SSH, so it's quoted at
the sink — but per the repo's two-layer convention it's also whitelist-checked
here. A malformed optional value falls back to `default` rather than crashing
daemon startup (mirrors `_env_int` for VLLM_PORT)."""
val = os.environ.get(name, "") or default
try:
return validate_container(val)
except ValueError:
log.warning("ignoring invalid %s=%r; using %r", name, val, default)
return default
def _env_set(name: str) -> frozenset[str]:
"""Parse a comma-separated env var into a lowercased frozenset of keys.
Used by DISABLED_SERVICES so an adopter whose cluster doesn't run a given
support service can switch its tile + probes off entirely (rather than have
the probe hit whatever else listens on that port — e.g. a vLLM sharing
Parakeet's default 8000)."""
raw = os.environ.get(name, "")
return frozenset(part.strip().lower() for part in raw.split(",") if part.strip())
def _env_int(name: str, default: int) -> int:
"""Parse an int env var, falling back to `default` when unset, blank, or
malformed. The StartOS Configure panel passes optional numeric fields as an
@@ -63,6 +94,8 @@ class Settings:
ssh_known_hosts: str
models_yaml: str
vllm_port: int
vllm_container: str
disabled_services: frozenset[str]
parakeet_port: int
kokoro_port: int
embed_port: int
@@ -116,6 +149,15 @@ class Settings:
ssh_known_hosts=_env("SSH_KNOWN_HOSTS"),
models_yaml=_resolve_models_yaml(),
vllm_port=_env_int("VLLM_PORT", 8888),
# Container name for the swappable vLLM on Spark 1. Defaults to the
# bundled launch-cluster.sh container; override if you named yours
# something else (the swap log-tail and pre-flight validator exec
# into it by name).
vllm_container=_env_container("VLLM_CONTAINER", "vllm_node"),
# Built-in support-service keys (parakeet, kokoro, embeddings,
# qdrant) the deployment doesn't run — hidden from the dashboard and
# never probed.
disabled_services=_env_set("DISABLED_SERVICES"),
parakeet_port=_env_int("PARAKEET_PORT", 8000),
kokoro_port=_env_int("KOKORO_PORT", 8880),
embed_port=_env_int("EMBED_PORT", 8088),
+11
View File
@@ -10,6 +10,17 @@ Format:
port: 8001
health_path: /health
image: nvcr.io/nim/nvidia/riva-multilingual:latest
A `kind: vllm` entry monitors an additional vLLM on another Spark (read-only —
the swap machinery only drives the primary Spark 1 vLLM). It gets a health tile
probed via /v1/models plus container state and start/stop/restart:
custom:
- key: vllm-spark2
kind: vllm
host: <spark-2-ip>
user: <ssh-user>
container: vllm_node
port: 8000
"""
from __future__ import annotations
import os
+4
View File
@@ -377,6 +377,10 @@ class DeepHealth:
async def run_all(self) -> dict[str, ProbeResult]:
results = {}
for name in self.PROBES:
# Don't deep-probe a service the deployment switched off — its port
# may be answered by something else (e.g. a vLLM on Parakeet's 8000).
if name in self.settings.disabled_services:
continue
results[name] = await self.run_one(name)
return results
+34 -9
View File
@@ -6,17 +6,28 @@ from .config import Settings
_TIMEOUT = 3.0
async def check_vllm(settings: Settings) -> dict:
base_url = (
f"http://{settings.spark1_host}:{settings.vllm_port}/v1"
if settings.spark1_host
else None
)
if not settings.spark1_host:
return {"ok": False, "error": "spark1 not configured", "base_url": base_url}
def _disabled(settings: Settings, key: str) -> dict | None:
"""A clean 'disabled' verdict if `key` is in DISABLED_SERVICES, else None.
Lets an adopter who doesn't run a given support service switch its probe off
entirely — so the probe never hits whatever else listens on that port, and
the connectivity log doesn't record it as perpetually down."""
if key in settings.disabled_services:
return {"ok": False, "disabled": True, "error": "disabled", "base_url": None}
return None
async def probe_vllm_endpoint(host: str, port: int) -> dict:
"""Probe any OpenAI-compatible vLLM at host:port via /v1/models.
Shared by the primary (Spark 1) health check and any extra vLLM registered
as a custom service (kind: vllm) to monitor a second Spark."""
base_url = f"http://{host}:{port}/v1" if host else None
if not host:
return {"ok": False, "error": "vllm host not configured", "base_url": base_url}
try:
async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
r = await c.get(f"http://{settings.spark1_host}:{settings.vllm_port}/v1/models")
r = await c.get(f"http://{host}:{port}/v1/models")
r.raise_for_status()
ids = [m["id"] for m in r.json().get("data", [])]
return {
@@ -29,7 +40,15 @@ async def check_vllm(settings: Settings) -> dict:
return {"ok": False, "error": str(e), "base_url": base_url}
async def check_vllm(settings: Settings) -> dict:
if not settings.spark1_host:
return {"ok": False, "error": "spark1 not configured", "base_url": None}
return await probe_vllm_endpoint(settings.spark1_host, settings.vllm_port)
async def check_parakeet(settings: Settings) -> dict:
if d := _disabled(settings, "parakeet"):
return d
base_url = (
f"http://{settings.parakeet_host}:{settings.parakeet_port}"
if settings.parakeet_host
@@ -47,6 +66,8 @@ async def check_parakeet(settings: Settings) -> dict:
async def check_kokoro(settings: Settings) -> dict:
if d := _disabled(settings, "kokoro"):
return d
base_url = (
f"http://{settings.kokoro_host}:{settings.kokoro_port}"
if settings.kokoro_host
@@ -68,6 +89,8 @@ async def check_kokoro(settings: Settings) -> dict:
async def check_embeddings(settings: Settings) -> dict:
if d := _disabled(settings, "embeddings"):
return d
base_url = (
f"http://{settings.embed_host}:{settings.embed_port}"
if settings.embed_host
@@ -89,6 +112,8 @@ async def check_embeddings(settings: Settings) -> dict:
async def check_qdrant(settings: Settings) -> dict:
if d := _disabled(settings, "qdrant"):
return d
base_url = (
f"http://{settings.qdrant_host}:{settings.qdrant_port}"
if settings.qdrant_host
+20 -8
View File
@@ -20,7 +20,7 @@ from .llm_proxy import build_router as build_llm_router
from .embeddings_proxy import build_router as build_embeddings_router
from .redaction_gateway import build_router as build_redaction_router, MapStore
from .hardware import HardwareProbe
from .health import check_kokoro, check_parakeet, check_vllm, check_embeddings, check_qdrant
from .health import check_kokoro, check_parakeet, check_vllm, check_embeddings, check_qdrant, probe_vllm_endpoint
from .matrix_bridge import MatrixBridgeManager
from .models import ModelDef, load_catalog
from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager
@@ -500,6 +500,10 @@ async def get_services() -> dict:
http = await check_embeddings(settings)
elif name == "qdrant":
http = await check_qdrant(settings)
elif svc.kind == "vllm":
# An extra vLLM monitored on another Spark (registered as a custom
# service). Probe its own host/port, not the primary Spark 1 one.
http = await probe_vllm_endpoint(svc.host, svc.port)
elif svc.kind == "bot":
# No HTTP health endpoint (host networking, no port) — judged purely
# by docker state. http_ready stays None so the badge isn't pinned
@@ -521,7 +525,7 @@ async def get_services() -> dict:
# Prefer the check fn's own top-level model key (embeddings reports
# it there); fall back to a model field inside detail for services
# whose /health embeds it (parakeet).
"model": http.get("model") or ((http.get("detail") or {}).get("model") if isinstance(http.get("detail"), dict) else None),
"model": http.get("model") or http.get("current_model") or ((http.get("detail") or {}).get("model") if isinstance(http.get("detail"), dict) else None),
"docker_state": docker.get("state"),
"restart_count": docker.get("restart_count"),
"started_at": docker.get("started_at"),
@@ -799,17 +803,20 @@ async def get_endpoints() -> dict:
"base_url": vllm.get("base_url"),
"model": vllm.get("current_model"),
"openai_compat": True,
"disabled": bool(vllm.get("disabled")),
},
"parakeet": {
"ready": bool(parakeet.get("ok")),
"base_url": parakeet.get("base_url"),
"kind": "stt",
"model": (parakeet.get("detail") or {}).get("model") if isinstance(parakeet.get("detail"), dict) else None,
"disabled": bool(parakeet.get("disabled")),
},
"kokoro": {
"ready": bool(kokoro.get("ok")),
"base_url": kokoro.get("base_url"),
"kind": "tts",
"disabled": bool(kokoro.get("disabled")),
},
"embeddings": {
"ready": bool(embeddings.get("ok")),
@@ -818,12 +825,14 @@ async def get_endpoints() -> dict:
"model": embeddings.get("model"),
# The proxied OpenAI-compatible endpoints live on Spark Control itself.
"openai_endpoints": ["/v1/embeddings", "/v1/rerank", "/api/search"],
"disabled": bool(embeddings.get("disabled")),
},
"qdrant": {
"ready": bool(qdrant.get("ok")),
"base_url": qdrant.get("base_url"),
"kind": "vectordb",
"collection": settings.qdrant_collection or None,
"disabled": bool(qdrant.get("disabled")),
},
}
@@ -837,12 +846,15 @@ async def get_status() -> dict:
check_embeddings(settings),
check_qdrant(settings),
)
# Feed health into the connectivity log (deduped — only logs on transition)
record_state("vllm", bool(vllm.get("ok")))
record_state("parakeet", bool(parakeet.get("ok")))
record_state("kokoro", bool(kokoro.get("ok")))
record_state("embeddings", bool(embeddings.get("ok")))
record_state("qdrant", bool(qdrant.get("ok")))
# Feed health into the connectivity log (deduped — only logs on transition).
# Skip services switched off via DISABLED_SERVICES — they'd otherwise log as
# perpetually down.
for _name, _r in (
("vllm", vllm), ("parakeet", parakeet), ("kokoro", kokoro),
("embeddings", embeddings), ("qdrant", qdrant),
):
if not _r.get("disabled"):
record_state(_name, bool(_r.get("ok")))
current_key = _identify_current_model(vllm.get("current_model"))
return {
"configured": settings.configured,
+13 -2
View File
@@ -5,6 +5,7 @@ machinery. We just run `docker start|stop|restart <container>` via SSH on the
appropriate host.
"""
from __future__ import annotations
import logging
import time
from dataclasses import dataclass
from typing import Literal, Optional
@@ -13,6 +14,8 @@ from .config import Settings
from .shellsafe import quote_arg
from .ssh import ssh_run
log = logging.getLogger(__name__)
# Cache the "unreachable" verdict per (host, user) for a short period so that a
# repeated docker_state call doesn't re-pay the 6 s SSH connect timeout each time.
@@ -103,7 +106,13 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
}
for entry in load_custom_services():
key = entry.get("key")
if not key or key in out:
if not key:
continue
if key in out:
# A custom entry can't shadow a built-in (parakeet/kokoro/…); warn so
# an adopter who picked a colliding key for, say, a second vLLM sees
# why no tile appeared instead of a silent no-op.
log.warning("custom service %r collides with a built-in name; ignoring", key)
continue
out[key] = ServiceDef(
name=key,
@@ -113,7 +122,9 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
container=entry.get("container", key),
port=int(entry.get("port", 0)),
)
return out
# Drop services the deployment has switched off (DISABLED_SERVICES) so they
# show no tile and are never probed/auto-restarted.
return {k: v for k, v in out.items() if k not in s.disabled_services}
async def docker_state(settings: Settings, svc: ServiceDef) -> dict:
+4
View File
@@ -932,6 +932,10 @@ function renderHealth(status) {
function setDot(id, ok, payload) {
const item = el(id);
if (!item) return;
// A service switched off via DISABLED_SERVICES isn't part of this
// deployment — hide its indicator entirely rather than show it as down.
if (payload && payload.disabled) { item.classList.add('hidden'); return; }
item.classList.remove('hidden');
const dot = item.querySelector('.dot');
dot.classList.remove('ok', 'bad', 'warn');
if (ok === true) dot.classList.add('ok');
+2 -1
View File
@@ -7,6 +7,7 @@ from typing import Optional
from .config import Settings
from .models import Catalog, build_launch_command
from .shellsafe import quote_arg
from .ssh import ssh_run, ssh_stream, StreamHandle
@@ -112,7 +113,7 @@ class SwapManager:
# Step 3: tail logs until the ready marker (or timeout)
job.state = "tailing"
tail_cmd = "docker logs -f --tail 50 vllm_node"
tail_cmd = f"docker logs -f --tail 50 {quote_arg(s.vllm_container)}"
job.append(f"$ {tail_cmd}")
timeout = max(model.expected_ready_seconds * 2, 600)
handle = StreamHandle()
+2 -1
View File
@@ -22,6 +22,7 @@ from typing import Any
from .config import Settings
from .models import Catalog, build_launch_command
from .shellsafe import quote_arg
from .ssh import ssh_run
@@ -114,7 +115,7 @@ async def validate_launch(key: str, catalog: Catalog, settings: Settings) -> dic
# Pipe the JSON args list to a here-doc Python invocation. The validator
# reads from stdin to avoid shell-escaping the args themselves.
cmd = (
f"echo '{payload}' | docker exec -i vllm_node python3 -c "
f"echo '{payload}' | docker exec -i {quote_arg(settings.vllm_container)} python3 -c "
+ shlex.quote(_VALIDATOR_SCRIPT)
)