v0.24.0:0 - configurable cluster topology (vllm container name, hide services, second-vllm monitor)
Make the cluster topology configurable so an adopter wired differently (vLLM on both Sparks, port 8000, different container name, no Parakeet) can monitor without forking. Covers the OpenClaw report P4/P5/#6. - VLLM_CONTAINER override (default vllm_node), validated at the boundary and quote_arg-quoted into the swap log-tail + pre-flight validator exec. - DISABLED_SERVICES list: hidden services show no tile and are skipped by status/deep-health/connectivity probes (kills the Parakeet-on-8000 collision). - kind: vllm custom service monitors a second Spark's vLLM via the shared probe_vllm_endpoint; /api/endpoints gains a disabled flag. Swap mechanism intentionally not generalized to raw docker run (that's coordination, roadmap item 4).
This commit is contained in:
+13
-2
@@ -5,6 +5,7 @@ machinery. We just run `docker start|stop|restart <container>` via SSH on the
|
||||
appropriate host.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, Optional
|
||||
@@ -13,6 +14,8 @@ from .config import Settings
|
||||
from .shellsafe import quote_arg
|
||||
from .ssh import ssh_run
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Cache the "unreachable" verdict per (host, user) for a short period so that a
|
||||
# repeated docker_state call doesn't re-pay the 6 s SSH connect timeout each time.
|
||||
@@ -103,7 +106,13 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
|
||||
}
|
||||
for entry in load_custom_services():
|
||||
key = entry.get("key")
|
||||
if not key or key in out:
|
||||
if not key:
|
||||
continue
|
||||
if key in out:
|
||||
# A custom entry can't shadow a built-in (parakeet/kokoro/…); warn so
|
||||
# an adopter who picked a colliding key for, say, a second vLLM sees
|
||||
# why no tile appeared instead of a silent no-op.
|
||||
log.warning("custom service %r collides with a built-in name; ignoring", key)
|
||||
continue
|
||||
out[key] = ServiceDef(
|
||||
name=key,
|
||||
@@ -113,7 +122,9 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
|
||||
container=entry.get("container", key),
|
||||
port=int(entry.get("port", 0)),
|
||||
)
|
||||
return out
|
||||
# Drop services the deployment has switched off (DISABLED_SERVICES) so they
|
||||
# show no tile and are never probed/auto-restarted.
|
||||
return {k: v for k, v in out.items() if k not in s.disabled_services}
|
||||
|
||||
|
||||
async def docker_state(settings: Settings, svc: ServiceDef) -> dict:
|
||||
|
||||
Reference in New Issue
Block a user