26070eb191
Make the cluster topology configurable so an adopter wired differently (vLLM on both Sparks, port 8000, different container name, no Parakeet) can monitor without forking. Covers the OpenClaw report P4/P5/#6. - VLLM_CONTAINER override (default vllm_node), validated at the boundary and quote_arg-quoted into the swap log-tail + pre-flight validator exec. - DISABLED_SERVICES list: hidden services show no tile and are skipped by status/deep-health/connectivity probes (kills the Parakeet-on-8000 collision). - kind: vllm custom service monitors a second Spark's vLLM via the shared probe_vllm_endpoint; /api/endpoints gains a disabled flag. Swap mechanism intentionally not generalized to raw docker run (that's coordination, roadmap item 4).
71 lines
1.9 KiB
Python
71 lines
1.9 KiB
Python
"""User-installed services persist in /data/services-overrides.yaml.
|
|
|
|
Format:
|
|
custom:
|
|
- key: my-riva
|
|
kind: stt
|
|
host: <spark-host-or-ip>
|
|
user: <ssh-user>
|
|
container: riva-asr
|
|
port: 8001
|
|
health_path: /health
|
|
image: nvcr.io/nim/nvidia/riva-multilingual:latest
|
|
|
|
A `kind: vllm` entry monitors an additional vLLM on another Spark (read-only —
|
|
the swap machinery only drives the primary Spark 1 vLLM). It gets a health tile
|
|
probed via /v1/models plus container state and start/stop/restart:
|
|
custom:
|
|
- key: vllm-spark2
|
|
kind: vllm
|
|
host: <spark-2-ip>
|
|
user: <ssh-user>
|
|
container: vllm_node
|
|
port: 8000
|
|
"""
|
|
from __future__ import annotations
|
|
import os
|
|
from pathlib import Path
|
|
import yaml
|
|
|
|
|
|
def _path() -> str:
|
|
return os.environ.get("SERVICES_OVERRIDES", "/data/services-overrides.yaml")
|
|
|
|
|
|
def load_custom_services() -> list[dict]:
|
|
try:
|
|
with open(_path()) as f:
|
|
data = yaml.safe_load(f) or {}
|
|
except FileNotFoundError:
|
|
return []
|
|
return data.get("custom") or []
|
|
|
|
|
|
def add_custom_service(entry: dict) -> None:
|
|
p = _path()
|
|
Path(p).parent.mkdir(parents=True, exist_ok=True)
|
|
data: dict = {}
|
|
try:
|
|
with open(p) as f:
|
|
data = yaml.safe_load(f) or {}
|
|
except FileNotFoundError:
|
|
pass
|
|
custom = data.get("custom") or []
|
|
custom = [c for c in custom if c.get("key") != entry["key"]]
|
|
custom.append(entry)
|
|
data["custom"] = custom
|
|
with open(p, "w") as f:
|
|
yaml.safe_dump(data, f, sort_keys=False)
|
|
|
|
|
|
def delete_custom_service(key: str) -> None:
|
|
p = _path()
|
|
try:
|
|
with open(p) as f:
|
|
data = yaml.safe_load(f) or {}
|
|
except FileNotFoundError:
|
|
return
|
|
data["custom"] = [c for c in (data.get("custom") or []) if c.get("key") != key]
|
|
with open(p, "w") as f:
|
|
yaml.safe_dump(data, f, sort_keys=False)
|