Files
spark-control/image/app/health.py
T
Keysat 26070eb191 v0.24.0:0 - configurable cluster topology (vllm container name, hide services, second-vllm monitor)
Make the cluster topology configurable so an adopter wired differently
(vLLM on both Sparks, port 8000, different container name, no Parakeet)
can monitor without forking. Covers the OpenClaw report P4/P5/#6.

- VLLM_CONTAINER override (default vllm_node), validated at the boundary
  and quote_arg-quoted into the swap log-tail + pre-flight validator exec.
- DISABLED_SERVICES list: hidden services show no tile and are skipped by
  status/deep-health/connectivity probes (kills the Parakeet-on-8000
  collision).
- kind: vllm custom service monitors a second Spark's vLLM via the shared
  probe_vllm_endpoint; /api/endpoints gains a disabled flag.

Swap mechanism intentionally not generalized to raw docker run (that's
coordination, roadmap item 4).
2026-06-17 23:03:33 -05:00

132 lines
5.2 KiB
Python

from __future__ import annotations
import httpx
from .config import Settings
_TIMEOUT = 3.0
def _disabled(settings: Settings, key: str) -> dict | None:
"""A clean 'disabled' verdict if `key` is in DISABLED_SERVICES, else None.
Lets an adopter who doesn't run a given support service switch its probe off
entirely — so the probe never hits whatever else listens on that port, and
the connectivity log doesn't record it as perpetually down."""
if key in settings.disabled_services:
return {"ok": False, "disabled": True, "error": "disabled", "base_url": None}
return None
async def probe_vllm_endpoint(host: str, port: int) -> dict:
"""Probe any OpenAI-compatible vLLM at host:port via /v1/models.
Shared by the primary (Spark 1) health check and any extra vLLM registered
as a custom service (kind: vllm) to monitor a second Spark."""
base_url = f"http://{host}:{port}/v1" if host else None
if not host:
return {"ok": False, "error": "vllm host not configured", "base_url": base_url}
try:
async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
r = await c.get(f"http://{host}:{port}/v1/models")
r.raise_for_status()
ids = [m["id"] for m in r.json().get("data", [])]
return {
"ok": True,
"current_model": ids[0] if ids else None,
"all": ids,
"base_url": base_url,
}
except Exception as e:
return {"ok": False, "error": str(e), "base_url": base_url}
async def check_vllm(settings: Settings) -> dict:
if not settings.spark1_host:
return {"ok": False, "error": "spark1 not configured", "base_url": None}
return await probe_vllm_endpoint(settings.spark1_host, settings.vllm_port)
async def check_parakeet(settings: Settings) -> dict:
if d := _disabled(settings, "parakeet"):
return d
base_url = (
f"http://{settings.parakeet_host}:{settings.parakeet_port}"
if settings.parakeet_host
else None
)
if not settings.parakeet_host:
return {"ok": False, "error": "parakeet host not configured", "base_url": base_url}
try:
async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
r = await c.get(f"http://{settings.parakeet_host}:{settings.parakeet_port}/health")
r.raise_for_status()
return {"ok": True, "detail": r.json(), "base_url": base_url}
except Exception as e:
return {"ok": False, "error": str(e), "base_url": base_url}
async def check_kokoro(settings: Settings) -> dict:
if d := _disabled(settings, "kokoro"):
return d
base_url = (
f"http://{settings.kokoro_host}:{settings.kokoro_port}"
if settings.kokoro_host
else None
)
if not settings.kokoro_host:
return {"ok": False, "error": "kokoro host not configured", "base_url": base_url}
try:
async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
r = await c.get(f"http://{settings.kokoro_host}:{settings.kokoro_port}/health")
r.raise_for_status()
return {
"ok": True,
"detail": r.json() if r.headers.get("content-type", "").startswith("application/json") else r.text,
"base_url": base_url,
}
except Exception as e:
return {"ok": False, "error": str(e), "base_url": base_url}
async def check_embeddings(settings: Settings) -> dict:
if d := _disabled(settings, "embeddings"):
return d
base_url = (
f"http://{settings.embed_host}:{settings.embed_port}"
if settings.embed_host
else None
)
if not settings.embed_host:
return {"ok": False, "error": "embedding host not configured", "base_url": base_url}
try:
async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
r = await c.get(f"{base_url}/health")
r.raise_for_status()
detail = r.json() if r.headers.get("content-type", "").startswith("application/json") else r.text
# spark-embed reports {"status":"ready"|"loading", ...} — only "ready" is healthy.
ready = isinstance(detail, dict) and detail.get("status") == "ready"
return {"ok": ready, "detail": detail, "base_url": base_url,
"model": detail.get("dense_model") if isinstance(detail, dict) else None}
except Exception as e:
return {"ok": False, "error": str(e), "base_url": base_url}
async def check_qdrant(settings: Settings) -> dict:
if d := _disabled(settings, "qdrant"):
return d
base_url = (
f"http://{settings.qdrant_host}:{settings.qdrant_port}"
if settings.qdrant_host
else None
)
if not settings.qdrant_host:
return {"ok": False, "error": "qdrant host not configured", "base_url": base_url}
try:
async with httpx.AsyncClient(timeout=_TIMEOUT) as c:
# /readyz returns 200 "all shards are ready" when serving.
r = await c.get(f"{base_url}/readyz")
r.raise_for_status()
return {"ok": True, "detail": r.text.strip()[:120], "base_url": base_url}
except Exception as e:
return {"ok": False, "error": str(e), "base_url": base_url}