Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9ff7ee9c1e | |||
| 1602b3b3b4 | |||
| 8ac455f5f5 | |||
| 000c55febe | |||
| 6434b01a95 | |||
| 5827683a09 |
@@ -0,0 +1,363 @@
|
|||||||
|
"""Deep health probes for each service.
|
||||||
|
|
||||||
|
Why this exists: Triton's /health endpoint returns 200 as long as the HTTP
|
||||||
|
layer is alive and the model is registered. It does NOT verify that the CUDA
|
||||||
|
context inside the worker process is healthy. We've observed Parakeet getting
|
||||||
|
its CUDA context wedged after an OOM, where /health stays green but every
|
||||||
|
real transcription returns 500 cudaErrorUnknown.
|
||||||
|
|
||||||
|
So this module sends *real* but tiny synthetic inference requests:
|
||||||
|
- Parakeet: 1 second of digital silence (16 kHz mono PCM, in-memory WAV)
|
||||||
|
- Magpie: short text-to-speech, response audio discarded
|
||||||
|
- vLLM: 1-token chat completion against whatever model is loaded
|
||||||
|
|
||||||
|
All synthetic payloads are generated on demand into BytesIO, sent over HTTP,
|
||||||
|
and never touched the filesystem (on either spark-control's side or the
|
||||||
|
target service's side beyond normal Triton/Riva working memory).
|
||||||
|
|
||||||
|
When a probe fails with a signal that looks like a CUDA wedge, we
|
||||||
|
automatically issue `docker restart <container>`. Rate-limited to 3 restarts
|
||||||
|
per service per 30 minutes to avoid restart loops.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
import io
|
||||||
|
import time
|
||||||
|
import wave
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from .config import Settings
|
||||||
|
from .connectivity import record_report
|
||||||
|
from .services import ServiceDef, run_action, services_from_settings
|
||||||
|
|
||||||
|
|
||||||
|
# Default 5-minute interval, controllable via env. Sub-minute is silly for a
|
||||||
|
# heavy synthetic probe; we just want to catch wedges within a reasonable
|
||||||
|
# window — much faster than the user noticing on their next real call.
|
||||||
|
DEFAULT_INTERVAL_SEC = 300.0
|
||||||
|
PROBE_TIMEOUT_SEC = 20.0
|
||||||
|
RESTART_RATE_LIMIT = 3 # max auto-restarts per service
|
||||||
|
RESTART_RATE_WINDOW_SEC = 1800.0 # within a 30-min window
|
||||||
|
RESTART_COOLDOWN_SEC = 120.0 # don't restart again within this many seconds of the last one
|
||||||
|
STARTUP_GRACE_SEC = 60.0 # don't auto-restart for the first minute after this app boots
|
||||||
|
|
||||||
|
|
||||||
|
def _silence_wav(seconds: float = 1.0, sample_rate: int = 16000) -> io.BytesIO:
|
||||||
|
"""Return an in-memory WAV file containing `seconds` of digital silence."""
|
||||||
|
n_frames = int(seconds * sample_rate)
|
||||||
|
buf = io.BytesIO()
|
||||||
|
with wave.open(buf, "wb") as w:
|
||||||
|
w.setnchannels(1)
|
||||||
|
w.setsampwidth(2) # int16
|
||||||
|
w.setframerate(sample_rate)
|
||||||
|
w.writeframes(b"\x00\x00" * n_frames)
|
||||||
|
buf.seek(0)
|
||||||
|
return buf
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_wedge(error: str) -> bool:
|
||||||
|
"""Heuristic: does this error string look like a stuck CUDA context that
|
||||||
|
a container restart would clear? We want to be conservative — only act
|
||||||
|
on signals we're confident about, otherwise leave the user in charge."""
|
||||||
|
err = (error or "").lower()
|
||||||
|
needles = [
|
||||||
|
"cudaerrorunknown",
|
||||||
|
"cuda error: unknown",
|
||||||
|
"cuda kernel errors",
|
||||||
|
"internal server error",
|
||||||
|
"engine core initialization failed",
|
||||||
|
"503", # service unavailable from a dependency
|
||||||
|
"500", # generic 5xx with a body that may not parse
|
||||||
|
]
|
||||||
|
return any(n in err for n in needles)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ProbeResult:
|
||||||
|
ok: bool
|
||||||
|
at: str
|
||||||
|
latency_ms: Optional[int] = None
|
||||||
|
error: str = ""
|
||||||
|
note: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ServiceState:
|
||||||
|
last: Optional[ProbeResult] = None
|
||||||
|
last_ok_at: Optional[str] = None
|
||||||
|
restarts: list[float] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class DeepHealth:
|
||||||
|
def __init__(self, settings: Settings, interval_sec: float = DEFAULT_INTERVAL_SEC) -> None:
|
||||||
|
self.settings = settings
|
||||||
|
self.interval_sec = interval_sec
|
||||||
|
self.state: dict[str, ServiceState] = {
|
||||||
|
"parakeet": ServiceState(),
|
||||||
|
"magpie": ServiceState(),
|
||||||
|
"vllm": ServiceState(),
|
||||||
|
}
|
||||||
|
self._stop = asyncio.Event()
|
||||||
|
self._boot_at = time.monotonic()
|
||||||
|
|
||||||
|
# ---- probes ---------------------------------------------------------
|
||||||
|
|
||||||
|
async def probe_parakeet(self) -> ProbeResult:
|
||||||
|
s = self.settings
|
||||||
|
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||||
|
if not s.parakeet_host:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||||
|
url = f"http://{s.parakeet_host}:{s.parakeet_port}/v1/audio/transcriptions"
|
||||||
|
wav = _silence_wav(1.0)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||||
|
r = await c.post(
|
||||||
|
url,
|
||||||
|
files={"file": ("probe.wav", wav, "audio/wav")},
|
||||||
|
data={"model": "parakeet-tdt-0.6b-v3"},
|
||||||
|
)
|
||||||
|
latency = round((time.monotonic() - t0) * 1000)
|
||||||
|
if 200 <= r.status_code < 300:
|
||||||
|
return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
|
||||||
|
return ProbeResult(
|
||||||
|
ok=False,
|
||||||
|
at=now_iso,
|
||||||
|
latency_ms=latency,
|
||||||
|
error=f"HTTP {r.status_code}: {r.text[:240]}",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||||
|
|
||||||
|
async def probe_magpie(self) -> ProbeResult:
|
||||||
|
s = self.settings
|
||||||
|
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||||
|
if not s.magpie_host:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||||
|
# Magpie /v1/audio/synthesize expects multipart form-data, not JSON.
|
||||||
|
# The (None, value) tuple in httpx's `files=` produces a non-file form field.
|
||||||
|
url = f"http://{s.magpie_host}:{s.magpie_port}/v1/audio/synthesize"
|
||||||
|
form: dict = {"text": (None, "hi"), "language": (None, "en-US")}
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||||
|
r = await c.post(url, files=form)
|
||||||
|
latency = round((time.monotonic() - t0) * 1000)
|
||||||
|
if 200 <= r.status_code < 300:
|
||||||
|
return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
|
||||||
|
# 4xx that aren't 5xx mean server is alive but our payload is off —
|
||||||
|
# don't classify as wedge.
|
||||||
|
if 400 <= r.status_code < 500:
|
||||||
|
return ProbeResult(
|
||||||
|
ok=True,
|
||||||
|
at=now_iso,
|
||||||
|
latency_ms=latency,
|
||||||
|
note=f"{r.status_code} — server alive (probe payload may need a voice name)",
|
||||||
|
)
|
||||||
|
return ProbeResult(
|
||||||
|
ok=False,
|
||||||
|
at=now_iso,
|
||||||
|
latency_ms=latency,
|
||||||
|
error=f"HTTP {r.status_code}: {r.text[:240]}",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||||
|
|
||||||
|
async def probe_vllm(self) -> ProbeResult:
|
||||||
|
s = self.settings
|
||||||
|
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||||
|
if not s.spark1_host:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||||
|
base = f"http://{s.spark1_host}:{s.vllm_port}"
|
||||||
|
# Step 1: is there a model loaded?
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=5.0) as c:
|
||||||
|
r = await c.get(f"{base}/v1/models")
|
||||||
|
if 200 <= r.status_code < 300:
|
||||||
|
models = r.json().get("data") or []
|
||||||
|
else:
|
||||||
|
# 5xx on /v1/models suggests something wedged after a model loaded
|
||||||
|
return ProbeResult(
|
||||||
|
ok=False,
|
||||||
|
at=now_iso,
|
||||||
|
error=f"list_models HTTP {r.status_code}: {r.text[:240]}",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Connection refused / timeout: usually means no vLLM process listening
|
||||||
|
# (the vllm_node container is alive but no `vllm serve` is running yet).
|
||||||
|
# That's an idle state, not a wedge — don't trigger auto-restart.
|
||||||
|
return ProbeResult(
|
||||||
|
ok=True,
|
||||||
|
at=now_iso,
|
||||||
|
note="no model currently loaded (idle)",
|
||||||
|
)
|
||||||
|
|
||||||
|
if not models:
|
||||||
|
return ProbeResult(
|
||||||
|
ok=True,
|
||||||
|
at=now_iso,
|
||||||
|
note="no model currently loaded (idle)",
|
||||||
|
)
|
||||||
|
|
||||||
|
model_id = models[0]["id"]
|
||||||
|
# Step 2: model is loaded; verify it can actually complete a 1-token request.
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||||
|
r = await c.post(
|
||||||
|
f"{base}/v1/chat/completions",
|
||||||
|
json={
|
||||||
|
"model": model_id,
|
||||||
|
"messages": [{"role": "user", "content": "hi"}],
|
||||||
|
"max_tokens": 1,
|
||||||
|
"temperature": 0,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
latency = round((time.monotonic() - t0) * 1000)
|
||||||
|
if 200 <= r.status_code < 300:
|
||||||
|
return ProbeResult(ok=True, at=now_iso, latency_ms=latency, note=f"model={model_id}")
|
||||||
|
return ProbeResult(
|
||||||
|
ok=False,
|
||||||
|
at=now_iso,
|
||||||
|
latency_ms=latency,
|
||||||
|
error=f"HTTP {r.status_code}: {r.text[:240]}",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||||
|
|
||||||
|
# ---- orchestration --------------------------------------------------
|
||||||
|
|
||||||
|
PROBES = {
|
||||||
|
"parakeet": "probe_parakeet",
|
||||||
|
"magpie": "probe_magpie",
|
||||||
|
"vllm": "probe_vllm",
|
||||||
|
}
|
||||||
|
|
||||||
|
async def run_one(self, service: str) -> ProbeResult:
|
||||||
|
fn = getattr(self, self.PROBES[service])
|
||||||
|
result: ProbeResult = await fn()
|
||||||
|
st = self.state[service]
|
||||||
|
prev_ok = st.last.ok if st.last else None
|
||||||
|
st.last = result
|
||||||
|
if result.ok:
|
||||||
|
st.last_ok_at = result.at
|
||||||
|
|
||||||
|
# Log to connectivity history: every failure, plus the first success
|
||||||
|
# after a failure (recovery), plus the first probe ever — but skip
|
||||||
|
# the "still ok" steady-state to keep the log readable.
|
||||||
|
if not result.ok:
|
||||||
|
record_report(
|
||||||
|
service,
|
||||||
|
ok=False,
|
||||||
|
source="deep-health",
|
||||||
|
detail=result.error[:240],
|
||||||
|
latency_ms=result.latency_ms,
|
||||||
|
)
|
||||||
|
elif prev_ok is False:
|
||||||
|
record_report(
|
||||||
|
service,
|
||||||
|
ok=True,
|
||||||
|
source="deep-health",
|
||||||
|
detail="recovered" + (f" — {result.note}" if result.note else ""),
|
||||||
|
latency_ms=result.latency_ms,
|
||||||
|
)
|
||||||
|
elif prev_ok is None:
|
||||||
|
record_report(
|
||||||
|
service,
|
||||||
|
ok=True,
|
||||||
|
source="deep-health",
|
||||||
|
detail="first probe ok" + (f" — {result.note}" if result.note else ""),
|
||||||
|
latency_ms=result.latency_ms,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Maybe auto-restart
|
||||||
|
if not result.ok and _looks_like_wedge(result.error):
|
||||||
|
await self._maybe_restart(service, result.error)
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _maybe_restart(self, service: str, error: str) -> None:
|
||||||
|
# No restarts during the boot grace period.
|
||||||
|
if time.monotonic() - self._boot_at < STARTUP_GRACE_SEC:
|
||||||
|
return
|
||||||
|
st = self.state[service]
|
||||||
|
now = time.monotonic()
|
||||||
|
st.restarts = [t for t in st.restarts if now - t < RESTART_RATE_WINDOW_SEC]
|
||||||
|
if st.restarts and now - st.restarts[-1] < RESTART_COOLDOWN_SEC:
|
||||||
|
return # already restarted recently, give it time
|
||||||
|
if len(st.restarts) >= RESTART_RATE_LIMIT:
|
||||||
|
record_report(
|
||||||
|
service,
|
||||||
|
ok=False,
|
||||||
|
source="deep-health",
|
||||||
|
detail=f"rate-limited; not auto-restarting (would be #{len(st.restarts)+1} in 30 min)",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
services = services_from_settings(self.settings)
|
||||||
|
if service not in services:
|
||||||
|
return
|
||||||
|
svc = services[service]
|
||||||
|
if not svc.host or not svc.user:
|
||||||
|
return
|
||||||
|
result = await run_action(self.settings, svc, "restart")
|
||||||
|
st.restarts.append(now)
|
||||||
|
ok = result.get("ok", False)
|
||||||
|
record_report(
|
||||||
|
service,
|
||||||
|
ok=False,
|
||||||
|
source="deep-health",
|
||||||
|
detail=f"auto-restart triggered (wedge: {error[:120]}); restart {'OK' if ok else 'FAILED'}",
|
||||||
|
)
|
||||||
|
|
||||||
|
async def run_all(self) -> dict[str, ProbeResult]:
|
||||||
|
results = {}
|
||||||
|
for name in self.PROBES:
|
||||||
|
results[name] = await self.run_one(name)
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def run_periodic(self) -> None:
|
||||||
|
"""Long-running loop. Cancel via .stop()."""
|
||||||
|
# Brief initial wait to let app finish startup
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(self._stop.wait(), timeout=10.0)
|
||||||
|
return
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass
|
||||||
|
while not self._stop.is_set():
|
||||||
|
try:
|
||||||
|
await self.run_all()
|
||||||
|
except Exception:
|
||||||
|
# Never let the loop die; the periodic check is best-effort
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(self._stop.wait(), timeout=self.interval_sec)
|
||||||
|
return
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
self._stop.set()
|
||||||
|
|
||||||
|
def summary(self) -> dict:
|
||||||
|
out = {}
|
||||||
|
for name, st in self.state.items():
|
||||||
|
last = st.last
|
||||||
|
out[name] = {
|
||||||
|
"last_ok_at": st.last_ok_at,
|
||||||
|
"last": (
|
||||||
|
{
|
||||||
|
"ok": last.ok,
|
||||||
|
"at": last.at,
|
||||||
|
"latency_ms": last.latency_ms,
|
||||||
|
"error": last.error,
|
||||||
|
"note": last.note,
|
||||||
|
}
|
||||||
|
if last
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
"auto_restarts_window": len(st.restarts),
|
||||||
|
}
|
||||||
|
return out
|
||||||
@@ -0,0 +1,130 @@
|
|||||||
|
"""On-disk presence + deletion for Hugging Face model caches on the Sparks.
|
||||||
|
|
||||||
|
The HF cache layout for a repo `org/name` is:
|
||||||
|
|
||||||
|
~/.cache/huggingface/hub/models--org--name/
|
||||||
|
|
||||||
|
We use `du -sb` to measure size (bytes) and `rm -rf` to free it. All operations
|
||||||
|
are gated by the server endpoints, which refuse to delete a currently-loaded
|
||||||
|
model or one tied to an in-flight swap/download.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
import shlex
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from .config import Settings
|
||||||
|
from .ssh import ssh_run
|
||||||
|
|
||||||
|
|
||||||
|
def repo_to_cache_dirname(repo: str) -> str:
|
||||||
|
"""Convert 'org/name' to 'models--org--name' (the HF hub cache directory)."""
|
||||||
|
if "/" not in repo:
|
||||||
|
raise ValueError(f"repo must be in 'org/name' form: {repo!r}")
|
||||||
|
return "models--" + repo.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
|
def _cache_path(repo: str) -> str:
|
||||||
|
"""Full remote path to the model's cache directory."""
|
||||||
|
# Use $HOME so it resolves correctly regardless of the SSH user's home.
|
||||||
|
return f"$HOME/.cache/huggingface/hub/{repo_to_cache_dirname(repo)}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HostDiskResult:
|
||||||
|
host: str
|
||||||
|
on_disk: bool
|
||||||
|
size_bytes: int = 0
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DiskStatus:
|
||||||
|
repo: str
|
||||||
|
on_disk: bool # True if present on AT LEAST one host
|
||||||
|
total_bytes: int # sum across hosts
|
||||||
|
per_host: list[HostDiskResult]
|
||||||
|
|
||||||
|
|
||||||
|
async def probe_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult:
|
||||||
|
"""Return whether the model's cache dir exists on this host and its size."""
|
||||||
|
if not host or not user:
|
||||||
|
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||||
|
path = _cache_path(repo)
|
||||||
|
# `du -sb` prints bytes; if the dir doesn't exist, `du` returns non-zero.
|
||||||
|
# We test existence explicitly first so we can report on_disk=False cleanly.
|
||||||
|
cmd = (
|
||||||
|
f"if [ -d {shlex.quote(path)} ]; then "
|
||||||
|
f"du -sb {shlex.quote(path)} 2>/dev/null | awk '{{print $1}}'; "
|
||||||
|
f"else echo MISSING; fi"
|
||||||
|
)
|
||||||
|
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=20.0)
|
||||||
|
if rc != 0:
|
||||||
|
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
|
||||||
|
raw = out.strip()
|
||||||
|
if raw == "MISSING" or raw == "":
|
||||||
|
return HostDiskResult(host=host, on_disk=False)
|
||||||
|
try:
|
||||||
|
size = int(raw.splitlines()[-1])
|
||||||
|
except ValueError:
|
||||||
|
return HostDiskResult(host=host, on_disk=False, error=f"unparsable du output: {raw!r}")
|
||||||
|
return HostDiskResult(host=host, on_disk=True, size_bytes=size)
|
||||||
|
|
||||||
|
|
||||||
|
async def probe_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
|
||||||
|
"""Probe one model across the relevant Sparks based on its mode (solo|cluster)."""
|
||||||
|
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
|
||||||
|
if mode == "cluster" and settings.spark2_host:
|
||||||
|
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||||
|
|
||||||
|
results = await asyncio.gather(*(probe_host(h, u, repo, settings) for h, u in hosts))
|
||||||
|
on_disk = any(r.on_disk for r in results)
|
||||||
|
total = sum(r.size_bytes for r in results)
|
||||||
|
return DiskStatus(repo=repo, on_disk=on_disk, total_bytes=total, per_host=list(results))
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult:
|
||||||
|
"""Probe + rm -rf on one host. Returns bytes freed (0 if the dir wasn't there)."""
|
||||||
|
if not host or not user:
|
||||||
|
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||||
|
path = _cache_path(repo)
|
||||||
|
# Safety: hard-code the prefix in the command so a bad `repo` can never escape.
|
||||||
|
# Compute size first, then remove. If absent, still return success (idempotent).
|
||||||
|
cmd = (
|
||||||
|
f"set -e; "
|
||||||
|
f"P={shlex.quote(path)}; "
|
||||||
|
f"if [ -d \"$P\" ]; then "
|
||||||
|
f" SIZE=$(du -sb \"$P\" 2>/dev/null | awk '{{print $1}}'); "
|
||||||
|
f" rm -rf -- \"$P\"; "
|
||||||
|
f" echo FREED $SIZE; "
|
||||||
|
f"else "
|
||||||
|
f" echo FREED 0; "
|
||||||
|
f"fi"
|
||||||
|
)
|
||||||
|
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=120.0)
|
||||||
|
if rc != 0:
|
||||||
|
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
|
||||||
|
# Parse the "FREED N" line
|
||||||
|
freed = 0
|
||||||
|
for line in out.splitlines():
|
||||||
|
parts = line.strip().split()
|
||||||
|
if len(parts) == 2 and parts[0] == "FREED":
|
||||||
|
try:
|
||||||
|
freed = int(parts[1])
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
break
|
||||||
|
return HostDiskResult(host=host, on_disk=False, size_bytes=freed)
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_from_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
|
||||||
|
"""rm -rf the model's cache dir on the relevant Sparks. Idempotent."""
|
||||||
|
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
|
||||||
|
if mode == "cluster" and settings.spark2_host:
|
||||||
|
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||||
|
|
||||||
|
results = await asyncio.gather(*(delete_host(h, u, repo, settings) for h, u in hosts))
|
||||||
|
total_freed = sum(r.size_bytes for r in results)
|
||||||
|
# After deletion, on_disk should be False on all hosts.
|
||||||
|
return DiskStatus(repo=repo, on_disk=False, total_bytes=total_freed, per_host=list(results))
|
||||||
@@ -12,6 +12,8 @@ from typing import Literal
|
|||||||
from .config import Settings
|
from .config import Settings
|
||||||
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||||
from .custom_services import add_custom_service, delete_custom_service
|
from .custom_services import add_custom_service, delete_custom_service
|
||||||
|
from .deep_health import DeepHealth
|
||||||
|
from .disk import delete_from_disk, probe_disk
|
||||||
from .download import DownloadManager
|
from .download import DownloadManager
|
||||||
from .hardware import HardwareProbe
|
from .hardware import HardwareProbe
|
||||||
from .health import check_magpie, check_parakeet, check_vllm
|
from .health import check_magpie, check_parakeet, check_vllm
|
||||||
@@ -22,6 +24,7 @@ from .services import docker_state, run_action, services_from_settings
|
|||||||
from .ssh import ssh_run
|
from .ssh import ssh_run
|
||||||
from .swap import SwapManager
|
from .swap import SwapManager
|
||||||
from .updates import UpdateManager, get_update_status
|
from .updates import UpdateManager, get_update_status
|
||||||
|
from .validate import validate_launch
|
||||||
from .wol import send_local_broadcast, send_via_peer
|
from .wol import send_local_broadcast, send_via_peer
|
||||||
|
|
||||||
|
|
||||||
@@ -32,9 +35,22 @@ download_manager = DownloadManager(settings)
|
|||||||
update_manager = UpdateManager(settings)
|
update_manager = UpdateManager(settings)
|
||||||
hardware_probe = HardwareProbe(settings)
|
hardware_probe = HardwareProbe(settings)
|
||||||
nim_manager = NimManager(settings)
|
nim_manager = NimManager(settings)
|
||||||
|
deep_health = DeepHealth(settings)
|
||||||
|
|
||||||
app = FastAPI(title="spark-control", version="0.1.0")
|
app = FastAPI(title="spark-control", version="0.1.0")
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def _start_deep_health() -> None:
|
||||||
|
# Fire-and-forget; the loop catches its own exceptions.
|
||||||
|
asyncio.create_task(deep_health.run_periodic())
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("shutdown")
|
||||||
|
async def _stop_deep_health() -> None:
|
||||||
|
deep_health.stop()
|
||||||
|
|
||||||
|
|
||||||
_STATIC_DIR = Path(__file__).resolve().parent / "static"
|
_STATIC_DIR = Path(__file__).resolve().parent / "static"
|
||||||
app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
|
app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
|
||||||
|
|
||||||
@@ -124,6 +140,89 @@ async def del_model(key: str) -> dict:
|
|||||||
return {"ok": True, "key": key}
|
return {"ok": True, "key": key}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/models/disk-status")
|
||||||
|
async def get_models_disk_status() -> dict:
|
||||||
|
"""Probe each catalog model's HF cache on the appropriate Spark(s) in parallel.
|
||||||
|
|
||||||
|
Result is keyed by model key: {on_disk, total_bytes, per_host:[{host,on_disk,size_bytes,error?}]}.
|
||||||
|
Designed to be called once on dashboard load; takes ~1–3s depending on Spark count.
|
||||||
|
"""
|
||||||
|
if not settings.configured:
|
||||||
|
return {"configured": False, "models": {}}
|
||||||
|
keys = list(catalog.models.keys())
|
||||||
|
statuses = await asyncio.gather(*(
|
||||||
|
probe_disk(catalog.models[k].repo, catalog.models[k].mode, settings) for k in keys
|
||||||
|
), return_exceptions=True)
|
||||||
|
out: dict[str, dict] = {}
|
||||||
|
for k, s in zip(keys, statuses):
|
||||||
|
if isinstance(s, Exception):
|
||||||
|
out[k] = {"on_disk": False, "total_bytes": 0, "per_host": [], "error": str(s)}
|
||||||
|
continue
|
||||||
|
out[k] = {
|
||||||
|
"on_disk": s.on_disk,
|
||||||
|
"total_bytes": s.total_bytes,
|
||||||
|
"per_host": [
|
||||||
|
{"host": r.host, "on_disk": r.on_disk, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
|
||||||
|
for r in s.per_host
|
||||||
|
],
|
||||||
|
}
|
||||||
|
return {"configured": True, "models": out}
|
||||||
|
|
||||||
|
|
||||||
|
@app.delete("/api/models/{key}/disk")
|
||||||
|
async def del_model_disk(key: str) -> dict:
|
||||||
|
"""Delete a model's weights from the Spark filesystem(s). The catalog entry stays.
|
||||||
|
|
||||||
|
Safety rails:
|
||||||
|
- Refuses if the model is currently loaded on vLLM.
|
||||||
|
- Refuses if a swap or download is in flight.
|
||||||
|
- Idempotent: if the cache dir is already gone on a host, that host reports 0 bytes freed.
|
||||||
|
"""
|
||||||
|
if key not in catalog.models:
|
||||||
|
raise HTTPException(404, f"unknown model: {key}")
|
||||||
|
m = catalog.models[key]
|
||||||
|
|
||||||
|
# Refuse if currently loaded
|
||||||
|
try:
|
||||||
|
vllm = await check_vllm(settings)
|
||||||
|
except Exception:
|
||||||
|
vllm = {}
|
||||||
|
if vllm.get("ok") and vllm.get("current_model") == m.repo:
|
||||||
|
raise HTTPException(
|
||||||
|
409,
|
||||||
|
f"'{m.display_name}' is the currently loaded model. Switch to a different model first, then try again."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Refuse if a swap is in flight
|
||||||
|
if swap_manager.current_job_id:
|
||||||
|
raise HTTPException(409, "a model swap is in progress; wait for it to finish")
|
||||||
|
|
||||||
|
# Refuse if a download is in flight for this same repo (a different model's download is fine)
|
||||||
|
if download_manager.current_job_id:
|
||||||
|
job = download_manager.get(download_manager.current_job_id)
|
||||||
|
if job and job.repo == m.repo:
|
||||||
|
raise HTTPException(409, "this model is currently downloading; cancel or wait for it to finish")
|
||||||
|
|
||||||
|
status = await delete_from_disk(m.repo, m.mode, settings)
|
||||||
|
# Audit log
|
||||||
|
record_report(
|
||||||
|
f"disk:{key}",
|
||||||
|
ok=True,
|
||||||
|
source="disk-delete",
|
||||||
|
detail=f"freed {status.total_bytes} bytes across {len(status.per_host)} host(s)",
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"key": key,
|
||||||
|
"repo": m.repo,
|
||||||
|
"bytes_freed": status.total_bytes,
|
||||||
|
"per_host": [
|
||||||
|
{"host": r.host, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
|
||||||
|
for r in status.per_host
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/hardware")
|
@app.get("/api/hardware")
|
||||||
async def get_hardware() -> dict:
|
async def get_hardware() -> dict:
|
||||||
"""Per-Spark hardware snapshot — RAM, disk, GPU mem + util, CPU load, uptime."""
|
"""Per-Spark hardware snapshot — RAM, disk, GPU mem + util, CPU load, uptime."""
|
||||||
@@ -136,6 +235,27 @@ async def get_connectivity() -> dict:
|
|||||||
return connectivity_summary()
|
return connectivity_summary()
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/deep-health")
|
||||||
|
async def get_deep_health() -> dict:
|
||||||
|
"""Last result + auto-restart counters for each service's synthetic probe."""
|
||||||
|
return deep_health.summary()
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/deep-health/{service}/run")
|
||||||
|
async def run_deep_health(service: str) -> dict:
|
||||||
|
"""Manually run a single service's deep-health probe right now."""
|
||||||
|
if service not in deep_health.PROBES:
|
||||||
|
raise HTTPException(404, f"unknown service: {service}")
|
||||||
|
result = await deep_health.run_one(service)
|
||||||
|
return {
|
||||||
|
"ok": result.ok,
|
||||||
|
"at": result.at,
|
||||||
|
"latency_ms": result.latency_ms,
|
||||||
|
"error": result.error,
|
||||||
|
"note": result.note,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class HealthEventBody(BaseModel):
|
class HealthEventBody(BaseModel):
|
||||||
service: str # e.g. "parakeet", "magpie", "vllm"
|
service: str # e.g. "parakeet", "magpie", "vllm"
|
||||||
ok: bool # true on success, false on failure
|
ok: bool # true on success, false on failure
|
||||||
@@ -434,6 +554,15 @@ class SwapRequest(BaseModel):
|
|||||||
dry_run: bool = False
|
dry_run: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/swap/{key}/validate")
|
||||||
|
async def validate_swap(key: str) -> dict:
|
||||||
|
"""Pre-flight check: run vLLM's argparse layer against the proposed launch
|
||||||
|
command WITHOUT starting an engine. Cheap (~5 s) and doesn't disturb the
|
||||||
|
currently-loaded model.
|
||||||
|
"""
|
||||||
|
return await validate_launch(key, catalog, settings)
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/swap")
|
@app.post("/api/swap")
|
||||||
async def post_swap(req: SwapRequest) -> dict:
|
async def post_swap(req: SwapRequest) -> dict:
|
||||||
if not settings.configured and not req.dry_run:
|
if not settings.configured and not req.dry_run:
|
||||||
|
|||||||
@@ -17,6 +17,9 @@ const state = {
|
|||||||
config: {},
|
config: {},
|
||||||
configured: true,
|
configured: true,
|
||||||
timer_handle: null,
|
timer_handle: null,
|
||||||
|
deep_health: {},
|
||||||
|
disk_status: {}, // keyed by model key: { on_disk, total_bytes, per_host }
|
||||||
|
disk_status_loaded: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
const el = (sel) => document.querySelector(sel);
|
const el = (sel) => document.querySelector(sel);
|
||||||
@@ -56,12 +59,36 @@ function renderCards() {
|
|||||||
? `<div class="desc">${escapeHtml(m.description)}</div>`
|
? `<div class="desc">${escapeHtml(m.description)}</div>`
|
||||||
: '';
|
: '';
|
||||||
const customPill = m.custom ? `<span class="tag custom-pill">custom</span>` : '';
|
const customPill = m.custom ? `<span class="tag custom-pill">custom</span>` : '';
|
||||||
|
// Disk-presence pill + trash button. Until /api/models/disk-status comes back,
|
||||||
|
// we don't know — render a neutral placeholder.
|
||||||
|
const disk = state.disk_status[key];
|
||||||
|
let diskPill = '';
|
||||||
|
if (state.disk_status_loaded) {
|
||||||
|
if (disk && disk.on_disk) {
|
||||||
|
const gb = (disk.total_bytes / 1e9);
|
||||||
|
diskPill = `<span class="tag on-disk" title="Weights present on disk">on disk · ${gb.toFixed(1)} GB</span>`;
|
||||||
|
} else {
|
||||||
|
diskPill = `<span class="tag not-on-disk" title="Weights not downloaded">not downloaded</span>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Trash button — hidden if not on disk; disabled (with tooltip) if currently loaded.
|
||||||
|
let trashBtn = '';
|
||||||
|
if (state.disk_status_loaded && disk && disk.on_disk) {
|
||||||
|
const disabled = isActive || isSwapping;
|
||||||
|
const tip = isActive
|
||||||
|
? 'Currently loaded — switch to another model first'
|
||||||
|
: isSwapping
|
||||||
|
? 'A swap is in progress'
|
||||||
|
: 'Delete weights from disk';
|
||||||
|
trashBtn = `<button class="icon-btn danger" data-disk-del-key="${key}" title="${escapeHtml(tip)}" aria-label="Delete from disk" ${disabled ? 'disabled' : ''}>${trashIcon}</button>`;
|
||||||
|
}
|
||||||
card.innerHTML = `
|
card.innerHTML = `
|
||||||
<div class="name">${escapeHtml(m.display_name)}</div>
|
<div class="name">${escapeHtml(m.display_name)}</div>
|
||||||
<div class="meta">
|
<div class="meta">
|
||||||
<span class="tag mode-${m.mode}">${m.mode}</span>
|
<span class="tag mode-${m.mode}">${m.mode}</span>
|
||||||
<span class="tag">${m.size_gb} GB</span>
|
<span class="tag">${m.size_gb} GB</span>
|
||||||
${customPill}
|
${customPill}
|
||||||
|
${diskPill}
|
||||||
${(m.capabilities || []).map(c => `<span class="tag cap">${escapeHtml(c)}</span>`).join('')}
|
${(m.capabilities || []).map(c => `<span class="tag cap">${escapeHtml(c)}</span>`).join('')}
|
||||||
</div>
|
</div>
|
||||||
${desc}
|
${desc}
|
||||||
@@ -73,8 +100,11 @@ function renderCards() {
|
|||||||
<button class="btn ${isActive ? '' : 'primary'}" data-swap-key="${key}" ${isActive || isSwapping ? 'disabled' : ''}>
|
<button class="btn ${isActive ? '' : 'primary'}" data-swap-key="${key}" ${isActive || isSwapping ? 'disabled' : ''}>
|
||||||
${isActive ? 'Current' : 'Switch to this'}
|
${isActive ? 'Current' : 'Switch to this'}
|
||||||
</button>
|
</button>
|
||||||
|
<button class="btn test-btn" data-test-key="${key}" title="Pre-flight check the launch command without starting the engine">Test</button>
|
||||||
<button class="btn adv-btn" data-adv-key="${key}" title="Advanced settings">Advanced</button>
|
<button class="btn adv-btn" data-adv-key="${key}" title="Advanced settings">Advanced</button>
|
||||||
|
${trashBtn}
|
||||||
</div>
|
</div>
|
||||||
|
<div class="test-result hidden" data-test-result-for="${key}"></div>
|
||||||
`;
|
`;
|
||||||
root.appendChild(card);
|
root.appendChild(card);
|
||||||
}
|
}
|
||||||
@@ -84,6 +114,42 @@ function renderCards() {
|
|||||||
for (const btn of root.querySelectorAll('[data-adv-key]')) {
|
for (const btn of root.querySelectorAll('[data-adv-key]')) {
|
||||||
btn.addEventListener('click', () => openAdvanced(btn.dataset.advKey));
|
btn.addEventListener('click', () => openAdvanced(btn.dataset.advKey));
|
||||||
}
|
}
|
||||||
|
for (const btn of root.querySelectorAll('[data-test-key]')) {
|
||||||
|
btn.addEventListener('click', () => testLaunch(btn.dataset.testKey, btn));
|
||||||
|
}
|
||||||
|
for (const btn of root.querySelectorAll('[data-disk-del-key]')) {
|
||||||
|
btn.addEventListener('click', () => openDiskDeleteDialog(btn.dataset.diskDelKey));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const trashIcon = '<svg viewBox="0 0 24 24" width="14" height="14" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" aria-hidden="true"><polyline points="3 6 5 6 21 6"></polyline><path d="M19 6l-1 14a2 2 0 0 1-2 2H8a2 2 0 0 1-2-2L5 6"></path><path d="M10 11v6"></path><path d="M14 11v6"></path><path d="M9 6V4a2 2 0 0 1 2-2h2a2 2 0 0 1 2 2v2"></path></svg>';
|
||||||
|
|
||||||
|
async function testLaunch(key, btn) {
|
||||||
|
const resultEl = document.querySelector(`[data-test-result-for="${key}"]`);
|
||||||
|
if (!resultEl) return;
|
||||||
|
const originalText = btn.textContent;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Testing…';
|
||||||
|
resultEl.classList.remove('hidden', 'ok', 'fail');
|
||||||
|
resultEl.innerHTML = '<span class="muted small">Checking launch args against vLLM\'s parser…</span>';
|
||||||
|
try {
|
||||||
|
const r = await fetchJSON(`/api/swap/${encodeURIComponent(key)}/validate`, { method: 'POST' });
|
||||||
|
if (r.ok) {
|
||||||
|
resultEl.classList.add('ok');
|
||||||
|
resultEl.innerHTML = `<span class="ok-mark">✓</span> Launch args parse OK. <span class="muted small">(Doesn't guarantee runtime success — only catches argparse-level issues.)</span>`;
|
||||||
|
} else {
|
||||||
|
resultEl.classList.add('fail');
|
||||||
|
const err = escapeHtml(r.error || 'unknown error');
|
||||||
|
const stage = r.stage ? ` <span class="muted small">(${escapeHtml(r.stage)})</span>` : '';
|
||||||
|
resultEl.innerHTML = `<span class="fail-mark">✗</span> Would fail: ${err}${stage}`;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
resultEl.classList.add('fail');
|
||||||
|
resultEl.innerHTML = `<span class="fail-mark">✗</span> Test failed: ${escapeHtml(e.message)}`;
|
||||||
|
} finally {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = originalText;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function renderCurrent(status) {
|
function renderCurrent(status) {
|
||||||
@@ -380,6 +446,35 @@ async function renderServices() {
|
|||||||
const restartsRow = s.restart_count != null && s.restart_count > 1
|
const restartsRow = s.restart_count != null && s.restart_count > 1
|
||||||
? `<div class="row"><span class="k">Restarts</span><span class="v">${s.restart_count}</span></div>`
|
? `<div class="row"><span class="k">Restarts</span><span class="v">${s.restart_count}</span></div>`
|
||||||
: '';
|
: '';
|
||||||
|
const dh = state.deep_health?.[name];
|
||||||
|
let deepRow = '';
|
||||||
|
if (dh && dh.last) {
|
||||||
|
const last = dh.last;
|
||||||
|
const when = (last.at || '').slice(11, 19); // HH:MM:SS
|
||||||
|
const verdict = last.ok
|
||||||
|
? `<span class="dh-ok">deep check ok</span>`
|
||||||
|
: `<span class="dh-fail">deep check FAILED</span>`;
|
||||||
|
const lat = last.latency_ms != null ? ` <span class="muted">${last.latency_ms} ms</span>` : '';
|
||||||
|
const restarts = dh.auto_restarts_window > 0
|
||||||
|
? ` <span class="muted">· ${dh.auto_restarts_window} auto-restart${dh.auto_restarts_window === 1 ? '' : 's'} in 30 min</span>`
|
||||||
|
: '';
|
||||||
|
deepRow = `
|
||||||
|
<div class="row deep-row">
|
||||||
|
<span class="k">Deep</span>
|
||||||
|
<span class="v deep-v">${verdict} <span class="muted small">${escapeHtml(when)}</span>${lat}${restarts}</span>
|
||||||
|
<button class="icon-btn dh-run-btn" data-dh-run="${escapeHtml(name)}" title="Run deep check now">↻</button>
|
||||||
|
</div>
|
||||||
|
${last.ok ? '' : `<div class="deep-error muted small">${escapeHtml((last.error || last.note || '').slice(0, 200))}</div>`}
|
||||||
|
`;
|
||||||
|
} else if (dh) {
|
||||||
|
deepRow = `
|
||||||
|
<div class="row deep-row">
|
||||||
|
<span class="k">Deep</span>
|
||||||
|
<span class="v muted-v">no probe yet</span>
|
||||||
|
<button class="icon-btn dh-run-btn" data-dh-run="${escapeHtml(name)}" title="Run deep check now">↻</button>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
card.innerHTML = `
|
card.innerHTML = `
|
||||||
<div class="head">
|
<div class="head">
|
||||||
<span class="name">${escapeHtml(name)}</span>
|
<span class="name">${escapeHtml(name)}</span>
|
||||||
@@ -390,6 +485,7 @@ async function renderServices() {
|
|||||||
${urlRow}
|
${urlRow}
|
||||||
${modelRow}
|
${modelRow}
|
||||||
${restartsRow}
|
${restartsRow}
|
||||||
|
${deepRow}
|
||||||
<div class="service-actions">
|
<div class="service-actions">
|
||||||
<button class="btn" data-svc-action="${name}:start" ${disable('start') ? 'disabled' : ''}>Start</button>
|
<button class="btn" data-svc-action="${name}:start" ${disable('start') ? 'disabled' : ''}>Start</button>
|
||||||
<button class="btn" data-svc-action="${name}:restart" ${disable('restart') ? 'disabled' : ''}>Restart</button>
|
<button class="btn" data-svc-action="${name}:restart" ${disable('restart') ? 'disabled' : ''}>Restart</button>
|
||||||
@@ -401,6 +497,25 @@ async function renderServices() {
|
|||||||
for (const btn of grid.querySelectorAll('.btn[data-svc-action]')) {
|
for (const btn of grid.querySelectorAll('.btn[data-svc-action]')) {
|
||||||
btn.addEventListener('click', () => onServiceAction(btn.dataset.svcAction));
|
btn.addEventListener('click', () => onServiceAction(btn.dataset.svcAction));
|
||||||
}
|
}
|
||||||
|
for (const btn of grid.querySelectorAll('[data-dh-run]')) {
|
||||||
|
btn.addEventListener('click', () => onDeepHealthRun(btn.dataset.dhRun, btn));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function onDeepHealthRun(name, btn) {
|
||||||
|
btn.disabled = true;
|
||||||
|
const orig = btn.textContent;
|
||||||
|
btn.textContent = '…';
|
||||||
|
try {
|
||||||
|
await fetchJSON(`/api/deep-health/${encodeURIComponent(name)}/run`, { method: 'POST' });
|
||||||
|
} catch (e) {
|
||||||
|
console.warn('deep-health run failed', e);
|
||||||
|
} finally {
|
||||||
|
try { state.deep_health = await fetchJSON('/api/deep-health'); } catch {}
|
||||||
|
btn.textContent = orig;
|
||||||
|
btn.disabled = false;
|
||||||
|
renderServices();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function onServiceAction(key) {
|
async function onServiceAction(key) {
|
||||||
@@ -635,6 +750,7 @@ async function pollStatus() {
|
|||||||
// Refresh services state lazily — every 5s poll triggers this too.
|
// Refresh services state lazily — every 5s poll triggers this too.
|
||||||
try {
|
try {
|
||||||
state.services = await fetchJSON('/api/services');
|
state.services = await fetchJSON('/api/services');
|
||||||
|
try { state.deep_health = await fetchJSON('/api/deep-health'); } catch {}
|
||||||
renderServices();
|
renderServices();
|
||||||
} catch {}
|
} catch {}
|
||||||
if (status.current_swap_job && status.current_swap_job !== state.swap_job_id) {
|
if (status.current_swap_job && status.current_swap_job !== state.swap_job_id) {
|
||||||
@@ -655,6 +771,78 @@ async function loadModels() {
|
|||||||
state.models = data.models || {};
|
state.models = data.models || {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function loadDiskStatus() {
|
||||||
|
// Probes each catalog model's HF cache over SSH; takes a beat. Best-effort.
|
||||||
|
try {
|
||||||
|
const r = await fetchJSON('/api/models/disk-status');
|
||||||
|
if (r && r.models) {
|
||||||
|
state.disk_status = r.models;
|
||||||
|
state.disk_status_loaded = true;
|
||||||
|
renderCards();
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Silent — pills just won't render. Don't block dashboard.
|
||||||
|
console.warn('disk-status probe failed:', e.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function fmtBytesShort(n) {
|
||||||
|
if (!Number.isFinite(n) || n <= 0) return '0 B';
|
||||||
|
if (n >= 1e9) return `${(n / 1e9).toFixed(1)} GB`;
|
||||||
|
if (n >= 1e6) return `${(n / 1e6).toFixed(1)} MB`;
|
||||||
|
if (n >= 1e3) return `${(n / 1e3).toFixed(1)} KB`;
|
||||||
|
return `${n} B`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function openDiskDeleteDialog(key) {
|
||||||
|
const m = state.models[key];
|
||||||
|
const disk = state.disk_status[key];
|
||||||
|
if (!m || !disk || !disk.on_disk) return;
|
||||||
|
const dlg = el('#disk-delete-dialog');
|
||||||
|
el('#dd-summary').innerHTML = `Free <strong>${fmtBytesShort(disk.total_bytes)}</strong> by removing <strong>${escapeHtml(m.display_name)}</strong> (<code>${escapeHtml(m.repo)}</code>) from disk.`;
|
||||||
|
const hostsEl = el('#dd-hosts');
|
||||||
|
hostsEl.innerHTML = '';
|
||||||
|
for (const h of (disk.per_host || [])) {
|
||||||
|
if (!h.on_disk) continue;
|
||||||
|
const li = document.createElement('li');
|
||||||
|
li.innerHTML = `<code>${escapeHtml(h.host)}</code> — ${fmtBytesShort(h.size_bytes)}`;
|
||||||
|
hostsEl.appendChild(li);
|
||||||
|
}
|
||||||
|
const errEl = el('#dd-error');
|
||||||
|
errEl.classList.add('hidden');
|
||||||
|
errEl.textContent = '';
|
||||||
|
|
||||||
|
const confirm = el('#dd-confirm');
|
||||||
|
const cancel = el('#dd-cancel');
|
||||||
|
const onCancel = () => dlg.close();
|
||||||
|
const onConfirm = async () => {
|
||||||
|
confirm.disabled = true;
|
||||||
|
cancel.disabled = true;
|
||||||
|
confirm.textContent = 'Deleting…';
|
||||||
|
try {
|
||||||
|
const r = await fetchJSON(`/api/models/${encodeURIComponent(key)}/disk`, { method: 'DELETE' });
|
||||||
|
dlg.close();
|
||||||
|
// Optimistically clear local disk state for this key, then refresh.
|
||||||
|
delete state.disk_status[key];
|
||||||
|
renderCards();
|
||||||
|
// Eagerly re-probe so size is accurate (and shows "not downloaded" pill).
|
||||||
|
loadDiskStatus();
|
||||||
|
const freed = r && typeof r.bytes_freed === 'number' ? fmtBytesShort(r.bytes_freed) : '';
|
||||||
|
console.log(`Deleted ${m.display_name} from disk${freed ? ` — freed ${freed}` : ''}.`);
|
||||||
|
} catch (e) {
|
||||||
|
errEl.textContent = e.message || 'Delete failed';
|
||||||
|
errEl.classList.remove('hidden');
|
||||||
|
} finally {
|
||||||
|
confirm.disabled = false;
|
||||||
|
cancel.disabled = false;
|
||||||
|
confirm.textContent = 'Delete from disk';
|
||||||
|
}
|
||||||
|
};
|
||||||
|
cancel.onclick = onCancel;
|
||||||
|
confirm.onclick = onConfirm;
|
||||||
|
dlg.showModal();
|
||||||
|
}
|
||||||
|
|
||||||
async function triggerSwap(modelKey) {
|
async function triggerSwap(modelKey) {
|
||||||
if (state.swap_job_id) return;
|
if (state.swap_job_id) return;
|
||||||
try {
|
try {
|
||||||
@@ -1439,9 +1627,12 @@ async function init() {
|
|||||||
await renderServices();
|
await renderServices();
|
||||||
pollHardware();
|
pollHardware();
|
||||||
pollUpdates();
|
pollUpdates();
|
||||||
|
// Disk-status probe runs after first paint — slow over SSH and not blocking.
|
||||||
|
loadDiskStatus();
|
||||||
setInterval(pollStatus, 5000);
|
setInterval(pollStatus, 5000);
|
||||||
setInterval(pollHardware, 8000); // every 8s
|
setInterval(pollHardware, 8000); // every 8s
|
||||||
setInterval(pollUpdates, 300000); // every 5 min
|
setInterval(pollUpdates, 300000); // every 5 min
|
||||||
|
setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely
|
||||||
}
|
}
|
||||||
|
|
||||||
init();
|
init();
|
||||||
|
|||||||
@@ -188,6 +188,20 @@
|
|||||||
</form>
|
</form>
|
||||||
</dialog>
|
</dialog>
|
||||||
|
|
||||||
|
<dialog id="disk-delete-dialog" class="modal">
|
||||||
|
<form method="dialog" class="modal-form">
|
||||||
|
<h3>Delete model weights from disk?</h3>
|
||||||
|
<p id="dd-summary" class="muted small"></p>
|
||||||
|
<ul class="muted small dd-hosts" id="dd-hosts"></ul>
|
||||||
|
<p class="muted small">This is reversible — you can re-download from the catalog at any time. The catalog entry stays intact.</p>
|
||||||
|
<p id="dd-error" class="muted small dd-error hidden"></p>
|
||||||
|
<div class="modal-actions">
|
||||||
|
<button type="button" id="dd-cancel" class="btn">Cancel</button>
|
||||||
|
<button type="button" id="dd-confirm" class="btn danger">Delete from disk</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</dialog>
|
||||||
|
|
||||||
<dialog id="advanced-dialog" class="modal">
|
<dialog id="advanced-dialog" class="modal">
|
||||||
<form method="dialog" class="modal-form" id="advanced-form">
|
<form method="dialog" class="modal-form" id="advanced-form">
|
||||||
<h3 id="adv-title">Advanced settings</h3>
|
<h3 id="adv-title">Advanced settings</h3>
|
||||||
|
|||||||
@@ -622,6 +622,19 @@ main {
|
|||||||
.service-card .row .v.copyable.copied { outline: 1px solid var(--accent); background: rgba(74, 222, 128, 0.05); }
|
.service-card .row .v.copyable.copied { outline: 1px solid var(--accent); background: rgba(74, 222, 128, 0.05); }
|
||||||
.service-card .row .icon-btn { padding: 3px 6px; }
|
.service-card .row .icon-btn { padding: 3px 6px; }
|
||||||
.service-card .row .icon-btn svg { width: 12px; height: 12px; }
|
.service-card .row .icon-btn svg { width: 12px; height: 12px; }
|
||||||
|
.service-card .deep-row .deep-v { display: flex; align-items: center; gap: 6px; font-family: inherit; flex-wrap: wrap; }
|
||||||
|
.service-card .dh-ok { color: var(--accent); }
|
||||||
|
.service-card .dh-fail { color: var(--error); font-weight: 500; }
|
||||||
|
.service-card .dh-run-btn { font-family: inherit; }
|
||||||
|
.service-card .deep-error {
|
||||||
|
padding: 4px 8px;
|
||||||
|
background: rgba(239, 68, 68, 0.06);
|
||||||
|
border-left: 2px solid var(--error);
|
||||||
|
border-radius: 4px;
|
||||||
|
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
|
||||||
|
font-size: 11px;
|
||||||
|
word-break: break-word;
|
||||||
|
}
|
||||||
|
|
||||||
.service-actions {
|
.service-actions {
|
||||||
display: flex;
|
display: flex;
|
||||||
@@ -701,8 +714,31 @@ main {
|
|||||||
.card.active .btn { background: rgba(74, 222, 128, 0.12); color: var(--accent); border-color: rgba(74, 222, 128, 0.4); }
|
.card.active .btn { background: rgba(74, 222, 128, 0.12); color: var(--accent); border-color: rgba(74, 222, 128, 0.4); }
|
||||||
.card-actions { display: flex; gap: 6px; }
|
.card-actions { display: flex; gap: 6px; }
|
||||||
.card-actions .btn.primary { flex: 1; }
|
.card-actions .btn.primary { flex: 1; }
|
||||||
.card .adv-btn { padding: 8px 12px; font-size: 12px; }
|
.card .adv-btn,
|
||||||
|
.card .test-btn { padding: 8px 12px; font-size: 12px; }
|
||||||
.card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); }
|
.card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); }
|
||||||
|
.tag.on-disk { color: var(--accent); border-color: rgba(74, 222, 128, 0.4); }
|
||||||
|
.tag.not-on-disk { color: var(--muted); border-color: var(--border); opacity: 0.7; }
|
||||||
|
.card-actions .icon-btn.danger { color: var(--error); border-color: rgba(239, 68, 68, 0.3); margin-left: auto; }
|
||||||
|
.card-actions .icon-btn.danger:hover:not(:disabled) { background: rgba(239, 68, 68, 0.08); border-color: var(--error); color: var(--error); }
|
||||||
|
.card-actions .icon-btn.danger:disabled { opacity: 0.35; cursor: not-allowed; }
|
||||||
|
.dd-hosts { padding-left: 18px; margin: 4px 0 8px; }
|
||||||
|
.dd-hosts code { background: var(--surface-2); padding: 1px 5px; border-radius: 4px; }
|
||||||
|
.dd-error { color: var(--error); }
|
||||||
|
|
||||||
|
.test-result {
|
||||||
|
font-size: 12px;
|
||||||
|
line-height: 1.45;
|
||||||
|
padding: 8px 10px;
|
||||||
|
border-radius: 5px;
|
||||||
|
margin-top: 4px;
|
||||||
|
border: 1px solid var(--border);
|
||||||
|
background: var(--surface-2);
|
||||||
|
}
|
||||||
|
.test-result.ok { border-color: rgba(74, 222, 128, 0.4); background: rgba(74, 222, 128, 0.04); }
|
||||||
|
.test-result.fail { border-color: rgba(239, 68, 68, 0.45); background: rgba(239, 68, 68, 0.06); word-break: break-word; }
|
||||||
|
.test-result .ok-mark { color: var(--accent); font-weight: 600; }
|
||||||
|
.test-result .fail-mark { color: var(--error); font-weight: 600; }
|
||||||
|
|
||||||
.footer {
|
.footer {
|
||||||
margin-top: 28px;
|
margin-top: 28px;
|
||||||
|
|||||||
@@ -0,0 +1,137 @@
|
|||||||
|
"""Pre-flight validation of a proposed vLLM launch command.
|
||||||
|
|
||||||
|
Runs vLLM's own argparse layer (EngineArgs) inside the vllm_node container WITHOUT
|
||||||
|
starting the engine. Catches:
|
||||||
|
|
||||||
|
* unknown flag names (typos)
|
||||||
|
* bad types / values that argparse rejects
|
||||||
|
* deprecated flags removed in the installed vLLM version
|
||||||
|
|
||||||
|
Does NOT catch (these surface only during real engine init):
|
||||||
|
* model-architecture-specific constraints (e.g. Qwen3.6 Mamba block_size)
|
||||||
|
* OOM at weight-loading time
|
||||||
|
* Triton / CUDA-kernel compatibility errors
|
||||||
|
|
||||||
|
A pre-flight check that returns "ok" is therefore NOT a guarantee — but a
|
||||||
|
"failed" verdict is a definitive 'don't bother with the real swap'.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
|
import shlex
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from .config import Settings
|
||||||
|
from .models import Catalog, build_launch_command
|
||||||
|
from .ssh import ssh_run
|
||||||
|
|
||||||
|
|
||||||
|
# Validates the proposed args against the same combined parser vLLM uses for
|
||||||
|
# `vllm serve` (engine args + server args + frontend args). Returns one JSON
|
||||||
|
# line on stdout: {"ok": true, ...} or {"ok": false, ...}.
|
||||||
|
_VALIDATOR_SCRIPT = r"""
|
||||||
|
import argparse, json, sys
|
||||||
|
|
||||||
|
# Mirror what `vllm serve` does internally: FlexibleArgumentParser (which is
|
||||||
|
# more lenient about dashes vs underscores) wrapped with make_arg_parser
|
||||||
|
# (which adds engine + server + frontend args).
|
||||||
|
parser = None
|
||||||
|
try:
|
||||||
|
# Newer vLLM path
|
||||||
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
# Older fallback
|
||||||
|
from vllm.engine.arg_utils import FlexibleArgumentParser
|
||||||
|
except Exception:
|
||||||
|
FlexibleArgumentParser = argparse.ArgumentParser # type: ignore
|
||||||
|
|
||||||
|
try:
|
||||||
|
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||||
|
parser = make_arg_parser(FlexibleArgumentParser(add_help=False))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if parser is None:
|
||||||
|
try:
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
parser = FlexibleArgumentParser(add_help=False)
|
||||||
|
EngineArgs.add_cli_args(parser)
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"ok": False, "stage": "import", "error": f"{type(e).__name__}: {e}"}))
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
class _ArgError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _err(message):
|
||||||
|
raise _ArgError(message)
|
||||||
|
|
||||||
|
parser.error = _err # capture argparse errors instead of sys.exit(2)
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw = sys.stdin.read()
|
||||||
|
arglist = json.loads(raw)
|
||||||
|
ns = parser.parse_args(arglist)
|
||||||
|
print(json.dumps({"ok": True, "model": getattr(ns, "model", None)}))
|
||||||
|
except _ArgError as e:
|
||||||
|
print(json.dumps({"ok": False, "stage": "parse", "error": str(e)}))
|
||||||
|
except SystemExit as e:
|
||||||
|
print(json.dumps({"ok": False, "stage": "parse", "error": f"argparse exit {e.code}"}))
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"ok": False, "stage": "parse", "error": f"{type(e).__name__}: {e}"}))
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _vllm_arg_list(key: str, model_def, catalog: Catalog) -> list[str]:
|
||||||
|
"""Reconstruct the args list passed to `vllm serve` (without the positional model)."""
|
||||||
|
cmd = build_launch_command(key, model_def, catalog.defaults)
|
||||||
|
# build_launch_command yields:
|
||||||
|
# ./launch-cluster.sh [--solo] -d exec vllm serve <repo> <args...>
|
||||||
|
# We just want the bits after `vllm serve <repo>`.
|
||||||
|
tokens = shlex.split(cmd)
|
||||||
|
if "serve" not in tokens:
|
||||||
|
return []
|
||||||
|
i = tokens.index("serve")
|
||||||
|
after = tokens[i + 1 :] # repo, then args
|
||||||
|
if not after:
|
||||||
|
return []
|
||||||
|
args = after[1:] # drop the repo
|
||||||
|
# EngineArgs expects --model=REPO rather than positional, so prepend it.
|
||||||
|
return [f"--model={after[0]}", *args]
|
||||||
|
|
||||||
|
|
||||||
|
async def validate_launch(key: str, catalog: Catalog, settings: Settings) -> dict:
|
||||||
|
if key not in catalog.models:
|
||||||
|
return {"ok": False, "stage": "lookup", "error": f"unknown model: {key}"}
|
||||||
|
if not settings.spark1_host or not settings.spark1_user:
|
||||||
|
return {"ok": False, "stage": "config", "error": "spark1 not configured"}
|
||||||
|
|
||||||
|
model = catalog.models[key]
|
||||||
|
arg_list = _vllm_arg_list(key, model, catalog)
|
||||||
|
if not arg_list:
|
||||||
|
return {"ok": False, "stage": "build", "error": "failed to build args list"}
|
||||||
|
|
||||||
|
payload = json.dumps(arg_list).replace("'", "'\\''")
|
||||||
|
# Pipe the JSON args list to a here-doc Python invocation. The validator
|
||||||
|
# reads from stdin to avoid shell-escaping the args themselves.
|
||||||
|
cmd = (
|
||||||
|
f"echo '{payload}' | docker exec -i vllm_node python3 -c "
|
||||||
|
+ shlex.quote(_VALIDATOR_SCRIPT)
|
||||||
|
)
|
||||||
|
|
||||||
|
rc, out, err = await ssh_run(settings.spark1_host, settings.spark1_user, cmd, settings, timeout=20)
|
||||||
|
if rc != 0 and not out.strip():
|
||||||
|
return {
|
||||||
|
"ok": False,
|
||||||
|
"stage": "ssh",
|
||||||
|
"error": err.strip() or f"rc={rc}",
|
||||||
|
"cmd_args": arg_list,
|
||||||
|
"launch_cmd": build_launch_command(key, model, catalog.defaults),
|
||||||
|
}
|
||||||
|
last = out.strip().splitlines()[-1] if out.strip() else ""
|
||||||
|
try:
|
||||||
|
result: dict[str, Any] = json.loads(last)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
result = {"ok": False, "stage": "decode", "error": "validator did not return JSON", "raw": out[-500:]}
|
||||||
|
result["cmd_args"] = arg_list
|
||||||
|
result["launch_cmd"] = build_launch_command(key, model, catalog.defaults)
|
||||||
|
return result
|
||||||
@@ -30,6 +30,7 @@ models:
|
|||||||
- -tp=2
|
- -tp=2
|
||||||
- --distributed-executor-backend=ray
|
- --distributed-executor-backend=ray
|
||||||
- --max-model-len=32768
|
- --max-model-len=32768
|
||||||
|
- --max-num-batched-tokens=16384
|
||||||
|
|
||||||
gemma4:
|
gemma4:
|
||||||
display_name: "Gemma 4 31B"
|
display_name: "Gemma 4 31B"
|
||||||
@@ -45,6 +46,7 @@ models:
|
|||||||
vllm_args:
|
vllm_args:
|
||||||
- --gpu-memory-utilization=0.8
|
- --gpu-memory-utilization=0.8
|
||||||
- --max-model-len=32768
|
- --max-model-len=32768
|
||||||
|
- --max-num-batched-tokens=16384
|
||||||
- --reasoning-parser=gemma4
|
- --reasoning-parser=gemma4
|
||||||
- --tool-call-parser=gemma4
|
- --tool-call-parser=gemma4
|
||||||
- --enable-auto-tool-choice
|
- --enable-auto-tool-choice
|
||||||
@@ -66,6 +68,7 @@ models:
|
|||||||
vllm_args:
|
vllm_args:
|
||||||
- --gpu-memory-utilization=0.85
|
- --gpu-memory-utilization=0.85
|
||||||
- --max-model-len=65536
|
- --max-model-len=65536
|
||||||
|
- --max-num-batched-tokens=16384
|
||||||
- --reasoning-parser=qwen3
|
- --reasoning-parser=qwen3
|
||||||
- --moe_backend=flashinfer_cutlass
|
- --moe_backend=flashinfer_cutlass
|
||||||
- --load-format=fastsafetensors
|
- --load-format=fastsafetensors
|
||||||
|
|||||||
@@ -20,6 +20,14 @@ The trick is the `docker run --rm alpine chown` — it runs as root inside the t
|
|||||||
|
|
||||||
This flag is Blackwell-specific. If vLLM in the container reports `unrecognized arguments: --moe_backend` or similar, edit `models.yaml` for `qwen36` and drop that flag. The swap UI does NOT auto-fallback in v0.1 — failure surfaces in the log stream.
|
This flag is Blackwell-specific. If vLLM in the container reports `unrecognized arguments: --moe_backend` or similar, edit `models.yaml` for `qwen36` and drop that flag. The swap UI does NOT auto-fallback in v0.1 — failure surfaces in the log stream.
|
||||||
|
|
||||||
|
## Qwen3.6 Mamba block-size assertion (fixed in v0.6.0:1)
|
||||||
|
|
||||||
|
Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe.
|
||||||
|
|
||||||
|
## Multimodal token budget for vision models (fixed in v0.8.0:1)
|
||||||
|
|
||||||
|
After the eugr/spark-vllm-docker update, vLLM became stricter about multimodal token budgets. Vision-capable models like Gemma 4 31B and Qwen3-VL crash at engine init with `ValueError: Chunked MM input disabled but max_tokens_per_mm_item (2496) is larger than max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into every model that has the `vision` capability. Now applied to qwen3-vl, gemma4, and qwen36 (which was already set for the Mamba issue).
|
||||||
|
|
||||||
## Two SSH paths to Spark 1 from the laptop
|
## Two SSH paths to Spark 1 from the laptop
|
||||||
|
|
||||||
`ssh <spark-user>@<spark-1-ip>` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `<spark-1-host>.local`. Always use the `.local` hostname or `<spark-2-ip>`-style entries that ARE matched.
|
`ssh <spark-user>@<spark-1-ip>` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `<spark-1-host>.local`. Always use the `.local` hostname or `<spark-2-ip>`-style entries that ARE matched.
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
export const v0_1_0 = VersionInfo.of({
|
export const v0_1_0 = VersionInfo.of({
|
||||||
version: '0.6.0:0',
|
version: '0.8.1:0',
|
||||||
releaseNotes: {
|
releaseNotes: {
|
||||||
en_US:
|
en_US:
|
||||||
'v0.6: Service-level connectivity tracking and a passive failure-report endpoint. The connectivity log now records up/down transitions for Parakeet, Magpie, and vLLM in addition to the Spark hosts (driven by the existing /api/status and /api/services polling). A new POST /api/health-event endpoint lets external apps (e.g. Open WebUI) record failures they observed even when the failure was brief enough to slip between polls. The Connectivity log dialog shows hosts and services with separate badges, and reports appear inline with their source app + error detail.',
|
'v0.8.1: model weights can now be deleted from disk directly from the dashboard. Each model card shows whether the weights are present (with on-disk GB size) or not yet downloaded. When present and the model is NOT currently loaded, a small trash icon appears on the card; clicking it pops a confirmation showing how many GB will be freed and on which Spark(s), then runs `rm -rf` on the Hugging Face cache directory via SSH. Cluster-mode models are deleted from both Sparks; solo-mode from Spark 1 only. Safety rails: refuses to delete the currently-loaded model, refuses during an in-flight swap or download, and the catalog entry stays — you can always re-download. Disk status is probed once on dashboard load and re-checked every 60s.',
|
||||||
},
|
},
|
||||||
migrations: {
|
migrations: {
|
||||||
up: async ({ effects }) => {},
|
up: async ({ effects }) => {},
|
||||||
|
|||||||
Reference in New Issue
Block a user