Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9ff7ee9c1e | |||
| 1602b3b3b4 | |||
| 8ac455f5f5 | |||
| 000c55febe |
@@ -0,0 +1,363 @@
|
|||||||
|
"""Deep health probes for each service.
|
||||||
|
|
||||||
|
Why this exists: Triton's /health endpoint returns 200 as long as the HTTP
|
||||||
|
layer is alive and the model is registered. It does NOT verify that the CUDA
|
||||||
|
context inside the worker process is healthy. We've observed Parakeet getting
|
||||||
|
its CUDA context wedged after an OOM, where /health stays green but every
|
||||||
|
real transcription returns 500 cudaErrorUnknown.
|
||||||
|
|
||||||
|
So this module sends *real* but tiny synthetic inference requests:
|
||||||
|
- Parakeet: 1 second of digital silence (16 kHz mono PCM, in-memory WAV)
|
||||||
|
- Magpie: short text-to-speech, response audio discarded
|
||||||
|
- vLLM: 1-token chat completion against whatever model is loaded
|
||||||
|
|
||||||
|
All synthetic payloads are generated on demand into BytesIO, sent over HTTP,
|
||||||
|
and never touched the filesystem (on either spark-control's side or the
|
||||||
|
target service's side beyond normal Triton/Riva working memory).
|
||||||
|
|
||||||
|
When a probe fails with a signal that looks like a CUDA wedge, we
|
||||||
|
automatically issue `docker restart <container>`. Rate-limited to 3 restarts
|
||||||
|
per service per 30 minutes to avoid restart loops.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
import io
|
||||||
|
import time
|
||||||
|
import wave
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from .config import Settings
|
||||||
|
from .connectivity import record_report
|
||||||
|
from .services import ServiceDef, run_action, services_from_settings
|
||||||
|
|
||||||
|
|
||||||
|
# Default 5-minute interval, controllable via env. Sub-minute is silly for a
|
||||||
|
# heavy synthetic probe; we just want to catch wedges within a reasonable
|
||||||
|
# window — much faster than the user noticing on their next real call.
|
||||||
|
DEFAULT_INTERVAL_SEC = 300.0
|
||||||
|
PROBE_TIMEOUT_SEC = 20.0
|
||||||
|
RESTART_RATE_LIMIT = 3 # max auto-restarts per service
|
||||||
|
RESTART_RATE_WINDOW_SEC = 1800.0 # within a 30-min window
|
||||||
|
RESTART_COOLDOWN_SEC = 120.0 # don't restart again within this many seconds of the last one
|
||||||
|
STARTUP_GRACE_SEC = 60.0 # don't auto-restart for the first minute after this app boots
|
||||||
|
|
||||||
|
|
||||||
|
def _silence_wav(seconds: float = 1.0, sample_rate: int = 16000) -> io.BytesIO:
|
||||||
|
"""Return an in-memory WAV file containing `seconds` of digital silence."""
|
||||||
|
n_frames = int(seconds * sample_rate)
|
||||||
|
buf = io.BytesIO()
|
||||||
|
with wave.open(buf, "wb") as w:
|
||||||
|
w.setnchannels(1)
|
||||||
|
w.setsampwidth(2) # int16
|
||||||
|
w.setframerate(sample_rate)
|
||||||
|
w.writeframes(b"\x00\x00" * n_frames)
|
||||||
|
buf.seek(0)
|
||||||
|
return buf
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_wedge(error: str) -> bool:
|
||||||
|
"""Heuristic: does this error string look like a stuck CUDA context that
|
||||||
|
a container restart would clear? We want to be conservative — only act
|
||||||
|
on signals we're confident about, otherwise leave the user in charge."""
|
||||||
|
err = (error or "").lower()
|
||||||
|
needles = [
|
||||||
|
"cudaerrorunknown",
|
||||||
|
"cuda error: unknown",
|
||||||
|
"cuda kernel errors",
|
||||||
|
"internal server error",
|
||||||
|
"engine core initialization failed",
|
||||||
|
"503", # service unavailable from a dependency
|
||||||
|
"500", # generic 5xx with a body that may not parse
|
||||||
|
]
|
||||||
|
return any(n in err for n in needles)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ProbeResult:
|
||||||
|
ok: bool
|
||||||
|
at: str
|
||||||
|
latency_ms: Optional[int] = None
|
||||||
|
error: str = ""
|
||||||
|
note: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ServiceState:
|
||||||
|
last: Optional[ProbeResult] = None
|
||||||
|
last_ok_at: Optional[str] = None
|
||||||
|
restarts: list[float] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class DeepHealth:
|
||||||
|
def __init__(self, settings: Settings, interval_sec: float = DEFAULT_INTERVAL_SEC) -> None:
|
||||||
|
self.settings = settings
|
||||||
|
self.interval_sec = interval_sec
|
||||||
|
self.state: dict[str, ServiceState] = {
|
||||||
|
"parakeet": ServiceState(),
|
||||||
|
"magpie": ServiceState(),
|
||||||
|
"vllm": ServiceState(),
|
||||||
|
}
|
||||||
|
self._stop = asyncio.Event()
|
||||||
|
self._boot_at = time.monotonic()
|
||||||
|
|
||||||
|
# ---- probes ---------------------------------------------------------
|
||||||
|
|
||||||
|
async def probe_parakeet(self) -> ProbeResult:
|
||||||
|
s = self.settings
|
||||||
|
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||||
|
if not s.parakeet_host:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||||
|
url = f"http://{s.parakeet_host}:{s.parakeet_port}/v1/audio/transcriptions"
|
||||||
|
wav = _silence_wav(1.0)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||||
|
r = await c.post(
|
||||||
|
url,
|
||||||
|
files={"file": ("probe.wav", wav, "audio/wav")},
|
||||||
|
data={"model": "parakeet-tdt-0.6b-v3"},
|
||||||
|
)
|
||||||
|
latency = round((time.monotonic() - t0) * 1000)
|
||||||
|
if 200 <= r.status_code < 300:
|
||||||
|
return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
|
||||||
|
return ProbeResult(
|
||||||
|
ok=False,
|
||||||
|
at=now_iso,
|
||||||
|
latency_ms=latency,
|
||||||
|
error=f"HTTP {r.status_code}: {r.text[:240]}",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||||
|
|
||||||
|
async def probe_magpie(self) -> ProbeResult:
|
||||||
|
s = self.settings
|
||||||
|
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||||
|
if not s.magpie_host:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||||
|
# Magpie /v1/audio/synthesize expects multipart form-data, not JSON.
|
||||||
|
# The (None, value) tuple in httpx's `files=` produces a non-file form field.
|
||||||
|
url = f"http://{s.magpie_host}:{s.magpie_port}/v1/audio/synthesize"
|
||||||
|
form: dict = {"text": (None, "hi"), "language": (None, "en-US")}
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||||
|
r = await c.post(url, files=form)
|
||||||
|
latency = round((time.monotonic() - t0) * 1000)
|
||||||
|
if 200 <= r.status_code < 300:
|
||||||
|
return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
|
||||||
|
# 4xx that aren't 5xx mean server is alive but our payload is off —
|
||||||
|
# don't classify as wedge.
|
||||||
|
if 400 <= r.status_code < 500:
|
||||||
|
return ProbeResult(
|
||||||
|
ok=True,
|
||||||
|
at=now_iso,
|
||||||
|
latency_ms=latency,
|
||||||
|
note=f"{r.status_code} — server alive (probe payload may need a voice name)",
|
||||||
|
)
|
||||||
|
return ProbeResult(
|
||||||
|
ok=False,
|
||||||
|
at=now_iso,
|
||||||
|
latency_ms=latency,
|
||||||
|
error=f"HTTP {r.status_code}: {r.text[:240]}",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||||
|
|
||||||
|
async def probe_vllm(self) -> ProbeResult:
|
||||||
|
s = self.settings
|
||||||
|
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||||
|
if not s.spark1_host:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||||
|
base = f"http://{s.spark1_host}:{s.vllm_port}"
|
||||||
|
# Step 1: is there a model loaded?
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=5.0) as c:
|
||||||
|
r = await c.get(f"{base}/v1/models")
|
||||||
|
if 200 <= r.status_code < 300:
|
||||||
|
models = r.json().get("data") or []
|
||||||
|
else:
|
||||||
|
# 5xx on /v1/models suggests something wedged after a model loaded
|
||||||
|
return ProbeResult(
|
||||||
|
ok=False,
|
||||||
|
at=now_iso,
|
||||||
|
error=f"list_models HTTP {r.status_code}: {r.text[:240]}",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Connection refused / timeout: usually means no vLLM process listening
|
||||||
|
# (the vllm_node container is alive but no `vllm serve` is running yet).
|
||||||
|
# That's an idle state, not a wedge — don't trigger auto-restart.
|
||||||
|
return ProbeResult(
|
||||||
|
ok=True,
|
||||||
|
at=now_iso,
|
||||||
|
note="no model currently loaded (idle)",
|
||||||
|
)
|
||||||
|
|
||||||
|
if not models:
|
||||||
|
return ProbeResult(
|
||||||
|
ok=True,
|
||||||
|
at=now_iso,
|
||||||
|
note="no model currently loaded (idle)",
|
||||||
|
)
|
||||||
|
|
||||||
|
model_id = models[0]["id"]
|
||||||
|
# Step 2: model is loaded; verify it can actually complete a 1-token request.
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||||
|
r = await c.post(
|
||||||
|
f"{base}/v1/chat/completions",
|
||||||
|
json={
|
||||||
|
"model": model_id,
|
||||||
|
"messages": [{"role": "user", "content": "hi"}],
|
||||||
|
"max_tokens": 1,
|
||||||
|
"temperature": 0,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
latency = round((time.monotonic() - t0) * 1000)
|
||||||
|
if 200 <= r.status_code < 300:
|
||||||
|
return ProbeResult(ok=True, at=now_iso, latency_ms=latency, note=f"model={model_id}")
|
||||||
|
return ProbeResult(
|
||||||
|
ok=False,
|
||||||
|
at=now_iso,
|
||||||
|
latency_ms=latency,
|
||||||
|
error=f"HTTP {r.status_code}: {r.text[:240]}",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||||
|
|
||||||
|
# ---- orchestration --------------------------------------------------
|
||||||
|
|
||||||
|
PROBES = {
|
||||||
|
"parakeet": "probe_parakeet",
|
||||||
|
"magpie": "probe_magpie",
|
||||||
|
"vllm": "probe_vllm",
|
||||||
|
}
|
||||||
|
|
||||||
|
async def run_one(self, service: str) -> ProbeResult:
|
||||||
|
fn = getattr(self, self.PROBES[service])
|
||||||
|
result: ProbeResult = await fn()
|
||||||
|
st = self.state[service]
|
||||||
|
prev_ok = st.last.ok if st.last else None
|
||||||
|
st.last = result
|
||||||
|
if result.ok:
|
||||||
|
st.last_ok_at = result.at
|
||||||
|
|
||||||
|
# Log to connectivity history: every failure, plus the first success
|
||||||
|
# after a failure (recovery), plus the first probe ever — but skip
|
||||||
|
# the "still ok" steady-state to keep the log readable.
|
||||||
|
if not result.ok:
|
||||||
|
record_report(
|
||||||
|
service,
|
||||||
|
ok=False,
|
||||||
|
source="deep-health",
|
||||||
|
detail=result.error[:240],
|
||||||
|
latency_ms=result.latency_ms,
|
||||||
|
)
|
||||||
|
elif prev_ok is False:
|
||||||
|
record_report(
|
||||||
|
service,
|
||||||
|
ok=True,
|
||||||
|
source="deep-health",
|
||||||
|
detail="recovered" + (f" — {result.note}" if result.note else ""),
|
||||||
|
latency_ms=result.latency_ms,
|
||||||
|
)
|
||||||
|
elif prev_ok is None:
|
||||||
|
record_report(
|
||||||
|
service,
|
||||||
|
ok=True,
|
||||||
|
source="deep-health",
|
||||||
|
detail="first probe ok" + (f" — {result.note}" if result.note else ""),
|
||||||
|
latency_ms=result.latency_ms,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Maybe auto-restart
|
||||||
|
if not result.ok and _looks_like_wedge(result.error):
|
||||||
|
await self._maybe_restart(service, result.error)
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _maybe_restart(self, service: str, error: str) -> None:
|
||||||
|
# No restarts during the boot grace period.
|
||||||
|
if time.monotonic() - self._boot_at < STARTUP_GRACE_SEC:
|
||||||
|
return
|
||||||
|
st = self.state[service]
|
||||||
|
now = time.monotonic()
|
||||||
|
st.restarts = [t for t in st.restarts if now - t < RESTART_RATE_WINDOW_SEC]
|
||||||
|
if st.restarts and now - st.restarts[-1] < RESTART_COOLDOWN_SEC:
|
||||||
|
return # already restarted recently, give it time
|
||||||
|
if len(st.restarts) >= RESTART_RATE_LIMIT:
|
||||||
|
record_report(
|
||||||
|
service,
|
||||||
|
ok=False,
|
||||||
|
source="deep-health",
|
||||||
|
detail=f"rate-limited; not auto-restarting (would be #{len(st.restarts)+1} in 30 min)",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
services = services_from_settings(self.settings)
|
||||||
|
if service not in services:
|
||||||
|
return
|
||||||
|
svc = services[service]
|
||||||
|
if not svc.host or not svc.user:
|
||||||
|
return
|
||||||
|
result = await run_action(self.settings, svc, "restart")
|
||||||
|
st.restarts.append(now)
|
||||||
|
ok = result.get("ok", False)
|
||||||
|
record_report(
|
||||||
|
service,
|
||||||
|
ok=False,
|
||||||
|
source="deep-health",
|
||||||
|
detail=f"auto-restart triggered (wedge: {error[:120]}); restart {'OK' if ok else 'FAILED'}",
|
||||||
|
)
|
||||||
|
|
||||||
|
async def run_all(self) -> dict[str, ProbeResult]:
|
||||||
|
results = {}
|
||||||
|
for name in self.PROBES:
|
||||||
|
results[name] = await self.run_one(name)
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def run_periodic(self) -> None:
|
||||||
|
"""Long-running loop. Cancel via .stop()."""
|
||||||
|
# Brief initial wait to let app finish startup
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(self._stop.wait(), timeout=10.0)
|
||||||
|
return
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass
|
||||||
|
while not self._stop.is_set():
|
||||||
|
try:
|
||||||
|
await self.run_all()
|
||||||
|
except Exception:
|
||||||
|
# Never let the loop die; the periodic check is best-effort
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(self._stop.wait(), timeout=self.interval_sec)
|
||||||
|
return
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
self._stop.set()
|
||||||
|
|
||||||
|
def summary(self) -> dict:
|
||||||
|
out = {}
|
||||||
|
for name, st in self.state.items():
|
||||||
|
last = st.last
|
||||||
|
out[name] = {
|
||||||
|
"last_ok_at": st.last_ok_at,
|
||||||
|
"last": (
|
||||||
|
{
|
||||||
|
"ok": last.ok,
|
||||||
|
"at": last.at,
|
||||||
|
"latency_ms": last.latency_ms,
|
||||||
|
"error": last.error,
|
||||||
|
"note": last.note,
|
||||||
|
}
|
||||||
|
if last
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
"auto_restarts_window": len(st.restarts),
|
||||||
|
}
|
||||||
|
return out
|
||||||
@@ -0,0 +1,130 @@
|
|||||||
|
"""On-disk presence + deletion for Hugging Face model caches on the Sparks.
|
||||||
|
|
||||||
|
The HF cache layout for a repo `org/name` is:
|
||||||
|
|
||||||
|
~/.cache/huggingface/hub/models--org--name/
|
||||||
|
|
||||||
|
We use `du -sb` to measure size (bytes) and `rm -rf` to free it. All operations
|
||||||
|
are gated by the server endpoints, which refuse to delete a currently-loaded
|
||||||
|
model or one tied to an in-flight swap/download.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
import shlex
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from .config import Settings
|
||||||
|
from .ssh import ssh_run
|
||||||
|
|
||||||
|
|
||||||
|
def repo_to_cache_dirname(repo: str) -> str:
|
||||||
|
"""Convert 'org/name' to 'models--org--name' (the HF hub cache directory)."""
|
||||||
|
if "/" not in repo:
|
||||||
|
raise ValueError(f"repo must be in 'org/name' form: {repo!r}")
|
||||||
|
return "models--" + repo.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
|
def _cache_path(repo: str) -> str:
|
||||||
|
"""Full remote path to the model's cache directory."""
|
||||||
|
# Use $HOME so it resolves correctly regardless of the SSH user's home.
|
||||||
|
return f"$HOME/.cache/huggingface/hub/{repo_to_cache_dirname(repo)}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HostDiskResult:
|
||||||
|
host: str
|
||||||
|
on_disk: bool
|
||||||
|
size_bytes: int = 0
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DiskStatus:
|
||||||
|
repo: str
|
||||||
|
on_disk: bool # True if present on AT LEAST one host
|
||||||
|
total_bytes: int # sum across hosts
|
||||||
|
per_host: list[HostDiskResult]
|
||||||
|
|
||||||
|
|
||||||
|
async def probe_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult:
|
||||||
|
"""Return whether the model's cache dir exists on this host and its size."""
|
||||||
|
if not host or not user:
|
||||||
|
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||||
|
path = _cache_path(repo)
|
||||||
|
# `du -sb` prints bytes; if the dir doesn't exist, `du` returns non-zero.
|
||||||
|
# We test existence explicitly first so we can report on_disk=False cleanly.
|
||||||
|
cmd = (
|
||||||
|
f"if [ -d {shlex.quote(path)} ]; then "
|
||||||
|
f"du -sb {shlex.quote(path)} 2>/dev/null | awk '{{print $1}}'; "
|
||||||
|
f"else echo MISSING; fi"
|
||||||
|
)
|
||||||
|
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=20.0)
|
||||||
|
if rc != 0:
|
||||||
|
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
|
||||||
|
raw = out.strip()
|
||||||
|
if raw == "MISSING" or raw == "":
|
||||||
|
return HostDiskResult(host=host, on_disk=False)
|
||||||
|
try:
|
||||||
|
size = int(raw.splitlines()[-1])
|
||||||
|
except ValueError:
|
||||||
|
return HostDiskResult(host=host, on_disk=False, error=f"unparsable du output: {raw!r}")
|
||||||
|
return HostDiskResult(host=host, on_disk=True, size_bytes=size)
|
||||||
|
|
||||||
|
|
||||||
|
async def probe_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
|
||||||
|
"""Probe one model across the relevant Sparks based on its mode (solo|cluster)."""
|
||||||
|
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
|
||||||
|
if mode == "cluster" and settings.spark2_host:
|
||||||
|
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||||
|
|
||||||
|
results = await asyncio.gather(*(probe_host(h, u, repo, settings) for h, u in hosts))
|
||||||
|
on_disk = any(r.on_disk for r in results)
|
||||||
|
total = sum(r.size_bytes for r in results)
|
||||||
|
return DiskStatus(repo=repo, on_disk=on_disk, total_bytes=total, per_host=list(results))
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult:
|
||||||
|
"""Probe + rm -rf on one host. Returns bytes freed (0 if the dir wasn't there)."""
|
||||||
|
if not host or not user:
|
||||||
|
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||||
|
path = _cache_path(repo)
|
||||||
|
# Safety: hard-code the prefix in the command so a bad `repo` can never escape.
|
||||||
|
# Compute size first, then remove. If absent, still return success (idempotent).
|
||||||
|
cmd = (
|
||||||
|
f"set -e; "
|
||||||
|
f"P={shlex.quote(path)}; "
|
||||||
|
f"if [ -d \"$P\" ]; then "
|
||||||
|
f" SIZE=$(du -sb \"$P\" 2>/dev/null | awk '{{print $1}}'); "
|
||||||
|
f" rm -rf -- \"$P\"; "
|
||||||
|
f" echo FREED $SIZE; "
|
||||||
|
f"else "
|
||||||
|
f" echo FREED 0; "
|
||||||
|
f"fi"
|
||||||
|
)
|
||||||
|
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=120.0)
|
||||||
|
if rc != 0:
|
||||||
|
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
|
||||||
|
# Parse the "FREED N" line
|
||||||
|
freed = 0
|
||||||
|
for line in out.splitlines():
|
||||||
|
parts = line.strip().split()
|
||||||
|
if len(parts) == 2 and parts[0] == "FREED":
|
||||||
|
try:
|
||||||
|
freed = int(parts[1])
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
break
|
||||||
|
return HostDiskResult(host=host, on_disk=False, size_bytes=freed)
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_from_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
|
||||||
|
"""rm -rf the model's cache dir on the relevant Sparks. Idempotent."""
|
||||||
|
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
|
||||||
|
if mode == "cluster" and settings.spark2_host:
|
||||||
|
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||||
|
|
||||||
|
results = await asyncio.gather(*(delete_host(h, u, repo, settings) for h, u in hosts))
|
||||||
|
total_freed = sum(r.size_bytes for r in results)
|
||||||
|
# After deletion, on_disk should be False on all hosts.
|
||||||
|
return DiskStatus(repo=repo, on_disk=False, total_bytes=total_freed, per_host=list(results))
|
||||||
@@ -12,6 +12,8 @@ from typing import Literal
|
|||||||
from .config import Settings
|
from .config import Settings
|
||||||
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||||
from .custom_services import add_custom_service, delete_custom_service
|
from .custom_services import add_custom_service, delete_custom_service
|
||||||
|
from .deep_health import DeepHealth
|
||||||
|
from .disk import delete_from_disk, probe_disk
|
||||||
from .download import DownloadManager
|
from .download import DownloadManager
|
||||||
from .hardware import HardwareProbe
|
from .hardware import HardwareProbe
|
||||||
from .health import check_magpie, check_parakeet, check_vllm
|
from .health import check_magpie, check_parakeet, check_vllm
|
||||||
@@ -33,9 +35,22 @@ download_manager = DownloadManager(settings)
|
|||||||
update_manager = UpdateManager(settings)
|
update_manager = UpdateManager(settings)
|
||||||
hardware_probe = HardwareProbe(settings)
|
hardware_probe = HardwareProbe(settings)
|
||||||
nim_manager = NimManager(settings)
|
nim_manager = NimManager(settings)
|
||||||
|
deep_health = DeepHealth(settings)
|
||||||
|
|
||||||
app = FastAPI(title="spark-control", version="0.1.0")
|
app = FastAPI(title="spark-control", version="0.1.0")
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def _start_deep_health() -> None:
|
||||||
|
# Fire-and-forget; the loop catches its own exceptions.
|
||||||
|
asyncio.create_task(deep_health.run_periodic())
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("shutdown")
|
||||||
|
async def _stop_deep_health() -> None:
|
||||||
|
deep_health.stop()
|
||||||
|
|
||||||
|
|
||||||
_STATIC_DIR = Path(__file__).resolve().parent / "static"
|
_STATIC_DIR = Path(__file__).resolve().parent / "static"
|
||||||
app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
|
app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
|
||||||
|
|
||||||
@@ -125,6 +140,89 @@ async def del_model(key: str) -> dict:
|
|||||||
return {"ok": True, "key": key}
|
return {"ok": True, "key": key}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/models/disk-status")
|
||||||
|
async def get_models_disk_status() -> dict:
|
||||||
|
"""Probe each catalog model's HF cache on the appropriate Spark(s) in parallel.
|
||||||
|
|
||||||
|
Result is keyed by model key: {on_disk, total_bytes, per_host:[{host,on_disk,size_bytes,error?}]}.
|
||||||
|
Designed to be called once on dashboard load; takes ~1–3s depending on Spark count.
|
||||||
|
"""
|
||||||
|
if not settings.configured:
|
||||||
|
return {"configured": False, "models": {}}
|
||||||
|
keys = list(catalog.models.keys())
|
||||||
|
statuses = await asyncio.gather(*(
|
||||||
|
probe_disk(catalog.models[k].repo, catalog.models[k].mode, settings) for k in keys
|
||||||
|
), return_exceptions=True)
|
||||||
|
out: dict[str, dict] = {}
|
||||||
|
for k, s in zip(keys, statuses):
|
||||||
|
if isinstance(s, Exception):
|
||||||
|
out[k] = {"on_disk": False, "total_bytes": 0, "per_host": [], "error": str(s)}
|
||||||
|
continue
|
||||||
|
out[k] = {
|
||||||
|
"on_disk": s.on_disk,
|
||||||
|
"total_bytes": s.total_bytes,
|
||||||
|
"per_host": [
|
||||||
|
{"host": r.host, "on_disk": r.on_disk, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
|
||||||
|
for r in s.per_host
|
||||||
|
],
|
||||||
|
}
|
||||||
|
return {"configured": True, "models": out}
|
||||||
|
|
||||||
|
|
||||||
|
@app.delete("/api/models/{key}/disk")
|
||||||
|
async def del_model_disk(key: str) -> dict:
|
||||||
|
"""Delete a model's weights from the Spark filesystem(s). The catalog entry stays.
|
||||||
|
|
||||||
|
Safety rails:
|
||||||
|
- Refuses if the model is currently loaded on vLLM.
|
||||||
|
- Refuses if a swap or download is in flight.
|
||||||
|
- Idempotent: if the cache dir is already gone on a host, that host reports 0 bytes freed.
|
||||||
|
"""
|
||||||
|
if key not in catalog.models:
|
||||||
|
raise HTTPException(404, f"unknown model: {key}")
|
||||||
|
m = catalog.models[key]
|
||||||
|
|
||||||
|
# Refuse if currently loaded
|
||||||
|
try:
|
||||||
|
vllm = await check_vllm(settings)
|
||||||
|
except Exception:
|
||||||
|
vllm = {}
|
||||||
|
if vllm.get("ok") and vllm.get("current_model") == m.repo:
|
||||||
|
raise HTTPException(
|
||||||
|
409,
|
||||||
|
f"'{m.display_name}' is the currently loaded model. Switch to a different model first, then try again."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Refuse if a swap is in flight
|
||||||
|
if swap_manager.current_job_id:
|
||||||
|
raise HTTPException(409, "a model swap is in progress; wait for it to finish")
|
||||||
|
|
||||||
|
# Refuse if a download is in flight for this same repo (a different model's download is fine)
|
||||||
|
if download_manager.current_job_id:
|
||||||
|
job = download_manager.get(download_manager.current_job_id)
|
||||||
|
if job and job.repo == m.repo:
|
||||||
|
raise HTTPException(409, "this model is currently downloading; cancel or wait for it to finish")
|
||||||
|
|
||||||
|
status = await delete_from_disk(m.repo, m.mode, settings)
|
||||||
|
# Audit log
|
||||||
|
record_report(
|
||||||
|
f"disk:{key}",
|
||||||
|
ok=True,
|
||||||
|
source="disk-delete",
|
||||||
|
detail=f"freed {status.total_bytes} bytes across {len(status.per_host)} host(s)",
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"key": key,
|
||||||
|
"repo": m.repo,
|
||||||
|
"bytes_freed": status.total_bytes,
|
||||||
|
"per_host": [
|
||||||
|
{"host": r.host, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
|
||||||
|
for r in status.per_host
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/hardware")
|
@app.get("/api/hardware")
|
||||||
async def get_hardware() -> dict:
|
async def get_hardware() -> dict:
|
||||||
"""Per-Spark hardware snapshot — RAM, disk, GPU mem + util, CPU load, uptime."""
|
"""Per-Spark hardware snapshot — RAM, disk, GPU mem + util, CPU load, uptime."""
|
||||||
@@ -137,6 +235,27 @@ async def get_connectivity() -> dict:
|
|||||||
return connectivity_summary()
|
return connectivity_summary()
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/deep-health")
|
||||||
|
async def get_deep_health() -> dict:
|
||||||
|
"""Last result + auto-restart counters for each service's synthetic probe."""
|
||||||
|
return deep_health.summary()
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/deep-health/{service}/run")
|
||||||
|
async def run_deep_health(service: str) -> dict:
|
||||||
|
"""Manually run a single service's deep-health probe right now."""
|
||||||
|
if service not in deep_health.PROBES:
|
||||||
|
raise HTTPException(404, f"unknown service: {service}")
|
||||||
|
result = await deep_health.run_one(service)
|
||||||
|
return {
|
||||||
|
"ok": result.ok,
|
||||||
|
"at": result.at,
|
||||||
|
"latency_ms": result.latency_ms,
|
||||||
|
"error": result.error,
|
||||||
|
"note": result.note,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class HealthEventBody(BaseModel):
|
class HealthEventBody(BaseModel):
|
||||||
service: str # e.g. "parakeet", "magpie", "vllm"
|
service: str # e.g. "parakeet", "magpie", "vllm"
|
||||||
ok: bool # true on success, false on failure
|
ok: bool # true on success, false on failure
|
||||||
|
|||||||
@@ -17,6 +17,9 @@ const state = {
|
|||||||
config: {},
|
config: {},
|
||||||
configured: true,
|
configured: true,
|
||||||
timer_handle: null,
|
timer_handle: null,
|
||||||
|
deep_health: {},
|
||||||
|
disk_status: {}, // keyed by model key: { on_disk, total_bytes, per_host }
|
||||||
|
disk_status_loaded: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
const el = (sel) => document.querySelector(sel);
|
const el = (sel) => document.querySelector(sel);
|
||||||
@@ -56,12 +59,36 @@ function renderCards() {
|
|||||||
? `<div class="desc">${escapeHtml(m.description)}</div>`
|
? `<div class="desc">${escapeHtml(m.description)}</div>`
|
||||||
: '';
|
: '';
|
||||||
const customPill = m.custom ? `<span class="tag custom-pill">custom</span>` : '';
|
const customPill = m.custom ? `<span class="tag custom-pill">custom</span>` : '';
|
||||||
|
// Disk-presence pill + trash button. Until /api/models/disk-status comes back,
|
||||||
|
// we don't know — render a neutral placeholder.
|
||||||
|
const disk = state.disk_status[key];
|
||||||
|
let diskPill = '';
|
||||||
|
if (state.disk_status_loaded) {
|
||||||
|
if (disk && disk.on_disk) {
|
||||||
|
const gb = (disk.total_bytes / 1e9);
|
||||||
|
diskPill = `<span class="tag on-disk" title="Weights present on disk">on disk · ${gb.toFixed(1)} GB</span>`;
|
||||||
|
} else {
|
||||||
|
diskPill = `<span class="tag not-on-disk" title="Weights not downloaded">not downloaded</span>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Trash button — hidden if not on disk; disabled (with tooltip) if currently loaded.
|
||||||
|
let trashBtn = '';
|
||||||
|
if (state.disk_status_loaded && disk && disk.on_disk) {
|
||||||
|
const disabled = isActive || isSwapping;
|
||||||
|
const tip = isActive
|
||||||
|
? 'Currently loaded — switch to another model first'
|
||||||
|
: isSwapping
|
||||||
|
? 'A swap is in progress'
|
||||||
|
: 'Delete weights from disk';
|
||||||
|
trashBtn = `<button class="icon-btn danger" data-disk-del-key="${key}" title="${escapeHtml(tip)}" aria-label="Delete from disk" ${disabled ? 'disabled' : ''}>${trashIcon}</button>`;
|
||||||
|
}
|
||||||
card.innerHTML = `
|
card.innerHTML = `
|
||||||
<div class="name">${escapeHtml(m.display_name)}</div>
|
<div class="name">${escapeHtml(m.display_name)}</div>
|
||||||
<div class="meta">
|
<div class="meta">
|
||||||
<span class="tag mode-${m.mode}">${m.mode}</span>
|
<span class="tag mode-${m.mode}">${m.mode}</span>
|
||||||
<span class="tag">${m.size_gb} GB</span>
|
<span class="tag">${m.size_gb} GB</span>
|
||||||
${customPill}
|
${customPill}
|
||||||
|
${diskPill}
|
||||||
${(m.capabilities || []).map(c => `<span class="tag cap">${escapeHtml(c)}</span>`).join('')}
|
${(m.capabilities || []).map(c => `<span class="tag cap">${escapeHtml(c)}</span>`).join('')}
|
||||||
</div>
|
</div>
|
||||||
${desc}
|
${desc}
|
||||||
@@ -75,6 +102,7 @@ function renderCards() {
|
|||||||
</button>
|
</button>
|
||||||
<button class="btn test-btn" data-test-key="${key}" title="Pre-flight check the launch command without starting the engine">Test</button>
|
<button class="btn test-btn" data-test-key="${key}" title="Pre-flight check the launch command without starting the engine">Test</button>
|
||||||
<button class="btn adv-btn" data-adv-key="${key}" title="Advanced settings">Advanced</button>
|
<button class="btn adv-btn" data-adv-key="${key}" title="Advanced settings">Advanced</button>
|
||||||
|
${trashBtn}
|
||||||
</div>
|
</div>
|
||||||
<div class="test-result hidden" data-test-result-for="${key}"></div>
|
<div class="test-result hidden" data-test-result-for="${key}"></div>
|
||||||
`;
|
`;
|
||||||
@@ -89,8 +117,13 @@ function renderCards() {
|
|||||||
for (const btn of root.querySelectorAll('[data-test-key]')) {
|
for (const btn of root.querySelectorAll('[data-test-key]')) {
|
||||||
btn.addEventListener('click', () => testLaunch(btn.dataset.testKey, btn));
|
btn.addEventListener('click', () => testLaunch(btn.dataset.testKey, btn));
|
||||||
}
|
}
|
||||||
|
for (const btn of root.querySelectorAll('[data-disk-del-key]')) {
|
||||||
|
btn.addEventListener('click', () => openDiskDeleteDialog(btn.dataset.diskDelKey));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const trashIcon = '<svg viewBox="0 0 24 24" width="14" height="14" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" aria-hidden="true"><polyline points="3 6 5 6 21 6"></polyline><path d="M19 6l-1 14a2 2 0 0 1-2 2H8a2 2 0 0 1-2-2L5 6"></path><path d="M10 11v6"></path><path d="M14 11v6"></path><path d="M9 6V4a2 2 0 0 1 2-2h2a2 2 0 0 1 2 2v2"></path></svg>';
|
||||||
|
|
||||||
async function testLaunch(key, btn) {
|
async function testLaunch(key, btn) {
|
||||||
const resultEl = document.querySelector(`[data-test-result-for="${key}"]`);
|
const resultEl = document.querySelector(`[data-test-result-for="${key}"]`);
|
||||||
if (!resultEl) return;
|
if (!resultEl) return;
|
||||||
@@ -413,6 +446,35 @@ async function renderServices() {
|
|||||||
const restartsRow = s.restart_count != null && s.restart_count > 1
|
const restartsRow = s.restart_count != null && s.restart_count > 1
|
||||||
? `<div class="row"><span class="k">Restarts</span><span class="v">${s.restart_count}</span></div>`
|
? `<div class="row"><span class="k">Restarts</span><span class="v">${s.restart_count}</span></div>`
|
||||||
: '';
|
: '';
|
||||||
|
const dh = state.deep_health?.[name];
|
||||||
|
let deepRow = '';
|
||||||
|
if (dh && dh.last) {
|
||||||
|
const last = dh.last;
|
||||||
|
const when = (last.at || '').slice(11, 19); // HH:MM:SS
|
||||||
|
const verdict = last.ok
|
||||||
|
? `<span class="dh-ok">deep check ok</span>`
|
||||||
|
: `<span class="dh-fail">deep check FAILED</span>`;
|
||||||
|
const lat = last.latency_ms != null ? ` <span class="muted">${last.latency_ms} ms</span>` : '';
|
||||||
|
const restarts = dh.auto_restarts_window > 0
|
||||||
|
? ` <span class="muted">· ${dh.auto_restarts_window} auto-restart${dh.auto_restarts_window === 1 ? '' : 's'} in 30 min</span>`
|
||||||
|
: '';
|
||||||
|
deepRow = `
|
||||||
|
<div class="row deep-row">
|
||||||
|
<span class="k">Deep</span>
|
||||||
|
<span class="v deep-v">${verdict} <span class="muted small">${escapeHtml(when)}</span>${lat}${restarts}</span>
|
||||||
|
<button class="icon-btn dh-run-btn" data-dh-run="${escapeHtml(name)}" title="Run deep check now">↻</button>
|
||||||
|
</div>
|
||||||
|
${last.ok ? '' : `<div class="deep-error muted small">${escapeHtml((last.error || last.note || '').slice(0, 200))}</div>`}
|
||||||
|
`;
|
||||||
|
} else if (dh) {
|
||||||
|
deepRow = `
|
||||||
|
<div class="row deep-row">
|
||||||
|
<span class="k">Deep</span>
|
||||||
|
<span class="v muted-v">no probe yet</span>
|
||||||
|
<button class="icon-btn dh-run-btn" data-dh-run="${escapeHtml(name)}" title="Run deep check now">↻</button>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
card.innerHTML = `
|
card.innerHTML = `
|
||||||
<div class="head">
|
<div class="head">
|
||||||
<span class="name">${escapeHtml(name)}</span>
|
<span class="name">${escapeHtml(name)}</span>
|
||||||
@@ -423,6 +485,7 @@ async function renderServices() {
|
|||||||
${urlRow}
|
${urlRow}
|
||||||
${modelRow}
|
${modelRow}
|
||||||
${restartsRow}
|
${restartsRow}
|
||||||
|
${deepRow}
|
||||||
<div class="service-actions">
|
<div class="service-actions">
|
||||||
<button class="btn" data-svc-action="${name}:start" ${disable('start') ? 'disabled' : ''}>Start</button>
|
<button class="btn" data-svc-action="${name}:start" ${disable('start') ? 'disabled' : ''}>Start</button>
|
||||||
<button class="btn" data-svc-action="${name}:restart" ${disable('restart') ? 'disabled' : ''}>Restart</button>
|
<button class="btn" data-svc-action="${name}:restart" ${disable('restart') ? 'disabled' : ''}>Restart</button>
|
||||||
@@ -434,6 +497,25 @@ async function renderServices() {
|
|||||||
for (const btn of grid.querySelectorAll('.btn[data-svc-action]')) {
|
for (const btn of grid.querySelectorAll('.btn[data-svc-action]')) {
|
||||||
btn.addEventListener('click', () => onServiceAction(btn.dataset.svcAction));
|
btn.addEventListener('click', () => onServiceAction(btn.dataset.svcAction));
|
||||||
}
|
}
|
||||||
|
for (const btn of grid.querySelectorAll('[data-dh-run]')) {
|
||||||
|
btn.addEventListener('click', () => onDeepHealthRun(btn.dataset.dhRun, btn));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function onDeepHealthRun(name, btn) {
|
||||||
|
btn.disabled = true;
|
||||||
|
const orig = btn.textContent;
|
||||||
|
btn.textContent = '…';
|
||||||
|
try {
|
||||||
|
await fetchJSON(`/api/deep-health/${encodeURIComponent(name)}/run`, { method: 'POST' });
|
||||||
|
} catch (e) {
|
||||||
|
console.warn('deep-health run failed', e);
|
||||||
|
} finally {
|
||||||
|
try { state.deep_health = await fetchJSON('/api/deep-health'); } catch {}
|
||||||
|
btn.textContent = orig;
|
||||||
|
btn.disabled = false;
|
||||||
|
renderServices();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function onServiceAction(key) {
|
async function onServiceAction(key) {
|
||||||
@@ -668,6 +750,7 @@ async function pollStatus() {
|
|||||||
// Refresh services state lazily — every 5s poll triggers this too.
|
// Refresh services state lazily — every 5s poll triggers this too.
|
||||||
try {
|
try {
|
||||||
state.services = await fetchJSON('/api/services');
|
state.services = await fetchJSON('/api/services');
|
||||||
|
try { state.deep_health = await fetchJSON('/api/deep-health'); } catch {}
|
||||||
renderServices();
|
renderServices();
|
||||||
} catch {}
|
} catch {}
|
||||||
if (status.current_swap_job && status.current_swap_job !== state.swap_job_id) {
|
if (status.current_swap_job && status.current_swap_job !== state.swap_job_id) {
|
||||||
@@ -688,6 +771,78 @@ async function loadModels() {
|
|||||||
state.models = data.models || {};
|
state.models = data.models || {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function loadDiskStatus() {
|
||||||
|
// Probes each catalog model's HF cache over SSH; takes a beat. Best-effort.
|
||||||
|
try {
|
||||||
|
const r = await fetchJSON('/api/models/disk-status');
|
||||||
|
if (r && r.models) {
|
||||||
|
state.disk_status = r.models;
|
||||||
|
state.disk_status_loaded = true;
|
||||||
|
renderCards();
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Silent — pills just won't render. Don't block dashboard.
|
||||||
|
console.warn('disk-status probe failed:', e.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function fmtBytesShort(n) {
|
||||||
|
if (!Number.isFinite(n) || n <= 0) return '0 B';
|
||||||
|
if (n >= 1e9) return `${(n / 1e9).toFixed(1)} GB`;
|
||||||
|
if (n >= 1e6) return `${(n / 1e6).toFixed(1)} MB`;
|
||||||
|
if (n >= 1e3) return `${(n / 1e3).toFixed(1)} KB`;
|
||||||
|
return `${n} B`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function openDiskDeleteDialog(key) {
|
||||||
|
const m = state.models[key];
|
||||||
|
const disk = state.disk_status[key];
|
||||||
|
if (!m || !disk || !disk.on_disk) return;
|
||||||
|
const dlg = el('#disk-delete-dialog');
|
||||||
|
el('#dd-summary').innerHTML = `Free <strong>${fmtBytesShort(disk.total_bytes)}</strong> by removing <strong>${escapeHtml(m.display_name)}</strong> (<code>${escapeHtml(m.repo)}</code>) from disk.`;
|
||||||
|
const hostsEl = el('#dd-hosts');
|
||||||
|
hostsEl.innerHTML = '';
|
||||||
|
for (const h of (disk.per_host || [])) {
|
||||||
|
if (!h.on_disk) continue;
|
||||||
|
const li = document.createElement('li');
|
||||||
|
li.innerHTML = `<code>${escapeHtml(h.host)}</code> — ${fmtBytesShort(h.size_bytes)}`;
|
||||||
|
hostsEl.appendChild(li);
|
||||||
|
}
|
||||||
|
const errEl = el('#dd-error');
|
||||||
|
errEl.classList.add('hidden');
|
||||||
|
errEl.textContent = '';
|
||||||
|
|
||||||
|
const confirm = el('#dd-confirm');
|
||||||
|
const cancel = el('#dd-cancel');
|
||||||
|
const onCancel = () => dlg.close();
|
||||||
|
const onConfirm = async () => {
|
||||||
|
confirm.disabled = true;
|
||||||
|
cancel.disabled = true;
|
||||||
|
confirm.textContent = 'Deleting…';
|
||||||
|
try {
|
||||||
|
const r = await fetchJSON(`/api/models/${encodeURIComponent(key)}/disk`, { method: 'DELETE' });
|
||||||
|
dlg.close();
|
||||||
|
// Optimistically clear local disk state for this key, then refresh.
|
||||||
|
delete state.disk_status[key];
|
||||||
|
renderCards();
|
||||||
|
// Eagerly re-probe so size is accurate (and shows "not downloaded" pill).
|
||||||
|
loadDiskStatus();
|
||||||
|
const freed = r && typeof r.bytes_freed === 'number' ? fmtBytesShort(r.bytes_freed) : '';
|
||||||
|
console.log(`Deleted ${m.display_name} from disk${freed ? ` — freed ${freed}` : ''}.`);
|
||||||
|
} catch (e) {
|
||||||
|
errEl.textContent = e.message || 'Delete failed';
|
||||||
|
errEl.classList.remove('hidden');
|
||||||
|
} finally {
|
||||||
|
confirm.disabled = false;
|
||||||
|
cancel.disabled = false;
|
||||||
|
confirm.textContent = 'Delete from disk';
|
||||||
|
}
|
||||||
|
};
|
||||||
|
cancel.onclick = onCancel;
|
||||||
|
confirm.onclick = onConfirm;
|
||||||
|
dlg.showModal();
|
||||||
|
}
|
||||||
|
|
||||||
async function triggerSwap(modelKey) {
|
async function triggerSwap(modelKey) {
|
||||||
if (state.swap_job_id) return;
|
if (state.swap_job_id) return;
|
||||||
try {
|
try {
|
||||||
@@ -1472,9 +1627,12 @@ async function init() {
|
|||||||
await renderServices();
|
await renderServices();
|
||||||
pollHardware();
|
pollHardware();
|
||||||
pollUpdates();
|
pollUpdates();
|
||||||
|
// Disk-status probe runs after first paint — slow over SSH and not blocking.
|
||||||
|
loadDiskStatus();
|
||||||
setInterval(pollStatus, 5000);
|
setInterval(pollStatus, 5000);
|
||||||
setInterval(pollHardware, 8000); // every 8s
|
setInterval(pollHardware, 8000); // every 8s
|
||||||
setInterval(pollUpdates, 300000); // every 5 min
|
setInterval(pollUpdates, 300000); // every 5 min
|
||||||
|
setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely
|
||||||
}
|
}
|
||||||
|
|
||||||
init();
|
init();
|
||||||
|
|||||||
@@ -188,6 +188,20 @@
|
|||||||
</form>
|
</form>
|
||||||
</dialog>
|
</dialog>
|
||||||
|
|
||||||
|
<dialog id="disk-delete-dialog" class="modal">
|
||||||
|
<form method="dialog" class="modal-form">
|
||||||
|
<h3>Delete model weights from disk?</h3>
|
||||||
|
<p id="dd-summary" class="muted small"></p>
|
||||||
|
<ul class="muted small dd-hosts" id="dd-hosts"></ul>
|
||||||
|
<p class="muted small">This is reversible — you can re-download from the catalog at any time. The catalog entry stays intact.</p>
|
||||||
|
<p id="dd-error" class="muted small dd-error hidden"></p>
|
||||||
|
<div class="modal-actions">
|
||||||
|
<button type="button" id="dd-cancel" class="btn">Cancel</button>
|
||||||
|
<button type="button" id="dd-confirm" class="btn danger">Delete from disk</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</dialog>
|
||||||
|
|
||||||
<dialog id="advanced-dialog" class="modal">
|
<dialog id="advanced-dialog" class="modal">
|
||||||
<form method="dialog" class="modal-form" id="advanced-form">
|
<form method="dialog" class="modal-form" id="advanced-form">
|
||||||
<h3 id="adv-title">Advanced settings</h3>
|
<h3 id="adv-title">Advanced settings</h3>
|
||||||
|
|||||||
@@ -622,6 +622,19 @@ main {
|
|||||||
.service-card .row .v.copyable.copied { outline: 1px solid var(--accent); background: rgba(74, 222, 128, 0.05); }
|
.service-card .row .v.copyable.copied { outline: 1px solid var(--accent); background: rgba(74, 222, 128, 0.05); }
|
||||||
.service-card .row .icon-btn { padding: 3px 6px; }
|
.service-card .row .icon-btn { padding: 3px 6px; }
|
||||||
.service-card .row .icon-btn svg { width: 12px; height: 12px; }
|
.service-card .row .icon-btn svg { width: 12px; height: 12px; }
|
||||||
|
.service-card .deep-row .deep-v { display: flex; align-items: center; gap: 6px; font-family: inherit; flex-wrap: wrap; }
|
||||||
|
.service-card .dh-ok { color: var(--accent); }
|
||||||
|
.service-card .dh-fail { color: var(--error); font-weight: 500; }
|
||||||
|
.service-card .dh-run-btn { font-family: inherit; }
|
||||||
|
.service-card .deep-error {
|
||||||
|
padding: 4px 8px;
|
||||||
|
background: rgba(239, 68, 68, 0.06);
|
||||||
|
border-left: 2px solid var(--error);
|
||||||
|
border-radius: 4px;
|
||||||
|
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
|
||||||
|
font-size: 11px;
|
||||||
|
word-break: break-word;
|
||||||
|
}
|
||||||
|
|
||||||
.service-actions {
|
.service-actions {
|
||||||
display: flex;
|
display: flex;
|
||||||
@@ -704,6 +717,14 @@ main {
|
|||||||
.card .adv-btn,
|
.card .adv-btn,
|
||||||
.card .test-btn { padding: 8px 12px; font-size: 12px; }
|
.card .test-btn { padding: 8px 12px; font-size: 12px; }
|
||||||
.card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); }
|
.card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); }
|
||||||
|
.tag.on-disk { color: var(--accent); border-color: rgba(74, 222, 128, 0.4); }
|
||||||
|
.tag.not-on-disk { color: var(--muted); border-color: var(--border); opacity: 0.7; }
|
||||||
|
.card-actions .icon-btn.danger { color: var(--error); border-color: rgba(239, 68, 68, 0.3); margin-left: auto; }
|
||||||
|
.card-actions .icon-btn.danger:hover:not(:disabled) { background: rgba(239, 68, 68, 0.08); border-color: var(--error); color: var(--error); }
|
||||||
|
.card-actions .icon-btn.danger:disabled { opacity: 0.35; cursor: not-allowed; }
|
||||||
|
.dd-hosts { padding-left: 18px; margin: 4px 0 8px; }
|
||||||
|
.dd-hosts code { background: var(--surface-2); padding: 1px 5px; border-radius: 4px; }
|
||||||
|
.dd-error { color: var(--error); }
|
||||||
|
|
||||||
.test-result {
|
.test-result {
|
||||||
font-size: 12px;
|
font-size: 12px;
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ models:
|
|||||||
- -tp=2
|
- -tp=2
|
||||||
- --distributed-executor-backend=ray
|
- --distributed-executor-backend=ray
|
||||||
- --max-model-len=32768
|
- --max-model-len=32768
|
||||||
|
- --max-num-batched-tokens=16384
|
||||||
|
|
||||||
gemma4:
|
gemma4:
|
||||||
display_name: "Gemma 4 31B"
|
display_name: "Gemma 4 31B"
|
||||||
@@ -45,6 +46,7 @@ models:
|
|||||||
vllm_args:
|
vllm_args:
|
||||||
- --gpu-memory-utilization=0.8
|
- --gpu-memory-utilization=0.8
|
||||||
- --max-model-len=32768
|
- --max-model-len=32768
|
||||||
|
- --max-num-batched-tokens=16384
|
||||||
- --reasoning-parser=gemma4
|
- --reasoning-parser=gemma4
|
||||||
- --tool-call-parser=gemma4
|
- --tool-call-parser=gemma4
|
||||||
- --enable-auto-tool-choice
|
- --enable-auto-tool-choice
|
||||||
|
|||||||
@@ -24,6 +24,10 @@ This flag is Blackwell-specific. If vLLM in the container reports `unrecognized
|
|||||||
|
|
||||||
Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe.
|
Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe.
|
||||||
|
|
||||||
|
## Multimodal token budget for vision models (fixed in v0.8.0:1)
|
||||||
|
|
||||||
|
After the eugr/spark-vllm-docker update, vLLM became stricter about multimodal token budgets. Vision-capable models like Gemma 4 31B and Qwen3-VL crash at engine init with `ValueError: Chunked MM input disabled but max_tokens_per_mm_item (2496) is larger than max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into every model that has the `vision` capability. Now applied to qwen3-vl, gemma4, and qwen36 (which was already set for the Mamba issue).
|
||||||
|
|
||||||
## Two SSH paths to Spark 1 from the laptop
|
## Two SSH paths to Spark 1 from the laptop
|
||||||
|
|
||||||
`ssh <spark-user>@<spark-1-ip>` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `<spark-1-host>.local`. Always use the `.local` hostname or `<spark-2-ip>`-style entries that ARE matched.
|
`ssh <spark-user>@<spark-1-ip>` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `<spark-1-host>.local`. Always use the `.local` hostname or `<spark-2-ip>`-style entries that ARE matched.
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
export const v0_1_0 = VersionInfo.of({
|
export const v0_1_0 = VersionInfo.of({
|
||||||
version: '0.7.0:2',
|
version: '0.8.1:0',
|
||||||
releaseNotes: {
|
releaseNotes: {
|
||||||
en_US:
|
en_US:
|
||||||
'v0.7: pre-flight launch validation. New "Test" button on every model card runs vLLM\'s argparse against the proposed launch command inside the running vllm_node container — without starting an engine. Catches unknown flags, bad types, and version-removed flags in about 5 seconds, before disrupting the currently-loaded model. (Runtime-only failures like the Qwen3.6 Mamba block-size assertion still only surface during a real swap, but argparse-stage bugs are now caught up front.)',
|
'v0.8.1: model weights can now be deleted from disk directly from the dashboard. Each model card shows whether the weights are present (with on-disk GB size) or not yet downloaded. When present and the model is NOT currently loaded, a small trash icon appears on the card; clicking it pops a confirmation showing how many GB will be freed and on which Spark(s), then runs `rm -rf` on the Hugging Face cache directory via SSH. Cluster-mode models are deleted from both Sparks; solo-mode from Spark 1 only. Safety rails: refuses to delete the currently-loaded model, refuses during an in-flight swap or download, and the catalog entry stays — you can always re-download. Disk status is probed once on dashboard load and re-checked every 60s.',
|
||||||
},
|
},
|
||||||
migrations: {
|
migrations: {
|
||||||
up: async ({ effects }) => {},
|
up: async ({ effects }) => {},
|
||||||
|
|||||||
Reference in New Issue
Block a user