Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9ff7ee9c1e | |||
| 1602b3b3b4 | |||
| 8ac455f5f5 | |||
| 000c55febe | |||
| 6434b01a95 | |||
| 5827683a09 | |||
| ee8c2406b8 | |||
| a02f4db850 | |||
| 1889ab45fb | |||
| e88fdcfde4 | |||
| 64ce0fca10 | |||
| c6da6b0784 |
@@ -84,6 +84,24 @@ Other services on your LAN can hit `GET /api/endpoints` to learn where the curre
|
||||
|
||||
`base_url` is filled in whenever Configure Sparks has been completed (even if the underlying service isn't currently up). Pair the URL with `ready: true` to safely route traffic.
|
||||
|
||||
## Reporting failures from external apps
|
||||
|
||||
Spark Control polls every 5 s, so a brief blip in Parakeet/Magpie/vLLM availability can slip between polls and never make it into the connectivity log. To capture short failures, an external app (e.g. Open WebUI) can POST whenever a call fails (or succeeds):
|
||||
|
||||
```bash
|
||||
curl -X POST http://<dashboard-url>/api/health-event \
|
||||
-H 'content-type: application/json' \
|
||||
-d '{
|
||||
"service": "parakeet",
|
||||
"ok": false,
|
||||
"source": "open-webui",
|
||||
"error": "HTTP 503",
|
||||
"ms": 420
|
||||
}'
|
||||
```
|
||||
|
||||
Fields: `service` (required), `ok` (required), `source` (optional, free-form), `error` (optional), `ms` (optional latency). Each POST appends a `report` event to the connectivity log alongside the polling-based transition events.
|
||||
|
||||
## Status
|
||||
|
||||
**v0.2.3** — installed and verified on a Start9 server. Five bundled LLMs in the catalog (qwen3-vl, gemma4, qwen36, qwen3-235b-fp8, qwen2.5-72b), plus any custom models added through the UI.
|
||||
|
||||
+6
-2
@@ -42,6 +42,8 @@ class Settings:
|
||||
parakeet_port: int
|
||||
magpie_port: int
|
||||
bind_port: int
|
||||
open_webui_url: str
|
||||
ngc_api_key: str
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> "Settings":
|
||||
@@ -55,10 +57,10 @@ class Settings:
|
||||
spark2_user=spark2_user,
|
||||
parakeet_host=_env("PARAKEET_HOST") or spark2_host,
|
||||
parakeet_user=_env("PARAKEET_USER") or spark2_user,
|
||||
parakeet_container=_env("PARAKEET_CONTAINER", "parakeet-asr"),
|
||||
parakeet_container=_env("PARAKEET_CONTAINER") or "parakeet-asr",
|
||||
magpie_host=_env("MAGPIE_HOST") or spark2_host,
|
||||
magpie_user=_env("MAGPIE_USER") or spark2_user,
|
||||
magpie_container=_env("MAGPIE_CONTAINER", "magpie-tts"),
|
||||
magpie_container=_env("MAGPIE_CONTAINER") or "magpie-tts",
|
||||
ssh_key_path=_env("SSH_KEY_PATH"),
|
||||
ssh_known_hosts=_env("SSH_KNOWN_HOSTS"),
|
||||
models_yaml=_resolve_models_yaml(),
|
||||
@@ -66,6 +68,8 @@ class Settings:
|
||||
parakeet_port=int(_env("PARAKEET_PORT", "8000")),
|
||||
magpie_port=int(_env("MAGPIE_PORT", "9000")),
|
||||
bind_port=int(_env("BIND_PORT", "9999")),
|
||||
open_webui_url=_env("OPEN_WEBUI_URL", ""),
|
||||
ngc_api_key=_env("NGC_API_KEY", ""),
|
||||
)
|
||||
|
||||
@property
|
||||
|
||||
@@ -0,0 +1,190 @@
|
||||
"""Track up/down transitions for any subject (Sparks AND services) and cache MACs.
|
||||
|
||||
Persisted to /data/connectivity.json. Schema:
|
||||
|
||||
{
|
||||
"macs": { "spark1": "aa:bb:..", "spark2": "11:22:.." },
|
||||
"current": { "spark1": "up", "parakeet": "up", "magpie": "down", ... },
|
||||
"last_change": { ... },
|
||||
"events": [
|
||||
# Active-probe transition (logged when state flips during polling)
|
||||
{ "subject": "spark2", "at": "...", "kind": "transition",
|
||||
"transition": "down" },
|
||||
{ "subject": "spark2", "at": "...", "kind": "transition",
|
||||
"transition": "up", "down_seconds": 4500 },
|
||||
|
||||
# Passive report (logged whenever an external app POSTs to
|
||||
# /api/health-event regardless of state change)
|
||||
{ "subject": "parakeet", "at": "...", "kind": "report",
|
||||
"ok": false, "source": "open-webui",
|
||||
"detail": "Connection refused", "latency_ms": 320 },
|
||||
]
|
||||
}
|
||||
|
||||
Legacy events from v0.5 with `spark` instead of `subject` and no `kind` field
|
||||
are read transparently as kind="transition".
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
import threading
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
MAX_EVENTS = 200 # rolling window — plenty for showing recent history
|
||||
|
||||
|
||||
def _path() -> str:
|
||||
return os.environ.get("CONNECTIVITY_LOG", "/data/connectivity.json")
|
||||
|
||||
|
||||
_lock = threading.Lock()
|
||||
|
||||
|
||||
def _read() -> dict:
|
||||
try:
|
||||
with open(_path()) as f:
|
||||
return json.load(f) or {}
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
return {}
|
||||
|
||||
|
||||
def _write(data: dict) -> None:
|
||||
p = _path()
|
||||
Path(p).parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = p + ".tmp"
|
||||
with open(tmp, "w") as f:
|
||||
json.dump(data, f, indent=2, sort_keys=False)
|
||||
os.replace(tmp, p)
|
||||
|
||||
|
||||
def load() -> dict:
|
||||
with _lock:
|
||||
d = _read()
|
||||
d.setdefault("macs", {})
|
||||
d.setdefault("current", {})
|
||||
d.setdefault("last_change", {})
|
||||
d.setdefault("events", [])
|
||||
return d
|
||||
|
||||
|
||||
def record_mac(subject: str, mac: Optional[str]) -> None:
|
||||
if not mac:
|
||||
return
|
||||
with _lock:
|
||||
d = _read()
|
||||
d.setdefault("macs", {})
|
||||
if d["macs"].get(subject) != mac:
|
||||
d["macs"][subject] = mac
|
||||
_write(d)
|
||||
|
||||
|
||||
def record_state(subject: str, reachable: bool) -> Optional[dict]:
|
||||
"""Update current state for `subject`. If it differs from the last seen
|
||||
state, append a transition event. Returns the event dict if a transition
|
||||
was recorded, else None.
|
||||
|
||||
`subject` can be a Spark host key (spark1/spark2) or a service name
|
||||
(parakeet/magpie/vllm).
|
||||
"""
|
||||
new_state = "up" if reachable else "down"
|
||||
now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
with _lock:
|
||||
d = _read()
|
||||
d.setdefault("macs", {})
|
||||
d.setdefault("current", {})
|
||||
d.setdefault("last_change", {})
|
||||
d.setdefault("events", [])
|
||||
prev = d["current"].get(subject)
|
||||
if prev == new_state:
|
||||
return None
|
||||
event: dict = {
|
||||
"subject": subject,
|
||||
"at": now,
|
||||
"kind": "transition",
|
||||
"transition": new_state,
|
||||
}
|
||||
# When we have a previous state and timestamp, compute duration
|
||||
last_change = d["last_change"].get(subject)
|
||||
if prev and last_change:
|
||||
try:
|
||||
prev_dt = datetime.fromisoformat(last_change.replace("Z", "+00:00"))
|
||||
duration = (datetime.now(timezone.utc) - prev_dt).total_seconds()
|
||||
if prev == "down" and new_state == "up":
|
||||
event["down_seconds"] = round(duration)
|
||||
if prev == "up" and new_state == "down":
|
||||
event["up_seconds"] = round(duration)
|
||||
except ValueError:
|
||||
pass
|
||||
d["current"][subject] = new_state
|
||||
d["last_change"][subject] = now
|
||||
d["events"].append(event)
|
||||
if len(d["events"]) > MAX_EVENTS:
|
||||
d["events"] = d["events"][-MAX_EVENTS:]
|
||||
_write(d)
|
||||
return event
|
||||
|
||||
|
||||
def record_report(
|
||||
subject: str,
|
||||
*,
|
||||
ok: bool,
|
||||
source: str = "external",
|
||||
detail: str = "",
|
||||
latency_ms: Optional[int] = None,
|
||||
) -> dict:
|
||||
"""Record a passive report from an external caller (e.g. Open WebUI got a
|
||||
503 calling Parakeet). Always appended to the events list; does NOT change
|
||||
the active-probe state (which only the polling probe is authoritative on).
|
||||
"""
|
||||
now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
with _lock:
|
||||
d = _read()
|
||||
d.setdefault("events", [])
|
||||
event: dict = {
|
||||
"subject": subject,
|
||||
"at": now,
|
||||
"kind": "report",
|
||||
"ok": bool(ok),
|
||||
"source": source or "external",
|
||||
}
|
||||
if detail:
|
||||
event["detail"] = detail
|
||||
if latency_ms is not None:
|
||||
event["latency_ms"] = int(latency_ms)
|
||||
d["events"].append(event)
|
||||
if len(d["events"]) > MAX_EVENTS:
|
||||
d["events"] = d["events"][-MAX_EVENTS:]
|
||||
_write(d)
|
||||
return event
|
||||
|
||||
|
||||
def get_mac(subject: str) -> Optional[str]:
|
||||
d = load()
|
||||
return d.get("macs", {}).get(subject)
|
||||
|
||||
|
||||
def _normalize_event(e: dict) -> dict:
|
||||
"""Promote legacy v0.5 events to the v0.6 shape so the UI sees one schema."""
|
||||
if "subject" in e:
|
||||
e.setdefault("kind", "transition")
|
||||
return e
|
||||
# Legacy: had "spark" + "transition" only
|
||||
if "spark" in e:
|
||||
e["subject"] = e.pop("spark")
|
||||
e.setdefault("kind", "transition")
|
||||
return e
|
||||
|
||||
|
||||
def summary() -> dict:
|
||||
"""Compact summary for the UI: known MACs, current state, recent events."""
|
||||
d = load()
|
||||
events = [_normalize_event(dict(e)) for e in d.get("events", [])]
|
||||
return {
|
||||
"macs": d.get("macs", {}),
|
||||
"current": d.get("current", {}),
|
||||
"last_change": d.get("last_change", {}),
|
||||
"events": events[-80:],
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
"""User-installed services persist in /data/services-overrides.yaml.
|
||||
|
||||
Format:
|
||||
custom:
|
||||
- key: my-riva
|
||||
kind: stt
|
||||
host: <spark-2-ip>
|
||||
user: <spark-user>
|
||||
container: riva-asr
|
||||
port: 8001
|
||||
health_path: /health
|
||||
image: nvcr.io/nim/nvidia/riva-multilingual:latest
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
|
||||
|
||||
def _path() -> str:
|
||||
return os.environ.get("SERVICES_OVERRIDES", "/data/services-overrides.yaml")
|
||||
|
||||
|
||||
def load_custom_services() -> list[dict]:
|
||||
try:
|
||||
with open(_path()) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
except FileNotFoundError:
|
||||
return []
|
||||
return data.get("custom") or []
|
||||
|
||||
|
||||
def add_custom_service(entry: dict) -> None:
|
||||
p = _path()
|
||||
Path(p).parent.mkdir(parents=True, exist_ok=True)
|
||||
data: dict = {}
|
||||
try:
|
||||
with open(p) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
custom = data.get("custom") or []
|
||||
custom = [c for c in custom if c.get("key") != entry["key"]]
|
||||
custom.append(entry)
|
||||
data["custom"] = custom
|
||||
with open(p, "w") as f:
|
||||
yaml.safe_dump(data, f, sort_keys=False)
|
||||
|
||||
|
||||
def delete_custom_service(key: str) -> None:
|
||||
p = _path()
|
||||
try:
|
||||
with open(p) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
except FileNotFoundError:
|
||||
return
|
||||
data["custom"] = [c for c in (data.get("custom") or []) if c.get("key") != key]
|
||||
with open(p, "w") as f:
|
||||
yaml.safe_dump(data, f, sort_keys=False)
|
||||
@@ -0,0 +1,363 @@
|
||||
"""Deep health probes for each service.
|
||||
|
||||
Why this exists: Triton's /health endpoint returns 200 as long as the HTTP
|
||||
layer is alive and the model is registered. It does NOT verify that the CUDA
|
||||
context inside the worker process is healthy. We've observed Parakeet getting
|
||||
its CUDA context wedged after an OOM, where /health stays green but every
|
||||
real transcription returns 500 cudaErrorUnknown.
|
||||
|
||||
So this module sends *real* but tiny synthetic inference requests:
|
||||
- Parakeet: 1 second of digital silence (16 kHz mono PCM, in-memory WAV)
|
||||
- Magpie: short text-to-speech, response audio discarded
|
||||
- vLLM: 1-token chat completion against whatever model is loaded
|
||||
|
||||
All synthetic payloads are generated on demand into BytesIO, sent over HTTP,
|
||||
and never touched the filesystem (on either spark-control's side or the
|
||||
target service's side beyond normal Triton/Riva working memory).
|
||||
|
||||
When a probe fails with a signal that looks like a CUDA wedge, we
|
||||
automatically issue `docker restart <container>`. Rate-limited to 3 restarts
|
||||
per service per 30 minutes to avoid restart loops.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import io
|
||||
import time
|
||||
import wave
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from .config import Settings
|
||||
from .connectivity import record_report
|
||||
from .services import ServiceDef, run_action, services_from_settings
|
||||
|
||||
|
||||
# Default 5-minute interval, controllable via env. Sub-minute is silly for a
|
||||
# heavy synthetic probe; we just want to catch wedges within a reasonable
|
||||
# window — much faster than the user noticing on their next real call.
|
||||
DEFAULT_INTERVAL_SEC = 300.0
|
||||
PROBE_TIMEOUT_SEC = 20.0
|
||||
RESTART_RATE_LIMIT = 3 # max auto-restarts per service
|
||||
RESTART_RATE_WINDOW_SEC = 1800.0 # within a 30-min window
|
||||
RESTART_COOLDOWN_SEC = 120.0 # don't restart again within this many seconds of the last one
|
||||
STARTUP_GRACE_SEC = 60.0 # don't auto-restart for the first minute after this app boots
|
||||
|
||||
|
||||
def _silence_wav(seconds: float = 1.0, sample_rate: int = 16000) -> io.BytesIO:
|
||||
"""Return an in-memory WAV file containing `seconds` of digital silence."""
|
||||
n_frames = int(seconds * sample_rate)
|
||||
buf = io.BytesIO()
|
||||
with wave.open(buf, "wb") as w:
|
||||
w.setnchannels(1)
|
||||
w.setsampwidth(2) # int16
|
||||
w.setframerate(sample_rate)
|
||||
w.writeframes(b"\x00\x00" * n_frames)
|
||||
buf.seek(0)
|
||||
return buf
|
||||
|
||||
|
||||
def _looks_like_wedge(error: str) -> bool:
|
||||
"""Heuristic: does this error string look like a stuck CUDA context that
|
||||
a container restart would clear? We want to be conservative — only act
|
||||
on signals we're confident about, otherwise leave the user in charge."""
|
||||
err = (error or "").lower()
|
||||
needles = [
|
||||
"cudaerrorunknown",
|
||||
"cuda error: unknown",
|
||||
"cuda kernel errors",
|
||||
"internal server error",
|
||||
"engine core initialization failed",
|
||||
"503", # service unavailable from a dependency
|
||||
"500", # generic 5xx with a body that may not parse
|
||||
]
|
||||
return any(n in err for n in needles)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProbeResult:
|
||||
ok: bool
|
||||
at: str
|
||||
latency_ms: Optional[int] = None
|
||||
error: str = ""
|
||||
note: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServiceState:
|
||||
last: Optional[ProbeResult] = None
|
||||
last_ok_at: Optional[str] = None
|
||||
restarts: list[float] = field(default_factory=list)
|
||||
|
||||
|
||||
class DeepHealth:
|
||||
def __init__(self, settings: Settings, interval_sec: float = DEFAULT_INTERVAL_SEC) -> None:
|
||||
self.settings = settings
|
||||
self.interval_sec = interval_sec
|
||||
self.state: dict[str, ServiceState] = {
|
||||
"parakeet": ServiceState(),
|
||||
"magpie": ServiceState(),
|
||||
"vllm": ServiceState(),
|
||||
}
|
||||
self._stop = asyncio.Event()
|
||||
self._boot_at = time.monotonic()
|
||||
|
||||
# ---- probes ---------------------------------------------------------
|
||||
|
||||
async def probe_parakeet(self) -> ProbeResult:
|
||||
s = self.settings
|
||||
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
if not s.parakeet_host:
|
||||
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||
url = f"http://{s.parakeet_host}:{s.parakeet_port}/v1/audio/transcriptions"
|
||||
wav = _silence_wav(1.0)
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||
r = await c.post(
|
||||
url,
|
||||
files={"file": ("probe.wav", wav, "audio/wav")},
|
||||
data={"model": "parakeet-tdt-0.6b-v3"},
|
||||
)
|
||||
latency = round((time.monotonic() - t0) * 1000)
|
||||
if 200 <= r.status_code < 300:
|
||||
return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
|
||||
return ProbeResult(
|
||||
ok=False,
|
||||
at=now_iso,
|
||||
latency_ms=latency,
|
||||
error=f"HTTP {r.status_code}: {r.text[:240]}",
|
||||
)
|
||||
except Exception as e:
|
||||
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||
|
||||
async def probe_magpie(self) -> ProbeResult:
|
||||
s = self.settings
|
||||
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
if not s.magpie_host:
|
||||
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||
# Magpie /v1/audio/synthesize expects multipart form-data, not JSON.
|
||||
# The (None, value) tuple in httpx's `files=` produces a non-file form field.
|
||||
url = f"http://{s.magpie_host}:{s.magpie_port}/v1/audio/synthesize"
|
||||
form: dict = {"text": (None, "hi"), "language": (None, "en-US")}
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||
r = await c.post(url, files=form)
|
||||
latency = round((time.monotonic() - t0) * 1000)
|
||||
if 200 <= r.status_code < 300:
|
||||
return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
|
||||
# 4xx that aren't 5xx mean server is alive but our payload is off —
|
||||
# don't classify as wedge.
|
||||
if 400 <= r.status_code < 500:
|
||||
return ProbeResult(
|
||||
ok=True,
|
||||
at=now_iso,
|
||||
latency_ms=latency,
|
||||
note=f"{r.status_code} — server alive (probe payload may need a voice name)",
|
||||
)
|
||||
return ProbeResult(
|
||||
ok=False,
|
||||
at=now_iso,
|
||||
latency_ms=latency,
|
||||
error=f"HTTP {r.status_code}: {r.text[:240]}",
|
||||
)
|
||||
except Exception as e:
|
||||
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||
|
||||
async def probe_vllm(self) -> ProbeResult:
|
||||
s = self.settings
|
||||
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
if not s.spark1_host:
|
||||
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||
base = f"http://{s.spark1_host}:{s.vllm_port}"
|
||||
# Step 1: is there a model loaded?
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as c:
|
||||
r = await c.get(f"{base}/v1/models")
|
||||
if 200 <= r.status_code < 300:
|
||||
models = r.json().get("data") or []
|
||||
else:
|
||||
# 5xx on /v1/models suggests something wedged after a model loaded
|
||||
return ProbeResult(
|
||||
ok=False,
|
||||
at=now_iso,
|
||||
error=f"list_models HTTP {r.status_code}: {r.text[:240]}",
|
||||
)
|
||||
except Exception:
|
||||
# Connection refused / timeout: usually means no vLLM process listening
|
||||
# (the vllm_node container is alive but no `vllm serve` is running yet).
|
||||
# That's an idle state, not a wedge — don't trigger auto-restart.
|
||||
return ProbeResult(
|
||||
ok=True,
|
||||
at=now_iso,
|
||||
note="no model currently loaded (idle)",
|
||||
)
|
||||
|
||||
if not models:
|
||||
return ProbeResult(
|
||||
ok=True,
|
||||
at=now_iso,
|
||||
note="no model currently loaded (idle)",
|
||||
)
|
||||
|
||||
model_id = models[0]["id"]
|
||||
# Step 2: model is loaded; verify it can actually complete a 1-token request.
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||
r = await c.post(
|
||||
f"{base}/v1/chat/completions",
|
||||
json={
|
||||
"model": model_id,
|
||||
"messages": [{"role": "user", "content": "hi"}],
|
||||
"max_tokens": 1,
|
||||
"temperature": 0,
|
||||
},
|
||||
)
|
||||
latency = round((time.monotonic() - t0) * 1000)
|
||||
if 200 <= r.status_code < 300:
|
||||
return ProbeResult(ok=True, at=now_iso, latency_ms=latency, note=f"model={model_id}")
|
||||
return ProbeResult(
|
||||
ok=False,
|
||||
at=now_iso,
|
||||
latency_ms=latency,
|
||||
error=f"HTTP {r.status_code}: {r.text[:240]}",
|
||||
)
|
||||
except Exception as e:
|
||||
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||
|
||||
# ---- orchestration --------------------------------------------------
|
||||
|
||||
PROBES = {
|
||||
"parakeet": "probe_parakeet",
|
||||
"magpie": "probe_magpie",
|
||||
"vllm": "probe_vllm",
|
||||
}
|
||||
|
||||
async def run_one(self, service: str) -> ProbeResult:
|
||||
fn = getattr(self, self.PROBES[service])
|
||||
result: ProbeResult = await fn()
|
||||
st = self.state[service]
|
||||
prev_ok = st.last.ok if st.last else None
|
||||
st.last = result
|
||||
if result.ok:
|
||||
st.last_ok_at = result.at
|
||||
|
||||
# Log to connectivity history: every failure, plus the first success
|
||||
# after a failure (recovery), plus the first probe ever — but skip
|
||||
# the "still ok" steady-state to keep the log readable.
|
||||
if not result.ok:
|
||||
record_report(
|
||||
service,
|
||||
ok=False,
|
||||
source="deep-health",
|
||||
detail=result.error[:240],
|
||||
latency_ms=result.latency_ms,
|
||||
)
|
||||
elif prev_ok is False:
|
||||
record_report(
|
||||
service,
|
||||
ok=True,
|
||||
source="deep-health",
|
||||
detail="recovered" + (f" — {result.note}" if result.note else ""),
|
||||
latency_ms=result.latency_ms,
|
||||
)
|
||||
elif prev_ok is None:
|
||||
record_report(
|
||||
service,
|
||||
ok=True,
|
||||
source="deep-health",
|
||||
detail="first probe ok" + (f" — {result.note}" if result.note else ""),
|
||||
latency_ms=result.latency_ms,
|
||||
)
|
||||
|
||||
# Maybe auto-restart
|
||||
if not result.ok and _looks_like_wedge(result.error):
|
||||
await self._maybe_restart(service, result.error)
|
||||
return result
|
||||
|
||||
async def _maybe_restart(self, service: str, error: str) -> None:
|
||||
# No restarts during the boot grace period.
|
||||
if time.monotonic() - self._boot_at < STARTUP_GRACE_SEC:
|
||||
return
|
||||
st = self.state[service]
|
||||
now = time.monotonic()
|
||||
st.restarts = [t for t in st.restarts if now - t < RESTART_RATE_WINDOW_SEC]
|
||||
if st.restarts and now - st.restarts[-1] < RESTART_COOLDOWN_SEC:
|
||||
return # already restarted recently, give it time
|
||||
if len(st.restarts) >= RESTART_RATE_LIMIT:
|
||||
record_report(
|
||||
service,
|
||||
ok=False,
|
||||
source="deep-health",
|
||||
detail=f"rate-limited; not auto-restarting (would be #{len(st.restarts)+1} in 30 min)",
|
||||
)
|
||||
return
|
||||
services = services_from_settings(self.settings)
|
||||
if service not in services:
|
||||
return
|
||||
svc = services[service]
|
||||
if not svc.host or not svc.user:
|
||||
return
|
||||
result = await run_action(self.settings, svc, "restart")
|
||||
st.restarts.append(now)
|
||||
ok = result.get("ok", False)
|
||||
record_report(
|
||||
service,
|
||||
ok=False,
|
||||
source="deep-health",
|
||||
detail=f"auto-restart triggered (wedge: {error[:120]}); restart {'OK' if ok else 'FAILED'}",
|
||||
)
|
||||
|
||||
async def run_all(self) -> dict[str, ProbeResult]:
|
||||
results = {}
|
||||
for name in self.PROBES:
|
||||
results[name] = await self.run_one(name)
|
||||
return results
|
||||
|
||||
async def run_periodic(self) -> None:
|
||||
"""Long-running loop. Cancel via .stop()."""
|
||||
# Brief initial wait to let app finish startup
|
||||
try:
|
||||
await asyncio.wait_for(self._stop.wait(), timeout=10.0)
|
||||
return
|
||||
except asyncio.TimeoutError:
|
||||
pass
|
||||
while not self._stop.is_set():
|
||||
try:
|
||||
await self.run_all()
|
||||
except Exception:
|
||||
# Never let the loop die; the periodic check is best-effort
|
||||
pass
|
||||
try:
|
||||
await asyncio.wait_for(self._stop.wait(), timeout=self.interval_sec)
|
||||
return
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
|
||||
def stop(self) -> None:
|
||||
self._stop.set()
|
||||
|
||||
def summary(self) -> dict:
|
||||
out = {}
|
||||
for name, st in self.state.items():
|
||||
last = st.last
|
||||
out[name] = {
|
||||
"last_ok_at": st.last_ok_at,
|
||||
"last": (
|
||||
{
|
||||
"ok": last.ok,
|
||||
"at": last.at,
|
||||
"latency_ms": last.latency_ms,
|
||||
"error": last.error,
|
||||
"note": last.note,
|
||||
}
|
||||
if last
|
||||
else None
|
||||
),
|
||||
"auto_restarts_window": len(st.restarts),
|
||||
}
|
||||
return out
|
||||
@@ -0,0 +1,130 @@
|
||||
"""On-disk presence + deletion for Hugging Face model caches on the Sparks.
|
||||
|
||||
The HF cache layout for a repo `org/name` is:
|
||||
|
||||
~/.cache/huggingface/hub/models--org--name/
|
||||
|
||||
We use `du -sb` to measure size (bytes) and `rm -rf` to free it. All operations
|
||||
are gated by the server endpoints, which refuse to delete a currently-loaded
|
||||
model or one tied to an in-flight swap/download.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import shlex
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from .config import Settings
|
||||
from .ssh import ssh_run
|
||||
|
||||
|
||||
def repo_to_cache_dirname(repo: str) -> str:
|
||||
"""Convert 'org/name' to 'models--org--name' (the HF hub cache directory)."""
|
||||
if "/" not in repo:
|
||||
raise ValueError(f"repo must be in 'org/name' form: {repo!r}")
|
||||
return "models--" + repo.replace("/", "--")
|
||||
|
||||
|
||||
def _cache_path(repo: str) -> str:
|
||||
"""Full remote path to the model's cache directory."""
|
||||
# Use $HOME so it resolves correctly regardless of the SSH user's home.
|
||||
return f"$HOME/.cache/huggingface/hub/{repo_to_cache_dirname(repo)}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class HostDiskResult:
|
||||
host: str
|
||||
on_disk: bool
|
||||
size_bytes: int = 0
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiskStatus:
|
||||
repo: str
|
||||
on_disk: bool # True if present on AT LEAST one host
|
||||
total_bytes: int # sum across hosts
|
||||
per_host: list[HostDiskResult]
|
||||
|
||||
|
||||
async def probe_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult:
|
||||
"""Return whether the model's cache dir exists on this host and its size."""
|
||||
if not host or not user:
|
||||
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||
path = _cache_path(repo)
|
||||
# `du -sb` prints bytes; if the dir doesn't exist, `du` returns non-zero.
|
||||
# We test existence explicitly first so we can report on_disk=False cleanly.
|
||||
cmd = (
|
||||
f"if [ -d {shlex.quote(path)} ]; then "
|
||||
f"du -sb {shlex.quote(path)} 2>/dev/null | awk '{{print $1}}'; "
|
||||
f"else echo MISSING; fi"
|
||||
)
|
||||
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=20.0)
|
||||
if rc != 0:
|
||||
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
|
||||
raw = out.strip()
|
||||
if raw == "MISSING" or raw == "":
|
||||
return HostDiskResult(host=host, on_disk=False)
|
||||
try:
|
||||
size = int(raw.splitlines()[-1])
|
||||
except ValueError:
|
||||
return HostDiskResult(host=host, on_disk=False, error=f"unparsable du output: {raw!r}")
|
||||
return HostDiskResult(host=host, on_disk=True, size_bytes=size)
|
||||
|
||||
|
||||
async def probe_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
|
||||
"""Probe one model across the relevant Sparks based on its mode (solo|cluster)."""
|
||||
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
|
||||
if mode == "cluster" and settings.spark2_host:
|
||||
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||
|
||||
results = await asyncio.gather(*(probe_host(h, u, repo, settings) for h, u in hosts))
|
||||
on_disk = any(r.on_disk for r in results)
|
||||
total = sum(r.size_bytes for r in results)
|
||||
return DiskStatus(repo=repo, on_disk=on_disk, total_bytes=total, per_host=list(results))
|
||||
|
||||
|
||||
async def delete_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult:
|
||||
"""Probe + rm -rf on one host. Returns bytes freed (0 if the dir wasn't there)."""
|
||||
if not host or not user:
|
||||
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||
path = _cache_path(repo)
|
||||
# Safety: hard-code the prefix in the command so a bad `repo` can never escape.
|
||||
# Compute size first, then remove. If absent, still return success (idempotent).
|
||||
cmd = (
|
||||
f"set -e; "
|
||||
f"P={shlex.quote(path)}; "
|
||||
f"if [ -d \"$P\" ]; then "
|
||||
f" SIZE=$(du -sb \"$P\" 2>/dev/null | awk '{{print $1}}'); "
|
||||
f" rm -rf -- \"$P\"; "
|
||||
f" echo FREED $SIZE; "
|
||||
f"else "
|
||||
f" echo FREED 0; "
|
||||
f"fi"
|
||||
)
|
||||
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=120.0)
|
||||
if rc != 0:
|
||||
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
|
||||
# Parse the "FREED N" line
|
||||
freed = 0
|
||||
for line in out.splitlines():
|
||||
parts = line.strip().split()
|
||||
if len(parts) == 2 and parts[0] == "FREED":
|
||||
try:
|
||||
freed = int(parts[1])
|
||||
except ValueError:
|
||||
pass
|
||||
break
|
||||
return HostDiskResult(host=host, on_disk=False, size_bytes=freed)
|
||||
|
||||
|
||||
async def delete_from_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
|
||||
"""rm -rf the model's cache dir on the relevant Sparks. Idempotent."""
|
||||
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
|
||||
if mode == "cluster" and settings.spark2_host:
|
||||
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||
|
||||
results = await asyncio.gather(*(delete_host(h, u, repo, settings) for h, u in hosts))
|
||||
total_freed = sum(r.size_bytes for r in results)
|
||||
# After deletion, on_disk should be False on all hosts.
|
||||
return DiskStatus(repo=repo, on_disk=False, total_bytes=total_freed, per_host=list(results))
|
||||
+14
-5
@@ -19,7 +19,7 @@ from .config import Settings
|
||||
from .ssh import ssh_stream, StreamHandle
|
||||
|
||||
|
||||
Mode = Literal["solo", "cluster"]
|
||||
Mode = Literal["spark1", "spark2", "cluster"]
|
||||
|
||||
|
||||
_TQDM_RE = re.compile(
|
||||
@@ -113,17 +113,26 @@ class DownloadManager:
|
||||
|
||||
async def _do(self, job: DownloadJob) -> None:
|
||||
s = self.settings
|
||||
if not s.spark1_host or not s.spark1_user:
|
||||
raise RuntimeError("spark1 not configured")
|
||||
# Pick the SSH target and hf-download flags from the mode.
|
||||
if job.mode == "spark2":
|
||||
target_host, target_user = s.spark2_host, s.spark2_user
|
||||
flags = ""
|
||||
elif job.mode == "cluster":
|
||||
target_host, target_user = s.spark1_host, s.spark1_user
|
||||
flags = "-c --copy-parallel"
|
||||
else: # spark1
|
||||
target_host, target_user = s.spark1_host, s.spark1_user
|
||||
flags = ""
|
||||
if not target_host or not target_user:
|
||||
raise RuntimeError(f"{job.mode} host not configured")
|
||||
|
||||
flags = "-c --copy-parallel" if job.mode == "cluster" else ""
|
||||
cmd = f"cd ~/spark-vllm-docker && ./hf-download.sh {job.repo} {flags}".strip()
|
||||
job.append(f"$ {cmd}")
|
||||
job.state = "downloading"
|
||||
job.progress.phase = "Connecting to Hugging Face…"
|
||||
|
||||
handle = StreamHandle()
|
||||
async for line in ssh_stream(s.spark1_host, s.spark1_user, cmd, s, handle=handle):
|
||||
async for line in ssh_stream(target_host, target_user, cmd, s, handle=handle):
|
||||
job.append(line)
|
||||
self._update_progress(job, line)
|
||||
|
||||
|
||||
@@ -0,0 +1,137 @@
|
||||
"""Per-Spark hardware snapshots: RAM, disk, GPU memory + utilization, CPU load, uptime.
|
||||
|
||||
Drives via a single SSH command per Spark that runs `free`, `df`, `nvidia-smi`,
|
||||
`/proc/loadavg`, and `uptime -p` and prints labeled lines back. We parse those
|
||||
labels in `_parse`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from .config import Settings
|
||||
from .connectivity import record_mac, record_state
|
||||
from .ssh import ssh_run
|
||||
|
||||
|
||||
_PROBE = r"""
|
||||
set -e
|
||||
echo HOSTNAME=$(hostname)
|
||||
echo UPTIME=$(uptime -p 2>/dev/null || uptime)
|
||||
echo LOAD=$(awk '{print $1, $2, $3}' /proc/loadavg)
|
||||
echo CORES=$(nproc 2>/dev/null || echo 0)
|
||||
echo MEMORY=$(free -b 2>/dev/null | awk '/^Mem:/ {print $2, $3}')
|
||||
echo DISK=$(df -B1 / 2>/dev/null | awk 'NR==2 {print $2, $3}')
|
||||
echo GPU=$(nvidia-smi --query-gpu=name,utilization.gpu,temperature.gpu,power.draw,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1)
|
||||
echo GPU_MEM_USED_MIB=$(nvidia-smi --query-compute-apps=used_gpu_memory --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END {print s+0}')
|
||||
DEFIF=$(ip route show default 2>/dev/null | awk '{print $5; exit}')
|
||||
echo MAC=$(cat /sys/class/net/$DEFIF/address 2>/dev/null)
|
||||
""".strip()
|
||||
|
||||
|
||||
def _parse_int(s: str) -> int | None:
|
||||
try: return int(s)
|
||||
except (TypeError, ValueError): return None
|
||||
|
||||
|
||||
def _parse(out: str) -> dict:
|
||||
info: dict[str, Any] = {}
|
||||
for raw in out.splitlines():
|
||||
if "=" not in raw:
|
||||
continue
|
||||
k, v = raw.split("=", 1)
|
||||
info[k.strip().lower()] = v.strip()
|
||||
parsed: dict[str, Any] = {}
|
||||
parsed["hostname"] = info.get("hostname")
|
||||
parsed["uptime"] = info.get("uptime")
|
||||
parsed["cores"] = _parse_int(info.get("cores", ""))
|
||||
# Load average -> (1m, 5m, 15m)
|
||||
if info.get("load"):
|
||||
loads = info["load"].split()
|
||||
try:
|
||||
parsed["load"] = [float(x) for x in loads[:3]]
|
||||
except ValueError:
|
||||
parsed["load"] = None
|
||||
# Memory: total used in bytes
|
||||
if info.get("memory"):
|
||||
mem = info["memory"].split()
|
||||
if len(mem) == 2:
|
||||
tot, used = _parse_int(mem[0]), _parse_int(mem[1])
|
||||
parsed["ram_total_bytes"] = tot
|
||||
parsed["ram_used_bytes"] = used
|
||||
# Disk: total used in bytes
|
||||
if info.get("disk"):
|
||||
dk = info["disk"].split()
|
||||
if len(dk) == 2:
|
||||
parsed["disk_total_bytes"] = _parse_int(dk[0])
|
||||
parsed["disk_used_bytes"] = _parse_int(dk[1])
|
||||
# GPU: "name, util_gpu, temp_C, power_W, memory_total_MiB"
|
||||
if info.get("gpu"):
|
||||
parts = [p.strip() for p in info["gpu"].split(",")]
|
||||
if len(parts) >= 5:
|
||||
name, ug, temp, power, mt = parts[0], parts[1], parts[2], parts[3], parts[4]
|
||||
parsed["gpu_name"] = name
|
||||
parsed["gpu_util_pct"] = _parse_int(ug)
|
||||
parsed["gpu_temp_c"] = _parse_int(temp)
|
||||
try: parsed["gpu_power_w"] = float(power)
|
||||
except ValueError: parsed["gpu_power_w"] = None
|
||||
# memory.total may be "[N/A]" on unified-memory systems (DGX Spark)
|
||||
parsed["gpu_mem_total_mib"] = _parse_int(mt)
|
||||
parsed["gpu_unified_memory"] = parsed["gpu_mem_total_mib"] is None
|
||||
# Sum per-process compute memory (works even on unified-memory systems)
|
||||
if info.get("gpu_mem_used_mib"):
|
||||
parsed["gpu_mem_used_mib"] = _parse_int(info["gpu_mem_used_mib"])
|
||||
# MAC address on the default-route interface (for Wake-on-LAN)
|
||||
if info.get("mac"):
|
||||
parsed["mac"] = info["mac"].lower()
|
||||
return parsed
|
||||
|
||||
|
||||
class HardwareProbe:
|
||||
"""Caches results briefly to avoid hammering the Sparks."""
|
||||
|
||||
def __init__(self, settings: Settings, ttl_sec: float = 4.0, fail_ttl_sec: float = 25.0) -> None:
|
||||
self.settings = settings
|
||||
self.ttl_sec = ttl_sec
|
||||
self.fail_ttl_sec = fail_ttl_sec
|
||||
self._cache: dict[str, tuple[float, dict]] = {}
|
||||
self._locks: dict[str, asyncio.Lock] = {}
|
||||
|
||||
def _ttl_for(self, value: dict) -> float:
|
||||
return self.ttl_sec if value.get("reachable") else self.fail_ttl_sec
|
||||
|
||||
def _lock(self, key: str) -> asyncio.Lock:
|
||||
if key not in self._locks:
|
||||
self._locks[key] = asyncio.Lock()
|
||||
return self._locks[key]
|
||||
|
||||
async def fetch(self) -> dict:
|
||||
s1, s2 = await asyncio.gather(
|
||||
self._one("spark1", self.settings.spark1_host, self.settings.spark1_user),
|
||||
self._one("spark2", self.settings.spark2_host, self.settings.spark2_user),
|
||||
)
|
||||
return {"spark1": s1, "spark2": s2}
|
||||
|
||||
async def _one(self, key: str, host: str, user: str) -> dict:
|
||||
if not host or not user:
|
||||
return {"reachable": False, "configured": False}
|
||||
async with self._lock(key):
|
||||
now = time.monotonic()
|
||||
cached = self._cache.get(key)
|
||||
if cached and (now - cached[0] < self._ttl_for(cached[1])):
|
||||
return cached[1]
|
||||
# Use a shorter timeout for the connect phase; if a previous probe
|
||||
# marked this host unreachable, return the cached failure immediately.
|
||||
rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=6)
|
||||
if rc != 0:
|
||||
result = {"reachable": False, "configured": True, "host": host, "error": err.strip() or out.strip() or f"rc={rc}"}
|
||||
self._cache[key] = (now, result)
|
||||
record_state(key, False)
|
||||
return result
|
||||
parsed = _parse(out)
|
||||
result = {"reachable": True, "configured": True, "host": host, **parsed}
|
||||
self._cache[key] = (now, result)
|
||||
record_state(key, True)
|
||||
if parsed.get("mac"):
|
||||
record_mac(key, parsed["mac"])
|
||||
return result
|
||||
@@ -0,0 +1,202 @@
|
||||
"""NVIDIA NIM container install / lifecycle.
|
||||
|
||||
Two pieces:
|
||||
* A small curated catalog of NIM images (so users don't have to copy/paste
|
||||
huge nvcr.io URLs).
|
||||
* An installer that SSHes into the target Spark, runs `docker pull` then
|
||||
`docker run -d --gpus all -p PORT:PORT -v VOLUME:/opt/nim/.cache
|
||||
-e NGC_API_KEY=... IMAGE` and streams output.
|
||||
|
||||
Custom services also persist via `overrides.add_custom_service()` so the
|
||||
Services panel can show them.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from .config import Settings
|
||||
from .ssh import ssh_stream, StreamHandle
|
||||
|
||||
|
||||
# Curated list. These are the most useful NIM containers for a dual-Spark
|
||||
# audio-and-LLM setup. Browse the full catalog at
|
||||
# https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia
|
||||
CATALOG_URL = "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers"
|
||||
|
||||
|
||||
SUGGESTED_NIMS: list[dict] = [
|
||||
{
|
||||
"key": "parakeet-tdt-0.6b-v3",
|
||||
"name": "Parakeet TDT 0.6B v3",
|
||||
"image": "nvcr.io/nim/nvidia/parakeet-tdt-0-6b-v3:latest",
|
||||
"default_container": "parakeet-asr",
|
||||
"default_port": 8000,
|
||||
"kind": "stt",
|
||||
"description": "Streaming speech-to-text (English). Used by Open WebUI for voice input. ~1 GB.",
|
||||
"homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/parakeet-tdt-0-6b-v3",
|
||||
},
|
||||
{
|
||||
"key": "magpie-tts-multilingual",
|
||||
"name": "Magpie TTS Multilingual",
|
||||
"image": "nvcr.io/nim/nvidia/magpie-tts-multilingual:latest",
|
||||
"default_container": "magpie-tts",
|
||||
"default_port": 9000,
|
||||
"kind": "tts",
|
||||
"description": "Multilingual text-to-speech. Counterpart to Parakeet for 'read aloud'. ~3 GB.",
|
||||
"homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/magpie-tts-multilingual",
|
||||
},
|
||||
{
|
||||
"key": "riva-multilingual",
|
||||
"name": "Riva Multilingual ASR",
|
||||
"image": "nvcr.io/nim/nvidia/riva-multilingual:latest",
|
||||
"default_container": "riva-asr",
|
||||
"default_port": 8001,
|
||||
"kind": "stt",
|
||||
"description": "NVIDIA Riva speech-recognition multi-language model. Larger and more accurate than Parakeet.",
|
||||
"homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class NimInstallJob:
|
||||
id: str
|
||||
image: str
|
||||
container: str
|
||||
port: int
|
||||
host: str
|
||||
user: str
|
||||
volume: Optional[str]
|
||||
started_at: str
|
||||
state: str = "starting" # starting | pulling | running | done | failed
|
||||
phase: str = "Starting…"
|
||||
lines: list[str] = field(default_factory=list)
|
||||
returncode: Optional[int] = None
|
||||
finished_at: Optional[str] = None
|
||||
|
||||
def append(self, line: str) -> None:
|
||||
self.lines.append(line)
|
||||
if len(self.lines) > 1000:
|
||||
del self.lines[: len(self.lines) - 1000]
|
||||
|
||||
|
||||
class NimManager:
|
||||
def __init__(self, settings: Settings) -> None:
|
||||
self.settings = settings
|
||||
self.lock = asyncio.Lock()
|
||||
self.jobs: dict[str, NimInstallJob] = {}
|
||||
self.current_job_id: Optional[str] = None
|
||||
|
||||
def get(self, job_id: str) -> NimInstallJob | None:
|
||||
return self.jobs.get(job_id)
|
||||
|
||||
async def trigger(
|
||||
self,
|
||||
*,
|
||||
image: str,
|
||||
container: str,
|
||||
port: int,
|
||||
host: str,
|
||||
user: str,
|
||||
volume: str | None = None,
|
||||
extra_env: dict[str, str] | None = None,
|
||||
) -> NimInstallJob:
|
||||
if self.lock.locked():
|
||||
raise RuntimeError("Another NIM install is already in progress")
|
||||
if not host or not user:
|
||||
raise RuntimeError("target host not configured")
|
||||
if not self.settings.ngc_api_key:
|
||||
raise RuntimeError(
|
||||
"NGC_API_KEY is not set. Open Configure Sparks in StartOS and paste your NGC personal API key (free at https://ngc.nvidia.com/setup/personal-key)."
|
||||
)
|
||||
|
||||
job = NimInstallJob(
|
||||
id=uuid.uuid4().hex[:8],
|
||||
image=image,
|
||||
container=container,
|
||||
port=port,
|
||||
host=host,
|
||||
user=user,
|
||||
volume=volume or f"{container}-cache",
|
||||
started_at=datetime.now(timezone.utc).isoformat(),
|
||||
)
|
||||
self.jobs[job.id] = job
|
||||
self.current_job_id = job.id
|
||||
asyncio.create_task(self._run(job, extra_env or {}))
|
||||
return job
|
||||
|
||||
async def _run(self, job: NimInstallJob, extra_env: dict[str, str]) -> None:
|
||||
async with self.lock:
|
||||
try:
|
||||
await self._do(job, extra_env)
|
||||
if job.state != "failed":
|
||||
job.state = "done"
|
||||
job.returncode = 0
|
||||
job.phase = "Done"
|
||||
except Exception as e:
|
||||
job.append(f"[error] {type(e).__name__}: {e}")
|
||||
job.state = "failed"
|
||||
if job.returncode is None:
|
||||
job.returncode = 1
|
||||
finally:
|
||||
job.finished_at = datetime.now(timezone.utc).isoformat()
|
||||
if self.current_job_id == job.id:
|
||||
self.current_job_id = None
|
||||
|
||||
async def _do(self, job: NimInstallJob, extra_env: dict[str, str]) -> None:
|
||||
# Build the bash one-liner. We use docker login non-interactively with the NGC API key.
|
||||
env_parts = [f'-e NGC_API_KEY=$NGC_API_KEY']
|
||||
for k, v in extra_env.items():
|
||||
env_parts.append(f"-e {k}={v}")
|
||||
env_str = " ".join(env_parts)
|
||||
cmd = (
|
||||
f"set -e; "
|
||||
f"export NGC_API_KEY='{self.settings.ngc_api_key}'; "
|
||||
f"echo '=== docker login nvcr.io ==='; "
|
||||
f"echo \"$NGC_API_KEY\" | docker login nvcr.io -u '$oauthtoken' --password-stdin; "
|
||||
f"echo '=== docker pull {job.image} (this can be 1-10 GB) ==='; "
|
||||
f"docker pull {job.image}; "
|
||||
f"echo '=== remove any prior container with the same name ==='; "
|
||||
f"docker rm -f {job.container} 2>/dev/null || true; "
|
||||
f"echo '=== docker run -d --gpus all -p {job.port}:{job.port} -v {job.volume}:/opt/nim/.cache {env_str} --name {job.container} --restart unless-stopped {job.image} ==='; "
|
||||
f"docker run -d --gpus all "
|
||||
f"-p {job.port}:{job.port} "
|
||||
f"-v {job.volume}:/opt/nim/.cache "
|
||||
f"{env_str} "
|
||||
f"--name {job.container} "
|
||||
f"--restart unless-stopped "
|
||||
f"{job.image}; "
|
||||
f"echo '=== ensuring cache volume is writable by uid 1000 (riva-server) ==='; "
|
||||
f"docker run --rm -v {job.volume}:/cache alpine chown -R 1000:1000 /cache && "
|
||||
f"docker restart {job.container}; "
|
||||
f"echo '=== install complete; container is starting up and will download its model on first boot ==='"
|
||||
)
|
||||
job.append(f"$ <install command for {job.image} on {job.host}>")
|
||||
job.state = "pulling"
|
||||
job.phase = "Pulling image from nvcr.io (this can take a few minutes)…"
|
||||
|
||||
handle = StreamHandle()
|
||||
async for line in ssh_stream(job.host, job.user, cmd, self.settings, handle=handle):
|
||||
# Don't log lines containing the api key
|
||||
if self.settings.ngc_api_key and self.settings.ngc_api_key in line:
|
||||
continue
|
||||
job.append(line)
|
||||
if "docker pull" in line:
|
||||
job.phase = "Pulling image from nvcr.io…"
|
||||
elif "Login Succeeded" in line:
|
||||
job.phase = "Logged in to NGC; pulling image…"
|
||||
elif "Pull complete" in line:
|
||||
job.phase = "Pulling layers…"
|
||||
elif "Status: Downloaded newer image" in line or "Image is up to date" in line:
|
||||
job.phase = "Image ready; starting container…"
|
||||
elif "docker run -d" in line:
|
||||
job.state = "running"
|
||||
job.phase = "Container starting; downloading model on first boot…"
|
||||
|
||||
rc = handle.returncode or 0
|
||||
if rc != 0:
|
||||
job.state = "failed"
|
||||
job.returncode = rc
|
||||
+402
-1
@@ -10,14 +10,22 @@ from pydantic import BaseModel
|
||||
from typing import Literal
|
||||
|
||||
from .config import Settings
|
||||
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||
from .custom_services import add_custom_service, delete_custom_service
|
||||
from .deep_health import DeepHealth
|
||||
from .disk import delete_from_disk, probe_disk
|
||||
from .download import DownloadManager
|
||||
from .hardware import HardwareProbe
|
||||
from .health import check_magpie, check_parakeet, check_vllm
|
||||
from .models import load_catalog
|
||||
from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager
|
||||
from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
|
||||
from .services import docker_state, run_action, services_from_settings
|
||||
from .ssh import ssh_run
|
||||
from .swap import SwapManager
|
||||
from .updates import UpdateManager, get_update_status
|
||||
from .validate import validate_launch
|
||||
from .wol import send_local_broadcast, send_via_peer
|
||||
|
||||
|
||||
settings = Settings.from_env()
|
||||
@@ -25,9 +33,24 @@ catalog = load_catalog(settings.models_yaml)
|
||||
swap_manager = SwapManager(settings, catalog)
|
||||
download_manager = DownloadManager(settings)
|
||||
update_manager = UpdateManager(settings)
|
||||
hardware_probe = HardwareProbe(settings)
|
||||
nim_manager = NimManager(settings)
|
||||
deep_health = DeepHealth(settings)
|
||||
|
||||
app = FastAPI(title="spark-control", version="0.1.0")
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def _start_deep_health() -> None:
|
||||
# Fire-and-forget; the loop catches its own exceptions.
|
||||
asyncio.create_task(deep_health.run_periodic())
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def _stop_deep_health() -> None:
|
||||
deep_health.stop()
|
||||
|
||||
|
||||
_STATIC_DIR = Path(__file__).resolve().parent / "static"
|
||||
app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
|
||||
|
||||
@@ -44,6 +67,7 @@ async def get_config() -> dict:
|
||||
"spark1_host": settings.spark1_host,
|
||||
"spark2_host": settings.spark2_host,
|
||||
"vllm_port": settings.vllm_port,
|
||||
"open_webui_url": settings.open_webui_url or None,
|
||||
}
|
||||
|
||||
|
||||
@@ -116,6 +140,191 @@ async def del_model(key: str) -> dict:
|
||||
return {"ok": True, "key": key}
|
||||
|
||||
|
||||
@app.get("/api/models/disk-status")
|
||||
async def get_models_disk_status() -> dict:
|
||||
"""Probe each catalog model's HF cache on the appropriate Spark(s) in parallel.
|
||||
|
||||
Result is keyed by model key: {on_disk, total_bytes, per_host:[{host,on_disk,size_bytes,error?}]}.
|
||||
Designed to be called once on dashboard load; takes ~1–3s depending on Spark count.
|
||||
"""
|
||||
if not settings.configured:
|
||||
return {"configured": False, "models": {}}
|
||||
keys = list(catalog.models.keys())
|
||||
statuses = await asyncio.gather(*(
|
||||
probe_disk(catalog.models[k].repo, catalog.models[k].mode, settings) for k in keys
|
||||
), return_exceptions=True)
|
||||
out: dict[str, dict] = {}
|
||||
for k, s in zip(keys, statuses):
|
||||
if isinstance(s, Exception):
|
||||
out[k] = {"on_disk": False, "total_bytes": 0, "per_host": [], "error": str(s)}
|
||||
continue
|
||||
out[k] = {
|
||||
"on_disk": s.on_disk,
|
||||
"total_bytes": s.total_bytes,
|
||||
"per_host": [
|
||||
{"host": r.host, "on_disk": r.on_disk, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
|
||||
for r in s.per_host
|
||||
],
|
||||
}
|
||||
return {"configured": True, "models": out}
|
||||
|
||||
|
||||
@app.delete("/api/models/{key}/disk")
|
||||
async def del_model_disk(key: str) -> dict:
|
||||
"""Delete a model's weights from the Spark filesystem(s). The catalog entry stays.
|
||||
|
||||
Safety rails:
|
||||
- Refuses if the model is currently loaded on vLLM.
|
||||
- Refuses if a swap or download is in flight.
|
||||
- Idempotent: if the cache dir is already gone on a host, that host reports 0 bytes freed.
|
||||
"""
|
||||
if key not in catalog.models:
|
||||
raise HTTPException(404, f"unknown model: {key}")
|
||||
m = catalog.models[key]
|
||||
|
||||
# Refuse if currently loaded
|
||||
try:
|
||||
vllm = await check_vllm(settings)
|
||||
except Exception:
|
||||
vllm = {}
|
||||
if vllm.get("ok") and vllm.get("current_model") == m.repo:
|
||||
raise HTTPException(
|
||||
409,
|
||||
f"'{m.display_name}' is the currently loaded model. Switch to a different model first, then try again."
|
||||
)
|
||||
|
||||
# Refuse if a swap is in flight
|
||||
if swap_manager.current_job_id:
|
||||
raise HTTPException(409, "a model swap is in progress; wait for it to finish")
|
||||
|
||||
# Refuse if a download is in flight for this same repo (a different model's download is fine)
|
||||
if download_manager.current_job_id:
|
||||
job = download_manager.get(download_manager.current_job_id)
|
||||
if job and job.repo == m.repo:
|
||||
raise HTTPException(409, "this model is currently downloading; cancel or wait for it to finish")
|
||||
|
||||
status = await delete_from_disk(m.repo, m.mode, settings)
|
||||
# Audit log
|
||||
record_report(
|
||||
f"disk:{key}",
|
||||
ok=True,
|
||||
source="disk-delete",
|
||||
detail=f"freed {status.total_bytes} bytes across {len(status.per_host)} host(s)",
|
||||
)
|
||||
return {
|
||||
"ok": True,
|
||||
"key": key,
|
||||
"repo": m.repo,
|
||||
"bytes_freed": status.total_bytes,
|
||||
"per_host": [
|
||||
{"host": r.host, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
|
||||
for r in status.per_host
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/hardware")
|
||||
async def get_hardware() -> dict:
|
||||
"""Per-Spark hardware snapshot — RAM, disk, GPU mem + util, CPU load, uptime."""
|
||||
return await hardware_probe.fetch()
|
||||
|
||||
|
||||
@app.get("/api/connectivity")
|
||||
async def get_connectivity() -> dict:
|
||||
"""Up/down transition log per Spark + cached MACs."""
|
||||
return connectivity_summary()
|
||||
|
||||
|
||||
@app.get("/api/deep-health")
|
||||
async def get_deep_health() -> dict:
|
||||
"""Last result + auto-restart counters for each service's synthetic probe."""
|
||||
return deep_health.summary()
|
||||
|
||||
|
||||
@app.post("/api/deep-health/{service}/run")
|
||||
async def run_deep_health(service: str) -> dict:
|
||||
"""Manually run a single service's deep-health probe right now."""
|
||||
if service not in deep_health.PROBES:
|
||||
raise HTTPException(404, f"unknown service: {service}")
|
||||
result = await deep_health.run_one(service)
|
||||
return {
|
||||
"ok": result.ok,
|
||||
"at": result.at,
|
||||
"latency_ms": result.latency_ms,
|
||||
"error": result.error,
|
||||
"note": result.note,
|
||||
}
|
||||
|
||||
|
||||
class HealthEventBody(BaseModel):
|
||||
service: str # e.g. "parakeet", "magpie", "vllm"
|
||||
ok: bool # true on success, false on failure
|
||||
source: str | None = None # what app reported (e.g. "open-webui")
|
||||
error: str | None = None # optional detail
|
||||
ms: int | None = None # optional latency
|
||||
|
||||
|
||||
@app.post("/api/health-event")
|
||||
async def post_health_event(body: HealthEventBody) -> dict:
|
||||
"""Passive endpoint: any LAN app can POST here when its call to one of our
|
||||
services succeeds or (more usefully) fails. We log the report into the
|
||||
connectivity history so a brief blip that polling misses still surfaces.
|
||||
|
||||
Example:
|
||||
curl -X POST http://<dashboard>/api/health-event \\
|
||||
-H 'content-type: application/json' \\
|
||||
-d '{"service":"parakeet","ok":false,"error":"503","source":"open-webui","ms":420}'
|
||||
"""
|
||||
if not body.service.strip():
|
||||
raise HTTPException(400, "service is required")
|
||||
event = record_report(
|
||||
body.service.strip(),
|
||||
ok=body.ok,
|
||||
source=(body.source or "external").strip(),
|
||||
detail=(body.error or "").strip(),
|
||||
latency_ms=body.ms,
|
||||
)
|
||||
return {"ok": True, "recorded": event}
|
||||
|
||||
|
||||
@app.post("/api/spark/{name}/wake")
|
||||
async def wake_spark(name: str) -> dict:
|
||||
"""Send a Wake-on-LAN magic packet for the named Spark.
|
||||
|
||||
Tries the OTHER Spark (if reachable) first because the packet has to
|
||||
originate on the target's LAN segment to be reliable. Falls back to a
|
||||
direct UDP broadcast from this container.
|
||||
"""
|
||||
if name not in ("spark1", "spark2"):
|
||||
raise HTTPException(404, f"unknown spark: {name}")
|
||||
mac = get_mac(name)
|
||||
if not mac:
|
||||
raise HTTPException(400, f"MAC for {name} not yet known; bring it up once so we can probe it, then this will work next time it sleeps")
|
||||
|
||||
# Find the peer's connectivity to decide the path.
|
||||
other = "spark2" if name == "spark1" else "spark1"
|
||||
other_host = settings.spark1_host if other == "spark1" else settings.spark2_host
|
||||
other_user = settings.spark1_user if other == "spark1" else settings.spark2_user
|
||||
|
||||
delivered_via = None
|
||||
via_peer_ok = False
|
||||
via_peer_err = ""
|
||||
if other_host and other_user:
|
||||
via_peer_ok, via_peer_err = await send_via_peer(other_host, other_user, mac, settings)
|
||||
if via_peer_ok:
|
||||
delivered_via = other
|
||||
|
||||
if not via_peer_ok:
|
||||
# Fall back to direct from this container
|
||||
try:
|
||||
send_local_broadcast(mac)
|
||||
delivered_via = "container"
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"WoL failed: peer={via_peer_err!r} container={e!r}")
|
||||
|
||||
return {"ok": True, "spark": name, "mac": mac, "delivered_via": delivered_via}
|
||||
|
||||
|
||||
@app.get("/api/services")
|
||||
async def get_services() -> dict:
|
||||
"""Lifecycle state of always-on support services (Parakeet, Magpie, …).
|
||||
@@ -158,9 +367,113 @@ async def get_services() -> dict:
|
||||
results = await asyncio.gather(*[one(n) for n in services.keys()])
|
||||
for name, info in results:
|
||||
out[name] = info
|
||||
# Feed http reachability into the connectivity log (transition-only)
|
||||
record_state(name, bool(info.get("http_ready")))
|
||||
return out
|
||||
|
||||
|
||||
@app.get("/api/nim/catalog")
|
||||
async def get_nim_catalog() -> dict:
|
||||
return {
|
||||
"catalog_url": CATALOG_URL,
|
||||
"ngc_key_configured": bool(settings.ngc_api_key),
|
||||
"suggested": SUGGESTED_NIMS,
|
||||
}
|
||||
|
||||
|
||||
class NimInstallBody(BaseModel):
|
||||
image: str
|
||||
container: str
|
||||
port: int
|
||||
host: Literal["spark1", "spark2"] = "spark2"
|
||||
kind: str = ""
|
||||
register: bool = True # write to custom services overrides after install
|
||||
|
||||
|
||||
@app.post("/api/nim/install")
|
||||
async def post_nim_install(body: NimInstallBody) -> dict:
|
||||
target_host = settings.spark1_host if body.host == "spark1" else settings.spark2_host
|
||||
target_user = settings.spark1_user if body.host == "spark1" else settings.spark2_user
|
||||
try:
|
||||
job = await nim_manager.trigger(
|
||||
image=body.image,
|
||||
container=body.container,
|
||||
port=body.port,
|
||||
host=target_host,
|
||||
user=target_user,
|
||||
)
|
||||
except RuntimeError as e:
|
||||
raise HTTPException(409 if "in progress" in str(e) else 400, str(e))
|
||||
|
||||
if body.register:
|
||||
# Persist in custom services so the panel shows it after install.
|
||||
add_custom_service({
|
||||
"key": body.container,
|
||||
"kind": body.kind or "nim",
|
||||
"host": target_host,
|
||||
"user": target_user,
|
||||
"container": body.container,
|
||||
"port": body.port,
|
||||
"image": body.image,
|
||||
})
|
||||
return {"job_id": job.id, "image": job.image, "container": job.container, "state": job.state}
|
||||
|
||||
|
||||
@app.get("/api/nim/install/{job_id}")
|
||||
async def get_nim_install(job_id: str) -> dict:
|
||||
job = nim_manager.get(job_id)
|
||||
if job is None:
|
||||
raise HTTPException(404, "no such job")
|
||||
return {
|
||||
"id": job.id,
|
||||
"image": job.image,
|
||||
"container": job.container,
|
||||
"port": job.port,
|
||||
"host": job.host,
|
||||
"state": job.state,
|
||||
"phase": job.phase,
|
||||
"started_at": job.started_at,
|
||||
"finished_at": job.finished_at,
|
||||
"returncode": job.returncode,
|
||||
"lines": job.lines,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/nim/install/{job_id}/stream")
|
||||
async def stream_nim_install(job_id: str):
|
||||
job = nim_manager.get(job_id)
|
||||
if job is None:
|
||||
raise HTTPException(404, "no such job")
|
||||
|
||||
async def gen():
|
||||
sent = 0
|
||||
last_phase = None
|
||||
while True:
|
||||
n = len(job.lines)
|
||||
if n > sent:
|
||||
for line in job.lines[sent:n]:
|
||||
yield f"data: {json.dumps({'line': line})}\n\n"
|
||||
sent = n
|
||||
if job.phase != last_phase:
|
||||
yield f"event: phase\ndata: {json.dumps({'state': job.state, 'phase': job.phase})}\n\n"
|
||||
last_phase = job.phase
|
||||
if job.returncode is not None and sent >= len(job.lines):
|
||||
yield f"event: done\ndata: {json.dumps({'state': job.state, 'returncode': job.returncode})}\n\n"
|
||||
return
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
return StreamingResponse(gen(), media_type="text/event-stream")
|
||||
|
||||
|
||||
@app.delete("/api/services/{name}")
|
||||
async def del_service(name: str) -> dict:
|
||||
# Only allow deleting custom services (not the bundled parakeet/magpie keys)
|
||||
if name in ("parakeet", "magpie"):
|
||||
raise HTTPException(400, "built-in service; cannot delete (use Configure Sparks to point at a different host)")
|
||||
delete_custom_service(name)
|
||||
return {"ok": True, "name": name}
|
||||
|
||||
|
||||
@app.post("/api/services/{name}/{action}")
|
||||
async def service_action(name: str, action: str) -> dict:
|
||||
services = services_from_settings(settings)
|
||||
@@ -212,6 +525,10 @@ async def get_status() -> dict:
|
||||
check_parakeet(settings),
|
||||
check_magpie(settings),
|
||||
)
|
||||
# Feed health into the connectivity log (deduped — only logs on transition)
|
||||
record_state("vllm", bool(vllm.get("ok")))
|
||||
record_state("parakeet", bool(parakeet.get("ok")))
|
||||
record_state("magpie", bool(magpie.get("ok")))
|
||||
current_key = _identify_current_model(vllm.get("current_model"))
|
||||
return {
|
||||
"configured": settings.configured,
|
||||
@@ -237,6 +554,15 @@ class SwapRequest(BaseModel):
|
||||
dry_run: bool = False
|
||||
|
||||
|
||||
@app.post("/api/swap/{key}/validate")
|
||||
async def validate_swap(key: str) -> dict:
|
||||
"""Pre-flight check: run vLLM's argparse layer against the proposed launch
|
||||
command WITHOUT starting an engine. Cheap (~5 s) and doesn't disturb the
|
||||
currently-loaded model.
|
||||
"""
|
||||
return await validate_launch(key, catalog, settings)
|
||||
|
||||
|
||||
@app.post("/api/swap")
|
||||
async def post_swap(req: SwapRequest) -> dict:
|
||||
if not settings.configured and not req.dry_run:
|
||||
@@ -297,7 +623,7 @@ async def stream_swap(job_id: str):
|
||||
|
||||
class DownloadRequest(BaseModel):
|
||||
repo: str
|
||||
mode: Literal["solo", "cluster"] = "solo"
|
||||
mode: Literal["spark1", "spark2", "cluster"] = "spark1"
|
||||
|
||||
|
||||
@app.post("/api/download")
|
||||
@@ -376,6 +702,81 @@ async def get_updates() -> dict:
|
||||
return await get_update_status(settings)
|
||||
|
||||
|
||||
@app.get("/api/explain-updates")
|
||||
async def explain_updates():
|
||||
"""Stream a layman's explanation of the pending commits from the currently-loaded vLLM model."""
|
||||
import httpx
|
||||
info = await get_update_status(settings)
|
||||
if not info.get("ok"):
|
||||
async def err_gen():
|
||||
yield f"event: done\ndata: {json.dumps({'error': info.get('error', 'unknown')})}\n\n"
|
||||
return StreamingResponse(err_gen(), media_type="text/event-stream")
|
||||
|
||||
vllm = await check_vllm(settings)
|
||||
if not vllm.get("ok") or not vllm.get("current_model"):
|
||||
async def err_gen():
|
||||
yield f"event: done\ndata: {json.dumps({'error': 'no vLLM model loaded — swap to a model first'})}\n\n"
|
||||
return StreamingResponse(err_gen(), media_type="text/event-stream")
|
||||
|
||||
commits = "\n".join(info.get("log", []))
|
||||
if not commits.strip():
|
||||
async def empty_gen():
|
||||
yield f"event: done\ndata: {json.dumps({'error': 'no pending commits'})}\n\n"
|
||||
return StreamingResponse(empty_gen(), media_type="text/event-stream")
|
||||
|
||||
prompt = (
|
||||
"You are reviewing pending git commits to `eugr/spark-vllm-docker`, an upstream community project that "
|
||||
"orchestrates vLLM on dual NVIDIA DGX Spark hardware (Blackwell GPUs, cluster via Ray, recipes per model). "
|
||||
"The reader has a setup running models like Qwen3.6-35B-A3B-NVFP4 (daily driver, solo), Qwen3-VL 235B (cluster), "
|
||||
"and Gemma 4 31B. The reader is technically literate but is NOT a vLLM expert.\n\n"
|
||||
"For the commit list below: give a short overall verdict (Apply / Optional / Skip and why), then a brief "
|
||||
"bullet per commit grouping similar ones. Call out anything that would break a working setup or that "
|
||||
"requires re-downloading models. Avoid jargon. ~250 words max.\n\n"
|
||||
f"Pending commits:\n{commits}"
|
||||
)
|
||||
|
||||
async def gen():
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=5.0)) as c:
|
||||
async with c.stream(
|
||||
"POST",
|
||||
f"{vllm['base_url']}/chat/completions",
|
||||
json={
|
||||
"model": vllm["current_model"],
|
||||
"stream": True,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": 600,
|
||||
"temperature": 0.4,
|
||||
},
|
||||
) as r:
|
||||
r.raise_for_status()
|
||||
async for line in r.aiter_lines():
|
||||
if not line.startswith("data: "):
|
||||
continue
|
||||
data = line[6:].strip()
|
||||
if data == "[DONE]":
|
||||
break
|
||||
try:
|
||||
chunk = json.loads(data)
|
||||
choices = chunk.get("choices") or []
|
||||
if not choices:
|
||||
continue
|
||||
delta = choices[0].get("delta") or {}
|
||||
text = delta.get("content")
|
||||
reasoning = delta.get("reasoning")
|
||||
if text:
|
||||
yield f"data: {json.dumps({'content': text})}\n\n"
|
||||
elif reasoning:
|
||||
yield f"data: {json.dumps({'reasoning': reasoning})}\n\n"
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
yield f"data: {json.dumps({'error': f'{type(e).__name__}: {e}'})}\n\n"
|
||||
yield f"event: done\ndata: {json.dumps({'ok': True})}\n\n"
|
||||
|
||||
return StreamingResponse(gen(), media_type="text/event-stream")
|
||||
|
||||
|
||||
class UpdateRequest(BaseModel):
|
||||
mode: Literal["solo", "cluster"] = "cluster"
|
||||
|
||||
|
||||
+42
-2
@@ -5,6 +5,7 @@ machinery. We just run `docker start|stop|restart <container>` via SSH on the
|
||||
appropriate host.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, Optional
|
||||
|
||||
@@ -12,6 +13,25 @@ from .config import Settings
|
||||
from .ssh import ssh_run
|
||||
|
||||
|
||||
# Cache the "unreachable" verdict per (host, user) for a short period so that a
|
||||
# repeated docker_state call doesn't re-pay the 6 s SSH connect timeout each time.
|
||||
_UNREACHABLE_TTL = 25.0
|
||||
_unreachable_cache: dict[tuple[str, str], float] = {}
|
||||
|
||||
|
||||
def _is_recently_unreachable(host: str, user: str) -> bool:
|
||||
ts = _unreachable_cache.get((host, user))
|
||||
return bool(ts and time.monotonic() - ts < _UNREACHABLE_TTL)
|
||||
|
||||
|
||||
def _mark_unreachable(host: str, user: str) -> None:
|
||||
_unreachable_cache[(host, user)] = time.monotonic()
|
||||
|
||||
|
||||
def _clear_unreachable(host: str, user: str) -> None:
|
||||
_unreachable_cache.pop((host, user), None)
|
||||
|
||||
|
||||
ServiceName = Literal["parakeet", "magpie"]
|
||||
ServiceAction = Literal["start", "stop", "restart"]
|
||||
|
||||
@@ -27,7 +47,8 @@ class ServiceDef:
|
||||
|
||||
|
||||
def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
|
||||
return {
|
||||
from .custom_services import load_custom_services
|
||||
out: dict[str, ServiceDef] = {
|
||||
"parakeet": ServiceDef(
|
||||
name="parakeet",
|
||||
kind="stt",
|
||||
@@ -45,19 +66,38 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
|
||||
port=s.magpie_port,
|
||||
),
|
||||
}
|
||||
for entry in load_custom_services():
|
||||
key = entry.get("key")
|
||||
if not key or key in out:
|
||||
continue
|
||||
out[key] = ServiceDef(
|
||||
name=key,
|
||||
kind=entry.get("kind", ""),
|
||||
host=entry.get("host", ""),
|
||||
user=entry.get("user", ""),
|
||||
container=entry.get("container", key),
|
||||
port=int(entry.get("port", 0)),
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
async def docker_state(settings: Settings, svc: ServiceDef) -> dict:
|
||||
"""Get docker state (running, exited, restarting, etc.) + restart count."""
|
||||
if not svc.host or not svc.user:
|
||||
return {"state": "unconfigured", "restart_count": None, "uptime": None}
|
||||
if _is_recently_unreachable(svc.host, svc.user):
|
||||
return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None}
|
||||
cmd = (
|
||||
f"docker inspect {svc.container} "
|
||||
f"--format '{{{{.State.Status}}}}|{{{{.State.StartedAt}}}}|{{{{.RestartCount}}}}|{{{{.State.ExitCode}}}}|{{{{.State.Error}}}}' "
|
||||
f"2>&1 || echo 'NOT_FOUND'"
|
||||
)
|
||||
rc, out, _ = await ssh_run(svc.host, svc.user, cmd, settings, timeout=10)
|
||||
rc, out, _ = await ssh_run(svc.host, svc.user, cmd, settings, timeout=6)
|
||||
out = out.strip()
|
||||
if rc == 124 or "timeout after" in out.lower():
|
||||
_mark_unreachable(svc.host, svc.user)
|
||||
return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None}
|
||||
_clear_unreachable(svc.host, svc.user)
|
||||
if rc != 0 or out.startswith("NOT_FOUND") or "Error" in out and "no such object" in out.lower():
|
||||
return {"state": "missing", "restart_count": None, "uptime": None, "raw": out}
|
||||
parts = out.split("|")
|
||||
|
||||
+753
-21
@@ -13,8 +13,13 @@ const state = {
|
||||
swap_progress: 0, // 0–1
|
||||
services: {},
|
||||
service_action_in_flight: null, // e.g. "parakeet:restart"
|
||||
hardware: {},
|
||||
config: {},
|
||||
configured: true,
|
||||
timer_handle: null,
|
||||
deep_health: {},
|
||||
disk_status: {}, // keyed by model key: { on_disk, total_bytes, per_host }
|
||||
disk_status_loaded: false,
|
||||
};
|
||||
|
||||
const el = (sel) => document.querySelector(sel);
|
||||
@@ -54,23 +59,52 @@ function renderCards() {
|
||||
? `<div class="desc">${escapeHtml(m.description)}</div>`
|
||||
: '';
|
||||
const customPill = m.custom ? `<span class="tag custom-pill">custom</span>` : '';
|
||||
// Disk-presence pill + trash button. Until /api/models/disk-status comes back,
|
||||
// we don't know — render a neutral placeholder.
|
||||
const disk = state.disk_status[key];
|
||||
let diskPill = '';
|
||||
if (state.disk_status_loaded) {
|
||||
if (disk && disk.on_disk) {
|
||||
const gb = (disk.total_bytes / 1e9);
|
||||
diskPill = `<span class="tag on-disk" title="Weights present on disk">on disk · ${gb.toFixed(1)} GB</span>`;
|
||||
} else {
|
||||
diskPill = `<span class="tag not-on-disk" title="Weights not downloaded">not downloaded</span>`;
|
||||
}
|
||||
}
|
||||
// Trash button — hidden if not on disk; disabled (with tooltip) if currently loaded.
|
||||
let trashBtn = '';
|
||||
if (state.disk_status_loaded && disk && disk.on_disk) {
|
||||
const disabled = isActive || isSwapping;
|
||||
const tip = isActive
|
||||
? 'Currently loaded — switch to another model first'
|
||||
: isSwapping
|
||||
? 'A swap is in progress'
|
||||
: 'Delete weights from disk';
|
||||
trashBtn = `<button class="icon-btn danger" data-disk-del-key="${key}" title="${escapeHtml(tip)}" aria-label="Delete from disk" ${disabled ? 'disabled' : ''}>${trashIcon}</button>`;
|
||||
}
|
||||
card.innerHTML = `
|
||||
<div class="name">${escapeHtml(m.display_name)}</div>
|
||||
<div class="meta">
|
||||
<span class="tag mode-${m.mode}">${m.mode}</span>
|
||||
<span class="tag">${m.size_gb} GB</span>
|
||||
${customPill}
|
||||
${diskPill}
|
||||
${(m.capabilities || []).map(c => `<span class="tag cap">${escapeHtml(c)}</span>`).join('')}
|
||||
</div>
|
||||
${desc}
|
||||
<div class="muted small repo">${escapeHtml(m.repo)}</div>
|
||||
<div class="muted small repo">
|
||||
<a href="https://huggingface.co/${encodeURIComponent(m.repo)}" target="_blank" rel="noopener" title="View on Hugging Face">${escapeHtml(m.repo)} <span class="hf-icon">↗</span></a>
|
||||
</div>
|
||||
<div class="spacer"></div>
|
||||
<div class="card-actions">
|
||||
<button class="btn ${isActive ? '' : 'primary'}" data-swap-key="${key}" ${isActive || isSwapping ? 'disabled' : ''}>
|
||||
${isActive ? 'Current' : 'Switch to this'}
|
||||
</button>
|
||||
<button class="btn test-btn" data-test-key="${key}" title="Pre-flight check the launch command without starting the engine">Test</button>
|
||||
<button class="btn adv-btn" data-adv-key="${key}" title="Advanced settings">Advanced</button>
|
||||
${trashBtn}
|
||||
</div>
|
||||
<div class="test-result hidden" data-test-result-for="${key}"></div>
|
||||
`;
|
||||
root.appendChild(card);
|
||||
}
|
||||
@@ -80,6 +114,42 @@ function renderCards() {
|
||||
for (const btn of root.querySelectorAll('[data-adv-key]')) {
|
||||
btn.addEventListener('click', () => openAdvanced(btn.dataset.advKey));
|
||||
}
|
||||
for (const btn of root.querySelectorAll('[data-test-key]')) {
|
||||
btn.addEventListener('click', () => testLaunch(btn.dataset.testKey, btn));
|
||||
}
|
||||
for (const btn of root.querySelectorAll('[data-disk-del-key]')) {
|
||||
btn.addEventListener('click', () => openDiskDeleteDialog(btn.dataset.diskDelKey));
|
||||
}
|
||||
}
|
||||
|
||||
const trashIcon = '<svg viewBox="0 0 24 24" width="14" height="14" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" aria-hidden="true"><polyline points="3 6 5 6 21 6"></polyline><path d="M19 6l-1 14a2 2 0 0 1-2 2H8a2 2 0 0 1-2-2L5 6"></path><path d="M10 11v6"></path><path d="M14 11v6"></path><path d="M9 6V4a2 2 0 0 1 2-2h2a2 2 0 0 1 2 2v2"></path></svg>';
|
||||
|
||||
async function testLaunch(key, btn) {
|
||||
const resultEl = document.querySelector(`[data-test-result-for="${key}"]`);
|
||||
if (!resultEl) return;
|
||||
const originalText = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Testing…';
|
||||
resultEl.classList.remove('hidden', 'ok', 'fail');
|
||||
resultEl.innerHTML = '<span class="muted small">Checking launch args against vLLM\'s parser…</span>';
|
||||
try {
|
||||
const r = await fetchJSON(`/api/swap/${encodeURIComponent(key)}/validate`, { method: 'POST' });
|
||||
if (r.ok) {
|
||||
resultEl.classList.add('ok');
|
||||
resultEl.innerHTML = `<span class="ok-mark">✓</span> Launch args parse OK. <span class="muted small">(Doesn't guarantee runtime success — only catches argparse-level issues.)</span>`;
|
||||
} else {
|
||||
resultEl.classList.add('fail');
|
||||
const err = escapeHtml(r.error || 'unknown error');
|
||||
const stage = r.stage ? ` <span class="muted small">(${escapeHtml(r.stage)})</span>` : '';
|
||||
resultEl.innerHTML = `<span class="fail-mark">✗</span> Would fail: ${err}${stage}`;
|
||||
}
|
||||
} catch (e) {
|
||||
resultEl.classList.add('fail');
|
||||
resultEl.innerHTML = `<span class="fail-mark">✗</span> Test failed: ${escapeHtml(e.message)}`;
|
||||
} finally {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
}
|
||||
|
||||
function renderCurrent(status) {
|
||||
@@ -93,6 +163,225 @@ function renderCurrent(status) {
|
||||
c.innerHTML = `<strong>${label}</strong>`;
|
||||
}
|
||||
|
||||
// ===================== hardware dashboard =====================
|
||||
|
||||
function fmtBytes(n) {
|
||||
if (!n && n !== 0) return '—';
|
||||
const u = ['B', 'KB', 'MB', 'GB', 'TB'];
|
||||
let i = 0; let v = n;
|
||||
while (v >= 1024 && i < u.length - 1) { v /= 1024; i++; }
|
||||
return v < 10 ? `${v.toFixed(1)} ${u[i]}` : `${Math.round(v)} ${u[i]}`;
|
||||
}
|
||||
function fmtMiB(n) {
|
||||
if (!n && n !== 0) return null;
|
||||
// n is in MiB; render in GB
|
||||
const gb = n / 1024;
|
||||
return gb < 10 ? gb.toFixed(1) : Math.round(gb).toString();
|
||||
}
|
||||
|
||||
function bar(usedPct, warn) {
|
||||
const pct = Math.max(2, Math.min(100, usedPct));
|
||||
return `<div class="bar ${warn ? 'warn' : ''}"><span style="width:${pct}%"></span></div>`;
|
||||
}
|
||||
|
||||
async function pollHardware() {
|
||||
try {
|
||||
state.hardware = await fetchJSON('/api/hardware');
|
||||
try { state.connectivity = await fetchJSON('/api/connectivity'); } catch {}
|
||||
renderHardware();
|
||||
} catch (e) { console.warn('hardware poll failed', e); }
|
||||
}
|
||||
|
||||
function fmtDuration(sec) {
|
||||
if (sec == null) return '';
|
||||
if (sec < 60) return `${Math.round(sec)}s`;
|
||||
if (sec < 3600) return `${Math.round(sec / 60)}m`;
|
||||
if (sec < 86400) {
|
||||
const h = Math.floor(sec / 3600);
|
||||
const m = Math.round((sec % 3600) / 60);
|
||||
return m ? `${h}h ${m}m` : `${h}h`;
|
||||
}
|
||||
const d = Math.floor(sec / 86400);
|
||||
const h = Math.round((sec % 86400) / 3600);
|
||||
return h ? `${d}d ${h}h` : `${d}d`;
|
||||
}
|
||||
|
||||
function openConnectivityDialog() {
|
||||
const dlg = el('#connectivity-dialog');
|
||||
const content = el('#connectivity-content');
|
||||
const c = state.connectivity || {};
|
||||
const events = c.events || [];
|
||||
if (events.length === 0) {
|
||||
content.innerHTML = '<div class="muted small">No events recorded yet. Once a Spark or service goes down and back up (or an external app reports a failure), entries appear here.</div>';
|
||||
dlg.showModal();
|
||||
return;
|
||||
}
|
||||
const bySubject = {};
|
||||
for (const e of events) {
|
||||
const subj = e.subject || e.spark || 'unknown'; // legacy fallback
|
||||
(bySubject[subj] = bySubject[subj] || []).push(e);
|
||||
}
|
||||
// Sort subjects: hosts first, then services, alphabetical
|
||||
const hostOrder = ['spark1', 'spark2'];
|
||||
const subjects = Object.keys(bySubject).sort((a, b) => {
|
||||
const ia = hostOrder.indexOf(a);
|
||||
const ib = hostOrder.indexOf(b);
|
||||
if (ia >= 0 && ib >= 0) return ia - ib;
|
||||
if (ia >= 0) return -1;
|
||||
if (ib >= 0) return 1;
|
||||
return a.localeCompare(b);
|
||||
});
|
||||
|
||||
const html = subjects.map((subj) => {
|
||||
const evs = bySubject[subj];
|
||||
const transitions = evs.filter(e => (e.kind || 'transition') === 'transition');
|
||||
const reports = evs.filter(e => e.kind === 'report');
|
||||
const downs = transitions.filter(e => e.transition === 'down').length;
|
||||
const failedReports = reports.filter(e => !e.ok).length;
|
||||
const mac = c.macs?.[subj];
|
||||
const summaryParts = [];
|
||||
if (transitions.length) summaryParts.push(`${transitions.length} probe transition${transitions.length===1?'':'s'} (${downs} down)`);
|
||||
if (reports.length) summaryParts.push(`${reports.length} app report${reports.length===1?'':'s'} (${failedReports} failed)`);
|
||||
const isHost = hostOrder.includes(subj);
|
||||
return `
|
||||
<div class="conn-spark">
|
||||
<h4>${escapeHtml(subj)}${isHost ? ' <span class="muted small">[host]</span>' : ' <span class="muted small">[service]</span>'}${mac ? ` <span class="muted small">${escapeHtml(mac)}</span>` : ''}</h4>
|
||||
<div class="conn-summary">${summaryParts.join(' · ') || 'no events'}</div>
|
||||
${evs.slice(-30).reverse().map(e => renderConnEvent(e)).join('')}
|
||||
</div>
|
||||
`;
|
||||
}).join('');
|
||||
content.innerHTML = html;
|
||||
dlg.showModal();
|
||||
}
|
||||
|
||||
function renderConnEvent(e) {
|
||||
const when = escapeHtml((e.at || '').replace('T', ' ').replace('Z', ''));
|
||||
const kind = e.kind || 'transition';
|
||||
if (kind === 'report') {
|
||||
const ok = !!e.ok;
|
||||
const source = escapeHtml(e.source || 'external');
|
||||
const detail = e.detail ? ` — ${escapeHtml(e.detail)}` : '';
|
||||
const latency = e.latency_ms != null ? ` (${e.latency_ms} ms)` : '';
|
||||
return `
|
||||
<div class="conn-event ${ok ? 'up' : 'down'} report">
|
||||
<span class="when">${when}</span>
|
||||
<span class="what">${ok ? '◷ report: ok' : '◷ report: failed'} <span class="muted">from</span> ${source}${detail}</span>
|
||||
<span class="dur">${latency}</span>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
const down = e.down_seconds != null ? `was down ${fmtDuration(e.down_seconds)}` : '';
|
||||
const up = e.up_seconds != null ? `was up ${fmtDuration(e.up_seconds)}` : '';
|
||||
return `
|
||||
<div class="conn-event ${e.transition}">
|
||||
<span class="when">${when}</span>
|
||||
<span class="what">${e.transition === 'up' ? '↑ came back online' : '↓ dropped offline'}</span>
|
||||
<span class="dur">${down}${up}</span>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
async function wakeSpark(name) {
|
||||
try {
|
||||
const r = await fetchJSON(`/api/spark/${name}/wake`, { method: 'POST' });
|
||||
alert(`Wake-on-LAN sent to ${name} (MAC ${r.mac}, via ${r.delivered_via}). Give it ~30 seconds to wake; the card will go green when it comes back.`);
|
||||
} catch (e) {
|
||||
alert(`Wake failed: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
function renderHardware() {
|
||||
const panel = el('#hardware-panel');
|
||||
const grid = el('#hardware-grid');
|
||||
const hw = state.hardware || {};
|
||||
const keys = Object.keys(hw).filter(k => hw[k] && (hw[k].configured !== false));
|
||||
if (keys.length === 0) { panel.classList.add('hidden'); return; }
|
||||
panel.classList.remove('hidden');
|
||||
grid.innerHTML = '';
|
||||
for (const key of keys) {
|
||||
const s = hw[key];
|
||||
const card = document.createElement('div');
|
||||
if (!s.reachable) {
|
||||
card.className = 'hw-card unreachable';
|
||||
const mac = state.connectivity?.macs?.[key];
|
||||
const wolRow = mac
|
||||
? `<div class="wol-row">
|
||||
<span class="mac-display">${escapeHtml(mac)}</span>
|
||||
<span class="spacer"></span>
|
||||
<button class="btn" data-wake="${escapeHtml(key)}">Wake (WoL)</button>
|
||||
</div>`
|
||||
: `<div class="muted small">MAC not yet known — once it's been up once with this dashboard installed, "Wake" will appear here.</div>`;
|
||||
card.innerHTML = `
|
||||
<div class="head">
|
||||
<span class="name">${escapeHtml(key)}</span>
|
||||
<span class="meta">unreachable</span>
|
||||
</div>
|
||||
<div class="muted small">${escapeHtml(s.host || '')} — ${escapeHtml(s.error || 'no response')}</div>
|
||||
${wolRow}
|
||||
<div class="muted small" style="line-height:1.5">
|
||||
If Wake-on-LAN doesn't bring it back, manual steps:
|
||||
<ol style="margin: 6px 0 0 18px; padding: 0;">
|
||||
<li>Verify it's powered on (check the front LED).</li>
|
||||
<li>Ping it from another LAN device.</li>
|
||||
<li>Power-cycle it physically.</li>
|
||||
<li>If it boots, this card will go green again automatically.</li>
|
||||
</ol>
|
||||
</div>
|
||||
`;
|
||||
grid.appendChild(card);
|
||||
continue;
|
||||
}
|
||||
const ramPct = s.ram_used_bytes && s.ram_total_bytes ? (s.ram_used_bytes / s.ram_total_bytes) * 100 : 0;
|
||||
const diskPct = s.disk_used_bytes && s.disk_total_bytes ? (s.disk_used_bytes / s.disk_total_bytes) * 100 : 0;
|
||||
const loadPct = (s.load && s.cores) ? Math.min(100, (s.load[0] / s.cores) * 100) : 0;
|
||||
// GPU memory: on unified-memory systems (DGX Spark) total is N/A, so use system RAM as the pool.
|
||||
const gpuMemTotalMiB = s.gpu_mem_total_mib || (s.gpu_unified_memory ? (s.ram_total_bytes / (1024 * 1024)) : null);
|
||||
const gpuMemUsedMiB = s.gpu_mem_used_mib ?? null;
|
||||
const gpuMemPct = (gpuMemTotalMiB && gpuMemUsedMiB != null) ? (gpuMemUsedMiB / gpuMemTotalMiB) * 100 : 0;
|
||||
const gpuMemNote = s.gpu_unified_memory ? ' <span class="muted">(unified)</span>' : '';
|
||||
const gpuExtras = [];
|
||||
if (s.gpu_temp_c != null) gpuExtras.push(`${s.gpu_temp_c}°C`);
|
||||
if (s.gpu_power_w != null) gpuExtras.push(`${s.gpu_power_w.toFixed(0)}W`);
|
||||
const gpuExtrasStr = gpuExtras.length ? ` · ${gpuExtras.join(' · ')}` : '';
|
||||
card.className = 'hw-card';
|
||||
card.innerHTML = `
|
||||
<div class="head">
|
||||
<span class="name">${escapeHtml(s.hostname || key)}</span>
|
||||
<span class="meta">${escapeHtml(key)} · ${escapeHtml(s.gpu_name || '')} · ${escapeHtml(s.uptime || '')}</span>
|
||||
</div>
|
||||
<div class="hw-metric">
|
||||
<span class="label">CPU</span>
|
||||
${bar(loadPct, loadPct > 80)}
|
||||
<span class="val">${s.load ? s.load[0].toFixed(2) : '—'} / ${s.cores || '?'} cores</span>
|
||||
</div>
|
||||
<div class="hw-metric">
|
||||
<span class="label">RAM</span>
|
||||
${bar(ramPct, ramPct > 85)}
|
||||
<span class="val">${fmtBytes(s.ram_used_bytes)} / ${fmtBytes(s.ram_total_bytes)}</span>
|
||||
</div>
|
||||
<div class="hw-metric">
|
||||
<span class="label">GPU mem${gpuMemNote}</span>
|
||||
${bar(gpuMemPct, gpuMemPct > 90)}
|
||||
<span class="val">${fmtMiB(gpuMemUsedMiB) || '—'} / ${fmtMiB(gpuMemTotalMiB) || '?'} GB</span>
|
||||
</div>
|
||||
<div class="hw-metric">
|
||||
<span class="label">GPU util</span>
|
||||
${bar(s.gpu_util_pct || 0, (s.gpu_util_pct || 0) > 90)}
|
||||
<span class="val">${s.gpu_util_pct ?? 0}%${gpuExtrasStr}</span>
|
||||
</div>
|
||||
<div class="hw-metric">
|
||||
<span class="label">Disk</span>
|
||||
${bar(diskPct, diskPct > 85)}
|
||||
<span class="val">${fmtBytes(s.disk_used_bytes)} / ${fmtBytes(s.disk_total_bytes)}</span>
|
||||
</div>
|
||||
`;
|
||||
grid.appendChild(card);
|
||||
}
|
||||
}
|
||||
|
||||
// ===================== service classification =====================
|
||||
|
||||
function classifyService(s) {
|
||||
// returns one of: running | unhealthy | missing | unconfigured | starting
|
||||
if (!s.host) return 'unconfigured';
|
||||
@@ -143,15 +432,49 @@ async function renderServices() {
|
||||
if (action === 'stop' && cls !== 'running' && cls !== 'starting' && cls !== 'unhealthy') return true;
|
||||
return false;
|
||||
};
|
||||
const copyIcon = `<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="9" width="13" height="13" rx="2"/><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/></svg>`;
|
||||
const hostStr = s.host ? `${s.host}:${s.port}` : '';
|
||||
const hostRow = s.host
|
||||
? `<div class="row"><span class="k">Host</span><span class="v">${escapeHtml(s.host)}:${s.port}</span></div>`
|
||||
? `<div class="row"><span class="k">Host</span><span class="v copyable" data-copy-self title="Click to copy">${escapeHtml(hostStr)}</span><button class="icon-btn" data-copy-text="${escapeHtml(hostStr)}" title="Copy host" aria-label="Copy">${copyIcon}</button></div>`
|
||||
: `<div class="row"><span class="k">Host</span><span class="v muted-v">not configured</span></div>`;
|
||||
const urlRow = s.base_url
|
||||
? `<div class="row"><span class="k">URL</span><span class="v copyable" data-copy-self title="Click to copy">${escapeHtml(s.base_url)}</span><button class="icon-btn" data-copy-text="${escapeHtml(s.base_url)}" title="Copy URL" aria-label="Copy">${copyIcon}</button></div>`
|
||||
: '';
|
||||
const modelRow = s.model
|
||||
? `<div class="row"><span class="k">Model</span><span class="v">${escapeHtml(s.model)}</span></div>`
|
||||
? `<div class="row"><span class="k">Model</span><span class="v copyable" data-copy-self title="Click to copy">${escapeHtml(s.model)}</span><button class="icon-btn" data-copy-text="${escapeHtml(s.model)}" title="Copy model" aria-label="Copy">${copyIcon}</button></div>`
|
||||
: '';
|
||||
const restartsRow = s.restart_count != null && s.restart_count > 1
|
||||
? `<div class="row"><span class="k">Restarts</span><span class="v">${s.restart_count}</span></div>`
|
||||
: '';
|
||||
const dh = state.deep_health?.[name];
|
||||
let deepRow = '';
|
||||
if (dh && dh.last) {
|
||||
const last = dh.last;
|
||||
const when = (last.at || '').slice(11, 19); // HH:MM:SS
|
||||
const verdict = last.ok
|
||||
? `<span class="dh-ok">deep check ok</span>`
|
||||
: `<span class="dh-fail">deep check FAILED</span>`;
|
||||
const lat = last.latency_ms != null ? ` <span class="muted">${last.latency_ms} ms</span>` : '';
|
||||
const restarts = dh.auto_restarts_window > 0
|
||||
? ` <span class="muted">· ${dh.auto_restarts_window} auto-restart${dh.auto_restarts_window === 1 ? '' : 's'} in 30 min</span>`
|
||||
: '';
|
||||
deepRow = `
|
||||
<div class="row deep-row">
|
||||
<span class="k">Deep</span>
|
||||
<span class="v deep-v">${verdict} <span class="muted small">${escapeHtml(when)}</span>${lat}${restarts}</span>
|
||||
<button class="icon-btn dh-run-btn" data-dh-run="${escapeHtml(name)}" title="Run deep check now">↻</button>
|
||||
</div>
|
||||
${last.ok ? '' : `<div class="deep-error muted small">${escapeHtml((last.error || last.note || '').slice(0, 200))}</div>`}
|
||||
`;
|
||||
} else if (dh) {
|
||||
deepRow = `
|
||||
<div class="row deep-row">
|
||||
<span class="k">Deep</span>
|
||||
<span class="v muted-v">no probe yet</span>
|
||||
<button class="icon-btn dh-run-btn" data-dh-run="${escapeHtml(name)}" title="Run deep check now">↻</button>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
card.innerHTML = `
|
||||
<div class="head">
|
||||
<span class="name">${escapeHtml(name)}</span>
|
||||
@@ -159,8 +482,10 @@ async function renderServices() {
|
||||
<span class="status">${statusLabel(cls)}</span>
|
||||
</div>
|
||||
${hostRow}
|
||||
${urlRow}
|
||||
${modelRow}
|
||||
${restartsRow}
|
||||
${deepRow}
|
||||
<div class="service-actions">
|
||||
<button class="btn" data-svc-action="${name}:start" ${disable('start') ? 'disabled' : ''}>Start</button>
|
||||
<button class="btn" data-svc-action="${name}:restart" ${disable('restart') ? 'disabled' : ''}>Restart</button>
|
||||
@@ -172,6 +497,25 @@ async function renderServices() {
|
||||
for (const btn of grid.querySelectorAll('.btn[data-svc-action]')) {
|
||||
btn.addEventListener('click', () => onServiceAction(btn.dataset.svcAction));
|
||||
}
|
||||
for (const btn of grid.querySelectorAll('[data-dh-run]')) {
|
||||
btn.addEventListener('click', () => onDeepHealthRun(btn.dataset.dhRun, btn));
|
||||
}
|
||||
}
|
||||
|
||||
async function onDeepHealthRun(name, btn) {
|
||||
btn.disabled = true;
|
||||
const orig = btn.textContent;
|
||||
btn.textContent = '…';
|
||||
try {
|
||||
await fetchJSON(`/api/deep-health/${encodeURIComponent(name)}/run`, { method: 'POST' });
|
||||
} catch (e) {
|
||||
console.warn('deep-health run failed', e);
|
||||
} finally {
|
||||
try { state.deep_health = await fetchJSON('/api/deep-health'); } catch {}
|
||||
btn.textContent = orig;
|
||||
btn.disabled = false;
|
||||
renderServices();
|
||||
}
|
||||
}
|
||||
|
||||
async function onServiceAction(key) {
|
||||
@@ -212,31 +556,50 @@ function renderEndpoint(status) {
|
||||
el('#ep-curl-snippet').textContent = snippet;
|
||||
}
|
||||
|
||||
function setupCopyButtons() {
|
||||
document.body.addEventListener('click', async (e) => {
|
||||
const btn = e.target.closest('.copy-btn');
|
||||
if (!btn) return;
|
||||
const targetSel = btn.dataset.copy;
|
||||
if (!targetSel) return;
|
||||
const target = el(targetSel);
|
||||
if (!target) return;
|
||||
const text = target.textContent;
|
||||
async function copyText(text, indicatorEl) {
|
||||
try {
|
||||
await navigator.clipboard.writeText(text);
|
||||
const original = btn.textContent;
|
||||
btn.classList.add('copied');
|
||||
btn.textContent = 'Copied';
|
||||
setTimeout(() => {
|
||||
btn.classList.remove('copied');
|
||||
btn.textContent = original;
|
||||
}, 1400);
|
||||
if (indicatorEl) {
|
||||
indicatorEl.classList.add('copied');
|
||||
setTimeout(() => indicatorEl.classList.remove('copied'), 1200);
|
||||
}
|
||||
return true;
|
||||
} catch {
|
||||
// Clipboard API may fail over plain HTTP; fall back to selection
|
||||
// Plain HTTP fallback: select the text so the user can ⌘C
|
||||
if (indicatorEl) {
|
||||
const range = document.createRange();
|
||||
range.selectNode(target);
|
||||
range.selectNode(indicatorEl);
|
||||
window.getSelection().removeAllRanges();
|
||||
window.getSelection().addRange(range);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function setupCopyButtons() {
|
||||
document.body.addEventListener('click', async (e) => {
|
||||
// Inline icon copy with literal text (used for dynamically-rendered service rows)
|
||||
const litBtn = e.target.closest('[data-copy-text]');
|
||||
if (litBtn) {
|
||||
await copyText(litBtn.dataset.copyText, litBtn);
|
||||
return;
|
||||
}
|
||||
// Copy buttons (with svg icon) referenced by data-copy="selector"
|
||||
const btn = e.target.closest('[data-copy]');
|
||||
if (btn) {
|
||||
const target = el(btn.dataset.copy);
|
||||
if (target) {
|
||||
await copyText(target.textContent, btn);
|
||||
target.classList.add('copied');
|
||||
setTimeout(() => target.classList.remove('copied'), 1200);
|
||||
}
|
||||
return;
|
||||
}
|
||||
// Self-copy: clicking the text itself
|
||||
const selfCopy = e.target.closest('[data-copy-self]');
|
||||
if (selfCopy) {
|
||||
await copyText(selfCopy.textContent, selfCopy);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -380,9 +743,14 @@ async function pollStatus() {
|
||||
renderCurrent(status);
|
||||
renderEndpoint(status);
|
||||
renderHealth(status);
|
||||
// If models hasn't loaded yet (init may have hit a transient proxy timeout), retry.
|
||||
if (!state.models || Object.keys(state.models).length === 0) {
|
||||
try { await loadModels(); } catch {}
|
||||
}
|
||||
// Refresh services state lazily — every 5s poll triggers this too.
|
||||
try {
|
||||
state.services = await fetchJSON('/api/services');
|
||||
try { state.deep_health = await fetchJSON('/api/deep-health'); } catch {}
|
||||
renderServices();
|
||||
} catch {}
|
||||
if (status.current_swap_job && status.current_swap_job !== state.swap_job_id) {
|
||||
@@ -403,6 +771,78 @@ async function loadModels() {
|
||||
state.models = data.models || {};
|
||||
}
|
||||
|
||||
async function loadDiskStatus() {
|
||||
// Probes each catalog model's HF cache over SSH; takes a beat. Best-effort.
|
||||
try {
|
||||
const r = await fetchJSON('/api/models/disk-status');
|
||||
if (r && r.models) {
|
||||
state.disk_status = r.models;
|
||||
state.disk_status_loaded = true;
|
||||
renderCards();
|
||||
}
|
||||
} catch (e) {
|
||||
// Silent — pills just won't render. Don't block dashboard.
|
||||
console.warn('disk-status probe failed:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
function fmtBytesShort(n) {
|
||||
if (!Number.isFinite(n) || n <= 0) return '0 B';
|
||||
if (n >= 1e9) return `${(n / 1e9).toFixed(1)} GB`;
|
||||
if (n >= 1e6) return `${(n / 1e6).toFixed(1)} MB`;
|
||||
if (n >= 1e3) return `${(n / 1e3).toFixed(1)} KB`;
|
||||
return `${n} B`;
|
||||
}
|
||||
|
||||
function openDiskDeleteDialog(key) {
|
||||
const m = state.models[key];
|
||||
const disk = state.disk_status[key];
|
||||
if (!m || !disk || !disk.on_disk) return;
|
||||
const dlg = el('#disk-delete-dialog');
|
||||
el('#dd-summary').innerHTML = `Free <strong>${fmtBytesShort(disk.total_bytes)}</strong> by removing <strong>${escapeHtml(m.display_name)}</strong> (<code>${escapeHtml(m.repo)}</code>) from disk.`;
|
||||
const hostsEl = el('#dd-hosts');
|
||||
hostsEl.innerHTML = '';
|
||||
for (const h of (disk.per_host || [])) {
|
||||
if (!h.on_disk) continue;
|
||||
const li = document.createElement('li');
|
||||
li.innerHTML = `<code>${escapeHtml(h.host)}</code> — ${fmtBytesShort(h.size_bytes)}`;
|
||||
hostsEl.appendChild(li);
|
||||
}
|
||||
const errEl = el('#dd-error');
|
||||
errEl.classList.add('hidden');
|
||||
errEl.textContent = '';
|
||||
|
||||
const confirm = el('#dd-confirm');
|
||||
const cancel = el('#dd-cancel');
|
||||
const onCancel = () => dlg.close();
|
||||
const onConfirm = async () => {
|
||||
confirm.disabled = true;
|
||||
cancel.disabled = true;
|
||||
confirm.textContent = 'Deleting…';
|
||||
try {
|
||||
const r = await fetchJSON(`/api/models/${encodeURIComponent(key)}/disk`, { method: 'DELETE' });
|
||||
dlg.close();
|
||||
// Optimistically clear local disk state for this key, then refresh.
|
||||
delete state.disk_status[key];
|
||||
renderCards();
|
||||
// Eagerly re-probe so size is accurate (and shows "not downloaded" pill).
|
||||
loadDiskStatus();
|
||||
const freed = r && typeof r.bytes_freed === 'number' ? fmtBytesShort(r.bytes_freed) : '';
|
||||
console.log(`Deleted ${m.display_name} from disk${freed ? ` — freed ${freed}` : ''}.`);
|
||||
} catch (e) {
|
||||
errEl.textContent = e.message || 'Delete failed';
|
||||
errEl.classList.remove('hidden');
|
||||
} finally {
|
||||
confirm.disabled = false;
|
||||
cancel.disabled = false;
|
||||
confirm.textContent = 'Delete from disk';
|
||||
}
|
||||
};
|
||||
cancel.onclick = onCancel;
|
||||
confirm.onclick = onConfirm;
|
||||
dlg.showModal();
|
||||
}
|
||||
|
||||
async function triggerSwap(modelKey) {
|
||||
if (state.swap_job_id) return;
|
||||
try {
|
||||
@@ -518,6 +958,18 @@ function openDownloadForm() {
|
||||
el('#download-form').classList.remove('hidden');
|
||||
el('#download-progress').classList.add('hidden');
|
||||
el('#dl-repo').focus();
|
||||
updateDlHfLink();
|
||||
}
|
||||
|
||||
function updateDlHfLink() {
|
||||
const repo = el('#dl-repo').value.trim();
|
||||
const link = el('#dl-hf-link');
|
||||
if (repo.includes('/')) {
|
||||
link.href = `https://huggingface.co/${encodeURIComponent(repo)}`;
|
||||
link.classList.remove('hidden');
|
||||
} else {
|
||||
link.classList.add('hidden');
|
||||
}
|
||||
}
|
||||
|
||||
function closeDownloadPanel() {
|
||||
@@ -647,6 +1099,47 @@ function handleDownloadDone(d) {
|
||||
|
||||
// ===================== Advanced / Add to catalog =====================
|
||||
|
||||
function gpuTotalGB(modelMode) {
|
||||
// Solo uses Spark 1's GPU only. Cluster shares across both — but loading is per-Spark.
|
||||
const s1 = state.hardware?.spark1;
|
||||
const s2 = state.hardware?.spark2;
|
||||
const g1 = s1?.gpu_mem_total_mib ? s1.gpu_mem_total_mib / 1024 : null;
|
||||
const g2 = s2?.gpu_mem_total_mib ? s2.gpu_mem_total_mib / 1024 : null;
|
||||
if (modelMode === 'cluster' && g1 && g2) return Math.min(g1, g2); // bottleneck
|
||||
return g1 || g2 || null;
|
||||
}
|
||||
|
||||
function knobContextHint(field, value, mode) {
|
||||
if (field === 'gpu_memory_utilization') {
|
||||
const gb = gpuTotalGB(mode);
|
||||
if (!gb) return '';
|
||||
const used = (value * gb).toFixed(0);
|
||||
const free = (gb - value * gb).toFixed(0);
|
||||
return `~${used} GB allocated to model + KV cache · ~${free} GB left for OS, buffers, other GPU workloads.`;
|
||||
}
|
||||
if (field === 'max_model_len') {
|
||||
if (!value) return '';
|
||||
const pages = Math.round(value / 350); // ~350 tokens per page
|
||||
const kvBytes = (value * 2 * 4 * 32 * 128); // rough fp16 KV cache size for typical 32-layer model
|
||||
return `~${pages.toLocaleString()} pages of text (very rough). Larger context = more GPU memory reserved for KV cache.`;
|
||||
}
|
||||
if (field === 'fastsafetensors') return value ? 'Faster cold-start weight loading.' : 'Standard safetensors loading.';
|
||||
if (field === 'prefix_caching') return value ? 'Reuses GPU state for repeated prompt prefixes (e.g. long system prompts).' : 'Off — every request re-processes the full prompt.';
|
||||
if (field === 'kv_cache_dtype') return value === 'fp8' ? 'Halves KV cache memory (fits ~2× more context). Quality cost is usually imperceptible.' : 'Default precision.';
|
||||
return '';
|
||||
}
|
||||
|
||||
function ensureKnobHint(rowEl, id) {
|
||||
let h = rowEl.querySelector(`.knob-hint[data-for="${id}"]`);
|
||||
if (!h) {
|
||||
h = document.createElement('div');
|
||||
h.className = 'knob-hint muted small';
|
||||
h.dataset.for = id;
|
||||
rowEl.appendChild(h);
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
function openAdvanced(key) {
|
||||
const m = state.models[key];
|
||||
if (!m) return;
|
||||
@@ -659,6 +1152,23 @@ function openAdvanced(key) {
|
||||
el('#adv-fst').checked = !!k.fastsafetensors;
|
||||
el('#adv-pcache').checked = !!k.prefix_caching;
|
||||
el('#adv-fp8').checked = k.kv_cache_dtype === 'fp8';
|
||||
|
||||
// Wire up live knob hints
|
||||
const updateHints = () => {
|
||||
const mml = parseInt(el('#adv-mml').value, 10);
|
||||
const gmu = parseFloat(el('#adv-gmu').value);
|
||||
ensureKnobHint(el('#adv-mml').parentElement, 'mml').textContent = knobContextHint('max_model_len', mml, m.mode);
|
||||
ensureKnobHint(el('#adv-gmu').parentElement, 'gmu').textContent = knobContextHint('gpu_memory_utilization', gmu, m.mode);
|
||||
ensureKnobHint(el('#adv-fst').parentElement, 'fst').textContent = knobContextHint('fastsafetensors', el('#adv-fst').checked, m.mode);
|
||||
ensureKnobHint(el('#adv-pcache').parentElement, 'pcache').textContent = knobContextHint('prefix_caching', el('#adv-pcache').checked, m.mode);
|
||||
ensureKnobHint(el('#adv-fp8').parentElement, 'fp8').textContent = knobContextHint('kv_cache_dtype', el('#adv-fp8').checked ? 'fp8' : 'auto', m.mode);
|
||||
};
|
||||
updateHints();
|
||||
el('#adv-mml').oninput = updateHints;
|
||||
el('#adv-gmu').oninput = (e) => { el('#adv-gmu-out').value = parseFloat(e.target.value).toFixed(2); updateHints(); };
|
||||
el('#adv-fst').onchange = updateHints;
|
||||
el('#adv-pcache').onchange = updateHints;
|
||||
el('#adv-fp8').onchange = updateHints;
|
||||
const del = el('#adv-delete');
|
||||
del.classList.toggle('hidden', !m.custom);
|
||||
del.onclick = async () => {
|
||||
@@ -753,6 +1263,197 @@ function setupAdvancedDialog() {
|
||||
el('#adv-gmu').addEventListener('input', (e) => { el('#adv-gmu-out').value = parseFloat(e.target.value).toFixed(2); });
|
||||
}
|
||||
|
||||
// ===================== NIM installer =====================
|
||||
|
||||
const nimState = {
|
||||
catalog: null,
|
||||
job_id: null,
|
||||
eventsource: null,
|
||||
timer: null,
|
||||
started_at: null,
|
||||
};
|
||||
|
||||
async function loadNimCatalog() {
|
||||
try {
|
||||
nimState.catalog = await fetchJSON('/api/nim/catalog');
|
||||
el('#nim-catalog-link').href = nimState.catalog.catalog_url;
|
||||
const warn = el('#nim-key-warn');
|
||||
if (!nimState.catalog.ngc_key_configured) {
|
||||
warn.classList.add('nim-key-warn');
|
||||
warn.innerHTML = '⚠️ NGC API key not set. Open <strong>Configure Sparks</strong> in StartOS and paste your NGC personal API key, otherwise installs will fail. <a href="https://ngc.nvidia.com/setup/personal-key" target="_blank" rel="noopener">Get a key</a>';
|
||||
} else {
|
||||
warn.classList.remove('nim-key-warn');
|
||||
warn.textContent = '';
|
||||
}
|
||||
const grid = el('#nim-suggested');
|
||||
grid.innerHTML = '';
|
||||
for (const s of nimState.catalog.suggested || []) {
|
||||
const card = document.createElement('div');
|
||||
card.className = 'nim-card';
|
||||
card.innerHTML = `
|
||||
<div class="info">
|
||||
<div class="name">${escapeHtml(s.name)} <span class="muted small">· ${escapeHtml(s.kind || 'nim')}</span></div>
|
||||
<div class="desc">${escapeHtml(s.description || '')}</div>
|
||||
<div class="img">${escapeHtml(s.image)}</div>
|
||||
<div class="links">${s.homepage ? `<a href="${escapeHtml(s.homepage)}" target="_blank" rel="noopener">View on NGC ↗</a>` : ''}</div>
|
||||
</div>
|
||||
<button type="button" class="btn primary nim-pick" data-image="${escapeHtml(s.image)}" data-container="${escapeHtml(s.default_container)}" data-port="${s.default_port}" data-kind="${escapeHtml(s.kind)}">Pick</button>
|
||||
`;
|
||||
grid.appendChild(card);
|
||||
}
|
||||
grid.querySelectorAll('.nim-pick').forEach(btn => {
|
||||
btn.addEventListener('click', () => {
|
||||
el('#nim-image').value = btn.dataset.image;
|
||||
el('#nim-container').value = btn.dataset.container;
|
||||
el('#nim-port').value = btn.dataset.port;
|
||||
el('#nim-kind').value = btn.dataset.kind || 'nim';
|
||||
});
|
||||
});
|
||||
} catch (e) { console.warn('nim catalog failed', e); }
|
||||
}
|
||||
|
||||
function openNimDialog() {
|
||||
loadNimCatalog();
|
||||
el('#nim-dialog').showModal();
|
||||
}
|
||||
|
||||
async function submitNim(e) {
|
||||
e.preventDefault();
|
||||
const body = {
|
||||
image: el('#nim-image').value.trim(),
|
||||
container: el('#nim-container').value.trim(),
|
||||
port: parseInt(el('#nim-port').value, 10),
|
||||
host: el('#nim-host').value,
|
||||
kind: el('#nim-kind').value,
|
||||
};
|
||||
if (!body.image || !body.container || !body.port) {
|
||||
alert('Image, container name, and port are required.');
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const r = await fetchJSON('/api/nim/install', {
|
||||
method: 'POST',
|
||||
headers: { 'content-type': 'application/json' },
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
el('#nim-dialog').close();
|
||||
attachNimProgress(r.job_id);
|
||||
} catch (e) {
|
||||
alert('Install failed: ' + e.message);
|
||||
}
|
||||
}
|
||||
|
||||
function nimTimerStart(at) {
|
||||
nimState.started_at = at;
|
||||
if (nimState.timer) clearInterval(nimState.timer);
|
||||
const tick = () => {
|
||||
if (!nimState.started_at) return;
|
||||
const sec = Math.max(0, Math.floor((Date.now() - nimState.started_at) / 1000));
|
||||
const m = Math.floor(sec / 60);
|
||||
const s = sec % 60;
|
||||
el('#nim-prog-elapsed').textContent = `${m}:${s.toString().padStart(2, '0')}`;
|
||||
};
|
||||
tick();
|
||||
nimState.timer = setInterval(tick, 500);
|
||||
}
|
||||
|
||||
async function attachNimProgress(jobId) {
|
||||
nimState.job_id = jobId;
|
||||
el('#nim-prog-log').textContent = '';
|
||||
el('#nim-prog-title').textContent = 'Installing…';
|
||||
el('#nim-progress-dialog').showModal();
|
||||
try {
|
||||
const snap = await fetchJSON(`/api/nim/install/${jobId}`);
|
||||
nimTimerStart(Date.parse(snap.started_at));
|
||||
el('#nim-prog-phase').textContent = snap.phase || 'Working…';
|
||||
el('#nim-prog-log').textContent = (snap.lines || []).join('\n');
|
||||
if (snap.returncode !== null) { onNimDone(snap); return; }
|
||||
} catch { nimTimerStart(Date.now()); }
|
||||
const es = new EventSource(`/api/nim/install/${jobId}/stream`);
|
||||
nimState.eventsource = es;
|
||||
es.onmessage = ev => {
|
||||
try {
|
||||
const d = JSON.parse(ev.data);
|
||||
if (d.line !== undefined) {
|
||||
const log = el('#nim-prog-log');
|
||||
log.textContent += d.line + '\n';
|
||||
log.scrollTop = log.scrollHeight;
|
||||
}
|
||||
} catch {}
|
||||
};
|
||||
es.addEventListener('phase', ev => {
|
||||
try { el('#nim-prog-phase').textContent = JSON.parse(ev.data).phase; } catch {}
|
||||
});
|
||||
es.addEventListener('done', ev => {
|
||||
let d = {}; try { d = JSON.parse(ev.data); } catch {}
|
||||
onNimDone(d);
|
||||
});
|
||||
es.onerror = () => { es.close(); nimState.eventsource = null; };
|
||||
}
|
||||
|
||||
function onNimDone(d) {
|
||||
if (nimState.eventsource) { nimState.eventsource.close(); nimState.eventsource = null; }
|
||||
if (nimState.timer) { clearInterval(nimState.timer); nimState.timer = null; }
|
||||
if (d.state === 'failed') {
|
||||
el('#nim-prog-title').textContent = `Failed (rc=${d.returncode})`;
|
||||
el('#nim-prog-phase').textContent = 'Failed';
|
||||
} else {
|
||||
el('#nim-prog-title').textContent = 'Installed';
|
||||
el('#nim-prog-phase').textContent = 'Done ✓ — service will appear when the container reports healthy.';
|
||||
}
|
||||
pollStatus();
|
||||
}
|
||||
|
||||
// ===================== Explain context (LLM commit summary) =====================
|
||||
|
||||
let explainEventSource = null;
|
||||
|
||||
async function explainContext() {
|
||||
if (explainEventSource) { explainEventSource.close(); explainEventSource = null; }
|
||||
const section = el('#ub-explain-section');
|
||||
const content = el('#ub-explain-content');
|
||||
section.classList.remove('hidden');
|
||||
section.open = true;
|
||||
content.innerHTML = '<span class="muted">Asking the loaded model…</span>';
|
||||
let text = '';
|
||||
const es = new EventSource('/api/explain-updates');
|
||||
explainEventSource = es;
|
||||
let firstChunk = true;
|
||||
es.onmessage = (ev) => {
|
||||
try {
|
||||
const d = JSON.parse(ev.data);
|
||||
if (d.error) {
|
||||
content.innerHTML = `<span class="muted">Couldn't get explanation: ${escapeHtml(d.error)}</span>`;
|
||||
return;
|
||||
}
|
||||
if (firstChunk) { content.innerHTML = ''; firstChunk = false; }
|
||||
if (d.content) {
|
||||
text += d.content;
|
||||
content.textContent = text;
|
||||
content.scrollTop = content.scrollHeight;
|
||||
} else if (d.reasoning) {
|
||||
// Show reasoning tokens but de-emphasized
|
||||
let r = content.querySelector('.reasoning-current');
|
||||
if (!r) {
|
||||
r = document.createElement('div');
|
||||
r.className = 'reasoning reasoning-current';
|
||||
r.textContent = '';
|
||||
content.appendChild(r);
|
||||
}
|
||||
r.textContent += d.reasoning;
|
||||
}
|
||||
} catch {}
|
||||
};
|
||||
es.addEventListener('done', () => {
|
||||
es.close();
|
||||
explainEventSource = null;
|
||||
// strip the reasoning-current marker
|
||||
const r = content.querySelector('.reasoning-current');
|
||||
if (r) r.classList.remove('reasoning-current');
|
||||
});
|
||||
es.onerror = () => { es.close(); explainEventSource = null; };
|
||||
}
|
||||
|
||||
// ===================== updates (spark-vllm-docker) =====================
|
||||
|
||||
const updState = {
|
||||
@@ -792,19 +1493,23 @@ function renderUpdateBanner() {
|
||||
banner.classList.toggle('up-to-date', behind === 0 && !dirty);
|
||||
banner.classList.toggle('warn', !!dirty);
|
||||
|
||||
const explain = el('#ub-explain');
|
||||
if (dirty > 0) {
|
||||
text.textContent = `${dirty} local change${dirty === 1 ? '' : 's'} in ~/spark-vllm-docker. Resolve before updating.`;
|
||||
details.classList.add('hidden');
|
||||
apply.classList.add('hidden');
|
||||
explain.classList.add('hidden');
|
||||
} else if (behind === 0) {
|
||||
text.textContent = `spark-vllm-docker is up to date (${info.current || ''})`;
|
||||
details.classList.add('hidden');
|
||||
apply.classList.add('hidden');
|
||||
list.classList.add('hidden');
|
||||
explain.classList.add('hidden');
|
||||
} else {
|
||||
text.textContent = `${behind} commit${behind === 1 ? '' : 's'} behind upstream`;
|
||||
details.classList.remove('hidden');
|
||||
apply.classList.remove('hidden');
|
||||
explain.classList.remove('hidden');
|
||||
log.textContent = (info.log || []).join('\n') || '(no log)';
|
||||
}
|
||||
}
|
||||
@@ -893,14 +1598,41 @@ async function init() {
|
||||
list.open = !list.open;
|
||||
});
|
||||
el('#ub-apply').addEventListener('click', applyUpdate);
|
||||
el('#ub-explain').addEventListener('click', explainContext);
|
||||
el('#dl-repo').addEventListener('input', updateDlHfLink);
|
||||
el('#open-nim').addEventListener('click', openNimDialog);
|
||||
el('#nim-cancel').addEventListener('click', () => el('#nim-dialog').close());
|
||||
el('#nim-form').addEventListener('submit', submitNim);
|
||||
el('#nim-prog-close').addEventListener('click', () => el('#nim-progress-dialog').close());
|
||||
el('#open-connectivity').addEventListener('click', openConnectivityDialog);
|
||||
el('#connectivity-close').addEventListener('click', () => el('#connectivity-dialog').close());
|
||||
// Wake-on-LAN buttons live on unreachable hardware cards; delegate.
|
||||
el('#hardware-grid').addEventListener('click', (e) => {
|
||||
const btn = e.target.closest('[data-wake]');
|
||||
if (btn) wakeSpark(btn.dataset.wake);
|
||||
});
|
||||
setupCatalogDialog();
|
||||
setupAdvancedDialog();
|
||||
// Open WebUI link from /api/config
|
||||
try {
|
||||
state.config = await fetchJSON('/api/config');
|
||||
if (state.config.open_webui_url) {
|
||||
const a = el('#open-webui-link');
|
||||
a.href = state.config.open_webui_url;
|
||||
a.classList.remove('hidden');
|
||||
}
|
||||
} catch {}
|
||||
await loadModels();
|
||||
await pollStatus();
|
||||
await renderServices();
|
||||
pollHardware();
|
||||
pollUpdates();
|
||||
// Disk-status probe runs after first paint — slow over SSH and not blocking.
|
||||
loadDiskStatus();
|
||||
setInterval(pollStatus, 5000);
|
||||
setInterval(pollHardware, 8000); // every 8s
|
||||
setInterval(pollUpdates, 300000); // every 5 min
|
||||
setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely
|
||||
}
|
||||
|
||||
init();
|
||||
|
||||
+124
-8
@@ -16,6 +16,7 @@
|
||||
<div class="current" id="current">
|
||||
<span class="muted">connecting…</span>
|
||||
</div>
|
||||
<a id="open-webui-link" class="topbar-btn hidden" href="#" target="_blank" rel="noopener" title="Open Open WebUI">Open chat ↗</a>
|
||||
</header>
|
||||
|
||||
<main>
|
||||
@@ -24,22 +25,47 @@
|
||||
<span>Run the <em>Configure Sparks</em> action in StartOS to set hostnames, then run <em>Test Connection</em>.</span>
|
||||
</section>
|
||||
|
||||
<section id="hardware-panel" class="hardware-panel hidden">
|
||||
<div class="section-header">
|
||||
<h2 class="section-title">Spark hardware</h2>
|
||||
<button id="open-connectivity" class="btn small-btn">Connectivity log</button>
|
||||
</div>
|
||||
<div id="hardware-grid" class="hardware-grid"></div>
|
||||
|
||||
<dialog id="connectivity-dialog" class="modal">
|
||||
<form method="dialog" class="modal-form">
|
||||
<h3>Spark connectivity history</h3>
|
||||
<p class="muted small">Most recent up/down transitions per Spark. Tracked since this dashboard was installed.</p>
|
||||
<div id="connectivity-content" class="connectivity-content"></div>
|
||||
<div class="modal-actions">
|
||||
<button type="button" id="connectivity-close" class="btn">Close</button>
|
||||
</div>
|
||||
</form>
|
||||
</dialog>
|
||||
</section>
|
||||
|
||||
<section id="endpoint-panel" class="endpoint-panel hidden">
|
||||
<div class="ep-title muted small">OpenAI-compatible endpoint</div>
|
||||
<div class="ep-row">
|
||||
<span class="ep-label">Base URL</span>
|
||||
<code class="ep-value" id="ep-url">—</code>
|
||||
<button class="copy-btn" data-copy="#ep-url" title="Copy base URL">Copy</button>
|
||||
<code class="ep-value copyable" id="ep-url" data-copy-self title="Click to copy">—</code>
|
||||
<button class="icon-btn" data-copy="#ep-url" title="Copy base URL" aria-label="Copy">
|
||||
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="9" width="13" height="13" rx="2"/><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/></svg>
|
||||
</button>
|
||||
</div>
|
||||
<div class="ep-row">
|
||||
<span class="ep-label">Model ID</span>
|
||||
<code class="ep-value" id="ep-model">—</code>
|
||||
<button class="copy-btn" data-copy="#ep-model" title="Copy model ID">Copy</button>
|
||||
<code class="ep-value copyable" id="ep-model" data-copy-self title="Click to copy">—</code>
|
||||
<button class="icon-btn" data-copy="#ep-model" title="Copy model ID" aria-label="Copy">
|
||||
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="9" width="13" height="13" rx="2"/><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/></svg>
|
||||
</button>
|
||||
</div>
|
||||
<details class="ep-curl">
|
||||
<summary class="muted small">curl example</summary>
|
||||
<pre id="ep-curl-snippet" class="snippet"></pre>
|
||||
<button class="copy-btn small" data-copy="#ep-curl-snippet">Copy snippet</button>
|
||||
<pre id="ep-curl-snippet" class="snippet copyable" data-copy-self title="Click to copy"></pre>
|
||||
<button class="icon-btn" data-copy="#ep-curl-snippet" title="Copy snippet" aria-label="Copy">
|
||||
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="9" width="13" height="13" rx="2"/><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/></svg>
|
||||
</button>
|
||||
</details>
|
||||
</section>
|
||||
|
||||
@@ -64,8 +90,66 @@
|
||||
</section>
|
||||
|
||||
<section id="services-panel" class="services hidden">
|
||||
<div class="section-header">
|
||||
<h2 class="section-title">Always-on services</h2>
|
||||
<button id="open-nim" class="btn small-btn">+ Install NIM</button>
|
||||
</div>
|
||||
<div id="services-grid" class="services-grid"></div>
|
||||
|
||||
<dialog id="nim-dialog" class="modal">
|
||||
<form method="dialog" class="modal-form" id="nim-form">
|
||||
<h3>Install a NVIDIA NIM container</h3>
|
||||
<p class="muted small" id="nim-key-warn"></p>
|
||||
<p class="muted small">Pick a curated container below or paste any image from <a href="#" id="nim-catalog-link" target="_blank" rel="noopener">the NGC NIM catalog</a>. Spark Control will <code>docker pull</code> and <code>docker run</code> it on the target Spark.</p>
|
||||
|
||||
<div id="nim-suggested" class="nim-grid"></div>
|
||||
|
||||
<fieldset class="modal-fieldset">
|
||||
<legend>Custom image</legend>
|
||||
<label class="modal-row"><span>Image (nvcr.io/...)</span><input type="text" id="nim-image" placeholder="nvcr.io/nim/nvidia/<name>:latest"></label>
|
||||
<label class="modal-row"><span>Container name</span><input type="text" id="nim-container" placeholder="my-service"></label>
|
||||
<label class="modal-row"><span>Port</span><input type="number" id="nim-port" min="1" max="65535"></label>
|
||||
<label class="modal-row"><span>Kind</span>
|
||||
<select id="nim-kind">
|
||||
<option value="nim">NIM (other)</option>
|
||||
<option value="stt">STT (speech-to-text)</option>
|
||||
<option value="tts">TTS (text-to-speech)</option>
|
||||
<option value="vision">Vision</option>
|
||||
<option value="embedding">Embedding</option>
|
||||
</select>
|
||||
</label>
|
||||
<label class="modal-row"><span>Target Spark</span>
|
||||
<select id="nim-host">
|
||||
<option value="spark2">Spark 2 (default for support services)</option>
|
||||
<option value="spark1">Spark 1 (head node)</option>
|
||||
</select>
|
||||
</label>
|
||||
</fieldset>
|
||||
|
||||
<div class="modal-actions">
|
||||
<button type="button" id="nim-cancel" class="btn">Cancel</button>
|
||||
<button type="submit" class="btn primary" id="nim-start">Install</button>
|
||||
</div>
|
||||
</form>
|
||||
</dialog>
|
||||
|
||||
<dialog id="nim-progress-dialog" class="modal">
|
||||
<form method="dialog" class="modal-form">
|
||||
<h3 id="nim-prog-title">Installing…</h3>
|
||||
<div class="phase-row">
|
||||
<div class="phase" id="nim-prog-phase">Starting…</div>
|
||||
<span class="spacer"></span>
|
||||
<span class="timer" id="nim-prog-elapsed">0:00</span>
|
||||
</div>
|
||||
<details open>
|
||||
<summary class="muted small">Log</summary>
|
||||
<pre id="nim-prog-log" class="log"></pre>
|
||||
</details>
|
||||
<div class="modal-actions">
|
||||
<button type="button" id="nim-prog-close" class="btn">Close</button>
|
||||
</div>
|
||||
</form>
|
||||
</dialog>
|
||||
</section>
|
||||
|
||||
<section id="models-section">
|
||||
@@ -104,6 +188,20 @@
|
||||
</form>
|
||||
</dialog>
|
||||
|
||||
<dialog id="disk-delete-dialog" class="modal">
|
||||
<form method="dialog" class="modal-form">
|
||||
<h3>Delete model weights from disk?</h3>
|
||||
<p id="dd-summary" class="muted small"></p>
|
||||
<ul class="muted small dd-hosts" id="dd-hosts"></ul>
|
||||
<p class="muted small">This is reversible — you can re-download from the catalog at any time. The catalog entry stays intact.</p>
|
||||
<p id="dd-error" class="muted small dd-error hidden"></p>
|
||||
<div class="modal-actions">
|
||||
<button type="button" id="dd-cancel" class="btn">Cancel</button>
|
||||
<button type="button" id="dd-confirm" class="btn danger">Delete from disk</button>
|
||||
</div>
|
||||
</form>
|
||||
</dialog>
|
||||
|
||||
<dialog id="advanced-dialog" class="modal">
|
||||
<form method="dialog" class="modal-form" id="advanced-form">
|
||||
<h3 id="adv-title">Advanced settings</h3>
|
||||
@@ -127,11 +225,20 @@
|
||||
<label class="dl-row">
|
||||
<span class="dl-label">HuggingFace repo</span>
|
||||
<input type="text" id="dl-repo" placeholder="e.g. RedHatAI/Qwen3.6-35B-A3B-NVFP4" autocomplete="off">
|
||||
<a id="dl-hf-link" class="dl-hf-link hidden" href="#" target="_blank" rel="noopener" title="Open on Hugging Face">↗</a>
|
||||
</label>
|
||||
<div class="dl-help muted small">
|
||||
<a href="https://huggingface.co/models?other=vllm" target="_blank" rel="noopener">Browse vLLM-compatible models</a>
|
||||
· NVFP4-quantized models (e.g. <code>RedHatAI/...</code>) are best for Blackwell hardware
|
||||
</div>
|
||||
<div class="dl-row">
|
||||
<span class="dl-label">Where</span>
|
||||
<label class="radio"><input type="radio" name="dl-mode" value="solo" checked> Spark 1 only (solo)</label>
|
||||
<label class="radio"><input type="radio" name="dl-mode" value="cluster"> Both Sparks (cluster, copy in parallel)</label>
|
||||
<label class="radio"><input type="radio" name="dl-mode" value="spark1" checked> Spark 1 only</label>
|
||||
<label class="radio"><input type="radio" name="dl-mode" value="spark2"> Spark 2 only</label>
|
||||
<label class="radio"><input type="radio" name="dl-mode" value="cluster"> Both Sparks (for cluster models)</label>
|
||||
</div>
|
||||
<div class="dl-help muted small">
|
||||
For <strong>solo</strong> models, download to wherever you'll run them. For <strong>cluster</strong> models (-tp 2), both Sparks need the weights — "Both" downloads to one Spark and rsyncs to the other in parallel.
|
||||
</div>
|
||||
<div class="dl-actions">
|
||||
<button id="dl-cancel" class="btn">Cancel</button>
|
||||
@@ -165,9 +272,14 @@
|
||||
</section>
|
||||
|
||||
<section id="update-banner" class="update-banner hidden">
|
||||
<div class="ub-context muted small">
|
||||
Updates to <strong><a href="https://github.com/eugr/spark-vllm-docker" target="_blank" rel="noopener">eugr/spark-vllm-docker</a></strong>
|
||||
— the upstream project that orchestrates vLLM on your Sparks (launch-cluster.sh, recipes, mods). These are <em>not</em> firmware, OS, or model updates.
|
||||
</div>
|
||||
<div class="ub-row">
|
||||
<span id="ub-text">Checking for updates…</span>
|
||||
<span class="spacer"></span>
|
||||
<button id="ub-explain" class="btn small-btn hidden">✨ Explain context</button>
|
||||
<button id="ub-details" class="btn small-btn hidden">Show details</button>
|
||||
<button id="ub-apply" class="btn small-btn primary hidden">Apply update</button>
|
||||
</div>
|
||||
@@ -175,6 +287,10 @@
|
||||
<summary class="muted small">Pending commits</summary>
|
||||
<pre id="ub-log" class="snippet"></pre>
|
||||
</details>
|
||||
<details id="ub-explain-section" class="hidden">
|
||||
<summary class="muted small">Explained by the loaded LLM</summary>
|
||||
<div id="ub-explain-content" class="explain-content"></div>
|
||||
</details>
|
||||
<div id="ub-progress" class="hidden">
|
||||
<div class="phase-row">
|
||||
<div class="phase" id="ub-phase">Applying update…</div>
|
||||
|
||||
+251
-6
@@ -45,6 +45,17 @@ body {
|
||||
.logo-dot { width: 10px; height: 10px; border-radius: 50%; background: var(--accent); box-shadow: 0 0 12px var(--accent); }
|
||||
.current { flex: 1; text-align: right; font-size: 14px; }
|
||||
.current strong { color: var(--accent); }
|
||||
.topbar-btn {
|
||||
background: var(--surface-2);
|
||||
border: 1px solid var(--border);
|
||||
color: var(--text);
|
||||
padding: 5px 10px;
|
||||
border-radius: 6px;
|
||||
font-size: 12px;
|
||||
text-decoration: none;
|
||||
transition: border-color 0.15s, background 0.15s;
|
||||
}
|
||||
.topbar-btn:hover { background: #24242c; border-color: var(--accent); color: var(--accent); }
|
||||
|
||||
main {
|
||||
max-width: 880px;
|
||||
@@ -97,7 +108,8 @@ main {
|
||||
overflow-x: auto;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.copy-btn {
|
||||
.copy-btn,
|
||||
.icon-btn {
|
||||
appearance: none;
|
||||
background: var(--surface-2);
|
||||
border: 1px solid var(--border);
|
||||
@@ -108,15 +120,27 @@ main {
|
||||
cursor: pointer;
|
||||
transition: color 0.15s, border-color 0.15s, background 0.15s;
|
||||
flex-shrink: 0;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
}
|
||||
.copy-btn:hover { color: var(--text); border-color: #34343c; }
|
||||
.copy-btn.copied {
|
||||
.icon-btn { padding: 5px 7px; }
|
||||
.icon-btn svg { width: 14px; height: 14px; display: block; }
|
||||
.copy-btn:hover,
|
||||
.icon-btn:hover { color: var(--text); border-color: #34343c; }
|
||||
.copy-btn.copied,
|
||||
.icon-btn.copied {
|
||||
color: var(--accent);
|
||||
border-color: rgba(74, 222, 128, 0.4);
|
||||
background: rgba(74, 222, 128, 0.08);
|
||||
}
|
||||
.icon-btn.copied svg { color: var(--accent); }
|
||||
.copy-btn.small { padding: 3px 8px; font-size: 11px; }
|
||||
|
||||
.copyable { cursor: pointer; }
|
||||
.copyable:hover { outline: 1px solid rgba(96, 165, 250, 0.5); }
|
||||
.copyable.copied { outline: 1px solid var(--accent); background: rgba(74, 222, 128, 0.05); }
|
||||
|
||||
.ep-curl { margin-top: 8px; }
|
||||
.ep-curl summary { cursor: pointer; padding: 4px 0; }
|
||||
.ep-curl[open] summary { margin-bottom: 6px; }
|
||||
@@ -255,6 +279,14 @@ main {
|
||||
font: 13px ui-monospace, SFMono-Regular, "SF Mono", Menlo, monospace;
|
||||
}
|
||||
.modal-row textarea { font-family: inherit; resize: vertical; }
|
||||
.modal-row .knob-hint {
|
||||
color: var(--muted);
|
||||
font-size: 11px;
|
||||
line-height: 1.5;
|
||||
margin-top: 2px;
|
||||
padding-left: 2px;
|
||||
}
|
||||
.modal-row.inline .knob-hint { width: 100%; margin-left: 22px; margin-top: 0; }
|
||||
.modal-row input:focus, .modal-row textarea:focus, .modal-row select:focus { outline: 1px solid var(--info); border-color: var(--info); }
|
||||
.modal-row input[type='range'] { padding: 0; flex: 1; }
|
||||
.modal-fieldset {
|
||||
@@ -274,10 +306,39 @@ main {
|
||||
background: var(--surface);
|
||||
border: 1px solid rgba(96, 165, 250, 0.4);
|
||||
border-radius: var(--radius);
|
||||
padding: 10px 14px;
|
||||
padding: 12px 14px;
|
||||
margin-top: 18px;
|
||||
font-size: 13px;
|
||||
}
|
||||
.ub-context { margin-bottom: 8px; line-height: 1.5; }
|
||||
.ub-context a { color: var(--info); text-decoration: none; }
|
||||
.ub-context a:hover { text-decoration: underline; }
|
||||
.ub-context em { font-style: normal; color: var(--text); font-weight: 500; }
|
||||
|
||||
#ub-explain-section { margin-top: 8px; }
|
||||
#ub-explain-section summary { cursor: pointer; padding: 4px 0; }
|
||||
.explain-content {
|
||||
background: #08080b;
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 6px;
|
||||
padding: 12px 14px;
|
||||
margin-top: 8px;
|
||||
font-size: 13px;
|
||||
line-height: 1.6;
|
||||
color: #c7c7d1;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
max-height: 320px;
|
||||
overflow: auto;
|
||||
}
|
||||
.explain-content .reasoning {
|
||||
color: var(--muted);
|
||||
font-style: italic;
|
||||
font-size: 11px;
|
||||
border-left: 2px solid var(--border);
|
||||
padding-left: 10px;
|
||||
margin: 4px 0;
|
||||
}
|
||||
.update-banner.up-to-date {
|
||||
border-color: var(--border);
|
||||
color: var(--muted);
|
||||
@@ -289,6 +350,90 @@ main {
|
||||
#ub-list summary { cursor: pointer; padding: 4px 0; }
|
||||
#ub-progress { margin-top: 10px; }
|
||||
|
||||
/* ===== Hardware dashboard ===== */
|
||||
|
||||
.hardware-grid {
|
||||
display: grid;
|
||||
gap: 14px;
|
||||
grid-template-columns: repeat(auto-fill, minmax(320px, 1fr));
|
||||
}
|
||||
.hw-card {
|
||||
background: var(--surface);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: var(--radius);
|
||||
padding: 14px 16px;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 8px;
|
||||
}
|
||||
.hw-card .head {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
gap: 8px;
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
.hw-card .head .name { font-weight: 600; font-size: 15px; }
|
||||
.hw-card .head .meta { color: var(--muted); font-size: 12px; margin-left: auto; }
|
||||
.hw-card.unreachable { border-color: rgba(239, 68, 68, 0.4); }
|
||||
.hw-card.unreachable .name { color: var(--error); }
|
||||
.hw-card.unreachable ol { color: var(--muted); }
|
||||
.hw-card .wol-row {
|
||||
margin-top: 8px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
font-size: 12px;
|
||||
color: var(--muted);
|
||||
}
|
||||
.hw-card .wol-row .btn { padding: 5px 10px; font-size: 12px; }
|
||||
.hw-card .mac-display { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; }
|
||||
|
||||
.connectivity-content {
|
||||
max-height: 360px;
|
||||
overflow-y: auto;
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 6px;
|
||||
padding: 10px;
|
||||
background: var(--surface-2);
|
||||
}
|
||||
.conn-spark { margin-bottom: 16px; }
|
||||
.conn-spark h4 { font-size: 13px; margin: 0 0 8px; color: var(--text); }
|
||||
.conn-event {
|
||||
font-size: 12px;
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
padding: 4px 0;
|
||||
border-bottom: 1px solid rgba(255,255,255,0.04);
|
||||
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
|
||||
}
|
||||
.conn-event:last-child { border-bottom: 0; }
|
||||
.conn-event .when { color: var(--muted); flex-shrink: 0; }
|
||||
.conn-event .what { flex: 1; }
|
||||
.conn-event.up .what { color: var(--accent); }
|
||||
.conn-event.down .what { color: var(--error); }
|
||||
.conn-event.report .what { font-style: italic; }
|
||||
.conn-event .muted { color: var(--muted); font-style: normal; }
|
||||
.conn-event .dur { color: var(--muted); }
|
||||
.conn-summary { color: var(--muted); font-size: 11px; padding: 4px 0 10px; }
|
||||
.hw-metric { display: flex; align-items: center; gap: 10px; font-size: 12px; }
|
||||
.hw-metric .label { color: var(--muted); width: 56px; flex-shrink: 0; text-transform: uppercase; letter-spacing: 0.05em; font-size: 11px; }
|
||||
.hw-metric .bar { flex: 1; height: 8px; background: var(--surface-2); border-radius: 4px; overflow: hidden; position: relative; }
|
||||
.hw-metric .bar > span {
|
||||
display: block;
|
||||
height: 100%;
|
||||
background: linear-gradient(90deg, var(--info), var(--accent));
|
||||
border-radius: 4px;
|
||||
transition: width 0.4s ease-out;
|
||||
}
|
||||
.hw-metric .bar.warn > span { background: linear-gradient(90deg, var(--warn), var(--error)); }
|
||||
.hw-metric .val {
|
||||
font-family: ui-monospace, SFMono-Regular, "SF Mono", Menlo, monospace;
|
||||
font-size: 12px;
|
||||
color: var(--text);
|
||||
min-width: 110px;
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
/* ===== Section header (title + action button) ===== */
|
||||
|
||||
.section-header {
|
||||
@@ -341,6 +486,24 @@ main {
|
||||
min-width: 200px;
|
||||
}
|
||||
.dl-row input[type='text']:focus { outline: 1px solid var(--info); border-color: var(--info); }
|
||||
.dl-hf-link {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
background: var(--surface-2);
|
||||
border: 1px solid var(--border);
|
||||
color: var(--info);
|
||||
padding: 7px 10px;
|
||||
border-radius: 6px;
|
||||
text-decoration: none;
|
||||
font-size: 14px;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.dl-hf-link:hover { background: rgba(96, 165, 250, 0.08); border-color: var(--info); }
|
||||
.dl-help { padding-left: 122px; line-height: 1.6; }
|
||||
.dl-help a { color: var(--info); text-decoration: none; }
|
||||
.dl-help a:hover { text-decoration: underline; }
|
||||
.dl-help code { background: var(--surface-2); padding: 1px 5px; border-radius: 3px; font-size: 11px; }
|
||||
.radio { display: inline-flex; align-items: center; gap: 6px; font-size: 13px; color: var(--text); cursor: pointer; }
|
||||
.radio input { accent-color: var(--accent); }
|
||||
.dl-actions { display: flex; gap: 8px; justify-content: flex-end; margin-top: 10px; }
|
||||
@@ -353,6 +516,37 @@ main {
|
||||
#dl-log-details { margin-top: 12px; }
|
||||
#dl-log-details summary { cursor: pointer; padding: 4px 0; }
|
||||
|
||||
/* ===== NIM install dialog ===== */
|
||||
|
||||
.modal#nim-dialog,
|
||||
.modal#nim-progress-dialog { max-width: 640px; }
|
||||
.nim-grid {
|
||||
display: grid;
|
||||
gap: 8px;
|
||||
grid-template-columns: 1fr;
|
||||
max-height: 240px;
|
||||
overflow-y: auto;
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
.nim-card {
|
||||
background: var(--surface-2);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 6px;
|
||||
padding: 10px 12px;
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
align-items: flex-start;
|
||||
}
|
||||
.nim-card .info { flex: 1; }
|
||||
.nim-card .name { font-weight: 600; font-size: 13px; }
|
||||
.nim-card .desc { color: var(--muted); font-size: 12px; margin-top: 4px; }
|
||||
.nim-card .img { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; color: #6b6b75; font-size: 11px; margin-top: 4px; word-break: break-all; }
|
||||
.nim-card .btn { padding: 6px 12px; font-size: 12px; flex-shrink: 0; }
|
||||
.nim-card .links { font-size: 11px; margin-top: 4px; }
|
||||
.nim-card .links a { color: var(--info); text-decoration: none; }
|
||||
.nim-card .links a:hover { text-decoration: underline; }
|
||||
.nim-key-warn { color: var(--warn); }
|
||||
|
||||
/* ===== Section titles ===== */
|
||||
|
||||
.section-title {
|
||||
@@ -409,13 +603,38 @@ main {
|
||||
|
||||
.service-card .row {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
font-size: 12px;
|
||||
color: var(--muted);
|
||||
gap: 6px;
|
||||
}
|
||||
.service-card .row .k { width: 60px; flex-shrink: 0; }
|
||||
.service-card .row .v { color: var(--text); font-family: ui-monospace, SFMono-Regular, "SF Mono", Menlo, monospace; word-break: break-all; }
|
||||
.service-card .row .v {
|
||||
color: var(--text);
|
||||
font-family: ui-monospace, SFMono-Regular, "SF Mono", Menlo, monospace;
|
||||
word-break: break-all;
|
||||
flex: 1;
|
||||
padding: 2px 4px;
|
||||
border-radius: 4px;
|
||||
}
|
||||
.service-card .row .v.muted-v { color: var(--muted); font-family: inherit; }
|
||||
.service-card .row .v.copyable:hover { outline: 1px solid rgba(96, 165, 250, 0.5); }
|
||||
.service-card .row .v.copyable.copied { outline: 1px solid var(--accent); background: rgba(74, 222, 128, 0.05); }
|
||||
.service-card .row .icon-btn { padding: 3px 6px; }
|
||||
.service-card .row .icon-btn svg { width: 12px; height: 12px; }
|
||||
.service-card .deep-row .deep-v { display: flex; align-items: center; gap: 6px; font-family: inherit; flex-wrap: wrap; }
|
||||
.service-card .dh-ok { color: var(--accent); }
|
||||
.service-card .dh-fail { color: var(--error); font-weight: 500; }
|
||||
.service-card .dh-run-btn { font-family: inherit; }
|
||||
.service-card .deep-error {
|
||||
padding: 4px 8px;
|
||||
background: rgba(239, 68, 68, 0.06);
|
||||
border-left: 2px solid var(--error);
|
||||
border-radius: 4px;
|
||||
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
|
||||
font-size: 11px;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.service-actions {
|
||||
display: flex;
|
||||
@@ -460,6 +679,9 @@ main {
|
||||
font-size: 11px;
|
||||
color: #5c5c66;
|
||||
}
|
||||
.card .repo a { color: inherit; text-decoration: none; }
|
||||
.card .repo a:hover { color: var(--info); text-decoration: underline; }
|
||||
.card .repo .hf-icon { font-size: 13px; opacity: 0.7; }
|
||||
.tag {
|
||||
background: var(--surface-2);
|
||||
border: 1px solid var(--border);
|
||||
@@ -492,8 +714,31 @@ main {
|
||||
.card.active .btn { background: rgba(74, 222, 128, 0.12); color: var(--accent); border-color: rgba(74, 222, 128, 0.4); }
|
||||
.card-actions { display: flex; gap: 6px; }
|
||||
.card-actions .btn.primary { flex: 1; }
|
||||
.card .adv-btn { padding: 8px 12px; font-size: 12px; }
|
||||
.card .adv-btn,
|
||||
.card .test-btn { padding: 8px 12px; font-size: 12px; }
|
||||
.card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); }
|
||||
.tag.on-disk { color: var(--accent); border-color: rgba(74, 222, 128, 0.4); }
|
||||
.tag.not-on-disk { color: var(--muted); border-color: var(--border); opacity: 0.7; }
|
||||
.card-actions .icon-btn.danger { color: var(--error); border-color: rgba(239, 68, 68, 0.3); margin-left: auto; }
|
||||
.card-actions .icon-btn.danger:hover:not(:disabled) { background: rgba(239, 68, 68, 0.08); border-color: var(--error); color: var(--error); }
|
||||
.card-actions .icon-btn.danger:disabled { opacity: 0.35; cursor: not-allowed; }
|
||||
.dd-hosts { padding-left: 18px; margin: 4px 0 8px; }
|
||||
.dd-hosts code { background: var(--surface-2); padding: 1px 5px; border-radius: 4px; }
|
||||
.dd-error { color: var(--error); }
|
||||
|
||||
.test-result {
|
||||
font-size: 12px;
|
||||
line-height: 1.45;
|
||||
padding: 8px 10px;
|
||||
border-radius: 5px;
|
||||
margin-top: 4px;
|
||||
border: 1px solid var(--border);
|
||||
background: var(--surface-2);
|
||||
}
|
||||
.test-result.ok { border-color: rgba(74, 222, 128, 0.4); background: rgba(74, 222, 128, 0.04); }
|
||||
.test-result.fail { border-color: rgba(239, 68, 68, 0.45); background: rgba(239, 68, 68, 0.06); word-break: break-word; }
|
||||
.test-result .ok-mark { color: var(--accent); font-weight: 600; }
|
||||
.test-result .fail-mark { color: var(--error); font-weight: 600; }
|
||||
|
||||
.footer {
|
||||
margin-top: 28px;
|
||||
|
||||
@@ -0,0 +1,137 @@
|
||||
"""Pre-flight validation of a proposed vLLM launch command.
|
||||
|
||||
Runs vLLM's own argparse layer (EngineArgs) inside the vllm_node container WITHOUT
|
||||
starting the engine. Catches:
|
||||
|
||||
* unknown flag names (typos)
|
||||
* bad types / values that argparse rejects
|
||||
* deprecated flags removed in the installed vLLM version
|
||||
|
||||
Does NOT catch (these surface only during real engine init):
|
||||
* model-architecture-specific constraints (e.g. Qwen3.6 Mamba block_size)
|
||||
* OOM at weight-loading time
|
||||
* Triton / CUDA-kernel compatibility errors
|
||||
|
||||
A pre-flight check that returns "ok" is therefore NOT a guarantee — but a
|
||||
"failed" verdict is a definitive 'don't bother with the real swap'.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import shlex
|
||||
from typing import Any
|
||||
|
||||
from .config import Settings
|
||||
from .models import Catalog, build_launch_command
|
||||
from .ssh import ssh_run
|
||||
|
||||
|
||||
# Validates the proposed args against the same combined parser vLLM uses for
|
||||
# `vllm serve` (engine args + server args + frontend args). Returns one JSON
|
||||
# line on stdout: {"ok": true, ...} or {"ok": false, ...}.
|
||||
_VALIDATOR_SCRIPT = r"""
|
||||
import argparse, json, sys
|
||||
|
||||
# Mirror what `vllm serve` does internally: FlexibleArgumentParser (which is
|
||||
# more lenient about dashes vs underscores) wrapped with make_arg_parser
|
||||
# (which adds engine + server + frontend args).
|
||||
parser = None
|
||||
try:
|
||||
# Newer vLLM path
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
except Exception:
|
||||
try:
|
||||
# Older fallback
|
||||
from vllm.engine.arg_utils import FlexibleArgumentParser
|
||||
except Exception:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser # type: ignore
|
||||
|
||||
try:
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
parser = make_arg_parser(FlexibleArgumentParser(add_help=False))
|
||||
except Exception:
|
||||
pass
|
||||
if parser is None:
|
||||
try:
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
parser = FlexibleArgumentParser(add_help=False)
|
||||
EngineArgs.add_cli_args(parser)
|
||||
except Exception as e:
|
||||
print(json.dumps({"ok": False, "stage": "import", "error": f"{type(e).__name__}: {e}"}))
|
||||
sys.exit(0)
|
||||
|
||||
class _ArgError(Exception):
|
||||
pass
|
||||
|
||||
def _err(message):
|
||||
raise _ArgError(message)
|
||||
|
||||
parser.error = _err # capture argparse errors instead of sys.exit(2)
|
||||
|
||||
try:
|
||||
raw = sys.stdin.read()
|
||||
arglist = json.loads(raw)
|
||||
ns = parser.parse_args(arglist)
|
||||
print(json.dumps({"ok": True, "model": getattr(ns, "model", None)}))
|
||||
except _ArgError as e:
|
||||
print(json.dumps({"ok": False, "stage": "parse", "error": str(e)}))
|
||||
except SystemExit as e:
|
||||
print(json.dumps({"ok": False, "stage": "parse", "error": f"argparse exit {e.code}"}))
|
||||
except Exception as e:
|
||||
print(json.dumps({"ok": False, "stage": "parse", "error": f"{type(e).__name__}: {e}"}))
|
||||
"""
|
||||
|
||||
|
||||
def _vllm_arg_list(key: str, model_def, catalog: Catalog) -> list[str]:
|
||||
"""Reconstruct the args list passed to `vllm serve` (without the positional model)."""
|
||||
cmd = build_launch_command(key, model_def, catalog.defaults)
|
||||
# build_launch_command yields:
|
||||
# ./launch-cluster.sh [--solo] -d exec vllm serve <repo> <args...>
|
||||
# We just want the bits after `vllm serve <repo>`.
|
||||
tokens = shlex.split(cmd)
|
||||
if "serve" not in tokens:
|
||||
return []
|
||||
i = tokens.index("serve")
|
||||
after = tokens[i + 1 :] # repo, then args
|
||||
if not after:
|
||||
return []
|
||||
args = after[1:] # drop the repo
|
||||
# EngineArgs expects --model=REPO rather than positional, so prepend it.
|
||||
return [f"--model={after[0]}", *args]
|
||||
|
||||
|
||||
async def validate_launch(key: str, catalog: Catalog, settings: Settings) -> dict:
|
||||
if key not in catalog.models:
|
||||
return {"ok": False, "stage": "lookup", "error": f"unknown model: {key}"}
|
||||
if not settings.spark1_host or not settings.spark1_user:
|
||||
return {"ok": False, "stage": "config", "error": "spark1 not configured"}
|
||||
|
||||
model = catalog.models[key]
|
||||
arg_list = _vllm_arg_list(key, model, catalog)
|
||||
if not arg_list:
|
||||
return {"ok": False, "stage": "build", "error": "failed to build args list"}
|
||||
|
||||
payload = json.dumps(arg_list).replace("'", "'\\''")
|
||||
# Pipe the JSON args list to a here-doc Python invocation. The validator
|
||||
# reads from stdin to avoid shell-escaping the args themselves.
|
||||
cmd = (
|
||||
f"echo '{payload}' | docker exec -i vllm_node python3 -c "
|
||||
+ shlex.quote(_VALIDATOR_SCRIPT)
|
||||
)
|
||||
|
||||
rc, out, err = await ssh_run(settings.spark1_host, settings.spark1_user, cmd, settings, timeout=20)
|
||||
if rc != 0 and not out.strip():
|
||||
return {
|
||||
"ok": False,
|
||||
"stage": "ssh",
|
||||
"error": err.strip() or f"rc={rc}",
|
||||
"cmd_args": arg_list,
|
||||
"launch_cmd": build_launch_command(key, model, catalog.defaults),
|
||||
}
|
||||
last = out.strip().splitlines()[-1] if out.strip() else ""
|
||||
try:
|
||||
result: dict[str, Any] = json.loads(last)
|
||||
except json.JSONDecodeError:
|
||||
result = {"ok": False, "stage": "decode", "error": "validator did not return JSON", "raw": out[-500:]}
|
||||
result["cmd_args"] = arg_list
|
||||
result["launch_cmd"] = build_launch_command(key, model, catalog.defaults)
|
||||
return result
|
||||
@@ -0,0 +1,69 @@
|
||||
"""Wake-on-LAN.
|
||||
|
||||
Two delivery paths, tried in order:
|
||||
|
||||
1. SSH into the other Spark and have IT broadcast — most reliable because the
|
||||
packet originates from the same LAN subnet as the sleeping Spark.
|
||||
2. Direct UDP broadcast from this container. May or may not work depending
|
||||
on the StartOS container's network namespace.
|
||||
|
||||
The DGX Spark's NIC must have WoL enabled in firmware/OS for either path to
|
||||
actually wake the box; this module just delivers the magic packet correctly.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import re
|
||||
import socket
|
||||
|
||||
from .config import Settings
|
||||
from .ssh import ssh_run
|
||||
|
||||
|
||||
_MAC_RE = re.compile(r"^[0-9a-fA-F]{2}([:-]?[0-9a-fA-F]{2}){5}$")
|
||||
|
||||
|
||||
def normalize_mac(mac: str) -> str:
|
||||
mac = mac.strip().lower()
|
||||
if not _MAC_RE.match(mac):
|
||||
raise ValueError(f"invalid MAC address: {mac!r}")
|
||||
return mac.replace("-", ":")
|
||||
|
||||
|
||||
def build_magic_packet(mac: str) -> bytes:
|
||||
mac_bytes = bytes.fromhex(normalize_mac(mac).replace(":", ""))
|
||||
return b"\xff" * 6 + mac_bytes * 16
|
||||
|
||||
|
||||
def send_local_broadcast(mac: str, broadcast: str = "255.255.255.255", port: int = 9) -> None:
|
||||
"""Send from THIS container. May not reach the LAN in some topologies."""
|
||||
pkt = build_magic_packet(mac)
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
try:
|
||||
s.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
|
||||
s.sendto(pkt, (broadcast, port))
|
||||
# Also send to port 7 (alternate WoL convention) for safety
|
||||
s.sendto(pkt, (broadcast, 7))
|
||||
finally:
|
||||
s.close()
|
||||
|
||||
|
||||
async def send_via_peer(host: str, user: str, mac: str, settings: Settings) -> tuple[bool, str]:
|
||||
"""Use a different (reachable) Spark to send the WoL packet to its peer.
|
||||
|
||||
Uses Python 3 (always present on the Sparks for vLLM) to avoid depending on
|
||||
wakeonlan / etherwake being installed.
|
||||
"""
|
||||
normalized = normalize_mac(mac)
|
||||
mac_hex = normalized.replace(":", "")
|
||||
py = (
|
||||
"python3 -c \""
|
||||
"import socket; "
|
||||
f"m=bytes.fromhex('{mac_hex}'); "
|
||||
"s=socket.socket(socket.AF_INET, socket.SOCK_DGRAM); "
|
||||
"s.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1); "
|
||||
"s.sendto(b'\\xff'*6 + m*16, ('255.255.255.255', 9)); "
|
||||
"s.sendto(b'\\xff'*6 + m*16, ('255.255.255.255', 7)); "
|
||||
"print('sent')\""
|
||||
)
|
||||
rc, out, err = await ssh_run(host, user, py, settings, timeout=8)
|
||||
return rc == 0 and "sent" in out, (err.strip() or out.strip() or f"rc={rc}")
|
||||
@@ -30,6 +30,7 @@ models:
|
||||
- -tp=2
|
||||
- --distributed-executor-backend=ray
|
||||
- --max-model-len=32768
|
||||
- --max-num-batched-tokens=16384
|
||||
|
||||
gemma4:
|
||||
display_name: "Gemma 4 31B"
|
||||
@@ -45,6 +46,7 @@ models:
|
||||
vllm_args:
|
||||
- --gpu-memory-utilization=0.8
|
||||
- --max-model-len=32768
|
||||
- --max-num-batched-tokens=16384
|
||||
- --reasoning-parser=gemma4
|
||||
- --tool-call-parser=gemma4
|
||||
- --enable-auto-tool-choice
|
||||
@@ -66,6 +68,7 @@ models:
|
||||
vllm_args:
|
||||
- --gpu-memory-utilization=0.85
|
||||
- --max-model-len=65536
|
||||
- --max-num-batched-tokens=16384
|
||||
- --reasoning-parser=qwen3
|
||||
- --moe_backend=flashinfer_cutlass
|
||||
- --load-format=fastsafetensors
|
||||
|
||||
@@ -20,6 +20,14 @@ The trick is the `docker run --rm alpine chown` — it runs as root inside the t
|
||||
|
||||
This flag is Blackwell-specific. If vLLM in the container reports `unrecognized arguments: --moe_backend` or similar, edit `models.yaml` for `qwen36` and drop that flag. The swap UI does NOT auto-fallback in v0.1 — failure surfaces in the log stream.
|
||||
|
||||
## Qwen3.6 Mamba block-size assertion (fixed in v0.6.0:1)
|
||||
|
||||
Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe.
|
||||
|
||||
## Multimodal token budget for vision models (fixed in v0.8.0:1)
|
||||
|
||||
After the eugr/spark-vllm-docker update, vLLM became stricter about multimodal token budgets. Vision-capable models like Gemma 4 31B and Qwen3-VL crash at engine init with `ValueError: Chunked MM input disabled but max_tokens_per_mm_item (2496) is larger than max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into every model that has the `vision` capability. Now applied to qwen3-vl, gemma4, and qwen36 (which was already set for the Mamba issue).
|
||||
|
||||
## Two SSH paths to Spark 1 from the laptop
|
||||
|
||||
`ssh <spark-user>@<spark-1-ip>` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `<spark-1-host>.local`. Always use the `.local` hostname or `<spark-2-ip>`-style entries that ARE matched.
|
||||
|
||||
@@ -76,6 +76,24 @@ const inputSpec = InputSpec.of({
|
||||
placeholder: 'magpie-tts',
|
||||
masked: false,
|
||||
}),
|
||||
open_webui_url: Value.text({
|
||||
name: 'Open WebUI URL (optional)',
|
||||
description:
|
||||
'If you also run Open WebUI on your LAN, paste its URL here. Spark Control will then show a one-click "Open chat" button next to the current model so you can jump straight to it.',
|
||||
required: false,
|
||||
default: null,
|
||||
placeholder: 'e.g. https://open-webui.yourserver.local',
|
||||
masked: false,
|
||||
}),
|
||||
ngc_api_key: Value.text({
|
||||
name: 'NGC API key (optional)',
|
||||
description:
|
||||
'NVIDIA NGC personal API key — needed to install NIM containers (Parakeet, Magpie, etc.) from nvcr.io. Get one free at https://ngc.nvidia.com/setup/personal-key. Stored only on this Start9 server; passed to docker as the NGC_API_KEY env var when installing NIM services.',
|
||||
required: false,
|
||||
default: null,
|
||||
placeholder: 'starts with "nvapi-..."',
|
||||
masked: true,
|
||||
}),
|
||||
})
|
||||
|
||||
export const configureSparks = sdk.Action.withInput(
|
||||
|
||||
@@ -14,6 +14,10 @@ export const sparkConfigSchema = z.object({
|
||||
magpie_host: z.string().catch(''),
|
||||
magpie_user: z.string().catch(''),
|
||||
magpie_container: z.string().catch(''),
|
||||
// Optional Open WebUI deep-link
|
||||
open_webui_url: z.string().catch(''),
|
||||
// Optional NGC API key for pulling NIM containers from nvcr.io/nim/...
|
||||
ngc_api_key: z.string().catch(''),
|
||||
})
|
||||
|
||||
export type SparkConfig = z.infer<typeof sparkConfigSchema>
|
||||
|
||||
@@ -19,6 +19,8 @@ export const main = sdk.setupMain(async ({ effects }) => {
|
||||
magpie_host: '',
|
||||
magpie_user: '',
|
||||
magpie_container: '',
|
||||
open_webui_url: '',
|
||||
ngc_api_key: '',
|
||||
}
|
||||
|
||||
return sdk.Daemons.of(effects).addDaemon('primary', {
|
||||
@@ -47,6 +49,10 @@ export const main = sdk.setupMain(async ({ effects }) => {
|
||||
MAGPIE_USER: cfg.magpie_user,
|
||||
MAGPIE_CONTAINER: cfg.magpie_container,
|
||||
MODELS_OVERRIDES: '/data/models-overrides.yaml',
|
||||
SERVICES_OVERRIDES: '/data/services-overrides.yaml',
|
||||
CONNECTIVITY_LOG: '/data/connectivity.json',
|
||||
OPEN_WEBUI_URL: cfg.open_webui_url,
|
||||
NGC_API_KEY: cfg.ngc_api_key,
|
||||
BIND_PORT: String(uiPort),
|
||||
},
|
||||
},
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||
|
||||
export const v0_1_0 = VersionInfo.of({
|
||||
version: '0.2.3:0',
|
||||
version: '0.8.1:0',
|
||||
releaseNotes: {
|
||||
en_US:
|
||||
'Per-model Advanced settings + downloaded-model catalog flow. Each card now has an Advanced button: max context tokens, GPU memory %, and optimization toggles (fastsafetensors, prefix caching, FP8 KV cache). After a download finishes, a dialog appears to add the model to the catalog with those same knobs as launch defaults. Custom models can be deleted. Overrides persist in /data/models-overrides.yaml and survive package updates.',
|
||||
'v0.8.1: model weights can now be deleted from disk directly from the dashboard. Each model card shows whether the weights are present (with on-disk GB size) or not yet downloaded. When present and the model is NOT currently loaded, a small trash icon appears on the card; clicking it pops a confirmation showing how many GB will be freed and on which Spark(s), then runs `rm -rf` on the Hugging Face cache directory via SSH. Cluster-mode models are deleted from both Sparks; solo-mode from Spark 1 only. Safety rails: refuses to delete the currently-loaded model, refuses during an in-flight swap or download, and the catalog entry stays — you can always re-download. Disk status is probed once on dashboard load and re-checked every 60s.',
|
||||
},
|
||||
migrations: {
|
||||
up: async ({ effects }) => {},
|
||||
|
||||
Reference in New Issue
Block a user