v0.4.0 - NIM installer + dashboard resilience
Hotfix (was v0.3.1):
- services.py: cache 'unreachable' per (host,user) for 25s so a dead Spark doesn't hang every /api/services call behind 6s ssh timeout
- ssh_run timeout reduced 10 -> 6s for docker_state probes
- hardware probe: shorter SSH timeout (6s), longer cache TTL for failures (25s)
- JS pollStatus retries loadModels() if state.models is empty (recovers from cold-start proxy timeout)
- Unreachable hardware card now includes troubleshooting steps (Spark Control cannot SSH into an unreachable Spark to restart it)
v0.4 NIM installer:
- nim.py module: curated SUGGESTED_NIMS list (Parakeet, Magpie, Riva) + NimManager that runs docker login nvcr.io + docker pull + docker run -d --gpus all -p PORT:PORT -v VOL:/opt/nim/.cache -e NGC_API_KEY -e ... --restart=unless-stopped + chown the volume to uid 1000 + restart. Streams all output via SSE; redacts the API key from log lines.
- custom_services.py: persists installed NIMs to /data/services-overrides.yaml so they appear in the services panel after install
- services.py: merges custom services into the panel
- /api/nim/catalog GET, /api/nim/install POST + GET/SSE
- /api/services/{name} DELETE for custom services
- UI: '+ Install NIM' button next to 'Always-on services'; modal lists curated images each with a 'Pick' button + a custom-image form; installation runs in a second dialog with phase + elapsed timer + collapsible log
- NGC API key field added to Configure Sparks (masked); injected as NGC_API_KEY env var into the container
Package: bump 0.4.0:0; main.ts adds SERVICES_OVERRIDES + NGC_API_KEY env vars
This commit is contained in:
+42
-2
@@ -5,6 +5,7 @@ machinery. We just run `docker start|stop|restart <container>` via SSH on the
|
||||
appropriate host.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, Optional
|
||||
|
||||
@@ -12,6 +13,25 @@ from .config import Settings
|
||||
from .ssh import ssh_run
|
||||
|
||||
|
||||
# Cache the "unreachable" verdict per (host, user) for a short period so that a
|
||||
# repeated docker_state call doesn't re-pay the 6 s SSH connect timeout each time.
|
||||
_UNREACHABLE_TTL = 25.0
|
||||
_unreachable_cache: dict[tuple[str, str], float] = {}
|
||||
|
||||
|
||||
def _is_recently_unreachable(host: str, user: str) -> bool:
|
||||
ts = _unreachable_cache.get((host, user))
|
||||
return bool(ts and time.monotonic() - ts < _UNREACHABLE_TTL)
|
||||
|
||||
|
||||
def _mark_unreachable(host: str, user: str) -> None:
|
||||
_unreachable_cache[(host, user)] = time.monotonic()
|
||||
|
||||
|
||||
def _clear_unreachable(host: str, user: str) -> None:
|
||||
_unreachable_cache.pop((host, user), None)
|
||||
|
||||
|
||||
ServiceName = Literal["parakeet", "magpie"]
|
||||
ServiceAction = Literal["start", "stop", "restart"]
|
||||
|
||||
@@ -27,7 +47,8 @@ class ServiceDef:
|
||||
|
||||
|
||||
def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
|
||||
return {
|
||||
from .custom_services import load_custom_services
|
||||
out: dict[str, ServiceDef] = {
|
||||
"parakeet": ServiceDef(
|
||||
name="parakeet",
|
||||
kind="stt",
|
||||
@@ -45,19 +66,38 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
|
||||
port=s.magpie_port,
|
||||
),
|
||||
}
|
||||
for entry in load_custom_services():
|
||||
key = entry.get("key")
|
||||
if not key or key in out:
|
||||
continue
|
||||
out[key] = ServiceDef(
|
||||
name=key,
|
||||
kind=entry.get("kind", ""),
|
||||
host=entry.get("host", ""),
|
||||
user=entry.get("user", ""),
|
||||
container=entry.get("container", key),
|
||||
port=int(entry.get("port", 0)),
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
async def docker_state(settings: Settings, svc: ServiceDef) -> dict:
|
||||
"""Get docker state (running, exited, restarting, etc.) + restart count."""
|
||||
if not svc.host or not svc.user:
|
||||
return {"state": "unconfigured", "restart_count": None, "uptime": None}
|
||||
if _is_recently_unreachable(svc.host, svc.user):
|
||||
return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None}
|
||||
cmd = (
|
||||
f"docker inspect {svc.container} "
|
||||
f"--format '{{{{.State.Status}}}}|{{{{.State.StartedAt}}}}|{{{{.RestartCount}}}}|{{{{.State.ExitCode}}}}|{{{{.State.Error}}}}' "
|
||||
f"2>&1 || echo 'NOT_FOUND'"
|
||||
)
|
||||
rc, out, _ = await ssh_run(svc.host, svc.user, cmd, settings, timeout=10)
|
||||
rc, out, _ = await ssh_run(svc.host, svc.user, cmd, settings, timeout=6)
|
||||
out = out.strip()
|
||||
if rc == 124 or "timeout after" in out.lower():
|
||||
_mark_unreachable(svc.host, svc.user)
|
||||
return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None}
|
||||
_clear_unreachable(svc.host, svc.user)
|
||||
if rc != 0 or out.startswith("NOT_FOUND") or "Error" in out and "no such object" in out.lower():
|
||||
return {"state": "missing", "restart_count": None, "uptime": None, "raw": out}
|
||||
parts = out.split("|")
|
||||
|
||||
Reference in New Issue
Block a user