v0.4.0 - NIM installer + dashboard resilience

Hotfix (was v0.3.1):
- services.py: cache 'unreachable' per (host,user) for 25s so a dead Spark doesn't hang every /api/services call behind 6s ssh timeout
- ssh_run timeout reduced 10 -> 6s for docker_state probes
- hardware probe: shorter SSH timeout (6s), longer cache TTL for failures (25s)
- JS pollStatus retries loadModels() if state.models is empty (recovers from cold-start proxy timeout)
- Unreachable hardware card now includes troubleshooting steps (Spark Control cannot SSH into an unreachable Spark to restart it)

v0.4 NIM installer:
- nim.py module: curated SUGGESTED_NIMS list (Parakeet, Magpie, Riva) + NimManager that runs docker login nvcr.io + docker pull + docker run -d --gpus all -p PORT:PORT -v VOL:/opt/nim/.cache -e NGC_API_KEY -e ... --restart=unless-stopped + chown the volume to uid 1000 + restart. Streams all output via SSE; redacts the API key from log lines.
- custom_services.py: persists installed NIMs to /data/services-overrides.yaml so they appear in the services panel after install
- services.py: merges custom services into the panel
- /api/nim/catalog GET, /api/nim/install POST + GET/SSE
- /api/services/{name} DELETE for custom services
- UI: '+ Install NIM' button next to 'Always-on services'; modal lists curated images each with a 'Pick' button + a custom-image form; installation runs in a second dialog with phase + elapsed timer + collapsible log
- NGC API key field added to Configure Sparks (masked); injected as NGC_API_KEY env var into the container

Package: bump 0.4.0:0; main.ts adds SERVICES_OVERRIDES + NGC_API_KEY env vars
This commit is contained in:
Grant
2026-05-12 12:32:29 -05:00
parent e88fdcfde4
commit 1889ab45fb
13 changed files with 690 additions and 10 deletions
+105
View File
@@ -10,10 +10,12 @@ from pydantic import BaseModel
from typing import Literal
from .config import Settings
from .custom_services import add_custom_service, delete_custom_service
from .download import DownloadManager
from .hardware import HardwareProbe
from .health import check_magpie, check_parakeet, check_vllm
from .models import load_catalog
from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager
from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
from .services import docker_state, run_action, services_from_settings
from .ssh import ssh_run
@@ -27,6 +29,7 @@ swap_manager = SwapManager(settings, catalog)
download_manager = DownloadManager(settings)
update_manager = UpdateManager(settings)
hardware_probe = HardwareProbe(settings)
nim_manager = NimManager(settings)
app = FastAPI(title="spark-control", version="0.1.0")
@@ -170,6 +173,108 @@ async def get_services() -> dict:
return out
@app.get("/api/nim/catalog")
async def get_nim_catalog() -> dict:
return {
"catalog_url": CATALOG_URL,
"ngc_key_configured": bool(settings.ngc_api_key),
"suggested": SUGGESTED_NIMS,
}
class NimInstallBody(BaseModel):
image: str
container: str
port: int
host: Literal["spark1", "spark2"] = "spark2"
kind: str = ""
register: bool = True # write to custom services overrides after install
@app.post("/api/nim/install")
async def post_nim_install(body: NimInstallBody) -> dict:
target_host = settings.spark1_host if body.host == "spark1" else settings.spark2_host
target_user = settings.spark1_user if body.host == "spark1" else settings.spark2_user
try:
job = await nim_manager.trigger(
image=body.image,
container=body.container,
port=body.port,
host=target_host,
user=target_user,
)
except RuntimeError as e:
raise HTTPException(409 if "in progress" in str(e) else 400, str(e))
if body.register:
# Persist in custom services so the panel shows it after install.
add_custom_service({
"key": body.container,
"kind": body.kind or "nim",
"host": target_host,
"user": target_user,
"container": body.container,
"port": body.port,
"image": body.image,
})
return {"job_id": job.id, "image": job.image, "container": job.container, "state": job.state}
@app.get("/api/nim/install/{job_id}")
async def get_nim_install(job_id: str) -> dict:
job = nim_manager.get(job_id)
if job is None:
raise HTTPException(404, "no such job")
return {
"id": job.id,
"image": job.image,
"container": job.container,
"port": job.port,
"host": job.host,
"state": job.state,
"phase": job.phase,
"started_at": job.started_at,
"finished_at": job.finished_at,
"returncode": job.returncode,
"lines": job.lines,
}
@app.get("/api/nim/install/{job_id}/stream")
async def stream_nim_install(job_id: str):
job = nim_manager.get(job_id)
if job is None:
raise HTTPException(404, "no such job")
async def gen():
sent = 0
last_phase = None
while True:
n = len(job.lines)
if n > sent:
for line in job.lines[sent:n]:
yield f"data: {json.dumps({'line': line})}\n\n"
sent = n
if job.phase != last_phase:
yield f"event: phase\ndata: {json.dumps({'state': job.state, 'phase': job.phase})}\n\n"
last_phase = job.phase
if job.returncode is not None and sent >= len(job.lines):
yield f"event: done\ndata: {json.dumps({'state': job.state, 'returncode': job.returncode})}\n\n"
return
await asyncio.sleep(0.5)
return StreamingResponse(gen(), media_type="text/event-stream")
@app.delete("/api/services/{name}")
async def del_service(name: str) -> dict:
# Only allow deleting custom services (not the bundled parakeet/magpie keys)
if name in ("parakeet", "magpie"):
raise HTTPException(400, "built-in service; cannot delete (use Configure Sparks to point at a different host)")
delete_custom_service(name)
return {"ok": True, "name": name}
@app.post("/api/services/{name}/{action}")
async def service_action(name: str, action: str) -> dict:
services = services_from_settings(settings)