v0.4.0 - NIM installer + dashboard resilience
Hotfix (was v0.3.1):
- services.py: cache 'unreachable' per (host,user) for 25s so a dead Spark doesn't hang every /api/services call behind 6s ssh timeout
- ssh_run timeout reduced 10 -> 6s for docker_state probes
- hardware probe: shorter SSH timeout (6s), longer cache TTL for failures (25s)
- JS pollStatus retries loadModels() if state.models is empty (recovers from cold-start proxy timeout)
- Unreachable hardware card now includes troubleshooting steps (Spark Control cannot SSH into an unreachable Spark to restart it)
v0.4 NIM installer:
- nim.py module: curated SUGGESTED_NIMS list (Parakeet, Magpie, Riva) + NimManager that runs docker login nvcr.io + docker pull + docker run -d --gpus all -p PORT:PORT -v VOL:/opt/nim/.cache -e NGC_API_KEY -e ... --restart=unless-stopped + chown the volume to uid 1000 + restart. Streams all output via SSE; redacts the API key from log lines.
- custom_services.py: persists installed NIMs to /data/services-overrides.yaml so they appear in the services panel after install
- services.py: merges custom services into the panel
- /api/nim/catalog GET, /api/nim/install POST + GET/SSE
- /api/services/{name} DELETE for custom services
- UI: '+ Install NIM' button next to 'Always-on services'; modal lists curated images each with a 'Pick' button + a custom-image form; installation runs in a second dialog with phase + elapsed timer + collapsible log
- NGC API key field added to Configure Sparks (masked); injected as NGC_API_KEY env var into the container
Package: bump 0.4.0:0; main.ts adds SERVICES_OVERRIDES + NGC_API_KEY env vars
This commit is contained in:
@@ -10,10 +10,12 @@ from pydantic import BaseModel
|
||||
from typing import Literal
|
||||
|
||||
from .config import Settings
|
||||
from .custom_services import add_custom_service, delete_custom_service
|
||||
from .download import DownloadManager
|
||||
from .hardware import HardwareProbe
|
||||
from .health import check_magpie, check_parakeet, check_vllm
|
||||
from .models import load_catalog
|
||||
from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager
|
||||
from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
|
||||
from .services import docker_state, run_action, services_from_settings
|
||||
from .ssh import ssh_run
|
||||
@@ -27,6 +29,7 @@ swap_manager = SwapManager(settings, catalog)
|
||||
download_manager = DownloadManager(settings)
|
||||
update_manager = UpdateManager(settings)
|
||||
hardware_probe = HardwareProbe(settings)
|
||||
nim_manager = NimManager(settings)
|
||||
|
||||
app = FastAPI(title="spark-control", version="0.1.0")
|
||||
|
||||
@@ -170,6 +173,108 @@ async def get_services() -> dict:
|
||||
return out
|
||||
|
||||
|
||||
@app.get("/api/nim/catalog")
|
||||
async def get_nim_catalog() -> dict:
|
||||
return {
|
||||
"catalog_url": CATALOG_URL,
|
||||
"ngc_key_configured": bool(settings.ngc_api_key),
|
||||
"suggested": SUGGESTED_NIMS,
|
||||
}
|
||||
|
||||
|
||||
class NimInstallBody(BaseModel):
|
||||
image: str
|
||||
container: str
|
||||
port: int
|
||||
host: Literal["spark1", "spark2"] = "spark2"
|
||||
kind: str = ""
|
||||
register: bool = True # write to custom services overrides after install
|
||||
|
||||
|
||||
@app.post("/api/nim/install")
|
||||
async def post_nim_install(body: NimInstallBody) -> dict:
|
||||
target_host = settings.spark1_host if body.host == "spark1" else settings.spark2_host
|
||||
target_user = settings.spark1_user if body.host == "spark1" else settings.spark2_user
|
||||
try:
|
||||
job = await nim_manager.trigger(
|
||||
image=body.image,
|
||||
container=body.container,
|
||||
port=body.port,
|
||||
host=target_host,
|
||||
user=target_user,
|
||||
)
|
||||
except RuntimeError as e:
|
||||
raise HTTPException(409 if "in progress" in str(e) else 400, str(e))
|
||||
|
||||
if body.register:
|
||||
# Persist in custom services so the panel shows it after install.
|
||||
add_custom_service({
|
||||
"key": body.container,
|
||||
"kind": body.kind or "nim",
|
||||
"host": target_host,
|
||||
"user": target_user,
|
||||
"container": body.container,
|
||||
"port": body.port,
|
||||
"image": body.image,
|
||||
})
|
||||
return {"job_id": job.id, "image": job.image, "container": job.container, "state": job.state}
|
||||
|
||||
|
||||
@app.get("/api/nim/install/{job_id}")
|
||||
async def get_nim_install(job_id: str) -> dict:
|
||||
job = nim_manager.get(job_id)
|
||||
if job is None:
|
||||
raise HTTPException(404, "no such job")
|
||||
return {
|
||||
"id": job.id,
|
||||
"image": job.image,
|
||||
"container": job.container,
|
||||
"port": job.port,
|
||||
"host": job.host,
|
||||
"state": job.state,
|
||||
"phase": job.phase,
|
||||
"started_at": job.started_at,
|
||||
"finished_at": job.finished_at,
|
||||
"returncode": job.returncode,
|
||||
"lines": job.lines,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/nim/install/{job_id}/stream")
|
||||
async def stream_nim_install(job_id: str):
|
||||
job = nim_manager.get(job_id)
|
||||
if job is None:
|
||||
raise HTTPException(404, "no such job")
|
||||
|
||||
async def gen():
|
||||
sent = 0
|
||||
last_phase = None
|
||||
while True:
|
||||
n = len(job.lines)
|
||||
if n > sent:
|
||||
for line in job.lines[sent:n]:
|
||||
yield f"data: {json.dumps({'line': line})}\n\n"
|
||||
sent = n
|
||||
if job.phase != last_phase:
|
||||
yield f"event: phase\ndata: {json.dumps({'state': job.state, 'phase': job.phase})}\n\n"
|
||||
last_phase = job.phase
|
||||
if job.returncode is not None and sent >= len(job.lines):
|
||||
yield f"event: done\ndata: {json.dumps({'state': job.state, 'returncode': job.returncode})}\n\n"
|
||||
return
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
return StreamingResponse(gen(), media_type="text/event-stream")
|
||||
|
||||
|
||||
@app.delete("/api/services/{name}")
|
||||
async def del_service(name: str) -> dict:
|
||||
# Only allow deleting custom services (not the bundled parakeet/magpie keys)
|
||||
if name in ("parakeet", "magpie"):
|
||||
raise HTTPException(400, "built-in service; cannot delete (use Configure Sparks to point at a different host)")
|
||||
delete_custom_service(name)
|
||||
return {"ok": True, "name": name}
|
||||
|
||||
|
||||
@app.post("/api/services/{name}/{action}")
|
||||
async def service_action(name: str, action: str) -> dict:
|
||||
services = services_from_settings(settings)
|
||||
|
||||
Reference in New Issue
Block a user