Files
spark-control/image/app/nim.py
T
Grant 1889ab45fb v0.4.0 - NIM installer + dashboard resilience
Hotfix (was v0.3.1):
- services.py: cache 'unreachable' per (host,user) for 25s so a dead Spark doesn't hang every /api/services call behind 6s ssh timeout
- ssh_run timeout reduced 10 -> 6s for docker_state probes
- hardware probe: shorter SSH timeout (6s), longer cache TTL for failures (25s)
- JS pollStatus retries loadModels() if state.models is empty (recovers from cold-start proxy timeout)
- Unreachable hardware card now includes troubleshooting steps (Spark Control cannot SSH into an unreachable Spark to restart it)

v0.4 NIM installer:
- nim.py module: curated SUGGESTED_NIMS list (Parakeet, Magpie, Riva) + NimManager that runs docker login nvcr.io + docker pull + docker run -d --gpus all -p PORT:PORT -v VOL:/opt/nim/.cache -e NGC_API_KEY -e ... --restart=unless-stopped + chown the volume to uid 1000 + restart. Streams all output via SSE; redacts the API key from log lines.
- custom_services.py: persists installed NIMs to /data/services-overrides.yaml so they appear in the services panel after install
- services.py: merges custom services into the panel
- /api/nim/catalog GET, /api/nim/install POST + GET/SSE
- /api/services/{name} DELETE for custom services
- UI: '+ Install NIM' button next to 'Always-on services'; modal lists curated images each with a 'Pick' button + a custom-image form; installation runs in a second dialog with phase + elapsed timer + collapsible log
- NGC API key field added to Configure Sparks (masked); injected as NGC_API_KEY env var into the container

Package: bump 0.4.0:0; main.ts adds SERVICES_OVERRIDES + NGC_API_KEY env vars
2026-05-12 12:32:29 -05:00

203 lines
7.9 KiB
Python

"""NVIDIA NIM container install / lifecycle.
Two pieces:
* A small curated catalog of NIM images (so users don't have to copy/paste
huge nvcr.io URLs).
* An installer that SSHes into the target Spark, runs `docker pull` then
`docker run -d --gpus all -p PORT:PORT -v VOLUME:/opt/nim/.cache
-e NGC_API_KEY=... IMAGE` and streams output.
Custom services also persist via `overrides.add_custom_service()` so the
Services panel can show them.
"""
from __future__ import annotations
import asyncio
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
from .config import Settings
from .ssh import ssh_stream, StreamHandle
# Curated list. These are the most useful NIM containers for a dual-Spark
# audio-and-LLM setup. Browse the full catalog at
# https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia
CATALOG_URL = "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers"
SUGGESTED_NIMS: list[dict] = [
{
"key": "parakeet-tdt-0.6b-v3",
"name": "Parakeet TDT 0.6B v3",
"image": "nvcr.io/nim/nvidia/parakeet-tdt-0-6b-v3:latest",
"default_container": "parakeet-asr",
"default_port": 8000,
"kind": "stt",
"description": "Streaming speech-to-text (English). Used by Open WebUI for voice input. ~1 GB.",
"homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/parakeet-tdt-0-6b-v3",
},
{
"key": "magpie-tts-multilingual",
"name": "Magpie TTS Multilingual",
"image": "nvcr.io/nim/nvidia/magpie-tts-multilingual:latest",
"default_container": "magpie-tts",
"default_port": 9000,
"kind": "tts",
"description": "Multilingual text-to-speech. Counterpart to Parakeet for 'read aloud'. ~3 GB.",
"homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/magpie-tts-multilingual",
},
{
"key": "riva-multilingual",
"name": "Riva Multilingual ASR",
"image": "nvcr.io/nim/nvidia/riva-multilingual:latest",
"default_container": "riva-asr",
"default_port": 8001,
"kind": "stt",
"description": "NVIDIA Riva speech-recognition multi-language model. Larger and more accurate than Parakeet.",
"homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia",
},
]
@dataclass
class NimInstallJob:
id: str
image: str
container: str
port: int
host: str
user: str
volume: Optional[str]
started_at: str
state: str = "starting" # starting | pulling | running | done | failed
phase: str = "Starting…"
lines: list[str] = field(default_factory=list)
returncode: Optional[int] = None
finished_at: Optional[str] = None
def append(self, line: str) -> None:
self.lines.append(line)
if len(self.lines) > 1000:
del self.lines[: len(self.lines) - 1000]
class NimManager:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self.lock = asyncio.Lock()
self.jobs: dict[str, NimInstallJob] = {}
self.current_job_id: Optional[str] = None
def get(self, job_id: str) -> NimInstallJob | None:
return self.jobs.get(job_id)
async def trigger(
self,
*,
image: str,
container: str,
port: int,
host: str,
user: str,
volume: str | None = None,
extra_env: dict[str, str] | None = None,
) -> NimInstallJob:
if self.lock.locked():
raise RuntimeError("Another NIM install is already in progress")
if not host or not user:
raise RuntimeError("target host not configured")
if not self.settings.ngc_api_key:
raise RuntimeError(
"NGC_API_KEY is not set. Open Configure Sparks in StartOS and paste your NGC personal API key (free at https://ngc.nvidia.com/setup/personal-key)."
)
job = NimInstallJob(
id=uuid.uuid4().hex[:8],
image=image,
container=container,
port=port,
host=host,
user=user,
volume=volume or f"{container}-cache",
started_at=datetime.now(timezone.utc).isoformat(),
)
self.jobs[job.id] = job
self.current_job_id = job.id
asyncio.create_task(self._run(job, extra_env or {}))
return job
async def _run(self, job: NimInstallJob, extra_env: dict[str, str]) -> None:
async with self.lock:
try:
await self._do(job, extra_env)
if job.state != "failed":
job.state = "done"
job.returncode = 0
job.phase = "Done"
except Exception as e:
job.append(f"[error] {type(e).__name__}: {e}")
job.state = "failed"
if job.returncode is None:
job.returncode = 1
finally:
job.finished_at = datetime.now(timezone.utc).isoformat()
if self.current_job_id == job.id:
self.current_job_id = None
async def _do(self, job: NimInstallJob, extra_env: dict[str, str]) -> None:
# Build the bash one-liner. We use docker login non-interactively with the NGC API key.
env_parts = [f'-e NGC_API_KEY=$NGC_API_KEY']
for k, v in extra_env.items():
env_parts.append(f"-e {k}={v}")
env_str = " ".join(env_parts)
cmd = (
f"set -e; "
f"export NGC_API_KEY='{self.settings.ngc_api_key}'; "
f"echo '=== docker login nvcr.io ==='; "
f"echo \"$NGC_API_KEY\" | docker login nvcr.io -u '$oauthtoken' --password-stdin; "
f"echo '=== docker pull {job.image} (this can be 1-10 GB) ==='; "
f"docker pull {job.image}; "
f"echo '=== remove any prior container with the same name ==='; "
f"docker rm -f {job.container} 2>/dev/null || true; "
f"echo '=== docker run -d --gpus all -p {job.port}:{job.port} -v {job.volume}:/opt/nim/.cache {env_str} --name {job.container} --restart unless-stopped {job.image} ==='; "
f"docker run -d --gpus all "
f"-p {job.port}:{job.port} "
f"-v {job.volume}:/opt/nim/.cache "
f"{env_str} "
f"--name {job.container} "
f"--restart unless-stopped "
f"{job.image}; "
f"echo '=== ensuring cache volume is writable by uid 1000 (riva-server) ==='; "
f"docker run --rm -v {job.volume}:/cache alpine chown -R 1000:1000 /cache && "
f"docker restart {job.container}; "
f"echo '=== install complete; container is starting up and will download its model on first boot ==='"
)
job.append(f"$ <install command for {job.image} on {job.host}>")
job.state = "pulling"
job.phase = "Pulling image from nvcr.io (this can take a few minutes)…"
handle = StreamHandle()
async for line in ssh_stream(job.host, job.user, cmd, self.settings, handle=handle):
# Don't log lines containing the api key
if self.settings.ngc_api_key and self.settings.ngc_api_key in line:
continue
job.append(line)
if "docker pull" in line:
job.phase = "Pulling image from nvcr.io…"
elif "Login Succeeded" in line:
job.phase = "Logged in to NGC; pulling image…"
elif "Pull complete" in line:
job.phase = "Pulling layers…"
elif "Status: Downloaded newer image" in line or "Image is up to date" in line:
job.phase = "Image ready; starting container…"
elif "docker run -d" in line:
job.state = "running"
job.phase = "Container starting; downloading model on first boot…"
rc = handle.returncode or 0
if rc != 0:
job.state = "failed"
job.returncode = rc