diff --git a/image/app/config.py b/image/app/config.py index 6a6137c..678519e 100644 --- a/image/app/config.py +++ b/image/app/config.py @@ -43,6 +43,7 @@ class Settings: magpie_port: int bind_port: int open_webui_url: str + ngc_api_key: str @classmethod def from_env(cls) -> "Settings": @@ -68,6 +69,7 @@ class Settings: magpie_port=int(_env("MAGPIE_PORT", "9000")), bind_port=int(_env("BIND_PORT", "9999")), open_webui_url=_env("OPEN_WEBUI_URL", ""), + ngc_api_key=_env("NGC_API_KEY", ""), ) @property diff --git a/image/app/custom_services.py b/image/app/custom_services.py new file mode 100644 index 0000000..3056512 --- /dev/null +++ b/image/app/custom_services.py @@ -0,0 +1,59 @@ +"""User-installed services persist in /data/services-overrides.yaml. + +Format: + custom: + - key: my-riva + kind: stt + host: + user: + container: riva-asr + port: 8001 + health_path: /health + image: nvcr.io/nim/nvidia/riva-multilingual:latest +""" +from __future__ import annotations +import os +from pathlib import Path +import yaml + + +def _path() -> str: + return os.environ.get("SERVICES_OVERRIDES", "/data/services-overrides.yaml") + + +def load_custom_services() -> list[dict]: + try: + with open(_path()) as f: + data = yaml.safe_load(f) or {} + except FileNotFoundError: + return [] + return data.get("custom") or [] + + +def add_custom_service(entry: dict) -> None: + p = _path() + Path(p).parent.mkdir(parents=True, exist_ok=True) + data: dict = {} + try: + with open(p) as f: + data = yaml.safe_load(f) or {} + except FileNotFoundError: + pass + custom = data.get("custom") or [] + custom = [c for c in custom if c.get("key") != entry["key"]] + custom.append(entry) + data["custom"] = custom + with open(p, "w") as f: + yaml.safe_dump(data, f, sort_keys=False) + + +def delete_custom_service(key: str) -> None: + p = _path() + try: + with open(p) as f: + data = yaml.safe_load(f) or {} + except FileNotFoundError: + return + data["custom"] = [c for c in (data.get("custom") or []) if c.get("key") != key] + with open(p, "w") as f: + yaml.safe_dump(data, f, sort_keys=False) diff --git a/image/app/hardware.py b/image/app/hardware.py index f66a378..4527026 100644 --- a/image/app/hardware.py +++ b/image/app/hardware.py @@ -84,12 +84,16 @@ def _parse(out: str) -> dict: class HardwareProbe: """Caches results briefly to avoid hammering the Sparks.""" - def __init__(self, settings: Settings, ttl_sec: float = 4.0) -> None: + def __init__(self, settings: Settings, ttl_sec: float = 4.0, fail_ttl_sec: float = 25.0) -> None: self.settings = settings self.ttl_sec = ttl_sec + self.fail_ttl_sec = fail_ttl_sec self._cache: dict[str, tuple[float, dict]] = {} self._locks: dict[str, asyncio.Lock] = {} + def _ttl_for(self, value: dict) -> float: + return self.ttl_sec if value.get("reachable") else self.fail_ttl_sec + def _lock(self, key: str) -> asyncio.Lock: if key not in self._locks: self._locks[key] = asyncio.Lock() @@ -108,12 +112,18 @@ class HardwareProbe: async with self._lock(key): now = time.monotonic() cached = self._cache.get(key) - if cached and (now - cached[0] < self.ttl_sec): + if cached and (now - cached[0] < self._ttl_for(cached[1])): return cached[1] - rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=12) + # Use a shorter timeout for the connect phase; if a previous probe + # marked this host unreachable, return the cached failure immediately. + rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=6) if rc != 0: + # Cache failures for a slightly longer TTL so the dashboard isn't + # blocked behind 6 s of SSH timeout on every poll. result = {"reachable": False, "configured": True, "host": host, "error": err.strip() or out.strip() or f"rc={rc}"} - else: - result = {"reachable": True, "configured": True, "host": host, **_parse(out)} + self._cache[key] = (now, result) + # Override the TTL effectively by inserting a sentinel into the cache age + return result + result = {"reachable": True, "configured": True, "host": host, **_parse(out)} self._cache[key] = (now, result) return result diff --git a/image/app/nim.py b/image/app/nim.py new file mode 100644 index 0000000..5718e36 --- /dev/null +++ b/image/app/nim.py @@ -0,0 +1,202 @@ +"""NVIDIA NIM container install / lifecycle. + +Two pieces: + * A small curated catalog of NIM images (so users don't have to copy/paste + huge nvcr.io URLs). + * An installer that SSHes into the target Spark, runs `docker pull` then + `docker run -d --gpus all -p PORT:PORT -v VOLUME:/opt/nim/.cache + -e NGC_API_KEY=... IMAGE` and streams output. + +Custom services also persist via `overrides.add_custom_service()` so the +Services panel can show them. +""" +from __future__ import annotations +import asyncio +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Optional + +from .config import Settings +from .ssh import ssh_stream, StreamHandle + + +# Curated list. These are the most useful NIM containers for a dual-Spark +# audio-and-LLM setup. Browse the full catalog at +# https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia +CATALOG_URL = "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers" + + +SUGGESTED_NIMS: list[dict] = [ + { + "key": "parakeet-tdt-0.6b-v3", + "name": "Parakeet TDT 0.6B v3", + "image": "nvcr.io/nim/nvidia/parakeet-tdt-0-6b-v3:latest", + "default_container": "parakeet-asr", + "default_port": 8000, + "kind": "stt", + "description": "Streaming speech-to-text (English). Used by Open WebUI for voice input. ~1 GB.", + "homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/parakeet-tdt-0-6b-v3", + }, + { + "key": "magpie-tts-multilingual", + "name": "Magpie TTS Multilingual", + "image": "nvcr.io/nim/nvidia/magpie-tts-multilingual:latest", + "default_container": "magpie-tts", + "default_port": 9000, + "kind": "tts", + "description": "Multilingual text-to-speech. Counterpart to Parakeet for 'read aloud'. ~3 GB.", + "homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/magpie-tts-multilingual", + }, + { + "key": "riva-multilingual", + "name": "Riva Multilingual ASR", + "image": "nvcr.io/nim/nvidia/riva-multilingual:latest", + "default_container": "riva-asr", + "default_port": 8001, + "kind": "stt", + "description": "NVIDIA Riva speech-recognition multi-language model. Larger and more accurate than Parakeet.", + "homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia", + }, +] + + +@dataclass +class NimInstallJob: + id: str + image: str + container: str + port: int + host: str + user: str + volume: Optional[str] + started_at: str + state: str = "starting" # starting | pulling | running | done | failed + phase: str = "Starting…" + lines: list[str] = field(default_factory=list) + returncode: Optional[int] = None + finished_at: Optional[str] = None + + def append(self, line: str) -> None: + self.lines.append(line) + if len(self.lines) > 1000: + del self.lines[: len(self.lines) - 1000] + + +class NimManager: + def __init__(self, settings: Settings) -> None: + self.settings = settings + self.lock = asyncio.Lock() + self.jobs: dict[str, NimInstallJob] = {} + self.current_job_id: Optional[str] = None + + def get(self, job_id: str) -> NimInstallJob | None: + return self.jobs.get(job_id) + + async def trigger( + self, + *, + image: str, + container: str, + port: int, + host: str, + user: str, + volume: str | None = None, + extra_env: dict[str, str] | None = None, + ) -> NimInstallJob: + if self.lock.locked(): + raise RuntimeError("Another NIM install is already in progress") + if not host or not user: + raise RuntimeError("target host not configured") + if not self.settings.ngc_api_key: + raise RuntimeError( + "NGC_API_KEY is not set. Open Configure Sparks in StartOS and paste your NGC personal API key (free at https://ngc.nvidia.com/setup/personal-key)." + ) + + job = NimInstallJob( + id=uuid.uuid4().hex[:8], + image=image, + container=container, + port=port, + host=host, + user=user, + volume=volume or f"{container}-cache", + started_at=datetime.now(timezone.utc).isoformat(), + ) + self.jobs[job.id] = job + self.current_job_id = job.id + asyncio.create_task(self._run(job, extra_env or {})) + return job + + async def _run(self, job: NimInstallJob, extra_env: dict[str, str]) -> None: + async with self.lock: + try: + await self._do(job, extra_env) + if job.state != "failed": + job.state = "done" + job.returncode = 0 + job.phase = "Done" + except Exception as e: + job.append(f"[error] {type(e).__name__}: {e}") + job.state = "failed" + if job.returncode is None: + job.returncode = 1 + finally: + job.finished_at = datetime.now(timezone.utc).isoformat() + if self.current_job_id == job.id: + self.current_job_id = None + + async def _do(self, job: NimInstallJob, extra_env: dict[str, str]) -> None: + # Build the bash one-liner. We use docker login non-interactively with the NGC API key. + env_parts = [f'-e NGC_API_KEY=$NGC_API_KEY'] + for k, v in extra_env.items(): + env_parts.append(f"-e {k}={v}") + env_str = " ".join(env_parts) + cmd = ( + f"set -e; " + f"export NGC_API_KEY='{self.settings.ngc_api_key}'; " + f"echo '=== docker login nvcr.io ==='; " + f"echo \"$NGC_API_KEY\" | docker login nvcr.io -u '$oauthtoken' --password-stdin; " + f"echo '=== docker pull {job.image} (this can be 1-10 GB) ==='; " + f"docker pull {job.image}; " + f"echo '=== remove any prior container with the same name ==='; " + f"docker rm -f {job.container} 2>/dev/null || true; " + f"echo '=== docker run -d --gpus all -p {job.port}:{job.port} -v {job.volume}:/opt/nim/.cache {env_str} --name {job.container} --restart unless-stopped {job.image} ==='; " + f"docker run -d --gpus all " + f"-p {job.port}:{job.port} " + f"-v {job.volume}:/opt/nim/.cache " + f"{env_str} " + f"--name {job.container} " + f"--restart unless-stopped " + f"{job.image}; " + f"echo '=== ensuring cache volume is writable by uid 1000 (riva-server) ==='; " + f"docker run --rm -v {job.volume}:/cache alpine chown -R 1000:1000 /cache && " + f"docker restart {job.container}; " + f"echo '=== install complete; container is starting up and will download its model on first boot ==='" + ) + job.append(f"$ ") + job.state = "pulling" + job.phase = "Pulling image from nvcr.io (this can take a few minutes)…" + + handle = StreamHandle() + async for line in ssh_stream(job.host, job.user, cmd, self.settings, handle=handle): + # Don't log lines containing the api key + if self.settings.ngc_api_key and self.settings.ngc_api_key in line: + continue + job.append(line) + if "docker pull" in line: + job.phase = "Pulling image from nvcr.io…" + elif "Login Succeeded" in line: + job.phase = "Logged in to NGC; pulling image…" + elif "Pull complete" in line: + job.phase = "Pulling layers…" + elif "Status: Downloaded newer image" in line or "Image is up to date" in line: + job.phase = "Image ready; starting container…" + elif "docker run -d" in line: + job.state = "running" + job.phase = "Container starting; downloading model on first boot…" + + rc = handle.returncode or 0 + if rc != 0: + job.state = "failed" + job.returncode = rc diff --git a/image/app/server.py b/image/app/server.py index d5adaca..3417260 100644 --- a/image/app/server.py +++ b/image/app/server.py @@ -10,10 +10,12 @@ from pydantic import BaseModel from typing import Literal from .config import Settings +from .custom_services import add_custom_service, delete_custom_service from .download import DownloadManager from .hardware import HardwareProbe from .health import check_magpie, check_parakeet, check_vllm from .models import load_catalog +from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs from .services import docker_state, run_action, services_from_settings from .ssh import ssh_run @@ -27,6 +29,7 @@ swap_manager = SwapManager(settings, catalog) download_manager = DownloadManager(settings) update_manager = UpdateManager(settings) hardware_probe = HardwareProbe(settings) +nim_manager = NimManager(settings) app = FastAPI(title="spark-control", version="0.1.0") @@ -170,6 +173,108 @@ async def get_services() -> dict: return out +@app.get("/api/nim/catalog") +async def get_nim_catalog() -> dict: + return { + "catalog_url": CATALOG_URL, + "ngc_key_configured": bool(settings.ngc_api_key), + "suggested": SUGGESTED_NIMS, + } + + +class NimInstallBody(BaseModel): + image: str + container: str + port: int + host: Literal["spark1", "spark2"] = "spark2" + kind: str = "" + register: bool = True # write to custom services overrides after install + + +@app.post("/api/nim/install") +async def post_nim_install(body: NimInstallBody) -> dict: + target_host = settings.spark1_host if body.host == "spark1" else settings.spark2_host + target_user = settings.spark1_user if body.host == "spark1" else settings.spark2_user + try: + job = await nim_manager.trigger( + image=body.image, + container=body.container, + port=body.port, + host=target_host, + user=target_user, + ) + except RuntimeError as e: + raise HTTPException(409 if "in progress" in str(e) else 400, str(e)) + + if body.register: + # Persist in custom services so the panel shows it after install. + add_custom_service({ + "key": body.container, + "kind": body.kind or "nim", + "host": target_host, + "user": target_user, + "container": body.container, + "port": body.port, + "image": body.image, + }) + return {"job_id": job.id, "image": job.image, "container": job.container, "state": job.state} + + +@app.get("/api/nim/install/{job_id}") +async def get_nim_install(job_id: str) -> dict: + job = nim_manager.get(job_id) + if job is None: + raise HTTPException(404, "no such job") + return { + "id": job.id, + "image": job.image, + "container": job.container, + "port": job.port, + "host": job.host, + "state": job.state, + "phase": job.phase, + "started_at": job.started_at, + "finished_at": job.finished_at, + "returncode": job.returncode, + "lines": job.lines, + } + + +@app.get("/api/nim/install/{job_id}/stream") +async def stream_nim_install(job_id: str): + job = nim_manager.get(job_id) + if job is None: + raise HTTPException(404, "no such job") + + async def gen(): + sent = 0 + last_phase = None + while True: + n = len(job.lines) + if n > sent: + for line in job.lines[sent:n]: + yield f"data: {json.dumps({'line': line})}\n\n" + sent = n + if job.phase != last_phase: + yield f"event: phase\ndata: {json.dumps({'state': job.state, 'phase': job.phase})}\n\n" + last_phase = job.phase + if job.returncode is not None and sent >= len(job.lines): + yield f"event: done\ndata: {json.dumps({'state': job.state, 'returncode': job.returncode})}\n\n" + return + await asyncio.sleep(0.5) + + return StreamingResponse(gen(), media_type="text/event-stream") + + +@app.delete("/api/services/{name}") +async def del_service(name: str) -> dict: + # Only allow deleting custom services (not the bundled parakeet/magpie keys) + if name in ("parakeet", "magpie"): + raise HTTPException(400, "built-in service; cannot delete (use Configure Sparks to point at a different host)") + delete_custom_service(name) + return {"ok": True, "name": name} + + @app.post("/api/services/{name}/{action}") async def service_action(name: str, action: str) -> dict: services = services_from_settings(settings) diff --git a/image/app/services.py b/image/app/services.py index e86b972..7f4dce5 100644 --- a/image/app/services.py +++ b/image/app/services.py @@ -5,6 +5,7 @@ machinery. We just run `docker start|stop|restart ` via SSH on the appropriate host. """ from __future__ import annotations +import time from dataclasses import dataclass from typing import Literal, Optional @@ -12,6 +13,25 @@ from .config import Settings from .ssh import ssh_run +# Cache the "unreachable" verdict per (host, user) for a short period so that a +# repeated docker_state call doesn't re-pay the 6 s SSH connect timeout each time. +_UNREACHABLE_TTL = 25.0 +_unreachable_cache: dict[tuple[str, str], float] = {} + + +def _is_recently_unreachable(host: str, user: str) -> bool: + ts = _unreachable_cache.get((host, user)) + return bool(ts and time.monotonic() - ts < _UNREACHABLE_TTL) + + +def _mark_unreachable(host: str, user: str) -> None: + _unreachable_cache[(host, user)] = time.monotonic() + + +def _clear_unreachable(host: str, user: str) -> None: + _unreachable_cache.pop((host, user), None) + + ServiceName = Literal["parakeet", "magpie"] ServiceAction = Literal["start", "stop", "restart"] @@ -27,7 +47,8 @@ class ServiceDef: def services_from_settings(s: Settings) -> dict[str, ServiceDef]: - return { + from .custom_services import load_custom_services + out: dict[str, ServiceDef] = { "parakeet": ServiceDef( name="parakeet", kind="stt", @@ -45,19 +66,38 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]: port=s.magpie_port, ), } + for entry in load_custom_services(): + key = entry.get("key") + if not key or key in out: + continue + out[key] = ServiceDef( + name=key, + kind=entry.get("kind", ""), + host=entry.get("host", ""), + user=entry.get("user", ""), + container=entry.get("container", key), + port=int(entry.get("port", 0)), + ) + return out async def docker_state(settings: Settings, svc: ServiceDef) -> dict: """Get docker state (running, exited, restarting, etc.) + restart count.""" if not svc.host or not svc.user: return {"state": "unconfigured", "restart_count": None, "uptime": None} + if _is_recently_unreachable(svc.host, svc.user): + return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None} cmd = ( f"docker inspect {svc.container} " f"--format '{{{{.State.Status}}}}|{{{{.State.StartedAt}}}}|{{{{.RestartCount}}}}|{{{{.State.ExitCode}}}}|{{{{.State.Error}}}}' " f"2>&1 || echo 'NOT_FOUND'" ) - rc, out, _ = await ssh_run(svc.host, svc.user, cmd, settings, timeout=10) + rc, out, _ = await ssh_run(svc.host, svc.user, cmd, settings, timeout=6) out = out.strip() + if rc == 124 or "timeout after" in out.lower(): + _mark_unreachable(svc.host, svc.user) + return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None} + _clear_unreachable(svc.host, svc.user) if rc != 0 or out.startswith("NOT_FOUND") or "Error" in out and "no such object" in out.lower(): return {"state": "missing", "restart_count": None, "uptime": None, "raw": out} parts = out.split("|") diff --git a/image/app/static/app.js b/image/app/static/app.js index 00de929..54fb1df 100644 --- a/image/app/static/app.js +++ b/image/app/static/app.js @@ -144,6 +144,15 @@ function renderHardware() { unreachable
${escapeHtml(s.host || '')} — ${escapeHtml(s.error || 'no response')}
+
+ Spark Control can't restart a Spark that won't answer SSH. Steps to try: +
    +
  1. Verify it's powered on (check the front LED).
  2. +
  3. Ping it from another LAN device.
  4. +
  5. Power-cycle it physically.
  6. +
  7. If it boots, this card will go green again automatically.
  8. +
+
`; grid.appendChild(card); continue; @@ -510,6 +519,10 @@ async function pollStatus() { renderCurrent(status); renderEndpoint(status); renderHealth(status); + // If models hasn't loaded yet (init may have hit a transient proxy timeout), retry. + if (!state.models || Object.keys(state.models).length === 0) { + try { await loadModels(); } catch {} + } // Refresh services state lazily — every 5s poll triggers this too. try { state.services = await fetchJSON('/api/services'); @@ -953,6 +966,147 @@ function setupAdvancedDialog() { el('#adv-gmu').addEventListener('input', (e) => { el('#adv-gmu-out').value = parseFloat(e.target.value).toFixed(2); }); } +// ===================== NIM installer ===================== + +const nimState = { + catalog: null, + job_id: null, + eventsource: null, + timer: null, + started_at: null, +}; + +async function loadNimCatalog() { + try { + nimState.catalog = await fetchJSON('/api/nim/catalog'); + el('#nim-catalog-link').href = nimState.catalog.catalog_url; + const warn = el('#nim-key-warn'); + if (!nimState.catalog.ngc_key_configured) { + warn.classList.add('nim-key-warn'); + warn.innerHTML = '⚠️ NGC API key not set. Open Configure Sparks in StartOS and paste your NGC personal API key, otherwise installs will fail. Get a key'; + } else { + warn.classList.remove('nim-key-warn'); + warn.textContent = ''; + } + const grid = el('#nim-suggested'); + grid.innerHTML = ''; + for (const s of nimState.catalog.suggested || []) { + const card = document.createElement('div'); + card.className = 'nim-card'; + card.innerHTML = ` +
+
${escapeHtml(s.name)} · ${escapeHtml(s.kind || 'nim')}
+
${escapeHtml(s.description || '')}
+
${escapeHtml(s.image)}
+ +
+ + `; + grid.appendChild(card); + } + grid.querySelectorAll('.nim-pick').forEach(btn => { + btn.addEventListener('click', () => { + el('#nim-image').value = btn.dataset.image; + el('#nim-container').value = btn.dataset.container; + el('#nim-port').value = btn.dataset.port; + el('#nim-kind').value = btn.dataset.kind || 'nim'; + }); + }); + } catch (e) { console.warn('nim catalog failed', e); } +} + +function openNimDialog() { + loadNimCatalog(); + el('#nim-dialog').showModal(); +} + +async function submitNim(e) { + e.preventDefault(); + const body = { + image: el('#nim-image').value.trim(), + container: el('#nim-container').value.trim(), + port: parseInt(el('#nim-port').value, 10), + host: el('#nim-host').value, + kind: el('#nim-kind').value, + }; + if (!body.image || !body.container || !body.port) { + alert('Image, container name, and port are required.'); + return; + } + try { + const r = await fetchJSON('/api/nim/install', { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(body), + }); + el('#nim-dialog').close(); + attachNimProgress(r.job_id); + } catch (e) { + alert('Install failed: ' + e.message); + } +} + +function nimTimerStart(at) { + nimState.started_at = at; + if (nimState.timer) clearInterval(nimState.timer); + const tick = () => { + if (!nimState.started_at) return; + const sec = Math.max(0, Math.floor((Date.now() - nimState.started_at) / 1000)); + const m = Math.floor(sec / 60); + const s = sec % 60; + el('#nim-prog-elapsed').textContent = `${m}:${s.toString().padStart(2, '0')}`; + }; + tick(); + nimState.timer = setInterval(tick, 500); +} + +async function attachNimProgress(jobId) { + nimState.job_id = jobId; + el('#nim-prog-log').textContent = ''; + el('#nim-prog-title').textContent = 'Installing…'; + el('#nim-progress-dialog').showModal(); + try { + const snap = await fetchJSON(`/api/nim/install/${jobId}`); + nimTimerStart(Date.parse(snap.started_at)); + el('#nim-prog-phase').textContent = snap.phase || 'Working…'; + el('#nim-prog-log').textContent = (snap.lines || []).join('\n'); + if (snap.returncode !== null) { onNimDone(snap); return; } + } catch { nimTimerStart(Date.now()); } + const es = new EventSource(`/api/nim/install/${jobId}/stream`); + nimState.eventsource = es; + es.onmessage = ev => { + try { + const d = JSON.parse(ev.data); + if (d.line !== undefined) { + const log = el('#nim-prog-log'); + log.textContent += d.line + '\n'; + log.scrollTop = log.scrollHeight; + } + } catch {} + }; + es.addEventListener('phase', ev => { + try { el('#nim-prog-phase').textContent = JSON.parse(ev.data).phase; } catch {} + }); + es.addEventListener('done', ev => { + let d = {}; try { d = JSON.parse(ev.data); } catch {} + onNimDone(d); + }); + es.onerror = () => { es.close(); nimState.eventsource = null; }; +} + +function onNimDone(d) { + if (nimState.eventsource) { nimState.eventsource.close(); nimState.eventsource = null; } + if (nimState.timer) { clearInterval(nimState.timer); nimState.timer = null; } + if (d.state === 'failed') { + el('#nim-prog-title').textContent = `Failed (rc=${d.returncode})`; + el('#nim-prog-phase').textContent = 'Failed'; + } else { + el('#nim-prog-title').textContent = 'Installed'; + el('#nim-prog-phase').textContent = 'Done ✓ — service will appear when the container reports healthy.'; + } + pollStatus(); +} + // ===================== Explain context (LLM commit summary) ===================== let explainEventSource = null; @@ -1149,6 +1303,10 @@ async function init() { el('#ub-apply').addEventListener('click', applyUpdate); el('#ub-explain').addEventListener('click', explainContext); el('#dl-repo').addEventListener('input', updateDlHfLink); + el('#open-nim').addEventListener('click', openNimDialog); + el('#nim-cancel').addEventListener('click', () => el('#nim-dialog').close()); + el('#nim-form').addEventListener('submit', submitNim); + el('#nim-prog-close').addEventListener('click', () => el('#nim-progress-dialog').close()); setupCatalogDialog(); setupAdvancedDialog(); // Open WebUI link from /api/config diff --git a/image/app/static/index.html b/image/app/static/index.html index 7b6db22..05a67c2 100644 --- a/image/app/static/index.html +++ b/image/app/static/index.html @@ -76,8 +76,66 @@
diff --git a/image/app/static/style.css b/image/app/static/style.css index bb442a8..a83919e 100644 --- a/image/app/static/style.css +++ b/image/app/static/style.css @@ -376,6 +376,7 @@ main { .hw-card .head .meta { color: var(--muted); font-size: 12px; margin-left: auto; } .hw-card.unreachable { border-color: rgba(239, 68, 68, 0.4); } .hw-card.unreachable .name { color: var(--error); } +.hw-card.unreachable ol { color: var(--muted); } .hw-metric { display: flex; align-items: center; gap: 10px; font-size: 12px; } .hw-metric .label { color: var(--muted); width: 56px; flex-shrink: 0; text-transform: uppercase; letter-spacing: 0.05em; font-size: 11px; } .hw-metric .bar { flex: 1; height: 8px; background: var(--surface-2); border-radius: 4px; overflow: hidden; position: relative; } @@ -477,6 +478,37 @@ main { #dl-log-details { margin-top: 12px; } #dl-log-details summary { cursor: pointer; padding: 4px 0; } +/* ===== NIM install dialog ===== */ + +.modal#nim-dialog, +.modal#nim-progress-dialog { max-width: 640px; } +.nim-grid { + display: grid; + gap: 8px; + grid-template-columns: 1fr; + max-height: 240px; + overflow-y: auto; + margin-bottom: 4px; +} +.nim-card { + background: var(--surface-2); + border: 1px solid var(--border); + border-radius: 6px; + padding: 10px 12px; + display: flex; + gap: 10px; + align-items: flex-start; +} +.nim-card .info { flex: 1; } +.nim-card .name { font-weight: 600; font-size: 13px; } +.nim-card .desc { color: var(--muted); font-size: 12px; margin-top: 4px; } +.nim-card .img { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; color: #6b6b75; font-size: 11px; margin-top: 4px; word-break: break-all; } +.nim-card .btn { padding: 6px 12px; font-size: 12px; flex-shrink: 0; } +.nim-card .links { font-size: 11px; margin-top: 4px; } +.nim-card .links a { color: var(--info); text-decoration: none; } +.nim-card .links a:hover { text-decoration: underline; } +.nim-key-warn { color: var(--warn); } + /* ===== Section titles ===== */ .section-title { diff --git a/package/startos/actions/configureSparks.ts b/package/startos/actions/configureSparks.ts index bb25f3e..6c780a2 100644 --- a/package/startos/actions/configureSparks.ts +++ b/package/startos/actions/configureSparks.ts @@ -85,6 +85,15 @@ const inputSpec = InputSpec.of({ placeholder: 'e.g. https://open-webui.yourserver.local', masked: false, }), + ngc_api_key: Value.text({ + name: 'NGC API key (optional)', + description: + 'NVIDIA NGC personal API key — needed to install NIM containers (Parakeet, Magpie, etc.) from nvcr.io. Get one free at https://ngc.nvidia.com/setup/personal-key. Stored only on this Start9 server; passed to docker as the NGC_API_KEY env var when installing NIM services.', + required: false, + default: null, + placeholder: 'starts with "nvapi-..."', + masked: true, + }), }) export const configureSparks = sdk.Action.withInput( diff --git a/package/startos/fileModels/sparkConfig.yaml.ts b/package/startos/fileModels/sparkConfig.yaml.ts index ee0d623..48a5260 100644 --- a/package/startos/fileModels/sparkConfig.yaml.ts +++ b/package/startos/fileModels/sparkConfig.yaml.ts @@ -16,6 +16,8 @@ export const sparkConfigSchema = z.object({ magpie_container: z.string().catch(''), // Optional Open WebUI deep-link open_webui_url: z.string().catch(''), + // Optional NGC API key for pulling NIM containers from nvcr.io/nim/... + ngc_api_key: z.string().catch(''), }) export type SparkConfig = z.infer diff --git a/package/startos/main.ts b/package/startos/main.ts index c621a6b..3c1e914 100644 --- a/package/startos/main.ts +++ b/package/startos/main.ts @@ -20,6 +20,7 @@ export const main = sdk.setupMain(async ({ effects }) => { magpie_user: '', magpie_container: '', open_webui_url: '', + ngc_api_key: '', } return sdk.Daemons.of(effects).addDaemon('primary', { @@ -48,7 +49,9 @@ export const main = sdk.setupMain(async ({ effects }) => { MAGPIE_USER: cfg.magpie_user, MAGPIE_CONTAINER: cfg.magpie_container, MODELS_OVERRIDES: '/data/models-overrides.yaml', + SERVICES_OVERRIDES: '/data/services-overrides.yaml', OPEN_WEBUI_URL: cfg.open_webui_url, + NGC_API_KEY: cfg.ngc_api_key, BIND_PORT: String(uiPort), }, }, diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts index bfa0d81..0ddbd98 100644 --- a/package/startos/versions/v0_1_0.ts +++ b/package/startos/versions/v0_1_0.ts @@ -1,10 +1,10 @@ import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' export const v0_1_0 = VersionInfo.of({ - version: '0.3.0:1', + version: '0.4.0:0', releaseNotes: { en_US: - 'v0.3: Spark hardware dashboard (RAM, disk, GPU memory + utilization, CPU load, uptime per Spark). Per-model Advanced settings now show plain-English hints tied to your actual GPU memory (e.g. "0.85 GPU util leaves ~18 GB free"). "Explain context" button on the update banner asks the loaded LLM to summarize pending commits in plain English. Optional Open WebUI URL in Configure Sparks shows a one-click "Open chat" button in the top bar. Downloads can now target Spark 1, Spark 2, or both. Each model card links out to its Hugging Face page.', + 'v0.4: install NIM containers from the dashboard. New "+ Install NIM" button next to the services panel shows a curated catalog (Parakeet, Magpie, Riva...) plus a free-form image field. Streams docker pull + docker run output with phase + elapsed timer; persists installed services to /data/services-overrides.yaml so they show up in the services panel after install. Configure Sparks now has an NGC API key field (masked) needed for nvcr.io. v0.3.1 hotfix bundled in: hardware/services SSH timeouts shortened (6 s) and failures cached for 25 s so an unreachable Spark doesn\'t hang the whole dashboard. Hardware card for an unreachable Spark now includes troubleshooting steps.', }, migrations: { up: async ({ effects }) => {},