v0.4.0 - NIM installer + dashboard resilience
Hotfix (was v0.3.1):
- services.py: cache 'unreachable' per (host,user) for 25s so a dead Spark doesn't hang every /api/services call behind 6s ssh timeout
- ssh_run timeout reduced 10 -> 6s for docker_state probes
- hardware probe: shorter SSH timeout (6s), longer cache TTL for failures (25s)
- JS pollStatus retries loadModels() if state.models is empty (recovers from cold-start proxy timeout)
- Unreachable hardware card now includes troubleshooting steps (Spark Control cannot SSH into an unreachable Spark to restart it)
v0.4 NIM installer:
- nim.py module: curated SUGGESTED_NIMS list (Parakeet, Magpie, Riva) + NimManager that runs docker login nvcr.io + docker pull + docker run -d --gpus all -p PORT:PORT -v VOL:/opt/nim/.cache -e NGC_API_KEY -e ... --restart=unless-stopped + chown the volume to uid 1000 + restart. Streams all output via SSE; redacts the API key from log lines.
- custom_services.py: persists installed NIMs to /data/services-overrides.yaml so they appear in the services panel after install
- services.py: merges custom services into the panel
- /api/nim/catalog GET, /api/nim/install POST + GET/SSE
- /api/services/{name} DELETE for custom services
- UI: '+ Install NIM' button next to 'Always-on services'; modal lists curated images each with a 'Pick' button + a custom-image form; installation runs in a second dialog with phase + elapsed timer + collapsible log
- NGC API key field added to Configure Sparks (masked); injected as NGC_API_KEY env var into the container
Package: bump 0.4.0:0; main.ts adds SERVICES_OVERRIDES + NGC_API_KEY env vars
This commit is contained in:
@@ -43,6 +43,7 @@ class Settings:
|
|||||||
magpie_port: int
|
magpie_port: int
|
||||||
bind_port: int
|
bind_port: int
|
||||||
open_webui_url: str
|
open_webui_url: str
|
||||||
|
ngc_api_key: str
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_env(cls) -> "Settings":
|
def from_env(cls) -> "Settings":
|
||||||
@@ -68,6 +69,7 @@ class Settings:
|
|||||||
magpie_port=int(_env("MAGPIE_PORT", "9000")),
|
magpie_port=int(_env("MAGPIE_PORT", "9000")),
|
||||||
bind_port=int(_env("BIND_PORT", "9999")),
|
bind_port=int(_env("BIND_PORT", "9999")),
|
||||||
open_webui_url=_env("OPEN_WEBUI_URL", ""),
|
open_webui_url=_env("OPEN_WEBUI_URL", ""),
|
||||||
|
ngc_api_key=_env("NGC_API_KEY", ""),
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -0,0 +1,59 @@
|
|||||||
|
"""User-installed services persist in /data/services-overrides.yaml.
|
||||||
|
|
||||||
|
Format:
|
||||||
|
custom:
|
||||||
|
- key: my-riva
|
||||||
|
kind: stt
|
||||||
|
host: <spark-2-ip>
|
||||||
|
user: <spark-user>
|
||||||
|
container: riva-asr
|
||||||
|
port: 8001
|
||||||
|
health_path: /health
|
||||||
|
image: nvcr.io/nim/nvidia/riva-multilingual:latest
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
def _path() -> str:
|
||||||
|
return os.environ.get("SERVICES_OVERRIDES", "/data/services-overrides.yaml")
|
||||||
|
|
||||||
|
|
||||||
|
def load_custom_services() -> list[dict]:
|
||||||
|
try:
|
||||||
|
with open(_path()) as f:
|
||||||
|
data = yaml.safe_load(f) or {}
|
||||||
|
except FileNotFoundError:
|
||||||
|
return []
|
||||||
|
return data.get("custom") or []
|
||||||
|
|
||||||
|
|
||||||
|
def add_custom_service(entry: dict) -> None:
|
||||||
|
p = _path()
|
||||||
|
Path(p).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
data: dict = {}
|
||||||
|
try:
|
||||||
|
with open(p) as f:
|
||||||
|
data = yaml.safe_load(f) or {}
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
custom = data.get("custom") or []
|
||||||
|
custom = [c for c in custom if c.get("key") != entry["key"]]
|
||||||
|
custom.append(entry)
|
||||||
|
data["custom"] = custom
|
||||||
|
with open(p, "w") as f:
|
||||||
|
yaml.safe_dump(data, f, sort_keys=False)
|
||||||
|
|
||||||
|
|
||||||
|
def delete_custom_service(key: str) -> None:
|
||||||
|
p = _path()
|
||||||
|
try:
|
||||||
|
with open(p) as f:
|
||||||
|
data = yaml.safe_load(f) or {}
|
||||||
|
except FileNotFoundError:
|
||||||
|
return
|
||||||
|
data["custom"] = [c for c in (data.get("custom") or []) if c.get("key") != key]
|
||||||
|
with open(p, "w") as f:
|
||||||
|
yaml.safe_dump(data, f, sort_keys=False)
|
||||||
+14
-4
@@ -84,12 +84,16 @@ def _parse(out: str) -> dict:
|
|||||||
class HardwareProbe:
|
class HardwareProbe:
|
||||||
"""Caches results briefly to avoid hammering the Sparks."""
|
"""Caches results briefly to avoid hammering the Sparks."""
|
||||||
|
|
||||||
def __init__(self, settings: Settings, ttl_sec: float = 4.0) -> None:
|
def __init__(self, settings: Settings, ttl_sec: float = 4.0, fail_ttl_sec: float = 25.0) -> None:
|
||||||
self.settings = settings
|
self.settings = settings
|
||||||
self.ttl_sec = ttl_sec
|
self.ttl_sec = ttl_sec
|
||||||
|
self.fail_ttl_sec = fail_ttl_sec
|
||||||
self._cache: dict[str, tuple[float, dict]] = {}
|
self._cache: dict[str, tuple[float, dict]] = {}
|
||||||
self._locks: dict[str, asyncio.Lock] = {}
|
self._locks: dict[str, asyncio.Lock] = {}
|
||||||
|
|
||||||
|
def _ttl_for(self, value: dict) -> float:
|
||||||
|
return self.ttl_sec if value.get("reachable") else self.fail_ttl_sec
|
||||||
|
|
||||||
def _lock(self, key: str) -> asyncio.Lock:
|
def _lock(self, key: str) -> asyncio.Lock:
|
||||||
if key not in self._locks:
|
if key not in self._locks:
|
||||||
self._locks[key] = asyncio.Lock()
|
self._locks[key] = asyncio.Lock()
|
||||||
@@ -108,12 +112,18 @@ class HardwareProbe:
|
|||||||
async with self._lock(key):
|
async with self._lock(key):
|
||||||
now = time.monotonic()
|
now = time.monotonic()
|
||||||
cached = self._cache.get(key)
|
cached = self._cache.get(key)
|
||||||
if cached and (now - cached[0] < self.ttl_sec):
|
if cached and (now - cached[0] < self._ttl_for(cached[1])):
|
||||||
return cached[1]
|
return cached[1]
|
||||||
rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=12)
|
# Use a shorter timeout for the connect phase; if a previous probe
|
||||||
|
# marked this host unreachable, return the cached failure immediately.
|
||||||
|
rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=6)
|
||||||
if rc != 0:
|
if rc != 0:
|
||||||
|
# Cache failures for a slightly longer TTL so the dashboard isn't
|
||||||
|
# blocked behind 6 s of SSH timeout on every poll.
|
||||||
result = {"reachable": False, "configured": True, "host": host, "error": err.strip() or out.strip() or f"rc={rc}"}
|
result = {"reachable": False, "configured": True, "host": host, "error": err.strip() or out.strip() or f"rc={rc}"}
|
||||||
else:
|
self._cache[key] = (now, result)
|
||||||
|
# Override the TTL effectively by inserting a sentinel into the cache age
|
||||||
|
return result
|
||||||
result = {"reachable": True, "configured": True, "host": host, **_parse(out)}
|
result = {"reachable": True, "configured": True, "host": host, **_parse(out)}
|
||||||
self._cache[key] = (now, result)
|
self._cache[key] = (now, result)
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -0,0 +1,202 @@
|
|||||||
|
"""NVIDIA NIM container install / lifecycle.
|
||||||
|
|
||||||
|
Two pieces:
|
||||||
|
* A small curated catalog of NIM images (so users don't have to copy/paste
|
||||||
|
huge nvcr.io URLs).
|
||||||
|
* An installer that SSHes into the target Spark, runs `docker pull` then
|
||||||
|
`docker run -d --gpus all -p PORT:PORT -v VOLUME:/opt/nim/.cache
|
||||||
|
-e NGC_API_KEY=... IMAGE` and streams output.
|
||||||
|
|
||||||
|
Custom services also persist via `overrides.add_custom_service()` so the
|
||||||
|
Services panel can show them.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
import uuid
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from .config import Settings
|
||||||
|
from .ssh import ssh_stream, StreamHandle
|
||||||
|
|
||||||
|
|
||||||
|
# Curated list. These are the most useful NIM containers for a dual-Spark
|
||||||
|
# audio-and-LLM setup. Browse the full catalog at
|
||||||
|
# https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia
|
||||||
|
CATALOG_URL = "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers"
|
||||||
|
|
||||||
|
|
||||||
|
SUGGESTED_NIMS: list[dict] = [
|
||||||
|
{
|
||||||
|
"key": "parakeet-tdt-0.6b-v3",
|
||||||
|
"name": "Parakeet TDT 0.6B v3",
|
||||||
|
"image": "nvcr.io/nim/nvidia/parakeet-tdt-0-6b-v3:latest",
|
||||||
|
"default_container": "parakeet-asr",
|
||||||
|
"default_port": 8000,
|
||||||
|
"kind": "stt",
|
||||||
|
"description": "Streaming speech-to-text (English). Used by Open WebUI for voice input. ~1 GB.",
|
||||||
|
"homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/parakeet-tdt-0-6b-v3",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "magpie-tts-multilingual",
|
||||||
|
"name": "Magpie TTS Multilingual",
|
||||||
|
"image": "nvcr.io/nim/nvidia/magpie-tts-multilingual:latest",
|
||||||
|
"default_container": "magpie-tts",
|
||||||
|
"default_port": 9000,
|
||||||
|
"kind": "tts",
|
||||||
|
"description": "Multilingual text-to-speech. Counterpart to Parakeet for 'read aloud'. ~3 GB.",
|
||||||
|
"homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/magpie-tts-multilingual",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "riva-multilingual",
|
||||||
|
"name": "Riva Multilingual ASR",
|
||||||
|
"image": "nvcr.io/nim/nvidia/riva-multilingual:latest",
|
||||||
|
"default_container": "riva-asr",
|
||||||
|
"default_port": 8001,
|
||||||
|
"kind": "stt",
|
||||||
|
"description": "NVIDIA Riva speech-recognition multi-language model. Larger and more accurate than Parakeet.",
|
||||||
|
"homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NimInstallJob:
|
||||||
|
id: str
|
||||||
|
image: str
|
||||||
|
container: str
|
||||||
|
port: int
|
||||||
|
host: str
|
||||||
|
user: str
|
||||||
|
volume: Optional[str]
|
||||||
|
started_at: str
|
||||||
|
state: str = "starting" # starting | pulling | running | done | failed
|
||||||
|
phase: str = "Starting…"
|
||||||
|
lines: list[str] = field(default_factory=list)
|
||||||
|
returncode: Optional[int] = None
|
||||||
|
finished_at: Optional[str] = None
|
||||||
|
|
||||||
|
def append(self, line: str) -> None:
|
||||||
|
self.lines.append(line)
|
||||||
|
if len(self.lines) > 1000:
|
||||||
|
del self.lines[: len(self.lines) - 1000]
|
||||||
|
|
||||||
|
|
||||||
|
class NimManager:
|
||||||
|
def __init__(self, settings: Settings) -> None:
|
||||||
|
self.settings = settings
|
||||||
|
self.lock = asyncio.Lock()
|
||||||
|
self.jobs: dict[str, NimInstallJob] = {}
|
||||||
|
self.current_job_id: Optional[str] = None
|
||||||
|
|
||||||
|
def get(self, job_id: str) -> NimInstallJob | None:
|
||||||
|
return self.jobs.get(job_id)
|
||||||
|
|
||||||
|
async def trigger(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
image: str,
|
||||||
|
container: str,
|
||||||
|
port: int,
|
||||||
|
host: str,
|
||||||
|
user: str,
|
||||||
|
volume: str | None = None,
|
||||||
|
extra_env: dict[str, str] | None = None,
|
||||||
|
) -> NimInstallJob:
|
||||||
|
if self.lock.locked():
|
||||||
|
raise RuntimeError("Another NIM install is already in progress")
|
||||||
|
if not host or not user:
|
||||||
|
raise RuntimeError("target host not configured")
|
||||||
|
if not self.settings.ngc_api_key:
|
||||||
|
raise RuntimeError(
|
||||||
|
"NGC_API_KEY is not set. Open Configure Sparks in StartOS and paste your NGC personal API key (free at https://ngc.nvidia.com/setup/personal-key)."
|
||||||
|
)
|
||||||
|
|
||||||
|
job = NimInstallJob(
|
||||||
|
id=uuid.uuid4().hex[:8],
|
||||||
|
image=image,
|
||||||
|
container=container,
|
||||||
|
port=port,
|
||||||
|
host=host,
|
||||||
|
user=user,
|
||||||
|
volume=volume or f"{container}-cache",
|
||||||
|
started_at=datetime.now(timezone.utc).isoformat(),
|
||||||
|
)
|
||||||
|
self.jobs[job.id] = job
|
||||||
|
self.current_job_id = job.id
|
||||||
|
asyncio.create_task(self._run(job, extra_env or {}))
|
||||||
|
return job
|
||||||
|
|
||||||
|
async def _run(self, job: NimInstallJob, extra_env: dict[str, str]) -> None:
|
||||||
|
async with self.lock:
|
||||||
|
try:
|
||||||
|
await self._do(job, extra_env)
|
||||||
|
if job.state != "failed":
|
||||||
|
job.state = "done"
|
||||||
|
job.returncode = 0
|
||||||
|
job.phase = "Done"
|
||||||
|
except Exception as e:
|
||||||
|
job.append(f"[error] {type(e).__name__}: {e}")
|
||||||
|
job.state = "failed"
|
||||||
|
if job.returncode is None:
|
||||||
|
job.returncode = 1
|
||||||
|
finally:
|
||||||
|
job.finished_at = datetime.now(timezone.utc).isoformat()
|
||||||
|
if self.current_job_id == job.id:
|
||||||
|
self.current_job_id = None
|
||||||
|
|
||||||
|
async def _do(self, job: NimInstallJob, extra_env: dict[str, str]) -> None:
|
||||||
|
# Build the bash one-liner. We use docker login non-interactively with the NGC API key.
|
||||||
|
env_parts = [f'-e NGC_API_KEY=$NGC_API_KEY']
|
||||||
|
for k, v in extra_env.items():
|
||||||
|
env_parts.append(f"-e {k}={v}")
|
||||||
|
env_str = " ".join(env_parts)
|
||||||
|
cmd = (
|
||||||
|
f"set -e; "
|
||||||
|
f"export NGC_API_KEY='{self.settings.ngc_api_key}'; "
|
||||||
|
f"echo '=== docker login nvcr.io ==='; "
|
||||||
|
f"echo \"$NGC_API_KEY\" | docker login nvcr.io -u '$oauthtoken' --password-stdin; "
|
||||||
|
f"echo '=== docker pull {job.image} (this can be 1-10 GB) ==='; "
|
||||||
|
f"docker pull {job.image}; "
|
||||||
|
f"echo '=== remove any prior container with the same name ==='; "
|
||||||
|
f"docker rm -f {job.container} 2>/dev/null || true; "
|
||||||
|
f"echo '=== docker run -d --gpus all -p {job.port}:{job.port} -v {job.volume}:/opt/nim/.cache {env_str} --name {job.container} --restart unless-stopped {job.image} ==='; "
|
||||||
|
f"docker run -d --gpus all "
|
||||||
|
f"-p {job.port}:{job.port} "
|
||||||
|
f"-v {job.volume}:/opt/nim/.cache "
|
||||||
|
f"{env_str} "
|
||||||
|
f"--name {job.container} "
|
||||||
|
f"--restart unless-stopped "
|
||||||
|
f"{job.image}; "
|
||||||
|
f"echo '=== ensuring cache volume is writable by uid 1000 (riva-server) ==='; "
|
||||||
|
f"docker run --rm -v {job.volume}:/cache alpine chown -R 1000:1000 /cache && "
|
||||||
|
f"docker restart {job.container}; "
|
||||||
|
f"echo '=== install complete; container is starting up and will download its model on first boot ==='"
|
||||||
|
)
|
||||||
|
job.append(f"$ <install command for {job.image} on {job.host}>")
|
||||||
|
job.state = "pulling"
|
||||||
|
job.phase = "Pulling image from nvcr.io (this can take a few minutes)…"
|
||||||
|
|
||||||
|
handle = StreamHandle()
|
||||||
|
async for line in ssh_stream(job.host, job.user, cmd, self.settings, handle=handle):
|
||||||
|
# Don't log lines containing the api key
|
||||||
|
if self.settings.ngc_api_key and self.settings.ngc_api_key in line:
|
||||||
|
continue
|
||||||
|
job.append(line)
|
||||||
|
if "docker pull" in line:
|
||||||
|
job.phase = "Pulling image from nvcr.io…"
|
||||||
|
elif "Login Succeeded" in line:
|
||||||
|
job.phase = "Logged in to NGC; pulling image…"
|
||||||
|
elif "Pull complete" in line:
|
||||||
|
job.phase = "Pulling layers…"
|
||||||
|
elif "Status: Downloaded newer image" in line or "Image is up to date" in line:
|
||||||
|
job.phase = "Image ready; starting container…"
|
||||||
|
elif "docker run -d" in line:
|
||||||
|
job.state = "running"
|
||||||
|
job.phase = "Container starting; downloading model on first boot…"
|
||||||
|
|
||||||
|
rc = handle.returncode or 0
|
||||||
|
if rc != 0:
|
||||||
|
job.state = "failed"
|
||||||
|
job.returncode = rc
|
||||||
@@ -10,10 +10,12 @@ from pydantic import BaseModel
|
|||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
from .config import Settings
|
from .config import Settings
|
||||||
|
from .custom_services import add_custom_service, delete_custom_service
|
||||||
from .download import DownloadManager
|
from .download import DownloadManager
|
||||||
from .hardware import HardwareProbe
|
from .hardware import HardwareProbe
|
||||||
from .health import check_magpie, check_parakeet, check_vllm
|
from .health import check_magpie, check_parakeet, check_vllm
|
||||||
from .models import load_catalog
|
from .models import load_catalog
|
||||||
|
from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager
|
||||||
from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
|
from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
|
||||||
from .services import docker_state, run_action, services_from_settings
|
from .services import docker_state, run_action, services_from_settings
|
||||||
from .ssh import ssh_run
|
from .ssh import ssh_run
|
||||||
@@ -27,6 +29,7 @@ swap_manager = SwapManager(settings, catalog)
|
|||||||
download_manager = DownloadManager(settings)
|
download_manager = DownloadManager(settings)
|
||||||
update_manager = UpdateManager(settings)
|
update_manager = UpdateManager(settings)
|
||||||
hardware_probe = HardwareProbe(settings)
|
hardware_probe = HardwareProbe(settings)
|
||||||
|
nim_manager = NimManager(settings)
|
||||||
|
|
||||||
app = FastAPI(title="spark-control", version="0.1.0")
|
app = FastAPI(title="spark-control", version="0.1.0")
|
||||||
|
|
||||||
@@ -170,6 +173,108 @@ async def get_services() -> dict:
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/nim/catalog")
|
||||||
|
async def get_nim_catalog() -> dict:
|
||||||
|
return {
|
||||||
|
"catalog_url": CATALOG_URL,
|
||||||
|
"ngc_key_configured": bool(settings.ngc_api_key),
|
||||||
|
"suggested": SUGGESTED_NIMS,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class NimInstallBody(BaseModel):
|
||||||
|
image: str
|
||||||
|
container: str
|
||||||
|
port: int
|
||||||
|
host: Literal["spark1", "spark2"] = "spark2"
|
||||||
|
kind: str = ""
|
||||||
|
register: bool = True # write to custom services overrides after install
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/nim/install")
|
||||||
|
async def post_nim_install(body: NimInstallBody) -> dict:
|
||||||
|
target_host = settings.spark1_host if body.host == "spark1" else settings.spark2_host
|
||||||
|
target_user = settings.spark1_user if body.host == "spark1" else settings.spark2_user
|
||||||
|
try:
|
||||||
|
job = await nim_manager.trigger(
|
||||||
|
image=body.image,
|
||||||
|
container=body.container,
|
||||||
|
port=body.port,
|
||||||
|
host=target_host,
|
||||||
|
user=target_user,
|
||||||
|
)
|
||||||
|
except RuntimeError as e:
|
||||||
|
raise HTTPException(409 if "in progress" in str(e) else 400, str(e))
|
||||||
|
|
||||||
|
if body.register:
|
||||||
|
# Persist in custom services so the panel shows it after install.
|
||||||
|
add_custom_service({
|
||||||
|
"key": body.container,
|
||||||
|
"kind": body.kind or "nim",
|
||||||
|
"host": target_host,
|
||||||
|
"user": target_user,
|
||||||
|
"container": body.container,
|
||||||
|
"port": body.port,
|
||||||
|
"image": body.image,
|
||||||
|
})
|
||||||
|
return {"job_id": job.id, "image": job.image, "container": job.container, "state": job.state}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/nim/install/{job_id}")
|
||||||
|
async def get_nim_install(job_id: str) -> dict:
|
||||||
|
job = nim_manager.get(job_id)
|
||||||
|
if job is None:
|
||||||
|
raise HTTPException(404, "no such job")
|
||||||
|
return {
|
||||||
|
"id": job.id,
|
||||||
|
"image": job.image,
|
||||||
|
"container": job.container,
|
||||||
|
"port": job.port,
|
||||||
|
"host": job.host,
|
||||||
|
"state": job.state,
|
||||||
|
"phase": job.phase,
|
||||||
|
"started_at": job.started_at,
|
||||||
|
"finished_at": job.finished_at,
|
||||||
|
"returncode": job.returncode,
|
||||||
|
"lines": job.lines,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/nim/install/{job_id}/stream")
|
||||||
|
async def stream_nim_install(job_id: str):
|
||||||
|
job = nim_manager.get(job_id)
|
||||||
|
if job is None:
|
||||||
|
raise HTTPException(404, "no such job")
|
||||||
|
|
||||||
|
async def gen():
|
||||||
|
sent = 0
|
||||||
|
last_phase = None
|
||||||
|
while True:
|
||||||
|
n = len(job.lines)
|
||||||
|
if n > sent:
|
||||||
|
for line in job.lines[sent:n]:
|
||||||
|
yield f"data: {json.dumps({'line': line})}\n\n"
|
||||||
|
sent = n
|
||||||
|
if job.phase != last_phase:
|
||||||
|
yield f"event: phase\ndata: {json.dumps({'state': job.state, 'phase': job.phase})}\n\n"
|
||||||
|
last_phase = job.phase
|
||||||
|
if job.returncode is not None and sent >= len(job.lines):
|
||||||
|
yield f"event: done\ndata: {json.dumps({'state': job.state, 'returncode': job.returncode})}\n\n"
|
||||||
|
return
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
return StreamingResponse(gen(), media_type="text/event-stream")
|
||||||
|
|
||||||
|
|
||||||
|
@app.delete("/api/services/{name}")
|
||||||
|
async def del_service(name: str) -> dict:
|
||||||
|
# Only allow deleting custom services (not the bundled parakeet/magpie keys)
|
||||||
|
if name in ("parakeet", "magpie"):
|
||||||
|
raise HTTPException(400, "built-in service; cannot delete (use Configure Sparks to point at a different host)")
|
||||||
|
delete_custom_service(name)
|
||||||
|
return {"ok": True, "name": name}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/services/{name}/{action}")
|
@app.post("/api/services/{name}/{action}")
|
||||||
async def service_action(name: str, action: str) -> dict:
|
async def service_action(name: str, action: str) -> dict:
|
||||||
services = services_from_settings(settings)
|
services = services_from_settings(settings)
|
||||||
|
|||||||
+42
-2
@@ -5,6 +5,7 @@ machinery. We just run `docker start|stop|restart <container>` via SSH on the
|
|||||||
appropriate host.
|
appropriate host.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import time
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal, Optional
|
from typing import Literal, Optional
|
||||||
|
|
||||||
@@ -12,6 +13,25 @@ from .config import Settings
|
|||||||
from .ssh import ssh_run
|
from .ssh import ssh_run
|
||||||
|
|
||||||
|
|
||||||
|
# Cache the "unreachable" verdict per (host, user) for a short period so that a
|
||||||
|
# repeated docker_state call doesn't re-pay the 6 s SSH connect timeout each time.
|
||||||
|
_UNREACHABLE_TTL = 25.0
|
||||||
|
_unreachable_cache: dict[tuple[str, str], float] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _is_recently_unreachable(host: str, user: str) -> bool:
|
||||||
|
ts = _unreachable_cache.get((host, user))
|
||||||
|
return bool(ts and time.monotonic() - ts < _UNREACHABLE_TTL)
|
||||||
|
|
||||||
|
|
||||||
|
def _mark_unreachable(host: str, user: str) -> None:
|
||||||
|
_unreachable_cache[(host, user)] = time.monotonic()
|
||||||
|
|
||||||
|
|
||||||
|
def _clear_unreachable(host: str, user: str) -> None:
|
||||||
|
_unreachable_cache.pop((host, user), None)
|
||||||
|
|
||||||
|
|
||||||
ServiceName = Literal["parakeet", "magpie"]
|
ServiceName = Literal["parakeet", "magpie"]
|
||||||
ServiceAction = Literal["start", "stop", "restart"]
|
ServiceAction = Literal["start", "stop", "restart"]
|
||||||
|
|
||||||
@@ -27,7 +47,8 @@ class ServiceDef:
|
|||||||
|
|
||||||
|
|
||||||
def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
|
def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
|
||||||
return {
|
from .custom_services import load_custom_services
|
||||||
|
out: dict[str, ServiceDef] = {
|
||||||
"parakeet": ServiceDef(
|
"parakeet": ServiceDef(
|
||||||
name="parakeet",
|
name="parakeet",
|
||||||
kind="stt",
|
kind="stt",
|
||||||
@@ -45,19 +66,38 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]:
|
|||||||
port=s.magpie_port,
|
port=s.magpie_port,
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
for entry in load_custom_services():
|
||||||
|
key = entry.get("key")
|
||||||
|
if not key or key in out:
|
||||||
|
continue
|
||||||
|
out[key] = ServiceDef(
|
||||||
|
name=key,
|
||||||
|
kind=entry.get("kind", ""),
|
||||||
|
host=entry.get("host", ""),
|
||||||
|
user=entry.get("user", ""),
|
||||||
|
container=entry.get("container", key),
|
||||||
|
port=int(entry.get("port", 0)),
|
||||||
|
)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
async def docker_state(settings: Settings, svc: ServiceDef) -> dict:
|
async def docker_state(settings: Settings, svc: ServiceDef) -> dict:
|
||||||
"""Get docker state (running, exited, restarting, etc.) + restart count."""
|
"""Get docker state (running, exited, restarting, etc.) + restart count."""
|
||||||
if not svc.host or not svc.user:
|
if not svc.host or not svc.user:
|
||||||
return {"state": "unconfigured", "restart_count": None, "uptime": None}
|
return {"state": "unconfigured", "restart_count": None, "uptime": None}
|
||||||
|
if _is_recently_unreachable(svc.host, svc.user):
|
||||||
|
return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None}
|
||||||
cmd = (
|
cmd = (
|
||||||
f"docker inspect {svc.container} "
|
f"docker inspect {svc.container} "
|
||||||
f"--format '{{{{.State.Status}}}}|{{{{.State.StartedAt}}}}|{{{{.RestartCount}}}}|{{{{.State.ExitCode}}}}|{{{{.State.Error}}}}' "
|
f"--format '{{{{.State.Status}}}}|{{{{.State.StartedAt}}}}|{{{{.RestartCount}}}}|{{{{.State.ExitCode}}}}|{{{{.State.Error}}}}' "
|
||||||
f"2>&1 || echo 'NOT_FOUND'"
|
f"2>&1 || echo 'NOT_FOUND'"
|
||||||
)
|
)
|
||||||
rc, out, _ = await ssh_run(svc.host, svc.user, cmd, settings, timeout=10)
|
rc, out, _ = await ssh_run(svc.host, svc.user, cmd, settings, timeout=6)
|
||||||
out = out.strip()
|
out = out.strip()
|
||||||
|
if rc == 124 or "timeout after" in out.lower():
|
||||||
|
_mark_unreachable(svc.host, svc.user)
|
||||||
|
return {"state": "unreachable", "host_unreachable": True, "restart_count": None, "uptime": None}
|
||||||
|
_clear_unreachable(svc.host, svc.user)
|
||||||
if rc != 0 or out.startswith("NOT_FOUND") or "Error" in out and "no such object" in out.lower():
|
if rc != 0 or out.startswith("NOT_FOUND") or "Error" in out and "no such object" in out.lower():
|
||||||
return {"state": "missing", "restart_count": None, "uptime": None, "raw": out}
|
return {"state": "missing", "restart_count": None, "uptime": None, "raw": out}
|
||||||
parts = out.split("|")
|
parts = out.split("|")
|
||||||
|
|||||||
@@ -144,6 +144,15 @@ function renderHardware() {
|
|||||||
<span class="meta">unreachable</span>
|
<span class="meta">unreachable</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="muted small">${escapeHtml(s.host || '')} — ${escapeHtml(s.error || 'no response')}</div>
|
<div class="muted small">${escapeHtml(s.host || '')} — ${escapeHtml(s.error || 'no response')}</div>
|
||||||
|
<div class="muted small" style="line-height:1.5">
|
||||||
|
Spark Control can't restart a Spark that won't answer SSH. Steps to try:
|
||||||
|
<ol style="margin: 6px 0 0 18px; padding: 0;">
|
||||||
|
<li>Verify it's powered on (check the front LED).</li>
|
||||||
|
<li>Ping it from another LAN device.</li>
|
||||||
|
<li>Power-cycle it physically.</li>
|
||||||
|
<li>If it boots, this card will go green again automatically.</li>
|
||||||
|
</ol>
|
||||||
|
</div>
|
||||||
`;
|
`;
|
||||||
grid.appendChild(card);
|
grid.appendChild(card);
|
||||||
continue;
|
continue;
|
||||||
@@ -510,6 +519,10 @@ async function pollStatus() {
|
|||||||
renderCurrent(status);
|
renderCurrent(status);
|
||||||
renderEndpoint(status);
|
renderEndpoint(status);
|
||||||
renderHealth(status);
|
renderHealth(status);
|
||||||
|
// If models hasn't loaded yet (init may have hit a transient proxy timeout), retry.
|
||||||
|
if (!state.models || Object.keys(state.models).length === 0) {
|
||||||
|
try { await loadModels(); } catch {}
|
||||||
|
}
|
||||||
// Refresh services state lazily — every 5s poll triggers this too.
|
// Refresh services state lazily — every 5s poll triggers this too.
|
||||||
try {
|
try {
|
||||||
state.services = await fetchJSON('/api/services');
|
state.services = await fetchJSON('/api/services');
|
||||||
@@ -953,6 +966,147 @@ function setupAdvancedDialog() {
|
|||||||
el('#adv-gmu').addEventListener('input', (e) => { el('#adv-gmu-out').value = parseFloat(e.target.value).toFixed(2); });
|
el('#adv-gmu').addEventListener('input', (e) => { el('#adv-gmu-out').value = parseFloat(e.target.value).toFixed(2); });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ===================== NIM installer =====================
|
||||||
|
|
||||||
|
const nimState = {
|
||||||
|
catalog: null,
|
||||||
|
job_id: null,
|
||||||
|
eventsource: null,
|
||||||
|
timer: null,
|
||||||
|
started_at: null,
|
||||||
|
};
|
||||||
|
|
||||||
|
async function loadNimCatalog() {
|
||||||
|
try {
|
||||||
|
nimState.catalog = await fetchJSON('/api/nim/catalog');
|
||||||
|
el('#nim-catalog-link').href = nimState.catalog.catalog_url;
|
||||||
|
const warn = el('#nim-key-warn');
|
||||||
|
if (!nimState.catalog.ngc_key_configured) {
|
||||||
|
warn.classList.add('nim-key-warn');
|
||||||
|
warn.innerHTML = '⚠️ NGC API key not set. Open <strong>Configure Sparks</strong> in StartOS and paste your NGC personal API key, otherwise installs will fail. <a href="https://ngc.nvidia.com/setup/personal-key" target="_blank" rel="noopener">Get a key</a>';
|
||||||
|
} else {
|
||||||
|
warn.classList.remove('nim-key-warn');
|
||||||
|
warn.textContent = '';
|
||||||
|
}
|
||||||
|
const grid = el('#nim-suggested');
|
||||||
|
grid.innerHTML = '';
|
||||||
|
for (const s of nimState.catalog.suggested || []) {
|
||||||
|
const card = document.createElement('div');
|
||||||
|
card.className = 'nim-card';
|
||||||
|
card.innerHTML = `
|
||||||
|
<div class="info">
|
||||||
|
<div class="name">${escapeHtml(s.name)} <span class="muted small">· ${escapeHtml(s.kind || 'nim')}</span></div>
|
||||||
|
<div class="desc">${escapeHtml(s.description || '')}</div>
|
||||||
|
<div class="img">${escapeHtml(s.image)}</div>
|
||||||
|
<div class="links">${s.homepage ? `<a href="${escapeHtml(s.homepage)}" target="_blank" rel="noopener">View on NGC ↗</a>` : ''}</div>
|
||||||
|
</div>
|
||||||
|
<button type="button" class="btn primary nim-pick" data-image="${escapeHtml(s.image)}" data-container="${escapeHtml(s.default_container)}" data-port="${s.default_port}" data-kind="${escapeHtml(s.kind)}">Pick</button>
|
||||||
|
`;
|
||||||
|
grid.appendChild(card);
|
||||||
|
}
|
||||||
|
grid.querySelectorAll('.nim-pick').forEach(btn => {
|
||||||
|
btn.addEventListener('click', () => {
|
||||||
|
el('#nim-image').value = btn.dataset.image;
|
||||||
|
el('#nim-container').value = btn.dataset.container;
|
||||||
|
el('#nim-port').value = btn.dataset.port;
|
||||||
|
el('#nim-kind').value = btn.dataset.kind || 'nim';
|
||||||
|
});
|
||||||
|
});
|
||||||
|
} catch (e) { console.warn('nim catalog failed', e); }
|
||||||
|
}
|
||||||
|
|
||||||
|
function openNimDialog() {
|
||||||
|
loadNimCatalog();
|
||||||
|
el('#nim-dialog').showModal();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function submitNim(e) {
|
||||||
|
e.preventDefault();
|
||||||
|
const body = {
|
||||||
|
image: el('#nim-image').value.trim(),
|
||||||
|
container: el('#nim-container').value.trim(),
|
||||||
|
port: parseInt(el('#nim-port').value, 10),
|
||||||
|
host: el('#nim-host').value,
|
||||||
|
kind: el('#nim-kind').value,
|
||||||
|
};
|
||||||
|
if (!body.image || !body.container || !body.port) {
|
||||||
|
alert('Image, container name, and port are required.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const r = await fetchJSON('/api/nim/install', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'content-type': 'application/json' },
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
});
|
||||||
|
el('#nim-dialog').close();
|
||||||
|
attachNimProgress(r.job_id);
|
||||||
|
} catch (e) {
|
||||||
|
alert('Install failed: ' + e.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function nimTimerStart(at) {
|
||||||
|
nimState.started_at = at;
|
||||||
|
if (nimState.timer) clearInterval(nimState.timer);
|
||||||
|
const tick = () => {
|
||||||
|
if (!nimState.started_at) return;
|
||||||
|
const sec = Math.max(0, Math.floor((Date.now() - nimState.started_at) / 1000));
|
||||||
|
const m = Math.floor(sec / 60);
|
||||||
|
const s = sec % 60;
|
||||||
|
el('#nim-prog-elapsed').textContent = `${m}:${s.toString().padStart(2, '0')}`;
|
||||||
|
};
|
||||||
|
tick();
|
||||||
|
nimState.timer = setInterval(tick, 500);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function attachNimProgress(jobId) {
|
||||||
|
nimState.job_id = jobId;
|
||||||
|
el('#nim-prog-log').textContent = '';
|
||||||
|
el('#nim-prog-title').textContent = 'Installing…';
|
||||||
|
el('#nim-progress-dialog').showModal();
|
||||||
|
try {
|
||||||
|
const snap = await fetchJSON(`/api/nim/install/${jobId}`);
|
||||||
|
nimTimerStart(Date.parse(snap.started_at));
|
||||||
|
el('#nim-prog-phase').textContent = snap.phase || 'Working…';
|
||||||
|
el('#nim-prog-log').textContent = (snap.lines || []).join('\n');
|
||||||
|
if (snap.returncode !== null) { onNimDone(snap); return; }
|
||||||
|
} catch { nimTimerStart(Date.now()); }
|
||||||
|
const es = new EventSource(`/api/nim/install/${jobId}/stream`);
|
||||||
|
nimState.eventsource = es;
|
||||||
|
es.onmessage = ev => {
|
||||||
|
try {
|
||||||
|
const d = JSON.parse(ev.data);
|
||||||
|
if (d.line !== undefined) {
|
||||||
|
const log = el('#nim-prog-log');
|
||||||
|
log.textContent += d.line + '\n';
|
||||||
|
log.scrollTop = log.scrollHeight;
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
};
|
||||||
|
es.addEventListener('phase', ev => {
|
||||||
|
try { el('#nim-prog-phase').textContent = JSON.parse(ev.data).phase; } catch {}
|
||||||
|
});
|
||||||
|
es.addEventListener('done', ev => {
|
||||||
|
let d = {}; try { d = JSON.parse(ev.data); } catch {}
|
||||||
|
onNimDone(d);
|
||||||
|
});
|
||||||
|
es.onerror = () => { es.close(); nimState.eventsource = null; };
|
||||||
|
}
|
||||||
|
|
||||||
|
function onNimDone(d) {
|
||||||
|
if (nimState.eventsource) { nimState.eventsource.close(); nimState.eventsource = null; }
|
||||||
|
if (nimState.timer) { clearInterval(nimState.timer); nimState.timer = null; }
|
||||||
|
if (d.state === 'failed') {
|
||||||
|
el('#nim-prog-title').textContent = `Failed (rc=${d.returncode})`;
|
||||||
|
el('#nim-prog-phase').textContent = 'Failed';
|
||||||
|
} else {
|
||||||
|
el('#nim-prog-title').textContent = 'Installed';
|
||||||
|
el('#nim-prog-phase').textContent = 'Done ✓ — service will appear when the container reports healthy.';
|
||||||
|
}
|
||||||
|
pollStatus();
|
||||||
|
}
|
||||||
|
|
||||||
// ===================== Explain context (LLM commit summary) =====================
|
// ===================== Explain context (LLM commit summary) =====================
|
||||||
|
|
||||||
let explainEventSource = null;
|
let explainEventSource = null;
|
||||||
@@ -1149,6 +1303,10 @@ async function init() {
|
|||||||
el('#ub-apply').addEventListener('click', applyUpdate);
|
el('#ub-apply').addEventListener('click', applyUpdate);
|
||||||
el('#ub-explain').addEventListener('click', explainContext);
|
el('#ub-explain').addEventListener('click', explainContext);
|
||||||
el('#dl-repo').addEventListener('input', updateDlHfLink);
|
el('#dl-repo').addEventListener('input', updateDlHfLink);
|
||||||
|
el('#open-nim').addEventListener('click', openNimDialog);
|
||||||
|
el('#nim-cancel').addEventListener('click', () => el('#nim-dialog').close());
|
||||||
|
el('#nim-form').addEventListener('submit', submitNim);
|
||||||
|
el('#nim-prog-close').addEventListener('click', () => el('#nim-progress-dialog').close());
|
||||||
setupCatalogDialog();
|
setupCatalogDialog();
|
||||||
setupAdvancedDialog();
|
setupAdvancedDialog();
|
||||||
// Open WebUI link from /api/config
|
// Open WebUI link from /api/config
|
||||||
|
|||||||
@@ -76,8 +76,66 @@
|
|||||||
</section>
|
</section>
|
||||||
|
|
||||||
<section id="services-panel" class="services hidden">
|
<section id="services-panel" class="services hidden">
|
||||||
|
<div class="section-header">
|
||||||
<h2 class="section-title">Always-on services</h2>
|
<h2 class="section-title">Always-on services</h2>
|
||||||
|
<button id="open-nim" class="btn small-btn">+ Install NIM</button>
|
||||||
|
</div>
|
||||||
<div id="services-grid" class="services-grid"></div>
|
<div id="services-grid" class="services-grid"></div>
|
||||||
|
|
||||||
|
<dialog id="nim-dialog" class="modal">
|
||||||
|
<form method="dialog" class="modal-form" id="nim-form">
|
||||||
|
<h3>Install a NVIDIA NIM container</h3>
|
||||||
|
<p class="muted small" id="nim-key-warn"></p>
|
||||||
|
<p class="muted small">Pick a curated container below or paste any image from <a href="#" id="nim-catalog-link" target="_blank" rel="noopener">the NGC NIM catalog</a>. Spark Control will <code>docker pull</code> and <code>docker run</code> it on the target Spark.</p>
|
||||||
|
|
||||||
|
<div id="nim-suggested" class="nim-grid"></div>
|
||||||
|
|
||||||
|
<fieldset class="modal-fieldset">
|
||||||
|
<legend>Custom image</legend>
|
||||||
|
<label class="modal-row"><span>Image (nvcr.io/...)</span><input type="text" id="nim-image" placeholder="nvcr.io/nim/nvidia/<name>:latest"></label>
|
||||||
|
<label class="modal-row"><span>Container name</span><input type="text" id="nim-container" placeholder="my-service"></label>
|
||||||
|
<label class="modal-row"><span>Port</span><input type="number" id="nim-port" min="1" max="65535"></label>
|
||||||
|
<label class="modal-row"><span>Kind</span>
|
||||||
|
<select id="nim-kind">
|
||||||
|
<option value="nim">NIM (other)</option>
|
||||||
|
<option value="stt">STT (speech-to-text)</option>
|
||||||
|
<option value="tts">TTS (text-to-speech)</option>
|
||||||
|
<option value="vision">Vision</option>
|
||||||
|
<option value="embedding">Embedding</option>
|
||||||
|
</select>
|
||||||
|
</label>
|
||||||
|
<label class="modal-row"><span>Target Spark</span>
|
||||||
|
<select id="nim-host">
|
||||||
|
<option value="spark2">Spark 2 (default for support services)</option>
|
||||||
|
<option value="spark1">Spark 1 (head node)</option>
|
||||||
|
</select>
|
||||||
|
</label>
|
||||||
|
</fieldset>
|
||||||
|
|
||||||
|
<div class="modal-actions">
|
||||||
|
<button type="button" id="nim-cancel" class="btn">Cancel</button>
|
||||||
|
<button type="submit" class="btn primary" id="nim-start">Install</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</dialog>
|
||||||
|
|
||||||
|
<dialog id="nim-progress-dialog" class="modal">
|
||||||
|
<form method="dialog" class="modal-form">
|
||||||
|
<h3 id="nim-prog-title">Installing…</h3>
|
||||||
|
<div class="phase-row">
|
||||||
|
<div class="phase" id="nim-prog-phase">Starting…</div>
|
||||||
|
<span class="spacer"></span>
|
||||||
|
<span class="timer" id="nim-prog-elapsed">0:00</span>
|
||||||
|
</div>
|
||||||
|
<details open>
|
||||||
|
<summary class="muted small">Log</summary>
|
||||||
|
<pre id="nim-prog-log" class="log"></pre>
|
||||||
|
</details>
|
||||||
|
<div class="modal-actions">
|
||||||
|
<button type="button" id="nim-prog-close" class="btn">Close</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</dialog>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<section id="models-section">
|
<section id="models-section">
|
||||||
|
|||||||
@@ -376,6 +376,7 @@ main {
|
|||||||
.hw-card .head .meta { color: var(--muted); font-size: 12px; margin-left: auto; }
|
.hw-card .head .meta { color: var(--muted); font-size: 12px; margin-left: auto; }
|
||||||
.hw-card.unreachable { border-color: rgba(239, 68, 68, 0.4); }
|
.hw-card.unreachable { border-color: rgba(239, 68, 68, 0.4); }
|
||||||
.hw-card.unreachable .name { color: var(--error); }
|
.hw-card.unreachable .name { color: var(--error); }
|
||||||
|
.hw-card.unreachable ol { color: var(--muted); }
|
||||||
.hw-metric { display: flex; align-items: center; gap: 10px; font-size: 12px; }
|
.hw-metric { display: flex; align-items: center; gap: 10px; font-size: 12px; }
|
||||||
.hw-metric .label { color: var(--muted); width: 56px; flex-shrink: 0; text-transform: uppercase; letter-spacing: 0.05em; font-size: 11px; }
|
.hw-metric .label { color: var(--muted); width: 56px; flex-shrink: 0; text-transform: uppercase; letter-spacing: 0.05em; font-size: 11px; }
|
||||||
.hw-metric .bar { flex: 1; height: 8px; background: var(--surface-2); border-radius: 4px; overflow: hidden; position: relative; }
|
.hw-metric .bar { flex: 1; height: 8px; background: var(--surface-2); border-radius: 4px; overflow: hidden; position: relative; }
|
||||||
@@ -477,6 +478,37 @@ main {
|
|||||||
#dl-log-details { margin-top: 12px; }
|
#dl-log-details { margin-top: 12px; }
|
||||||
#dl-log-details summary { cursor: pointer; padding: 4px 0; }
|
#dl-log-details summary { cursor: pointer; padding: 4px 0; }
|
||||||
|
|
||||||
|
/* ===== NIM install dialog ===== */
|
||||||
|
|
||||||
|
.modal#nim-dialog,
|
||||||
|
.modal#nim-progress-dialog { max-width: 640px; }
|
||||||
|
.nim-grid {
|
||||||
|
display: grid;
|
||||||
|
gap: 8px;
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
max-height: 240px;
|
||||||
|
overflow-y: auto;
|
||||||
|
margin-bottom: 4px;
|
||||||
|
}
|
||||||
|
.nim-card {
|
||||||
|
background: var(--surface-2);
|
||||||
|
border: 1px solid var(--border);
|
||||||
|
border-radius: 6px;
|
||||||
|
padding: 10px 12px;
|
||||||
|
display: flex;
|
||||||
|
gap: 10px;
|
||||||
|
align-items: flex-start;
|
||||||
|
}
|
||||||
|
.nim-card .info { flex: 1; }
|
||||||
|
.nim-card .name { font-weight: 600; font-size: 13px; }
|
||||||
|
.nim-card .desc { color: var(--muted); font-size: 12px; margin-top: 4px; }
|
||||||
|
.nim-card .img { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; color: #6b6b75; font-size: 11px; margin-top: 4px; word-break: break-all; }
|
||||||
|
.nim-card .btn { padding: 6px 12px; font-size: 12px; flex-shrink: 0; }
|
||||||
|
.nim-card .links { font-size: 11px; margin-top: 4px; }
|
||||||
|
.nim-card .links a { color: var(--info); text-decoration: none; }
|
||||||
|
.nim-card .links a:hover { text-decoration: underline; }
|
||||||
|
.nim-key-warn { color: var(--warn); }
|
||||||
|
|
||||||
/* ===== Section titles ===== */
|
/* ===== Section titles ===== */
|
||||||
|
|
||||||
.section-title {
|
.section-title {
|
||||||
|
|||||||
@@ -85,6 +85,15 @@ const inputSpec = InputSpec.of({
|
|||||||
placeholder: 'e.g. https://open-webui.yourserver.local',
|
placeholder: 'e.g. https://open-webui.yourserver.local',
|
||||||
masked: false,
|
masked: false,
|
||||||
}),
|
}),
|
||||||
|
ngc_api_key: Value.text({
|
||||||
|
name: 'NGC API key (optional)',
|
||||||
|
description:
|
||||||
|
'NVIDIA NGC personal API key — needed to install NIM containers (Parakeet, Magpie, etc.) from nvcr.io. Get one free at https://ngc.nvidia.com/setup/personal-key. Stored only on this Start9 server; passed to docker as the NGC_API_KEY env var when installing NIM services.',
|
||||||
|
required: false,
|
||||||
|
default: null,
|
||||||
|
placeholder: 'starts with "nvapi-..."',
|
||||||
|
masked: true,
|
||||||
|
}),
|
||||||
})
|
})
|
||||||
|
|
||||||
export const configureSparks = sdk.Action.withInput(
|
export const configureSparks = sdk.Action.withInput(
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ export const sparkConfigSchema = z.object({
|
|||||||
magpie_container: z.string().catch(''),
|
magpie_container: z.string().catch(''),
|
||||||
// Optional Open WebUI deep-link
|
// Optional Open WebUI deep-link
|
||||||
open_webui_url: z.string().catch(''),
|
open_webui_url: z.string().catch(''),
|
||||||
|
// Optional NGC API key for pulling NIM containers from nvcr.io/nim/...
|
||||||
|
ngc_api_key: z.string().catch(''),
|
||||||
})
|
})
|
||||||
|
|
||||||
export type SparkConfig = z.infer<typeof sparkConfigSchema>
|
export type SparkConfig = z.infer<typeof sparkConfigSchema>
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ export const main = sdk.setupMain(async ({ effects }) => {
|
|||||||
magpie_user: '',
|
magpie_user: '',
|
||||||
magpie_container: '',
|
magpie_container: '',
|
||||||
open_webui_url: '',
|
open_webui_url: '',
|
||||||
|
ngc_api_key: '',
|
||||||
}
|
}
|
||||||
|
|
||||||
return sdk.Daemons.of(effects).addDaemon('primary', {
|
return sdk.Daemons.of(effects).addDaemon('primary', {
|
||||||
@@ -48,7 +49,9 @@ export const main = sdk.setupMain(async ({ effects }) => {
|
|||||||
MAGPIE_USER: cfg.magpie_user,
|
MAGPIE_USER: cfg.magpie_user,
|
||||||
MAGPIE_CONTAINER: cfg.magpie_container,
|
MAGPIE_CONTAINER: cfg.magpie_container,
|
||||||
MODELS_OVERRIDES: '/data/models-overrides.yaml',
|
MODELS_OVERRIDES: '/data/models-overrides.yaml',
|
||||||
|
SERVICES_OVERRIDES: '/data/services-overrides.yaml',
|
||||||
OPEN_WEBUI_URL: cfg.open_webui_url,
|
OPEN_WEBUI_URL: cfg.open_webui_url,
|
||||||
|
NGC_API_KEY: cfg.ngc_api_key,
|
||||||
BIND_PORT: String(uiPort),
|
BIND_PORT: String(uiPort),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
export const v0_1_0 = VersionInfo.of({
|
export const v0_1_0 = VersionInfo.of({
|
||||||
version: '0.3.0:1',
|
version: '0.4.0:0',
|
||||||
releaseNotes: {
|
releaseNotes: {
|
||||||
en_US:
|
en_US:
|
||||||
'v0.3: Spark hardware dashboard (RAM, disk, GPU memory + utilization, CPU load, uptime per Spark). Per-model Advanced settings now show plain-English hints tied to your actual GPU memory (e.g. "0.85 GPU util leaves ~18 GB free"). "Explain context" button on the update banner asks the loaded LLM to summarize pending commits in plain English. Optional Open WebUI URL in Configure Sparks shows a one-click "Open chat" button in the top bar. Downloads can now target Spark 1, Spark 2, or both. Each model card links out to its Hugging Face page.',
|
'v0.4: install NIM containers from the dashboard. New "+ Install NIM" button next to the services panel shows a curated catalog (Parakeet, Magpie, Riva...) plus a free-form image field. Streams docker pull + docker run output with phase + elapsed timer; persists installed services to /data/services-overrides.yaml so they show up in the services panel after install. Configure Sparks now has an NGC API key field (masked) needed for nvcr.io. v0.3.1 hotfix bundled in: hardware/services SSH timeouts shortened (6 s) and failures cached for 25 s so an unreachable Spark doesn\'t hang the whole dashboard. Hardware card for an unreachable Spark now includes troubleshooting steps.',
|
||||||
},
|
},
|
||||||
migrations: {
|
migrations: {
|
||||||
up: async ({ effects }) => {},
|
up: async ({ effects }) => {},
|
||||||
|
|||||||
Reference in New Issue
Block a user