1c4e861783
Triaged from a full independent evaluation (EVALUATION.md). Addresses the three P0/P1 code findings; the proxy/data APIs that downstream apps consume are deliberately untouched. - ssh command injection (P0): new shellsafe.py validates + shlex.quotes every user-supplied value crossing into an SSH command on the Sparks (model repo, vllm args/knobs, NIM image/container/volume/port/env, service names). Boundary validation on POST /api/models and POST /api/nim/install; quoting at every sink in models/download/nim/services. NGC key now quoted too. - qdrant path injection (P1): /api/search validates the collection name against a metacharacter-free whitelist and URL-encodes the path segment. - csrf (P1): csrf_guard middleware enforces same-origin on state-changing control endpoints; /v1/*, /scrub, /rehydrate, /api/search, /api/audio/* and /api/health-event are exempt so external consumers are unaffected. Verified: injection survives only as a single quoted token, vLLM preflight shlex.split round-trip intact, CSRF behaviors covered via TestClient, both offline redaction suites still pass, tsc clean, s9pk rebuilt.
204 lines
8.1 KiB
Python
204 lines
8.1 KiB
Python
"""NVIDIA NIM container install / lifecycle.
|
|
|
|
Two pieces:
|
|
* A small curated catalog of NIM images (so users don't have to copy/paste
|
|
huge nvcr.io URLs).
|
|
* An installer that SSHes into the target Spark, runs `docker pull` then
|
|
`docker run -d --gpus all -p PORT:PORT -v VOLUME:/opt/nim/.cache
|
|
-e NGC_API_KEY=... IMAGE` and streams output.
|
|
|
|
Custom services also persist via `overrides.add_custom_service()` so the
|
|
Services panel can show them.
|
|
"""
|
|
from __future__ import annotations
|
|
import asyncio
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
from .config import Settings
|
|
from .shellsafe import quote_arg
|
|
from .ssh import ssh_stream, StreamHandle
|
|
|
|
|
|
# Curated list. These are the most useful NIM containers for a dual-Spark
|
|
# audio-and-LLM setup. Browse the full catalog at
|
|
# https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia
|
|
CATALOG_URL = "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers"
|
|
|
|
|
|
SUGGESTED_NIMS: list[dict] = [
|
|
{
|
|
"key": "parakeet-tdt-0.6b-v3",
|
|
"name": "Parakeet TDT 0.6B v3",
|
|
"image": "nvcr.io/nim/nvidia/parakeet-tdt-0-6b-v3:latest",
|
|
"default_container": "parakeet-asr",
|
|
"default_port": 8000,
|
|
"kind": "stt",
|
|
"description": "Streaming speech-to-text (English). Used by Open WebUI for voice input. ~1 GB.",
|
|
"homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/parakeet-tdt-0-6b-v3",
|
|
},
|
|
{
|
|
"key": "riva-multilingual",
|
|
"name": "Riva Multilingual ASR",
|
|
"image": "nvcr.io/nim/nvidia/riva-multilingual:latest",
|
|
"default_container": "riva-asr",
|
|
"default_port": 8001,
|
|
"kind": "stt",
|
|
"description": "NVIDIA Riva speech-recognition multi-language model. Larger and more accurate than Parakeet.",
|
|
"homepage": "https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia",
|
|
},
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class NimInstallJob:
|
|
id: str
|
|
image: str
|
|
container: str
|
|
port: int
|
|
host: str
|
|
user: str
|
|
volume: Optional[str]
|
|
started_at: str
|
|
state: str = "starting" # starting | pulling | running | done | failed
|
|
phase: str = "Starting…"
|
|
lines: list[str] = field(default_factory=list)
|
|
returncode: Optional[int] = None
|
|
finished_at: Optional[str] = None
|
|
|
|
def append(self, line: str) -> None:
|
|
self.lines.append(line)
|
|
if len(self.lines) > 1000:
|
|
del self.lines[: len(self.lines) - 1000]
|
|
|
|
|
|
class NimManager:
|
|
def __init__(self, settings: Settings) -> None:
|
|
self.settings = settings
|
|
self.lock = asyncio.Lock()
|
|
self.jobs: dict[str, NimInstallJob] = {}
|
|
self.current_job_id: Optional[str] = None
|
|
|
|
def get(self, job_id: str) -> NimInstallJob | None:
|
|
return self.jobs.get(job_id)
|
|
|
|
async def trigger(
|
|
self,
|
|
*,
|
|
image: str,
|
|
container: str,
|
|
port: int,
|
|
host: str,
|
|
user: str,
|
|
volume: str | None = None,
|
|
extra_env: dict[str, str] | None = None,
|
|
) -> NimInstallJob:
|
|
if self.lock.locked():
|
|
raise RuntimeError("Another NIM install is already in progress")
|
|
if not host or not user:
|
|
raise RuntimeError("target host not configured")
|
|
if not self.settings.ngc_api_key:
|
|
raise RuntimeError(
|
|
"NGC_API_KEY is not set. Open Configure Sparks in StartOS and paste your NGC personal API key (free at https://ngc.nvidia.com/setup/personal-key)."
|
|
)
|
|
|
|
job = NimInstallJob(
|
|
id=uuid.uuid4().hex[:8],
|
|
image=image,
|
|
container=container,
|
|
port=port,
|
|
host=host,
|
|
user=user,
|
|
volume=volume or f"{container}-cache",
|
|
started_at=datetime.now(timezone.utc).isoformat(),
|
|
)
|
|
self.jobs[job.id] = job
|
|
self.current_job_id = job.id
|
|
asyncio.create_task(self._run(job, extra_env or {}))
|
|
return job
|
|
|
|
async def _run(self, job: NimInstallJob, extra_env: dict[str, str]) -> None:
|
|
async with self.lock:
|
|
try:
|
|
await self._do(job, extra_env)
|
|
if job.state != "failed":
|
|
job.state = "done"
|
|
job.returncode = 0
|
|
job.phase = "Done"
|
|
except Exception as e:
|
|
job.append(f"[error] {type(e).__name__}: {e}")
|
|
job.state = "failed"
|
|
if job.returncode is None:
|
|
job.returncode = 1
|
|
finally:
|
|
job.finished_at = datetime.now(timezone.utc).isoformat()
|
|
if self.current_job_id == job.id:
|
|
self.current_job_id = None
|
|
|
|
async def _do(self, job: NimInstallJob, extra_env: dict[str, str]) -> None:
|
|
# Build the bash one-liner. We use docker login non-interactively with the NGC API key.
|
|
# The real docker commands use shlex.quote'd values (img/ctr/vol) so nothing
|
|
# user-controlled can break out of the SSH shell. The cosmetic `echo` log lines
|
|
# embed the *raw* values inside single quotes — safe because image/container are
|
|
# validated against a metacharacter-free whitelist at the API boundary, and
|
|
# volume/port derive from them. (Embedding shlex.quote output inside another
|
|
# quoted echo string would be wrong — it can re-expose $() / $VAR.)
|
|
img = quote_arg(job.image)
|
|
ctr = quote_arg(job.container)
|
|
vol = quote_arg(job.volume)
|
|
port = int(job.port) # int can't inject; coerce defensively
|
|
env_parts = ['-e NGC_API_KEY=$NGC_API_KEY']
|
|
for k, v in extra_env.items():
|
|
env_parts.append(f"-e {quote_arg(k)}={quote_arg(v)}")
|
|
env_str = " ".join(env_parts)
|
|
cmd = (
|
|
f"set -e; "
|
|
f"export NGC_API_KEY={quote_arg(self.settings.ngc_api_key or '')}; "
|
|
f"echo '=== docker login nvcr.io ==='; "
|
|
f"echo \"$NGC_API_KEY\" | docker login nvcr.io -u '$oauthtoken' --password-stdin; "
|
|
f"echo '=== docker pull {job.image} (this can be 1-10 GB) ==='; "
|
|
f"docker pull {img}; "
|
|
f"echo '=== remove any prior container with the same name ==='; "
|
|
f"docker rm -f {ctr} 2>/dev/null || true; "
|
|
f"echo '=== docker run -d --gpus all -p {job.port}:{job.port} -v {job.volume}:/opt/nim/.cache --name {job.container} --restart unless-stopped {job.image} ==='; "
|
|
f"docker run -d --gpus all "
|
|
f"-p {port}:{port} "
|
|
f"-v {vol}:/opt/nim/.cache "
|
|
f"{env_str} "
|
|
f"--name {ctr} "
|
|
f"--restart unless-stopped "
|
|
f"{img}; "
|
|
f"echo '=== ensuring cache volume is writable by uid 1000 (riva-server) ==='; "
|
|
f"docker run --rm -v {vol}:/cache alpine chown -R 1000:1000 /cache && "
|
|
f"docker restart {ctr}; "
|
|
f"echo '=== install complete; container is starting up and will download its model on first boot ==='"
|
|
)
|
|
job.append(f"$ <install command for {job.image} on {job.host}>")
|
|
job.state = "pulling"
|
|
job.phase = "Pulling image from nvcr.io (this can take a few minutes)…"
|
|
|
|
handle = StreamHandle()
|
|
async for line in ssh_stream(job.host, job.user, cmd, self.settings, handle=handle):
|
|
# Don't log lines containing the api key
|
|
if self.settings.ngc_api_key and self.settings.ngc_api_key in line:
|
|
continue
|
|
job.append(line)
|
|
if "docker pull" in line:
|
|
job.phase = "Pulling image from nvcr.io…"
|
|
elif "Login Succeeded" in line:
|
|
job.phase = "Logged in to NGC; pulling image…"
|
|
elif "Pull complete" in line:
|
|
job.phase = "Pulling layers…"
|
|
elif "Status: Downloaded newer image" in line or "Image is up to date" in line:
|
|
job.phase = "Image ready; starting container…"
|
|
elif "docker run -d" in line:
|
|
job.state = "running"
|
|
job.phase = "Container starting; downloading model on first boot…"
|
|
|
|
rc = handle.returncode or 0
|
|
if rc != 0:
|
|
job.state = "failed"
|
|
job.returncode = rc
|