From ae8efa1754e53a9968d0d3118c60bb403cca72b2 Mon Sep 17 00:00:00 2001 From: Grant Date: Tue, 12 May 2026 09:29:13 -0500 Subject: [PATCH] Initial scaffold: image/ FastAPI app, models.yaml, docs - image/ FastAPI app: /api/status, /api/swap, /api/swap/{id}/stream, /api/test-connection - models.yaml: 5-model catalog (qwen3-vl, gemma4, qwen36, qwen3-235b-fp8, qwen25-72b) - README, runbook, known-issues - Dry-run swap verified against live Spark 1 (gemma4 currently loaded) --- .gitignore | 14 ++ README.md | 55 ++++++++ claude-code-starter-prompt.md | 244 ++++++++++++++++++++++++++++++++++ image/Dockerfile | 21 +++ image/app/__init__.py | 0 image/app/config.py | 58 ++++++++ image/app/health.py | 43 ++++++ image/app/models.py | 40 ++++++ image/app/server.py | 155 +++++++++++++++++++++ image/app/ssh.py | 91 +++++++++++++ image/app/static/app.js | 195 +++++++++++++++++++++++++++ image/app/static/index.html | 51 +++++++ image/app/static/style.css | 170 +++++++++++++++++++++++ image/app/swap.py | 140 +++++++++++++++++++ image/entrypoint.sh | 20 +++ image/pyproject.toml | 22 +++ known-issues.md | 40 ++++++ models.yaml | 80 +++++++++++ runbook.md | 61 +++++++++ 19 files changed, 1500 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 claude-code-starter-prompt.md create mode 100644 image/Dockerfile create mode 100644 image/app/__init__.py create mode 100644 image/app/config.py create mode 100644 image/app/health.py create mode 100644 image/app/models.py create mode 100644 image/app/server.py create mode 100644 image/app/ssh.py create mode 100644 image/app/static/app.js create mode 100644 image/app/static/index.html create mode 100644 image/app/static/style.css create mode 100644 image/app/swap.py create mode 100644 image/entrypoint.sh create mode 100644 image/pyproject.toml create mode 100644 known-issues.md create mode 100644 models.yaml create mode 100644 runbook.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ce81f0c --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +__pycache__/ +*.py[cod] +*.egg-info/ +.venv/ +venv/ +.env +.env.* +!.env.example +node_modules/ +*.s9pk +dist/ +build/ +.DS_Store +.claude/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..d3252a4 --- /dev/null +++ b/README.md @@ -0,0 +1,55 @@ +# spark-control + +A browser-based control panel for a dual-DGX-Spark vLLM cluster. Designed to run as a [StartOS 0.4](https://docs.start9.com/packaging/0.4.0.x/) package on a Start9 server on the same LAN as the Sparks. + +## What it does + +- Shows which LLM is currently loaded on the cluster (`:8888/v1/models`). +- Click to swap to a different model — stops the current one, launches the new one, streams logs to the UI until `Application startup complete.` appears. +- Surfaces health for Parakeet (STT, `:8000`) and Magpie (TTS, `:9000`) on Spark 2. + +## Architecture + +``` +[Browser/phone] ──► [StartOS reverse proxy] ──► [spark-control container] + │ (SSH over LAN) + ▼ + [Spark 1] ──► launch-cluster.sh + │ + ▼ + [Spark 2] +``` + +Two layers in this repo: + +- `image/` — a self-contained FastAPI app + static UI. Runs anywhere with `uvicorn` and an SSH client. Useful for development. +- `package/` — a thin StartOS 0.4 wrapper that packages the image, exposes the UI on the LAN, and gives the user actions to configure SSH access to the Sparks. + +## Quick start (local dev, no StartOS yet) + +```bash +cd image +python -m venv .venv && source .venv/bin/activate +pip install -e . +export SPARK1_HOST=.local +export SPARK1_USER= +export SPARK2_HOST= +export SPARK2_USER= +export SSH_KEY_PATH="$HOME/Library/Application Support/NVIDIA/Sync/config/nvsync.key" +uvicorn app.server:app --host 0.0.0.0 --port 9999 --reload +``` + +Open . + +## Repo layout + +- `models.yaml` — model catalog (also bundled into the image) +- `image/` — Docker image source +- `package/` — StartOS 0.4 package source +- `scripts/build-s9pk.sh` — convenience wrapper around the StartOS build +- `runbook.md` — operating notes +- `known-issues.md` — known quirks and workarounds + +## Status + +v0.1 — local-only, single-cluster, no auth (trusts LAN). diff --git a/claude-code-starter-prompt.md b/claude-code-starter-prompt.md new file mode 100644 index 0000000..a4c002c --- /dev/null +++ b/claude-code-starter-prompt.md @@ -0,0 +1,244 @@ +# Project: spark-control — Model switcher web UI for dual DGX Spark cluster + +## Goal + +I want to build a small web service that gives me a browser-based interface to: + +1. See which LLM is currently loaded on my DGX Spark cluster +2. Click a button to swap to a different model +3. See real-time status as the swap progresses (stop → launch → ready) +4. See basic health info about supporting services (Parakeet STT, eventually Magpie TTS) + +The UI should live at a stable URL on my LAN so I can bookmark it. I'll likely access it from my laptop and phone. + +## Where this project lives + +This repo lives on **my laptop** (macOS). The Sparks are servers — we control them remotely over SSH. Claude Code runs on my laptop, makes edits in the local repo, and executes commands on the Sparks via SSH. + +The web UI itself, when deployed, will run on **Spark 1** (where it can directly invoke `launch-cluster.sh`), but development happens on my laptop. We'll deploy the code to Spark 1 via `rsync` or `scp` or `git pull` as needed. + +## SSH setup + +From my laptop I can SSH to either Spark directly: + +```bash +ssh @ # Spark 1 +ssh @ # Spark 2 +``` + +(I can also use SSH key auth — set up earlier.) + +When you need to run a command on a Spark, use this pattern: + +```bash +ssh @ 'cd ~/spark-vllm-docker && ./launch-cluster.sh status' +``` + +For multi-line commands or scripts, you can pipe a heredoc or just SSH in directly and run them interactively. Either works — but always tell me what you're about to run so I can review. + +For file transfers between my laptop and the Sparks, use `rsync`: + +```bash +rsync -avz ~/Projects/spark-control/ @:~/spark-control/ +``` + +## My hardware and what's running + +**Two NVIDIA DGX Spark units** networked together: + +- **Spark 1** — hostname ``, LAN IP ``, QSFP IP ``. Head node for the vLLM cluster. +- **Spark 2** — hostname ``, LAN IP ``, QSFP IP ``. Worker node for vLLM cluster, also hosts standalone services. + +Both run Ubuntu 24.04, NVIDIA driver 580.x, CUDA 13.0, Docker, and have 128 GB unified memory each. They share a QSFP cable for high-speed (200 Gb/s) inter-node networking. + +Passwordless SSH works in both directions via `~/.ssh/` key. My Linux username on both machines is ``. + +**Currently running:** +- One LLM at a time on the cluster (via the `eugr/spark-vllm-docker` project — see below) +- `parakeet-asr` Docker container on Spark 2 (port 8000) — running 24/7 for speech-to-text, healthy for weeks +- `magpie-tts` Docker container on Spark 2 (port 9000) — was being set up; I'm not 100% sure of its current state; first task is to verify +- Open WebUI runs on a separate Start9 server on the LAN (not on the Sparks), accessing the LLM via HTTP + +## The LLM cluster: how it works + +I use the **`eugr/spark-vllm-docker`** community project (cloned to `~/spark-vllm-docker` on Spark 1). It manages a Ray-based vLLM cluster across both Sparks, with a wrapper script called `launch-cluster.sh` that handles starting/stopping Docker containers on both nodes. + +Key commands (all run from `~/spark-vllm-docker` on Spark 1): +- `./launch-cluster.sh status` — see what's running on both nodes +- `./launch-cluster.sh stop` — stop the cluster +- `./launch-cluster.sh -d exec vllm serve ...` — launch in daemon mode with vLLM args +- `./launch-cluster.sh --solo -d exec vllm serve ...` — same but only on Spark 1 (for smaller models) +- `docker logs -f vllm_node` — tail vLLM logs + +Container names: `vllm_node` (the main vLLM container), `ray_head` and `ray_worker` (Ray cluster), plus support containers. + +The vLLM server binds to port **8888** and exposes an OpenAI-compatible API at `http://:8888/v1`. + +## Models I have on disk (both Sparks) + +All weights live in `~/.cache/huggingface/hub/` on each Spark: + +1. **`RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4`** (~135 GB) — flagship MoE, runs across both Sparks (-tp 2), has vision capability. Use for: maximum quality, vision input, multilingual. + +2. **`RedHatAI/gemma-4-31B-it-NVFP4`** (~23 GB) — runs solo on Spark 1, has vision, has thinking-mode reasoning. Use for: math/reasoning-heavy tasks. Has a known vLLM Triton-attention slowdown bug (~15-20 tok/s vs theoretical 30-40). + +3. **`RedHatAI/Qwen3.6-35B-A3B-NVFP4`** (~20 GB) — newer-generation Qwen MoE (35B total / 3B active), runs solo on Spark 1, expected to be the fastest (~70-100 tok/s) and my new daily driver. **Note: this may still be downloading or may not be downloaded yet — first task is to verify and download if needed.** + +## Exact launch commands for each model + +These are the commands my system needs to run when I click a swap button. + +### Qwen3-VL-235B (uses both Sparks) +```bash +cd ~/spark-vllm-docker +./launch-cluster.sh stop +./launch-cluster.sh -d exec vllm serve \ + RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4 \ + --port 8888 --host 0.0.0.0 \ + --gpu-memory-utilization 0.7 \ + -tp 2 \ + --distributed-executor-backend ray \ + --max-model-len 32768 +``` +Expected ready time: ~3-5 min after stop completes. + +### Gemma 4 31B (solo on Spark 1) +```bash +cd ~/spark-vllm-docker +./launch-cluster.sh stop +./launch-cluster.sh --solo -d exec vllm serve \ + RedHatAI/gemma-4-31B-it-NVFP4 \ + --port 8888 --host 0.0.0.0 \ + --gpu-memory-utilization 0.8 \ + --max-model-len 32768 \ + --reasoning-parser gemma4 \ + --tool-call-parser gemma4 \ + --enable-auto-tool-choice +``` +Expected ready time: ~3-4 min. + +### Qwen3.6-35B-A3B (solo on Spark 1) — new daily driver +```bash +cd ~/spark-vllm-docker +./launch-cluster.sh stop +./launch-cluster.sh --solo -d exec vllm serve \ + RedHatAI/Qwen3.6-35B-A3B-NVFP4 \ + --port 8888 --host 0.0.0.0 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 65536 \ + --reasoning-parser qwen3 \ + --moe_backend flashinfer_cutlass +``` +Expected ready time: ~3-5 min. + +Note: the `--moe_backend flashinfer_cutlass` flag is Blackwell-specific. If it errors on launch, fallback is to remove that flag. + +### Common operations +- Stop everything: `./launch-cluster.sh stop` +- Status check: `./launch-cluster.sh status` +- See vLLM logs: `docker logs vllm_node` (add `-f` to follow) +- Hard reset if stuck: `./launch-cluster.sh stop && docker ps -aq | xargs -r docker rm -f` +- Health check (is API responding?): `curl -s http://:8888/v1/models` + +### "Ready" signal +The model is ready to serve when `docker logs vllm_node` contains the line `Application startup complete.` Until then, it's still loading weights or compiling CUDA graphs. + +## Supporting services on Spark 2 (always-on, separate from cluster) + +These don't get touched by model swaps: + +- **`parakeet-asr`** — STT on port 8000. Already running 24/7. Verify with `curl http://:8000/health` which should return `{"status":"ready",...}`. +- **`magpie-tts`** — TTS on port 9000. May or may not be running; verify with `docker ps` on Spark 2 and `curl http://:9000/v1/health/ready`. + +## What I want you to build + +### Phase 1: Set up the project repo (start here) + +Create a Git repo at `~/Projects/spark-control/` on **my laptop**. Initial structure: + +``` +spark-control/ +├── README.md +├── models.yaml # Declarative config for each model +├── scripts/ +│ ├── swap-model.sh # Universal swap script +│ ├── status.sh # Cluster + service status +│ └── health.sh # Health checks for everything +├── web-ui/ +│ ├── server.py # FastAPI backend +│ ├── static/ +│ │ ├── index.html # Toggle UI +│ │ ├── style.css +│ │ └── app.js # State management, polling +│ └── requirements.txt +├── runbook.md # Operating notes +└── known-issues.md # Gotchas, troubleshooting +``` + +### Phase 2: Build the universal swap script + +`scripts/swap-model.sh ` should: +1. Read the launch command from `models.yaml` by key (e.g. `qwen3-vl`, `gemma4`, `qwen36`) +2. Stop the current cluster (via SSH to Spark 1) +3. Run the new launch command (via SSH to Spark 1) +4. Tail logs until "Application startup complete" appears or a timeout (~10 min) hits +5. Return exit code 0 on success, non-zero on failure + +Two versions might be useful: +- The version that runs on **my laptop** — wraps everything in `ssh @ ...` +- A simpler version that lives on **Spark 1** — runs commands directly without SSH (used by the deployed web UI) + +You can either share one script with a `--remote` flag, or make them two distinct files. Your call — propose the cleaner option. + +### Phase 3: Build the web UI + +FastAPI backend that: +- `GET /api/status` → JSON with `{current_model, ready, parakeet_health, magpie_health, last_swap_time}` +- `POST /api/swap` with `{model_key}` → starts swap, returns swap job ID +- `GET /api/swap/{job_id}/stream` → Server-Sent Events streaming swap progress +- `GET /` → serves the HTML UI + +Frontend should: +- Show a card per model with a "Switch to this" button +- Highlight which model is currently loaded +- During a swap, show streaming log output and a spinner +- Show a green/red indicator for Parakeet and Magpie health +- Auto-refresh every 5 seconds + +Keep the UI simple, clean, dark-themed. No frameworks needed — vanilla HTML/JS is fine. + +### Phase 4: Deploy and make it persistent + +The web UI runs on **Spark 1** so it can directly invoke `launch-cluster.sh` without SSH overhead. To deploy: + +1. `rsync` the project code from my laptop to `~/spark-control/` on Spark 1 +2. Set up a Python virtual environment on Spark 1 and install requirements +3. Create a systemd service file that starts the FastAPI server on boot +4. Service should listen on `0.0.0.0:9999` so I can hit it from any device on my LAN +5. Add a simple deploy script (`scripts/deploy.sh`) on my laptop that does the rsync + restart in one command for future iteration + +## Working style + +- Before making changes that affect the running cluster, please ask me first. +- When you write commands you want me to run, give them in clearly marked code blocks. +- Distinguish clearly when a command is meant to run on my laptop vs. on a Spark (which means via SSH). +- If you need information about the current state of the Sparks, ask me to run a diagnostic SSH command and paste the output — or run it yourself if you have shell access. +- Test things incrementally. Don't build the whole UI before validating the swap script works. +- I'm a layman — explain technical decisions briefly in plain English when they involve trade-offs. +- When making changes that modify files on a Spark, do them by editing in my laptop's repo first and then deploying — not by editing on the Spark directly. That keeps my laptop as the source of truth. + +## First task + +1. First, **verify SSH access to both Sparks** from my laptop: + - `ssh @ hostname` should return `` + - `ssh @ hostname` should return `` +2. Then **verify the current state of the cluster** via SSH: + - Confirm `~/spark-vllm-docker` exists on Spark 1 and `launch-cluster.sh` is there: `ssh @ 'ls ~/spark-vllm-docker/launch-cluster.sh'` + - Check which LLM (if any) is currently loaded: `ssh @ 'cd ~/spark-vllm-docker && ./launch-cluster.sh status'` and `ssh @ 'curl -s http://localhost:8888/v1/models'` + - Verify which models are downloaded: `ssh @ 'ls ~/.cache/huggingface/hub/ | grep -iE "qwen|gemma"'` + - Specifically check if `Qwen3.6-35B-A3B-NVFP4` is downloaded; if not, that's the prerequisite step (run the `hf-download.sh` command on Spark 1) + - Check what's running on Spark 2: `ssh @ 'docker ps'` (looking for parakeet-asr and possibly magpie-tts) +3. Then create the repo structure on my laptop at `~/Projects/spark-control/` +4. Then propose the design for `models.yaml` and the swap script before implementing + +Ask me anything that's unclear before starting. diff --git a/image/Dockerfile b/image/Dockerfile new file mode 100644 index 0000000..0a2501b --- /dev/null +++ b/image/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.12-slim + +RUN apt-get update \ + && apt-get install -y --no-install-recommends openssh-client curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY pyproject.toml /app/ +COPY app /app/app +COPY entrypoint.sh /app/entrypoint.sh +RUN chmod +x /app/entrypoint.sh + +# models.yaml is mounted in or copied at build via build-context root. +# For Docker, build from the repo root: `docker build -f image/Dockerfile .` +COPY models.yaml /app/models.yaml + +RUN pip install --no-cache-dir -e . + +ENV BIND_PORT=9999 +EXPOSE 9999 +ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/image/app/__init__.py b/image/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/image/app/config.py b/image/app/config.py new file mode 100644 index 0000000..eebd6a1 --- /dev/null +++ b/image/app/config.py @@ -0,0 +1,58 @@ +from __future__ import annotations +import os +from dataclasses import dataclass +from pathlib import Path + + +def _env(name: str, default: str = "") -> str: + return os.environ.get(name, default) + + +def _resolve_models_yaml() -> str: + if env := os.environ.get("MODELS_YAML"): + return env + here = Path(__file__).resolve().parent # app/ + candidates = [ + here.parent / "models.yaml", # image/models.yaml (Docker) + here.parent.parent / "models.yaml", # /models.yaml (dev) + Path("/app/models.yaml"), # explicit container path + ] + for p in candidates: + if p.exists(): + return str(p) + return str(candidates[0]) # let load fail with a clear path + + +@dataclass(frozen=True) +class Settings: + spark1_host: str + spark1_user: str + spark2_host: str + spark2_user: str + ssh_key_path: str + ssh_known_hosts: str + models_yaml: str + vllm_port: int + parakeet_port: int + magpie_port: int + bind_port: int + + @classmethod + def from_env(cls) -> "Settings": + return cls( + spark1_host=_env("SPARK1_HOST"), + spark1_user=_env("SPARK1_USER", ""), + spark2_host=_env("SPARK2_HOST"), + spark2_user=_env("SPARK2_USER", ""), + ssh_key_path=_env("SSH_KEY_PATH"), + ssh_known_hosts=_env("SSH_KNOWN_HOSTS"), + models_yaml=_resolve_models_yaml(), + vllm_port=int(_env("VLLM_PORT", "8888")), + parakeet_port=int(_env("PARAKEET_PORT", "8000")), + magpie_port=int(_env("MAGPIE_PORT", "9000")), + bind_port=int(_env("BIND_PORT", "9999")), + ) + + @property + def configured(self) -> bool: + return bool(self.spark1_host) diff --git a/image/app/health.py b/image/app/health.py new file mode 100644 index 0000000..16ab92a --- /dev/null +++ b/image/app/health.py @@ -0,0 +1,43 @@ +from __future__ import annotations +import httpx +from .config import Settings + + +_TIMEOUT = 3.0 + + +async def check_vllm(settings: Settings) -> dict: + if not settings.spark1_host: + return {"ok": False, "error": "spark1 not configured"} + try: + async with httpx.AsyncClient(timeout=_TIMEOUT) as c: + r = await c.get(f"http://{settings.spark1_host}:{settings.vllm_port}/v1/models") + r.raise_for_status() + ids = [m["id"] for m in r.json().get("data", [])] + return {"ok": True, "current_model": ids[0] if ids else None, "all": ids} + except Exception as e: + return {"ok": False, "error": str(e)} + + +async def check_parakeet(settings: Settings) -> dict: + if not settings.spark2_host: + return {"ok": False, "error": "spark2 not configured"} + try: + async with httpx.AsyncClient(timeout=_TIMEOUT) as c: + r = await c.get(f"http://{settings.spark2_host}:{settings.parakeet_port}/health") + r.raise_for_status() + return {"ok": True, "detail": r.json()} + except Exception as e: + return {"ok": False, "error": str(e)} + + +async def check_magpie(settings: Settings) -> dict: + if not settings.spark2_host: + return {"ok": False, "error": "spark2 not configured"} + try: + async with httpx.AsyncClient(timeout=_TIMEOUT) as c: + r = await c.get(f"http://{settings.spark2_host}:{settings.magpie_port}/v1/health/ready") + r.raise_for_status() + return {"ok": True, "detail": r.json() if r.headers.get("content-type", "").startswith("application/json") else r.text} + except Exception as e: + return {"ok": False, "error": str(e)} diff --git a/image/app/models.py b/image/app/models.py new file mode 100644 index 0000000..22bd393 --- /dev/null +++ b/image/app/models.py @@ -0,0 +1,40 @@ +from __future__ import annotations +from typing import Literal +import yaml +from pydantic import BaseModel, Field + + +class ModelDef(BaseModel): + display_name: str + repo: str + size_gb: float + mode: Literal["solo", "cluster"] + capabilities: list[str] = Field(default_factory=list) + expected_ready_seconds: int = 300 + vllm_args: list[str] = Field(default_factory=list) + + +class Defaults(BaseModel): + port: int = 8888 + host: str = "0.0.0.0" + + +class Catalog(BaseModel): + defaults: Defaults = Field(default_factory=Defaults) + models: dict[str, ModelDef] + + +def load_catalog(path: str) -> Catalog: + with open(path) as f: + data = yaml.safe_load(f) + return Catalog.model_validate(data) + + +def build_launch_command(key: str, model: ModelDef, defaults: Defaults) -> str: + """Return the shell command to launch `model` on Spark 1. + + Assumes cwd will be `~/spark-vllm-docker` (we cd in the SSH wrapper). + """ + solo = "--solo " if model.mode == "solo" else "" + args = [f"--port={defaults.port}", f"--host={defaults.host}", *model.vllm_args] + return f"./launch-cluster.sh {solo}-d exec vllm serve {model.repo} {' '.join(args)}" diff --git a/image/app/server.py b/image/app/server.py new file mode 100644 index 0000000..dfc077d --- /dev/null +++ b/image/app/server.py @@ -0,0 +1,155 @@ +from __future__ import annotations +import asyncio +import json +from pathlib import Path + +from fastapi import FastAPI, HTTPException +from fastapi.responses import FileResponse, JSONResponse, StreamingResponse +from fastapi.staticfiles import StaticFiles +from pydantic import BaseModel + +from .config import Settings +from .health import check_magpie, check_parakeet, check_vllm +from .models import load_catalog +from .ssh import ssh_run +from .swap import SwapManager + + +settings = Settings.from_env() +catalog = load_catalog(settings.models_yaml) +swap_manager = SwapManager(settings, catalog) + +app = FastAPI(title="spark-control", version="0.1.0") + +_STATIC_DIR = Path(__file__).resolve().parent / "static" +app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static") + + +@app.get("/", include_in_schema=False) +async def index() -> FileResponse: + return FileResponse(_STATIC_DIR / "index.html") + + +@app.get("/api/config") +async def get_config() -> dict: + return { + "configured": settings.configured, + "spark1_host": settings.spark1_host, + "spark2_host": settings.spark2_host, + "vllm_port": settings.vllm_port, + } + + +@app.get("/api/models") +async def get_models() -> dict: + return { + "defaults": catalog.defaults.model_dump(), + "models": {k: v.model_dump() for k, v in catalog.models.items()}, + } + + +@app.get("/api/status") +async def get_status() -> dict: + vllm, parakeet, magpie = await asyncio.gather( + check_vllm(settings), + check_parakeet(settings), + check_magpie(settings), + ) + current_key = _identify_current_model(vllm.get("current_model")) + return { + "configured": settings.configured, + "vllm": vllm, + "parakeet": parakeet, + "magpie": magpie, + "current_model_key": current_key, + "current_swap_job": swap_manager.current_job_id, + } + + +def _identify_current_model(repo: str | None) -> str | None: + if not repo: + return None + for key, m in catalog.models.items(): + if m.repo == repo: + return key + return None + + +class SwapRequest(BaseModel): + model_key: str + dry_run: bool = False + + +@app.post("/api/swap") +async def post_swap(req: SwapRequest) -> dict: + if not settings.configured and not req.dry_run: + raise HTTPException(503, "spark1 not configured") + try: + job = await swap_manager.trigger(req.model_key, dry_run=req.dry_run) + except KeyError: + raise HTTPException(404, f"unknown model: {req.model_key}") + except RuntimeError as e: + raise HTTPException(409, str(e)) + return {"job_id": job.id, "model_key": job.model_key, "state": job.state} + + +@app.get("/api/swap/{job_id}") +async def get_swap(job_id: str) -> dict: + job = swap_manager.get(job_id) + if job is None: + raise HTTPException(404, "no such job") + return { + "id": job.id, + "model_key": job.model_key, + "state": job.state, + "started_at": job.started_at, + "finished_at": job.finished_at, + "returncode": job.returncode, + "dry_run": job.dry_run, + "lines": job.lines, + } + + +@app.get("/api/swap/{job_id}/stream") +async def stream_swap(job_id: str): + job = swap_manager.get(job_id) + if job is None: + raise HTTPException(404, "no such job") + + async def gen(): + sent = 0 + while True: + n = len(job.lines) + if n > sent: + for line in job.lines[sent:n]: + payload = json.dumps({"line": line, "state": job.state}) + yield f"data: {payload}\n\n" + sent = n + if job.returncode is not None and sent >= len(job.lines): + payload = json.dumps({ + "state": job.state, + "returncode": job.returncode, + "finished_at": job.finished_at, + }) + yield f"event: done\ndata: {payload}\n\n" + return + await asyncio.sleep(0.4) + + return StreamingResponse(gen(), media_type="text/event-stream") + + +@app.post("/api/test-connection") +async def test_connection() -> dict: + """Probe both Sparks with a `hostname` command. Useful for the StartOS setup flow.""" + results: dict[str, dict] = {} + if settings.spark1_host: + rc, out, err = await ssh_run(settings.spark1_host, settings.spark1_user, "hostname && docker ps --format '{{.Names}}'", settings, timeout=10) + results["spark1"] = {"ok": rc == 0, "rc": rc, "stdout": out.strip(), "stderr": err.strip()} + else: + results["spark1"] = {"ok": False, "error": "not configured"} + if settings.spark2_host: + rc, out, err = await ssh_run(settings.spark2_host, settings.spark2_user, "hostname && docker ps --format '{{.Names}}'", settings, timeout=10) + results["spark2"] = {"ok": rc == 0, "rc": rc, "stdout": out.strip(), "stderr": err.strip()} + else: + results["spark2"] = {"ok": False, "error": "not configured"} + return results diff --git a/image/app/ssh.py b/image/app/ssh.py new file mode 100644 index 0000000..05b1162 --- /dev/null +++ b/image/app/ssh.py @@ -0,0 +1,91 @@ +"""Async wrappers around the system `ssh` client. + +We shell out rather than use Paramiko/asyncssh so that: + - Host key + auth behavior is identical to what a user would see at the shell. + - The same ssh config file (`~/.ssh/config`) and key files work in dev. + - We don't pull in a heavy crypto dependency for the container image. +""" +from __future__ import annotations +import asyncio +from typing import AsyncIterator +from .config import Settings + + +def _base_args(settings: Settings) -> list[str]: + args = [ + "ssh", + "-o", "BatchMode=yes", + "-o", "StrictHostKeyChecking=accept-new", + "-o", "ServerAliveInterval=15", + "-o", "ServerAliveCountMax=4", + ] + if settings.ssh_key_path: + args += ["-i", settings.ssh_key_path] + if settings.ssh_known_hosts: + args += ["-o", f"UserKnownHostsFile={settings.ssh_known_hosts}"] + return args + + +async def ssh_run( + host: str, + user: str, + command: str, + settings: Settings, + timeout: float = 30.0, +) -> tuple[int, str, str]: + """Run a one-shot SSH command. Returns (rc, stdout, stderr).""" + args = _base_args(settings) + [f"{user}@{host}", command] + proc = await asyncio.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout_b, stderr_b = await asyncio.wait_for(proc.communicate(), timeout=timeout) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + return 124, "", f"timeout after {timeout}s" + assert proc.returncode is not None + return proc.returncode, stdout_b.decode(errors="replace"), stderr_b.decode(errors="replace") + + +class StreamHandle: + """Holds the final returncode once an `ssh_stream()` generator completes.""" + + def __init__(self) -> None: + self.returncode: int | None = None + + +async def ssh_stream( + host: str, + user: str, + command: str, + settings: Settings, + handle: StreamHandle | None = None, +) -> AsyncIterator[str]: + """Yield stdout (and merged stderr) lines from a long-running SSH command. + + The generator may be aborted by closing it (e.g. `break` in `async for`); + the child SSH process is terminated and waited on in the `finally` block. + """ + args = _base_args(settings) + [f"{user}@{host}", command] + proc = await asyncio.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + assert proc.stdout is not None + try: + async for raw in proc.stdout: + yield raw.decode(errors="replace").rstrip("\r\n") + finally: + if proc.returncode is None: + proc.terminate() + try: + await asyncio.wait_for(proc.wait(), timeout=5) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + if handle is not None: + handle.returncode = proc.returncode diff --git a/image/app/static/app.js b/image/app/static/app.js new file mode 100644 index 0000000..6fa0af5 --- /dev/null +++ b/image/app/static/app.js @@ -0,0 +1,195 @@ +// spark-control front-end +// - polls /api/status every 5s for current model + health +// - lists models from /api/models as cards +// - POST /api/swap to start a swap, then opens SSE /api/swap/{id}/stream + +const state = { + models: {}, + defaults: {}, + current_model_key: null, + swap_job_id: null, + swap_eventsource: null, + configured: true, +}; + +function el(sel) { return document.querySelector(sel); } +function $(sel) { return document.querySelectorAll(sel); } + +async function fetchJSON(url, opts) { + const r = await fetch(url, opts); + if (!r.ok) { + const text = await r.text().catch(() => ""); + throw new Error(`${r.status} ${r.statusText}: ${text}`); + } + return r.json(); +} + +function renderCards() { + const root = el("#cards"); + root.innerHTML = ""; + const keys = Object.keys(state.models); + for (const key of keys) { + const m = state.models[key]; + const isActive = key === state.current_model_key; + const isSwapping = !!state.swap_job_id; + const card = document.createElement("div"); + card.className = "card" + (isActive ? " active" : ""); + card.innerHTML = ` +
${m.display_name}
+
+ ${m.mode} + ${m.size_gb} GB + ${(m.capabilities || []).map(c => `${c}`).join("")} +
+
${m.repo}
+
+ + `; + root.appendChild(card); + } + for (const btn of $(".card .btn")) { + btn.addEventListener("click", () => triggerSwap(btn.dataset.key)); + } +} + +function renderCurrent(status) { + const c = el("#current"); + if (!status.configured) { + c.innerHTML = `not configured`; + return; + } + if (status.current_swap_job) { + c.innerHTML = `swap in progress`; + return; + } + const v = status.vllm || {}; + if (!v.ok) { + c.innerHTML = `vLLM unreachable`; + return; + } + const key = status.current_model_key; + const m = key ? state.models[key] : null; + const label = m ? m.display_name : (v.current_model || "(unknown)"); + c.innerHTML = `${label}`; +} + +function renderHealth(status) { + function setDot(id, ok) { + const item = el(id); + if (!item) return; + const dot = item.querySelector(".dot"); + dot.classList.remove("ok", "bad", "warn"); + if (ok === true) dot.classList.add("ok"); + else if (ok === false) dot.classList.add("bad"); + else dot.classList.add("warn"); + item.title = JSON.stringify(status[id.replace("#h-", "")] || {}, null, 2); + } + setDot("#h-vllm", status.vllm && status.vllm.ok); + setDot("#h-parakeet", status.parakeet && status.parakeet.ok); + setDot("#h-magpie", status.magpie && status.magpie.ok); + el("#updated").textContent = `updated ${new Date().toLocaleTimeString()}`; +} + +function renderBanner(status) { + el("#setup-banner").classList.toggle("hidden", !!status.configured); +} + +async function pollStatus() { + try { + const status = await fetchJSON("/api/status"); + state.current_model_key = status.current_model_key; + state.configured = status.configured; + renderBanner(status); + renderCurrent(status); + renderHealth(status); + if (status.current_swap_job && status.current_swap_job !== state.swap_job_id) { + attachToSwap(status.current_swap_job); + } else if (!status.current_swap_job && state.swap_job_id && !state.swap_eventsource) { + // someone else's swap finished; clear local + state.swap_job_id = null; + el("#swap-panel").classList.add("hidden"); + } + renderCards(); + } catch (e) { + console.error("status poll failed", e); + } +} + +async function loadModels() { + const data = await fetchJSON("/api/models"); + state.defaults = data.defaults || {}; + state.models = data.models || {}; +} + +async function triggerSwap(modelKey) { + if (state.swap_job_id) return; + try { + const r = await fetchJSON("/api/swap", { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ model_key: modelKey }), + }); + attachToSwap(r.job_id); + } catch (e) { + alert("Failed to start swap: " + e.message); + } +} + +function attachToSwap(jobId) { + if (state.swap_eventsource) { + state.swap_eventsource.close(); + state.swap_eventsource = null; + } + state.swap_job_id = jobId; + el("#swap-panel").classList.remove("hidden"); + el("#swap-log").textContent = ""; + el("#swap-state").textContent = "starting"; + + const es = new EventSource(`/api/swap/${jobId}/stream`); + state.swap_eventsource = es; + + es.onmessage = (ev) => { + try { + const d = JSON.parse(ev.data); + if (d.state) el("#swap-state").textContent = d.state; + if (d.line) appendLog(d.line); + } catch {} + }; + es.addEventListener("done", (ev) => { + try { + const d = JSON.parse(ev.data); + el("#swap-state").textContent = d.state + ` (rc=${d.returncode})`; + } catch {} + es.close(); + state.swap_eventsource = null; + state.swap_job_id = null; + setTimeout(() => { + el("#swap-panel").classList.add("hidden"); + pollStatus(); + }, 4000); + pollStatus(); + }); + es.onerror = () => { + // SSE drops happen on tab background; reconnect on next poll + es.close(); + state.swap_eventsource = null; + }; + + renderCards(); +} + +function appendLog(line) { + const log = el("#swap-log"); + log.textContent += line + "\n"; + log.scrollTop = log.scrollHeight; +} + +async function init() { + await loadModels(); + await pollStatus(); + setInterval(pollStatus, 5000); +} + +init(); diff --git a/image/app/static/index.html b/image/app/static/index.html new file mode 100644 index 0000000..91360ed --- /dev/null +++ b/image/app/static/index.html @@ -0,0 +1,51 @@ + + + + + + + spark-control + + + +
+
+ + spark-control +
+
+ connecting… +
+
+ +
+ + + + +
+ +
+
+ vLLM + Parakeet + Magpie +
+
+
+
+ + + + diff --git a/image/app/static/style.css b/image/app/static/style.css new file mode 100644 index 0000000..04f8ca5 --- /dev/null +++ b/image/app/static/style.css @@ -0,0 +1,170 @@ +:root { + --bg: #0a0a0d; + --surface: #15151a; + --surface-2: #1c1c22; + --border: #25252c; + --text: #e6e6ea; + --muted: #7e7e8a; + --accent: #4ade80; + --warn: #f59e0b; + --error: #ef4444; + --info: #60a5fa; + --radius: 10px; +} + +* { box-sizing: border-box; } + +html, body { margin: 0; padding: 0; } + +body { + background: var(--bg); + color: var(--text); + font: 15px/1.5 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; + min-height: 100vh; + -webkit-font-smoothing: antialiased; +} + +.muted { color: var(--muted); } +.small { font-size: 13px; } +.hidden { display: none !important; } +.spacer { flex: 1; } + +.topbar { + position: sticky; + top: 0; + background: rgba(10, 10, 13, 0.85); + backdrop-filter: saturate(160%) blur(10px); + -webkit-backdrop-filter: saturate(160%) blur(10px); + border-bottom: 1px solid var(--border); + display: flex; + align-items: center; + gap: 16px; + padding: 12px 20px; + z-index: 10; +} +.brand { display: flex; align-items: center; gap: 10px; font-weight: 600; } +.logo-dot { width: 10px; height: 10px; border-radius: 50%; background: var(--accent); box-shadow: 0 0 12px var(--accent); } +.current { flex: 1; text-align: right; font-size: 14px; } +.current strong { color: var(--accent); } + +main { + max-width: 880px; + margin: 0 auto; + padding: 24px 20px 80px; +} + +.banner { + background: var(--surface); + border: 1px solid var(--warn); + color: var(--warn); + padding: 12px 16px; + border-radius: var(--radius); + margin-bottom: 16px; + font-size: 14px; +} +.banner em { font-style: normal; background: rgba(245, 158, 11, 0.15); padding: 2px 6px; border-radius: 4px; } + +.swap-panel { + background: var(--surface); + border: 1px solid var(--info); + border-radius: var(--radius); + padding: 14px 16px; + margin-bottom: 20px; +} +.swap-header { display: flex; align-items: center; gap: 10px; } +.swap-header #swap-title { font-weight: 600; color: var(--info); } +.spinner { + width: 14px; height: 14px; + border: 2px solid var(--info); + border-right-color: transparent; + border-radius: 50%; + animation: spin 0.8s linear infinite; +} +@keyframes spin { to { transform: rotate(360deg); } } + +.log { + background: #08080b; + border: 1px solid var(--border); + border-radius: 6px; + padding: 10px 12px; + margin: 10px 0 0; + font: 12px/1.55 ui-monospace, SFMono-Regular, "SF Mono", Menlo, monospace; + color: #c7c7d1; + max-height: 280px; + overflow: auto; + white-space: pre-wrap; + word-break: break-word; +} + +.cards { + display: grid; + gap: 14px; + grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); +} + +.card { + background: var(--surface); + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 16px; + display: flex; + flex-direction: column; + gap: 12px; + transition: border-color 0.15s, transform 0.15s; +} +.card.active { + border-color: var(--accent); + box-shadow: 0 0 0 1px var(--accent) inset, 0 0 24px rgba(74, 222, 128, 0.08); +} +.card .name { font-weight: 600; font-size: 15px; } +.card .meta { display: flex; flex-wrap: wrap; gap: 6px; font-size: 12px; color: var(--muted); } +.tag { + background: var(--surface-2); + border: 1px solid var(--border); + padding: 2px 8px; + border-radius: 999px; + font-size: 11px; +} +.tag.mode-cluster { color: var(--info); border-color: rgba(96, 165, 250, 0.4); } +.tag.mode-solo { color: var(--accent); border-color: rgba(74, 222, 128, 0.4); } +.tag.cap { color: var(--muted); } + +.btn { + appearance: none; + border: 1px solid var(--border); + background: var(--surface-2); + color: var(--text); + padding: 8px 14px; + border-radius: 8px; + cursor: pointer; + font: inherit; + font-weight: 500; + transition: background 0.15s, border-color 0.15s, opacity 0.15s; +} +.btn:hover:not(:disabled) { background: #24242c; border-color: #34343c; } +.btn.primary { background: var(--accent); color: #052e16; border-color: var(--accent); } +.btn.primary:hover:not(:disabled) { background: #6ee19a; } +.btn:disabled { opacity: 0.45; cursor: not-allowed; } +.card.active .btn { background: rgba(74, 222, 128, 0.12); color: var(--accent); border-color: rgba(74, 222, 128, 0.4); } + +.footer { + margin-top: 28px; + padding-top: 16px; + border-top: 1px solid var(--border); + display: flex; + align-items: center; + gap: 14px; + flex-wrap: wrap; +} +.health { display: flex; gap: 14px; flex-wrap: wrap; } +.health-item { display: inline-flex; align-items: center; gap: 6px; font-size: 13px; color: var(--muted); } +.dot { width: 9px; height: 9px; border-radius: 50%; background: var(--muted); display: inline-block; } +.dot.ok { background: var(--accent); box-shadow: 0 0 8px rgba(74, 222, 128, 0.7); } +.dot.bad { background: var(--error); box-shadow: 0 0 8px rgba(239, 68, 68, 0.7); } +.dot.warn { background: var(--warn); } + +@media (max-width: 640px) { + .topbar { padding: 10px 14px; } + main { padding: 16px 14px 80px; } + .cards { grid-template-columns: 1fr; } +} diff --git a/image/app/swap.py b/image/app/swap.py new file mode 100644 index 0000000..07d400a --- /dev/null +++ b/image/app/swap.py @@ -0,0 +1,140 @@ +from __future__ import annotations +import asyncio +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Optional + +from .config import Settings +from .models import Catalog, build_launch_command +from .ssh import ssh_run, ssh_stream, StreamHandle + + +READY_MARKER = "Application startup complete." +MAX_LINES = 500 + + +@dataclass +class SwapJob: + id: str + model_key: str + started_at: str + state: str = "starting" # starting|stopping|launching|tailing|ready|failed + lines: list[str] = field(default_factory=list) + returncode: Optional[int] = None + finished_at: Optional[str] = None + dry_run: bool = False + + def append(self, line: str) -> None: + self.lines.append(line) + if len(self.lines) > MAX_LINES: + del self.lines[: len(self.lines) - MAX_LINES] + + +class SwapManager: + def __init__(self, settings: Settings, catalog: Catalog) -> None: + self.settings = settings + self.catalog = catalog + self.lock = asyncio.Lock() + self.jobs: dict[str, SwapJob] = {} + self.current_job_id: Optional[str] = None + + def get(self, job_id: str) -> SwapJob | None: + return self.jobs.get(job_id) + + def reload_catalog(self, catalog: Catalog) -> None: + self.catalog = catalog + + async def trigger(self, model_key: str, *, dry_run: bool = False) -> SwapJob: + if model_key not in self.catalog.models: + raise KeyError(model_key) + if self.lock.locked(): + raise RuntimeError("A swap is already in progress") + job = SwapJob( + id=uuid.uuid4().hex[:8], + model_key=model_key, + started_at=datetime.now(timezone.utc).isoformat(), + dry_run=dry_run, + ) + self.jobs[job.id] = job + self.current_job_id = job.id + asyncio.create_task(self._run(job)) + return job + + async def _run(self, job: SwapJob) -> None: + async with self.lock: + try: + await self._do(job) + if job.state != "failed": + job.state = "ready" + job.returncode = 0 + except Exception as e: + job.append(f"[error] {type(e).__name__}: {e}") + job.state = "failed" + if job.returncode is None: + job.returncode = 1 + finally: + job.finished_at = datetime.now(timezone.utc).isoformat() + if self.current_job_id == job.id: + self.current_job_id = None + + async def _do(self, job: SwapJob) -> None: + model = self.catalog.models[job.model_key] + s = self.settings + + # Step 1: stop + job.state = "stopping" + stop_cmd = "cd ~/spark-vllm-docker && ./launch-cluster.sh stop" + job.append(f"$ {stop_cmd}") + if not job.dry_run: + rc, out, err = await ssh_run(s.spark1_host, s.spark1_user, stop_cmd, s, timeout=180) + for line in (out + err).splitlines(): + job.append(line) + if rc != 0: + job.returncode = rc + job.state = "failed" + return + + # Step 2: launch + job.state = "launching" + launch = build_launch_command(job.model_key, model, self.catalog.defaults) + launch_cmd = f"cd ~/spark-vllm-docker && {launch}" + job.append(f"$ {launch_cmd}") + if job.dry_run: + return + rc, out, err = await ssh_run(s.spark1_host, s.spark1_user, launch_cmd, s, timeout=60) + for line in (out + err).splitlines(): + job.append(line) + if rc != 0: + job.returncode = rc + job.state = "failed" + return + + # Step 3: tail logs until the ready marker (or timeout) + job.state = "tailing" + tail_cmd = "docker logs -f --tail 50 vllm_node" + job.append(f"$ {tail_cmd}") + timeout = max(model.expected_ready_seconds * 2, 600) + handle = StreamHandle() + loop = asyncio.get_event_loop() + deadline = loop.time() + timeout + ready = False + + async def _tail() -> bool: + async for line in ssh_stream(s.spark1_host, s.spark1_user, tail_cmd, s, handle=handle): + job.append(line) + if READY_MARKER in line: + return True + if loop.time() > deadline: + return False + return False + + try: + ready = await asyncio.wait_for(_tail(), timeout=timeout + 30) + except asyncio.TimeoutError: + ready = False + + if not ready: + job.append(f"[error] did not see '{READY_MARKER}' within {timeout}s") + job.state = "failed" + job.returncode = 124 diff --git a/image/entrypoint.sh b/image/entrypoint.sh new file mode 100644 index 0000000..673c8e6 --- /dev/null +++ b/image/entrypoint.sh @@ -0,0 +1,20 @@ +#!/bin/sh +set -eu + +# Persist ssh state on the StartOS volume (mounted at /data when packaged). +SSH_DIR="${SSH_DIR:-/data/ssh}" +mkdir -p "$SSH_DIR" +chmod 700 "$SSH_DIR" + +if [ ! -f "$SSH_DIR/id_ed25519" ]; then + echo "[entrypoint] Generating ed25519 keypair for SSH to Sparks..." + ssh-keygen -t ed25519 -N "" -f "$SSH_DIR/id_ed25519" -C "spark-control@start9" >/dev/null +fi +chmod 600 "$SSH_DIR/id_ed25519" +chmod 644 "$SSH_DIR/id_ed25519.pub" +touch "$SSH_DIR/known_hosts" && chmod 600 "$SSH_DIR/known_hosts" + +export SSH_KEY_PATH="${SSH_KEY_PATH:-$SSH_DIR/id_ed25519}" +export SSH_KNOWN_HOSTS="${SSH_KNOWN_HOSTS:-$SSH_DIR/known_hosts}" + +exec uvicorn app.server:app --host 0.0.0.0 --port "${BIND_PORT:-9999}" diff --git a/image/pyproject.toml b/image/pyproject.toml new file mode 100644 index 0000000..6afbc0c --- /dev/null +++ b/image/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "spark-control" +version = "0.1.0" +description = "Web UI to swap vLLM models on a DGX Spark cluster" +requires-python = ">=3.11" +dependencies = [ + "fastapi>=0.115", + "uvicorn[standard]>=0.32", + "pydantic>=2.9", + "pyyaml>=6.0", + "httpx>=0.27", +] + +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +packages = ["app"] + +[tool.setuptools.package-data] +app = ["static/*", "../models.yaml"] diff --git a/known-issues.md b/known-issues.md new file mode 100644 index 0000000..d860f58 --- /dev/null +++ b/known-issues.md @@ -0,0 +1,40 @@ +# Known issues + +## magpie-tts crash loop (Spark 2) + +The `magpie-tts` container at `nvcr.io/nim/nvidia/magpie-tts-multilingual:latest` is in a restart loop and `:9000` is not reachable. **Status as of 2026-05-12: unfixed. UI surfaces a red dot.** + +**Root cause (from `docker logs magpie-tts`):** + +``` +nimlib.exceptions.ManifestDownloadError: Error downloading manifest: + I/O error Permission denied (os error 13) +``` + +The container exits 1 from `nimutils.download_models()` when fetching `nim/nvidia/magpie-tts-multilingual` model files from NGC. The "permission denied" is a local filesystem error — the container can't write the model cache where it expects to. + +**To diagnose further:** + +```bash +ssh @ +docker inspect magpie-tts | jq '.[].HostConfig.Mounts, .[].Config.Env' +# Look for: the mount path for the model cache, and whether NGC_API_KEY is set. +``` + +**Likely fixes (untried):** + +1. Chown the bind-mounted cache directory on Spark 2 to the UID the container runs as. +2. Set an `NGC_API_KEY` env var (NIM containers need this for non-public artifacts). +3. Confirm there's free disk space. + +## Qwen3.6-35B-A3B `--moe_backend=flashinfer_cutlass` may fail on launch + +This flag is Blackwell-specific. If vLLM in the container reports `unrecognized arguments: --moe_backend` or similar, edit `models.yaml` for `qwen36` and drop that flag. The swap UI does NOT auto-fallback in v0.1 — failure surfaces in the log stream. + +## Two SSH paths to Spark 1 from the laptop + +`ssh @` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `.local`. Always use the `.local` hostname or ``-style entries that ARE matched. + +## Older models in `models.yaml` + +The `qwen3-235b-fp8` and `qwen25-72b` catalog entries are conservative guesses for vLLM flags — they're on disk but were never the focus of this project. First launch of either may fail or be suboptimal; capture working flags here. diff --git a/models.yaml b/models.yaml new file mode 100644 index 0000000..c815c60 --- /dev/null +++ b/models.yaml @@ -0,0 +1,80 @@ +# spark-control model catalog +# +# Edit this file (or override at runtime via the StartOS "Edit Model Catalog" +# action) to add or change available models. +# +# Each model entry produces this command on Spark 1: +# cd ~/spark-vllm-docker +# ./launch-cluster.sh [--solo] -d exec vllm serve \ +# --port= --host= + +defaults: + port: 8888 + host: 0.0.0.0 + +models: + qwen3-vl: + display_name: "Qwen3-VL 235B (vision)" + repo: RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4 + size_gb: 135 + mode: cluster + capabilities: [vision, multilingual] + expected_ready_seconds: 300 + vllm_args: + - --gpu-memory-utilization=0.7 + - -tp=2 + - --distributed-executor-backend=ray + - --max-model-len=32768 + + gemma4: + display_name: "Gemma 4 31B" + repo: RedHatAI/gemma-4-31B-it-NVFP4 + size_gb: 23 + mode: solo + capabilities: [vision, reasoning, tools] + expected_ready_seconds: 240 + vllm_args: + - --gpu-memory-utilization=0.8 + - --max-model-len=32768 + - --reasoning-parser=gemma4 + - --tool-call-parser=gemma4 + - --enable-auto-tool-choice + + qwen36: + display_name: "Qwen3.6 35B-A3B (daily driver)" + repo: RedHatAI/Qwen3.6-35B-A3B-NVFP4 + size_gb: 20 + mode: solo + capabilities: [reasoning] + expected_ready_seconds: 300 + vllm_args: + - --gpu-memory-utilization=0.85 + - --max-model-len=65536 + - --reasoning-parser=qwen3 + - --moe_backend=flashinfer_cutlass + + qwen3-235b-fp8: + display_name: "Qwen3 235B-A22B FP8 (legacy)" + repo: Qwen/Qwen3-235B-A22B-FP8 + size_gb: 220 + mode: cluster + capabilities: [] + expected_ready_seconds: 360 + vllm_args: + - --gpu-memory-utilization=0.7 + - -tp=2 + - --distributed-executor-backend=ray + - --max-model-len=32768 + + qwen25-72b: + display_name: "Qwen2.5 72B (legacy)" + repo: Qwen/Qwen2.5-72B-Instruct + size_gb: 145 + mode: cluster + capabilities: [] + expected_ready_seconds: 360 + vllm_args: + - --gpu-memory-utilization=0.7 + - -tp=2 + - --distributed-executor-backend=ray + - --max-model-len=32768 diff --git a/runbook.md b/runbook.md new file mode 100644 index 0000000..9875b8c --- /dev/null +++ b/runbook.md @@ -0,0 +1,61 @@ +# spark-control runbook + +Operating notes for running and maintaining the cluster via spark-control. + +## Day-to-day + +- The UI lives at `http://.local:9999` once the StartOS package is installed and configured. +- Status auto-refreshes every 5 s. +- A swap takes 3–6 minutes depending on the model. Don't close the tab — but if you do, the swap continues; reopen and you'll re-attach to the log stream. + +## Adding a new model + +1. Add an entry to `models.yaml` (in the image source) or, post-install, via the "Edit Model Catalog" action in StartOS. +2. Confirm the weights are on the Spark: `ssh @.local 'ls ~/.cache/huggingface/hub/'`. If not, download with `./hf-download.sh ` on Spark 1. +3. The new model appears in the UI on next refresh. + +## Manual swap fallback + +If the UI is unavailable and you need to swap by hand: + +```bash +ssh @.local +cd ~/spark-vllm-docker +./launch-cluster.sh stop +./launch-cluster.sh --solo -d exec vllm serve RedHatAI/gemma-4-31B-it-NVFP4 \ + --port 8888 --host 0.0.0.0 --gpu-memory-utilization 0.8 \ + --max-model-len 32768 --reasoning-parser gemma4 \ + --tool-call-parser gemma4 --enable-auto-tool-choice +docker logs -f vllm_node # wait for "Application startup complete." +``` + +## Diagnostics + +```bash +# Is vLLM serving? +curl -s http://:8888/v1/models | jq . + +# Cluster status (containers up?) +ssh @.local 'cd ~/spark-vllm-docker && ./launch-cluster.sh status' + +# Tail current model's logs +ssh @.local 'docker logs --tail 200 -f vllm_node' + +# Parakeet +curl -s http://:8000/health + +# Magpie (see known-issues.md) +curl -s http://:9000/v1/health/ready +``` + +## Hard reset + +If launch-cluster.sh gets stuck: + +```bash +ssh @.local +cd ~/spark-vllm-docker +./launch-cluster.sh stop +docker ps -aq | xargs -r docker rm -f +# then relaunch your preferred model +```