v0.3.0 - Hardware dashboard + knob context + Explain context + Open WebUI link

Hardware dashboard: - New hardware.py module: SSH probes each Spark for hostname, uptime, load+cores, RAM, disk, GPU (name, util, temp, power) + per-process GPU memory sum - DGX Spark uses unified memory (nvidia-smi memory.total returns N/A); fall back to per-process compute memory and compute fraction against system RAM. Marks with gpu_unified_memory=true. - 4s TTL cache in HardwareProbe to avoid hammering - /api/hardware returns per-Spark snapshot - UI: 'Spark hardware' section at the top with per-Spark cards (CPU load, RAM, GPU mem (unified), GPU util + temp + power, disk) — bars with warn threshold styling - Polls every 8s Knob context (tied to live hardware): - Each Advanced knob now shows plain-English help text - 'GPU memory %' shows '~N GB allocated · ~M GB left for OS/buffers' computed from actual Spark RAM - 'Max context' shows '~N pages of text' - Toggles show tradeoff descriptions Explain context: - '✨ Explain context' button on the update banner - /api/explain-updates POST: forwards pending commits to the loaded vLLM model and streams its response back as SSE - Renders into an expandable 'Explained by the loaded LLM' section under Pending commits - Reasoning tokens shown italicized when the model emits them Open WebUI integration: - New 'Open WebUI URL' optional field in Configure Sparks - /api/config exposes it; UI shows 'Open chat ↗' button in the top bar if set Downloads: - Third radio option: Spark 1 only / Spark 2 only / Both Sparks - Backend picks SSH target based on mode - HF repo link icon next to the input - Helper line about NVFP4 for Blackwell Model cards: - Repo name is now a clickable link to its Hugging Face page Package: bump 0.3.0:0
2026-05-12 12:00:15 -05:00
parent c6da6b0784
commit 64ce0fca10
11 changed files with 609 additions and 11 deletions
@@ -11,6 +11,7 @@ from typing import Literal

 from .config import Settings
 from .download import DownloadManager
+from .hardware import HardwareProbe
 from .health import check_magpie, check_parakeet, check_vllm
 from .models import load_catalog
 from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
@@ -25,6 +26,7 @@ catalog = load_catalog(settings.models_yaml)
 swap_manager = SwapManager(settings, catalog)
 download_manager = DownloadManager(settings)
 update_manager = UpdateManager(settings)
+hardware_probe = HardwareProbe(settings)

 app = FastAPI(title="spark-control", version="0.1.0")

@@ -44,6 +46,7 @@ async def get_config() -> dict:
        "spark1_host": settings.spark1_host,
        "spark2_host": settings.spark2_host,
        "vllm_port": settings.vllm_port,
+        "open_webui_url": settings.open_webui_url or None,
    }


@@ -116,6 +119,12 @@ async def del_model(key: str) -> dict:
    return {"ok": True, "key": key}


+@app.get("/api/hardware")
+async def get_hardware() -> dict:
+    """Per-Spark hardware snapshot — RAM, disk, GPU mem + util, CPU load, uptime."""
+    return await hardware_probe.fetch()
+
+
@app.get("/api/services")
 async def get_services() -> dict:
    """Lifecycle state of always-on support services (Parakeet, Magpie, …).
@@ -297,7 +306,7 @@ async def stream_swap(job_id: str):

 class DownloadRequest(BaseModel):
    repo: str
-    mode: Literal["solo", "cluster"] = "solo"
+    mode: Literal["spark1", "spark2", "cluster"] = "spark1"


@app.post("/api/download")
@@ -376,6 +385,81 @@ async def get_updates() -> dict:
    return await get_update_status(settings)


+@app.get("/api/explain-updates")
+async def explain_updates():
+    """Stream a layman's explanation of the pending commits from the currently-loaded vLLM model."""
+    import httpx
+    info = await get_update_status(settings)
+    if not info.get("ok"):
+        async def err_gen():
+            yield f"event: done\ndata: {json.dumps({'error': info.get('error', 'unknown')})}\n\n"
+        return StreamingResponse(err_gen(), media_type="text/event-stream")
+
+    vllm = await check_vllm(settings)
+    if not vllm.get("ok") or not vllm.get("current_model"):
+        async def err_gen():
+            yield f"event: done\ndata: {json.dumps({'error': 'no vLLM model loaded — swap to a model first'})}\n\n"
+        return StreamingResponse(err_gen(), media_type="text/event-stream")
+
+    commits = "\n".join(info.get("log", []))
+    if not commits.strip():
+        async def empty_gen():
+            yield f"event: done\ndata: {json.dumps({'error': 'no pending commits'})}\n\n"
+        return StreamingResponse(empty_gen(), media_type="text/event-stream")
+
+    prompt = (
+        "You are reviewing pending git commits to `eugr/spark-vllm-docker`, an upstream community project that "
+        "orchestrates vLLM on dual NVIDIA DGX Spark hardware (Blackwell GPUs, cluster via Ray, recipes per model). "
+        "The reader has a setup running models like Qwen3.6-35B-A3B-NVFP4 (daily driver, solo), Qwen3-VL 235B (cluster), "
+        "and Gemma 4 31B. The reader is technically literate but is NOT a vLLM expert.\n\n"
+        "For the commit list below: give a short overall verdict (Apply / Optional / Skip and why), then a brief "
+        "bullet per commit grouping similar ones. Call out anything that would break a working setup or that "
+        "requires re-downloading models. Avoid jargon. ~250 words max.\n\n"
+        f"Pending commits:\n{commits}"
+    )
+
+    async def gen():
+        try:
+            async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=5.0)) as c:
+                async with c.stream(
+                    "POST",
+                    f"{vllm['base_url']}/chat/completions",
+                    json={
+                        "model": vllm["current_model"],
+                        "stream": True,
+                        "messages": [{"role": "user", "content": prompt}],
+                        "max_tokens": 600,
+                        "temperature": 0.4,
+                    },
+                ) as r:
+                    r.raise_for_status()
+                    async for line in r.aiter_lines():
+                        if not line.startswith("data: "):
+                            continue
+                        data = line[6:].strip()
+                        if data == "[DONE]":
+                            break
+                        try:
+                            chunk = json.loads(data)
+                            choices = chunk.get("choices") or []
+                            if not choices:
+                                continue
+                            delta = choices[0].get("delta") or {}
+                            text = delta.get("content")
+                            reasoning = delta.get("reasoning")
+                            if text:
+                                yield f"data: {json.dumps({'content': text})}\n\n"
+                            elif reasoning:
+                                yield f"data: {json.dumps({'reasoning': reasoning})}\n\n"
+                        except json.JSONDecodeError:
+                            continue
+        except Exception as e:
+            yield f"data: {json.dumps({'error': f'{type(e).__name__}: {e}'})}\n\n"
+        yield f"event: done\ndata: {json.dumps({'ok': True})}\n\n"
+
+    return StreamingResponse(gen(), media_type="text/event-stream")
+
+
 class UpdateRequest(BaseModel):
    mode: Literal["solo", "cluster"] = "cluster"