v0.3.0 - Hardware dashboard + knob context + Explain context + Open WebUI link
Hardware dashboard:
- New hardware.py module: SSH probes each Spark for hostname, uptime, load+cores, RAM, disk, GPU (name, util, temp, power) + per-process GPU memory sum
- DGX Spark uses unified memory (nvidia-smi memory.total returns N/A); fall back to per-process compute memory and compute fraction against system RAM. Marks with gpu_unified_memory=true.
- 4s TTL cache in HardwareProbe to avoid hammering
- /api/hardware returns per-Spark snapshot
- UI: 'Spark hardware' section at the top with per-Spark cards (CPU load, RAM, GPU mem (unified), GPU util + temp + power, disk) — bars with warn threshold styling
- Polls every 8s
Knob context (tied to live hardware):
- Each Advanced knob now shows plain-English help text
- 'GPU memory %' shows '~N GB allocated · ~M GB left for OS/buffers' computed from actual Spark RAM
- 'Max context' shows '~N pages of text'
- Toggles show tradeoff descriptions
Explain context:
- '✨ Explain context' button on the update banner
- /api/explain-updates POST: forwards pending commits to the loaded vLLM model and streams its response back as SSE
- Renders into an expandable 'Explained by the loaded LLM' section under Pending commits
- Reasoning tokens shown italicized when the model emits them
Open WebUI integration:
- New 'Open WebUI URL' optional field in Configure Sparks
- /api/config exposes it; UI shows 'Open chat ↗' button in the top bar if set
Downloads:
- Third radio option: Spark 1 only / Spark 2 only / Both Sparks
- Backend picks SSH target based on mode
- HF repo link icon next to the input
- Helper line about NVFP4 for Blackwell
Model cards:
- Repo name is now a clickable link to its Hugging Face page
Package: bump 0.3.0:0
This commit is contained in:
+85
-1
@@ -11,6 +11,7 @@ from typing import Literal
|
||||
|
||||
from .config import Settings
|
||||
from .download import DownloadManager
|
||||
from .hardware import HardwareProbe
|
||||
from .health import check_magpie, check_parakeet, check_vllm
|
||||
from .models import load_catalog
|
||||
from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
|
||||
@@ -25,6 +26,7 @@ catalog = load_catalog(settings.models_yaml)
|
||||
swap_manager = SwapManager(settings, catalog)
|
||||
download_manager = DownloadManager(settings)
|
||||
update_manager = UpdateManager(settings)
|
||||
hardware_probe = HardwareProbe(settings)
|
||||
|
||||
app = FastAPI(title="spark-control", version="0.1.0")
|
||||
|
||||
@@ -44,6 +46,7 @@ async def get_config() -> dict:
|
||||
"spark1_host": settings.spark1_host,
|
||||
"spark2_host": settings.spark2_host,
|
||||
"vllm_port": settings.vllm_port,
|
||||
"open_webui_url": settings.open_webui_url or None,
|
||||
}
|
||||
|
||||
|
||||
@@ -116,6 +119,12 @@ async def del_model(key: str) -> dict:
|
||||
return {"ok": True, "key": key}
|
||||
|
||||
|
||||
@app.get("/api/hardware")
|
||||
async def get_hardware() -> dict:
|
||||
"""Per-Spark hardware snapshot — RAM, disk, GPU mem + util, CPU load, uptime."""
|
||||
return await hardware_probe.fetch()
|
||||
|
||||
|
||||
@app.get("/api/services")
|
||||
async def get_services() -> dict:
|
||||
"""Lifecycle state of always-on support services (Parakeet, Magpie, …).
|
||||
@@ -297,7 +306,7 @@ async def stream_swap(job_id: str):
|
||||
|
||||
class DownloadRequest(BaseModel):
|
||||
repo: str
|
||||
mode: Literal["solo", "cluster"] = "solo"
|
||||
mode: Literal["spark1", "spark2", "cluster"] = "spark1"
|
||||
|
||||
|
||||
@app.post("/api/download")
|
||||
@@ -376,6 +385,81 @@ async def get_updates() -> dict:
|
||||
return await get_update_status(settings)
|
||||
|
||||
|
||||
@app.get("/api/explain-updates")
|
||||
async def explain_updates():
|
||||
"""Stream a layman's explanation of the pending commits from the currently-loaded vLLM model."""
|
||||
import httpx
|
||||
info = await get_update_status(settings)
|
||||
if not info.get("ok"):
|
||||
async def err_gen():
|
||||
yield f"event: done\ndata: {json.dumps({'error': info.get('error', 'unknown')})}\n\n"
|
||||
return StreamingResponse(err_gen(), media_type="text/event-stream")
|
||||
|
||||
vllm = await check_vllm(settings)
|
||||
if not vllm.get("ok") or not vllm.get("current_model"):
|
||||
async def err_gen():
|
||||
yield f"event: done\ndata: {json.dumps({'error': 'no vLLM model loaded — swap to a model first'})}\n\n"
|
||||
return StreamingResponse(err_gen(), media_type="text/event-stream")
|
||||
|
||||
commits = "\n".join(info.get("log", []))
|
||||
if not commits.strip():
|
||||
async def empty_gen():
|
||||
yield f"event: done\ndata: {json.dumps({'error': 'no pending commits'})}\n\n"
|
||||
return StreamingResponse(empty_gen(), media_type="text/event-stream")
|
||||
|
||||
prompt = (
|
||||
"You are reviewing pending git commits to `eugr/spark-vllm-docker`, an upstream community project that "
|
||||
"orchestrates vLLM on dual NVIDIA DGX Spark hardware (Blackwell GPUs, cluster via Ray, recipes per model). "
|
||||
"The reader has a setup running models like Qwen3.6-35B-A3B-NVFP4 (daily driver, solo), Qwen3-VL 235B (cluster), "
|
||||
"and Gemma 4 31B. The reader is technically literate but is NOT a vLLM expert.\n\n"
|
||||
"For the commit list below: give a short overall verdict (Apply / Optional / Skip and why), then a brief "
|
||||
"bullet per commit grouping similar ones. Call out anything that would break a working setup or that "
|
||||
"requires re-downloading models. Avoid jargon. ~250 words max.\n\n"
|
||||
f"Pending commits:\n{commits}"
|
||||
)
|
||||
|
||||
async def gen():
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=5.0)) as c:
|
||||
async with c.stream(
|
||||
"POST",
|
||||
f"{vllm['base_url']}/chat/completions",
|
||||
json={
|
||||
"model": vllm["current_model"],
|
||||
"stream": True,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": 600,
|
||||
"temperature": 0.4,
|
||||
},
|
||||
) as r:
|
||||
r.raise_for_status()
|
||||
async for line in r.aiter_lines():
|
||||
if not line.startswith("data: "):
|
||||
continue
|
||||
data = line[6:].strip()
|
||||
if data == "[DONE]":
|
||||
break
|
||||
try:
|
||||
chunk = json.loads(data)
|
||||
choices = chunk.get("choices") or []
|
||||
if not choices:
|
||||
continue
|
||||
delta = choices[0].get("delta") or {}
|
||||
text = delta.get("content")
|
||||
reasoning = delta.get("reasoning")
|
||||
if text:
|
||||
yield f"data: {json.dumps({'content': text})}\n\n"
|
||||
elif reasoning:
|
||||
yield f"data: {json.dumps({'reasoning': reasoning})}\n\n"
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
yield f"data: {json.dumps({'error': f'{type(e).__name__}: {e}'})}\n\n"
|
||||
yield f"event: done\ndata: {json.dumps({'ok': True})}\n\n"
|
||||
|
||||
return StreamingResponse(gen(), media_type="text/event-stream")
|
||||
|
||||
|
||||
class UpdateRequest(BaseModel):
|
||||
mode: Literal["solo", "cluster"] = "cluster"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user