v0.7.0 - Pre-flight launch validation (Test button on every model card)

validate.py:
- Builds the same args list a real swap would pass to 'vllm serve'
- SSHes into Spark 1 and runs vLLM's own argparse layer inside the running vllm_node container, WITHOUT initializing the engine
- Uses FlexibleArgumentParser (from vllm.utils.argparse_utils, with fallback to engine.arg_utils) + make_arg_parser — the exact same parser the 'vllm serve' CLI uses. Earlier attempt with bare argparse.ArgumentParser was too strict (rejected '--moe_backend' with underscore that the real CLI accepts via FlexibleArgumentParser's normalization)
- Returns structured {ok, stage, error, cmd_args, launch_cmd} so the UI can surface the exact failure cause

Endpoint: POST /api/swap/{key}/validate. Cheap (~5s), no engine init, no disruption to the currently-loaded model.

Frontend: 'Test' button on every model card, inline result below the action row (green check or red detailed error). Result stays visible until the user reloads or clicks Test again.

Catches: typos in flag names, deprecated/removed flags after a vLLM upgrade, type mismatches. Does NOT catch runtime-only failures (Mamba block-size assertion, OOM at load, kernel-compat). Ok=true is necessary-but-not-sufficient; ok=false is definitive 'don't bother running it'.
This commit is contained in:
Grant
2026-05-12 13:37:37 -05:00
parent 5827683a09
commit 6434b01a95
5 changed files with 198 additions and 3 deletions
+10
View File
@@ -22,6 +22,7 @@ from .services import docker_state, run_action, services_from_settings
from .ssh import ssh_run
from .swap import SwapManager
from .updates import UpdateManager, get_update_status
from .validate import validate_launch
from .wol import send_local_broadcast, send_via_peer
@@ -434,6 +435,15 @@ class SwapRequest(BaseModel):
dry_run: bool = False
@app.post("/api/swap/{key}/validate")
async def validate_swap(key: str) -> dict:
"""Pre-flight check: run vLLM's argparse layer against the proposed launch
command WITHOUT starting an engine. Cheap (~5 s) and doesn't disturb the
currently-loaded model.
"""
return await validate_launch(key, catalog, settings)
@app.post("/api/swap")
async def post_swap(req: SwapRequest) -> dict:
if not settings.configured and not req.dry_run:
+33
View File
@@ -73,8 +73,10 @@ function renderCards() {
<button class="btn ${isActive ? '' : 'primary'}" data-swap-key="${key}" ${isActive || isSwapping ? 'disabled' : ''}>
${isActive ? 'Current' : 'Switch to this'}
</button>
<button class="btn test-btn" data-test-key="${key}" title="Pre-flight check the launch command without starting the engine">Test</button>
<button class="btn adv-btn" data-adv-key="${key}" title="Advanced settings">Advanced</button>
</div>
<div class="test-result hidden" data-test-result-for="${key}"></div>
`;
root.appendChild(card);
}
@@ -84,6 +86,37 @@ function renderCards() {
for (const btn of root.querySelectorAll('[data-adv-key]')) {
btn.addEventListener('click', () => openAdvanced(btn.dataset.advKey));
}
for (const btn of root.querySelectorAll('[data-test-key]')) {
btn.addEventListener('click', () => testLaunch(btn.dataset.testKey, btn));
}
}
async function testLaunch(key, btn) {
const resultEl = document.querySelector(`[data-test-result-for="${key}"]`);
if (!resultEl) return;
const originalText = btn.textContent;
btn.disabled = true;
btn.textContent = 'Testing…';
resultEl.classList.remove('hidden', 'ok', 'fail');
resultEl.innerHTML = '<span class="muted small">Checking launch args against vLLM\'s parser…</span>';
try {
const r = await fetchJSON(`/api/swap/${encodeURIComponent(key)}/validate`, { method: 'POST' });
if (r.ok) {
resultEl.classList.add('ok');
resultEl.innerHTML = `<span class="ok-mark">✓</span> Launch args parse OK. <span class="muted small">(Doesn't guarantee runtime success — only catches argparse-level issues.)</span>`;
} else {
resultEl.classList.add('fail');
const err = escapeHtml(r.error || 'unknown error');
const stage = r.stage ? ` <span class="muted small">(${escapeHtml(r.stage)})</span>` : '';
resultEl.innerHTML = `<span class="fail-mark">✗</span> Would fail: ${err}${stage}`;
}
} catch (e) {
resultEl.classList.add('fail');
resultEl.innerHTML = `<span class="fail-mark">✗</span> Test failed: ${escapeHtml(e.message)}`;
} finally {
btn.disabled = false;
btn.textContent = originalText;
}
}
function renderCurrent(status) {
+16 -1
View File
@@ -701,9 +701,24 @@ main {
.card.active .btn { background: rgba(74, 222, 128, 0.12); color: var(--accent); border-color: rgba(74, 222, 128, 0.4); }
.card-actions { display: flex; gap: 6px; }
.card-actions .btn.primary { flex: 1; }
.card .adv-btn { padding: 8px 12px; font-size: 12px; }
.card .adv-btn,
.card .test-btn { padding: 8px 12px; font-size: 12px; }
.card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); }
.test-result {
font-size: 12px;
line-height: 1.45;
padding: 8px 10px;
border-radius: 5px;
margin-top: 4px;
border: 1px solid var(--border);
background: var(--surface-2);
}
.test-result.ok { border-color: rgba(74, 222, 128, 0.4); background: rgba(74, 222, 128, 0.04); }
.test-result.fail { border-color: rgba(239, 68, 68, 0.45); background: rgba(239, 68, 68, 0.06); word-break: break-word; }
.test-result .ok-mark { color: var(--accent); font-weight: 600; }
.test-result .fail-mark { color: var(--error); font-weight: 600; }
.footer {
margin-top: 28px;
padding-top: 16px;
+137
View File
@@ -0,0 +1,137 @@
"""Pre-flight validation of a proposed vLLM launch command.
Runs vLLM's own argparse layer (EngineArgs) inside the vllm_node container WITHOUT
starting the engine. Catches:
* unknown flag names (typos)
* bad types / values that argparse rejects
* deprecated flags removed in the installed vLLM version
Does NOT catch (these surface only during real engine init):
* model-architecture-specific constraints (e.g. Qwen3.6 Mamba block_size)
* OOM at weight-loading time
* Triton / CUDA-kernel compatibility errors
A pre-flight check that returns "ok" is therefore NOT a guarantee — but a
"failed" verdict is a definitive 'don't bother with the real swap'.
"""
from __future__ import annotations
import json
import shlex
from typing import Any
from .config import Settings
from .models import Catalog, build_launch_command
from .ssh import ssh_run
# Validates the proposed args against the same combined parser vLLM uses for
# `vllm serve` (engine args + server args + frontend args). Returns one JSON
# line on stdout: {"ok": true, ...} or {"ok": false, ...}.
_VALIDATOR_SCRIPT = r"""
import argparse, json, sys
# Mirror what `vllm serve` does internally: FlexibleArgumentParser (which is
# more lenient about dashes vs underscores) wrapped with make_arg_parser
# (which adds engine + server + frontend args).
parser = None
try:
# Newer vLLM path
from vllm.utils.argparse_utils import FlexibleArgumentParser
except Exception:
try:
# Older fallback
from vllm.engine.arg_utils import FlexibleArgumentParser
except Exception:
FlexibleArgumentParser = argparse.ArgumentParser # type: ignore
try:
from vllm.entrypoints.openai.cli_args import make_arg_parser
parser = make_arg_parser(FlexibleArgumentParser(add_help=False))
except Exception:
pass
if parser is None:
try:
from vllm.engine.arg_utils import EngineArgs
parser = FlexibleArgumentParser(add_help=False)
EngineArgs.add_cli_args(parser)
except Exception as e:
print(json.dumps({"ok": False, "stage": "import", "error": f"{type(e).__name__}: {e}"}))
sys.exit(0)
class _ArgError(Exception):
pass
def _err(message):
raise _ArgError(message)
parser.error = _err # capture argparse errors instead of sys.exit(2)
try:
raw = sys.stdin.read()
arglist = json.loads(raw)
ns = parser.parse_args(arglist)
print(json.dumps({"ok": True, "model": getattr(ns, "model", None)}))
except _ArgError as e:
print(json.dumps({"ok": False, "stage": "parse", "error": str(e)}))
except SystemExit as e:
print(json.dumps({"ok": False, "stage": "parse", "error": f"argparse exit {e.code}"}))
except Exception as e:
print(json.dumps({"ok": False, "stage": "parse", "error": f"{type(e).__name__}: {e}"}))
"""
def _vllm_arg_list(key: str, model_def, catalog: Catalog) -> list[str]:
"""Reconstruct the args list passed to `vllm serve` (without the positional model)."""
cmd = build_launch_command(key, model_def, catalog.defaults)
# build_launch_command yields:
# ./launch-cluster.sh [--solo] -d exec vllm serve <repo> <args...>
# We just want the bits after `vllm serve <repo>`.
tokens = shlex.split(cmd)
if "serve" not in tokens:
return []
i = tokens.index("serve")
after = tokens[i + 1 :] # repo, then args
if not after:
return []
args = after[1:] # drop the repo
# EngineArgs expects --model=REPO rather than positional, so prepend it.
return [f"--model={after[0]}", *args]
async def validate_launch(key: str, catalog: Catalog, settings: Settings) -> dict:
if key not in catalog.models:
return {"ok": False, "stage": "lookup", "error": f"unknown model: {key}"}
if not settings.spark1_host or not settings.spark1_user:
return {"ok": False, "stage": "config", "error": "spark1 not configured"}
model = catalog.models[key]
arg_list = _vllm_arg_list(key, model, catalog)
if not arg_list:
return {"ok": False, "stage": "build", "error": "failed to build args list"}
payload = json.dumps(arg_list).replace("'", "'\\''")
# Pipe the JSON args list to a here-doc Python invocation. The validator
# reads from stdin to avoid shell-escaping the args themselves.
cmd = (
f"echo '{payload}' | docker exec -i vllm_node python3 -c "
+ shlex.quote(_VALIDATOR_SCRIPT)
)
rc, out, err = await ssh_run(settings.spark1_host, settings.spark1_user, cmd, settings, timeout=20)
if rc != 0 and not out.strip():
return {
"ok": False,
"stage": "ssh",
"error": err.strip() or f"rc={rc}",
"cmd_args": arg_list,
"launch_cmd": build_launch_command(key, model, catalog.defaults),
}
last = out.strip().splitlines()[-1] if out.strip() else ""
try:
result: dict[str, Any] = json.loads(last)
except json.JSONDecodeError:
result = {"ok": False, "stage": "decode", "error": "validator did not return JSON", "raw": out[-500:]}
result["cmd_args"] = arg_list
result["launch_cmd"] = build_launch_command(key, model, catalog.defaults)
return result