v0.7.0 - Pre-flight launch validation (Test button on every model card)
validate.py:
- Builds the same args list a real swap would pass to 'vllm serve'
- SSHes into Spark 1 and runs vLLM's own argparse layer inside the running vllm_node container, WITHOUT initializing the engine
- Uses FlexibleArgumentParser (from vllm.utils.argparse_utils, with fallback to engine.arg_utils) + make_arg_parser — the exact same parser the 'vllm serve' CLI uses. Earlier attempt with bare argparse.ArgumentParser was too strict (rejected '--moe_backend' with underscore that the real CLI accepts via FlexibleArgumentParser's normalization)
- Returns structured {ok, stage, error, cmd_args, launch_cmd} so the UI can surface the exact failure cause
Endpoint: POST /api/swap/{key}/validate. Cheap (~5s), no engine init, no disruption to the currently-loaded model.
Frontend: 'Test' button on every model card, inline result below the action row (green check or red detailed error). Result stays visible until the user reloads or clicks Test again.
Catches: typos in flag names, deprecated/removed flags after a vLLM upgrade, type mismatches. Does NOT catch runtime-only failures (Mamba block-size assertion, OOM at load, kernel-compat). Ok=true is necessary-but-not-sufficient; ok=false is definitive 'don't bother running it'.
This commit is contained in:
@@ -22,6 +22,7 @@ from .services import docker_state, run_action, services_from_settings
|
||||
from .ssh import ssh_run
|
||||
from .swap import SwapManager
|
||||
from .updates import UpdateManager, get_update_status
|
||||
from .validate import validate_launch
|
||||
from .wol import send_local_broadcast, send_via_peer
|
||||
|
||||
|
||||
@@ -434,6 +435,15 @@ class SwapRequest(BaseModel):
|
||||
dry_run: bool = False
|
||||
|
||||
|
||||
@app.post("/api/swap/{key}/validate")
|
||||
async def validate_swap(key: str) -> dict:
|
||||
"""Pre-flight check: run vLLM's argparse layer against the proposed launch
|
||||
command WITHOUT starting an engine. Cheap (~5 s) and doesn't disturb the
|
||||
currently-loaded model.
|
||||
"""
|
||||
return await validate_launch(key, catalog, settings)
|
||||
|
||||
|
||||
@app.post("/api/swap")
|
||||
async def post_swap(req: SwapRequest) -> dict:
|
||||
if not settings.configured and not req.dry_run:
|
||||
|
||||
@@ -73,8 +73,10 @@ function renderCards() {
|
||||
<button class="btn ${isActive ? '' : 'primary'}" data-swap-key="${key}" ${isActive || isSwapping ? 'disabled' : ''}>
|
||||
${isActive ? 'Current' : 'Switch to this'}
|
||||
</button>
|
||||
<button class="btn test-btn" data-test-key="${key}" title="Pre-flight check the launch command without starting the engine">Test</button>
|
||||
<button class="btn adv-btn" data-adv-key="${key}" title="Advanced settings">Advanced</button>
|
||||
</div>
|
||||
<div class="test-result hidden" data-test-result-for="${key}"></div>
|
||||
`;
|
||||
root.appendChild(card);
|
||||
}
|
||||
@@ -84,6 +86,37 @@ function renderCards() {
|
||||
for (const btn of root.querySelectorAll('[data-adv-key]')) {
|
||||
btn.addEventListener('click', () => openAdvanced(btn.dataset.advKey));
|
||||
}
|
||||
for (const btn of root.querySelectorAll('[data-test-key]')) {
|
||||
btn.addEventListener('click', () => testLaunch(btn.dataset.testKey, btn));
|
||||
}
|
||||
}
|
||||
|
||||
async function testLaunch(key, btn) {
|
||||
const resultEl = document.querySelector(`[data-test-result-for="${key}"]`);
|
||||
if (!resultEl) return;
|
||||
const originalText = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Testing…';
|
||||
resultEl.classList.remove('hidden', 'ok', 'fail');
|
||||
resultEl.innerHTML = '<span class="muted small">Checking launch args against vLLM\'s parser…</span>';
|
||||
try {
|
||||
const r = await fetchJSON(`/api/swap/${encodeURIComponent(key)}/validate`, { method: 'POST' });
|
||||
if (r.ok) {
|
||||
resultEl.classList.add('ok');
|
||||
resultEl.innerHTML = `<span class="ok-mark">✓</span> Launch args parse OK. <span class="muted small">(Doesn't guarantee runtime success — only catches argparse-level issues.)</span>`;
|
||||
} else {
|
||||
resultEl.classList.add('fail');
|
||||
const err = escapeHtml(r.error || 'unknown error');
|
||||
const stage = r.stage ? ` <span class="muted small">(${escapeHtml(r.stage)})</span>` : '';
|
||||
resultEl.innerHTML = `<span class="fail-mark">✗</span> Would fail: ${err}${stage}`;
|
||||
}
|
||||
} catch (e) {
|
||||
resultEl.classList.add('fail');
|
||||
resultEl.innerHTML = `<span class="fail-mark">✗</span> Test failed: ${escapeHtml(e.message)}`;
|
||||
} finally {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
}
|
||||
|
||||
function renderCurrent(status) {
|
||||
|
||||
@@ -701,9 +701,24 @@ main {
|
||||
.card.active .btn { background: rgba(74, 222, 128, 0.12); color: var(--accent); border-color: rgba(74, 222, 128, 0.4); }
|
||||
.card-actions { display: flex; gap: 6px; }
|
||||
.card-actions .btn.primary { flex: 1; }
|
||||
.card .adv-btn { padding: 8px 12px; font-size: 12px; }
|
||||
.card .adv-btn,
|
||||
.card .test-btn { padding: 8px 12px; font-size: 12px; }
|
||||
.card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); }
|
||||
|
||||
.test-result {
|
||||
font-size: 12px;
|
||||
line-height: 1.45;
|
||||
padding: 8px 10px;
|
||||
border-radius: 5px;
|
||||
margin-top: 4px;
|
||||
border: 1px solid var(--border);
|
||||
background: var(--surface-2);
|
||||
}
|
||||
.test-result.ok { border-color: rgba(74, 222, 128, 0.4); background: rgba(74, 222, 128, 0.04); }
|
||||
.test-result.fail { border-color: rgba(239, 68, 68, 0.45); background: rgba(239, 68, 68, 0.06); word-break: break-word; }
|
||||
.test-result .ok-mark { color: var(--accent); font-weight: 600; }
|
||||
.test-result .fail-mark { color: var(--error); font-weight: 600; }
|
||||
|
||||
.footer {
|
||||
margin-top: 28px;
|
||||
padding-top: 16px;
|
||||
|
||||
@@ -0,0 +1,137 @@
|
||||
"""Pre-flight validation of a proposed vLLM launch command.
|
||||
|
||||
Runs vLLM's own argparse layer (EngineArgs) inside the vllm_node container WITHOUT
|
||||
starting the engine. Catches:
|
||||
|
||||
* unknown flag names (typos)
|
||||
* bad types / values that argparse rejects
|
||||
* deprecated flags removed in the installed vLLM version
|
||||
|
||||
Does NOT catch (these surface only during real engine init):
|
||||
* model-architecture-specific constraints (e.g. Qwen3.6 Mamba block_size)
|
||||
* OOM at weight-loading time
|
||||
* Triton / CUDA-kernel compatibility errors
|
||||
|
||||
A pre-flight check that returns "ok" is therefore NOT a guarantee — but a
|
||||
"failed" verdict is a definitive 'don't bother with the real swap'.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import shlex
|
||||
from typing import Any
|
||||
|
||||
from .config import Settings
|
||||
from .models import Catalog, build_launch_command
|
||||
from .ssh import ssh_run
|
||||
|
||||
|
||||
# Validates the proposed args against the same combined parser vLLM uses for
|
||||
# `vllm serve` (engine args + server args + frontend args). Returns one JSON
|
||||
# line on stdout: {"ok": true, ...} or {"ok": false, ...}.
|
||||
_VALIDATOR_SCRIPT = r"""
|
||||
import argparse, json, sys
|
||||
|
||||
# Mirror what `vllm serve` does internally: FlexibleArgumentParser (which is
|
||||
# more lenient about dashes vs underscores) wrapped with make_arg_parser
|
||||
# (which adds engine + server + frontend args).
|
||||
parser = None
|
||||
try:
|
||||
# Newer vLLM path
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
except Exception:
|
||||
try:
|
||||
# Older fallback
|
||||
from vllm.engine.arg_utils import FlexibleArgumentParser
|
||||
except Exception:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser # type: ignore
|
||||
|
||||
try:
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
parser = make_arg_parser(FlexibleArgumentParser(add_help=False))
|
||||
except Exception:
|
||||
pass
|
||||
if parser is None:
|
||||
try:
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
parser = FlexibleArgumentParser(add_help=False)
|
||||
EngineArgs.add_cli_args(parser)
|
||||
except Exception as e:
|
||||
print(json.dumps({"ok": False, "stage": "import", "error": f"{type(e).__name__}: {e}"}))
|
||||
sys.exit(0)
|
||||
|
||||
class _ArgError(Exception):
|
||||
pass
|
||||
|
||||
def _err(message):
|
||||
raise _ArgError(message)
|
||||
|
||||
parser.error = _err # capture argparse errors instead of sys.exit(2)
|
||||
|
||||
try:
|
||||
raw = sys.stdin.read()
|
||||
arglist = json.loads(raw)
|
||||
ns = parser.parse_args(arglist)
|
||||
print(json.dumps({"ok": True, "model": getattr(ns, "model", None)}))
|
||||
except _ArgError as e:
|
||||
print(json.dumps({"ok": False, "stage": "parse", "error": str(e)}))
|
||||
except SystemExit as e:
|
||||
print(json.dumps({"ok": False, "stage": "parse", "error": f"argparse exit {e.code}"}))
|
||||
except Exception as e:
|
||||
print(json.dumps({"ok": False, "stage": "parse", "error": f"{type(e).__name__}: {e}"}))
|
||||
"""
|
||||
|
||||
|
||||
def _vllm_arg_list(key: str, model_def, catalog: Catalog) -> list[str]:
|
||||
"""Reconstruct the args list passed to `vllm serve` (without the positional model)."""
|
||||
cmd = build_launch_command(key, model_def, catalog.defaults)
|
||||
# build_launch_command yields:
|
||||
# ./launch-cluster.sh [--solo] -d exec vllm serve <repo> <args...>
|
||||
# We just want the bits after `vllm serve <repo>`.
|
||||
tokens = shlex.split(cmd)
|
||||
if "serve" not in tokens:
|
||||
return []
|
||||
i = tokens.index("serve")
|
||||
after = tokens[i + 1 :] # repo, then args
|
||||
if not after:
|
||||
return []
|
||||
args = after[1:] # drop the repo
|
||||
# EngineArgs expects --model=REPO rather than positional, so prepend it.
|
||||
return [f"--model={after[0]}", *args]
|
||||
|
||||
|
||||
async def validate_launch(key: str, catalog: Catalog, settings: Settings) -> dict:
|
||||
if key not in catalog.models:
|
||||
return {"ok": False, "stage": "lookup", "error": f"unknown model: {key}"}
|
||||
if not settings.spark1_host or not settings.spark1_user:
|
||||
return {"ok": False, "stage": "config", "error": "spark1 not configured"}
|
||||
|
||||
model = catalog.models[key]
|
||||
arg_list = _vllm_arg_list(key, model, catalog)
|
||||
if not arg_list:
|
||||
return {"ok": False, "stage": "build", "error": "failed to build args list"}
|
||||
|
||||
payload = json.dumps(arg_list).replace("'", "'\\''")
|
||||
# Pipe the JSON args list to a here-doc Python invocation. The validator
|
||||
# reads from stdin to avoid shell-escaping the args themselves.
|
||||
cmd = (
|
||||
f"echo '{payload}' | docker exec -i vllm_node python3 -c "
|
||||
+ shlex.quote(_VALIDATOR_SCRIPT)
|
||||
)
|
||||
|
||||
rc, out, err = await ssh_run(settings.spark1_host, settings.spark1_user, cmd, settings, timeout=20)
|
||||
if rc != 0 and not out.strip():
|
||||
return {
|
||||
"ok": False,
|
||||
"stage": "ssh",
|
||||
"error": err.strip() or f"rc={rc}",
|
||||
"cmd_args": arg_list,
|
||||
"launch_cmd": build_launch_command(key, model, catalog.defaults),
|
||||
}
|
||||
last = out.strip().splitlines()[-1] if out.strip() else ""
|
||||
try:
|
||||
result: dict[str, Any] = json.loads(last)
|
||||
except json.JSONDecodeError:
|
||||
result = {"ok": False, "stage": "decode", "error": "validator did not return JSON", "raw": out[-500:]}
|
||||
result["cmd_args"] = arg_list
|
||||
result["launch_cmd"] = build_launch_command(key, model, catalog.defaults)
|
||||
return result
|
||||
Reference in New Issue
Block a user