v0.7.0 - Pre-flight launch validation (Test button on every model card)

validate.py: - Builds the same args list a real swap would pass to 'vllm serve' - SSHes into Spark 1 and runs vLLM's own argparse layer inside the running vllm_node container, WITHOUT initializing the engine - Uses FlexibleArgumentParser (from vllm.utils.argparse_utils, with fallback to engine.arg_utils) + make_arg_parser — the exact same parser the 'vllm serve' CLI uses. Earlier attempt with bare argparse.ArgumentParser was too strict (rejected '--moe_backend' with underscore that the real CLI accepts via FlexibleArgumentParser's normalization) - Returns structured {ok, stage, error, cmd_args, launch_cmd} so the UI can surface the exact failure cause Endpoint: POST /api/swap/{key}/validate. Cheap (~5s), no engine init, no disruption to the currently-loaded model. Frontend: 'Test' button on every model card, inline result below the action row (green check or red detailed error). Result stays visible until the user reloads or clicks Test again. Catches: typos in flag names, deprecated/removed flags after a vLLM upgrade, type mismatches. Does NOT catch runtime-only failures (Mamba block-size assertion, OOM at load, kernel-compat). Ok=true is necessary-but-not-sufficient; ok=false is definitive 'don't bother running it'.
2026-05-12 13:37:37 -05:00
parent 5827683a09
commit 6434b01a95
5 changed files with 198 additions and 3 deletions
@@ -0,0 +1,137 @@
+"""Pre-flight validation of a proposed vLLM launch command.
+
+Runs vLLM's own argparse layer (EngineArgs) inside the vllm_node container WITHOUT
+starting the engine. Catches:
+
+  * unknown flag names (typos)
+  * bad types / values that argparse rejects
+  * deprecated flags removed in the installed vLLM version
+
+Does NOT catch (these surface only during real engine init):
+  * model-architecture-specific constraints (e.g. Qwen3.6 Mamba block_size)
+  * OOM at weight-loading time
+  * Triton / CUDA-kernel compatibility errors
+
+A pre-flight check that returns "ok" is therefore NOT a guarantee — but a
+"failed" verdict is a definitive 'don't bother with the real swap'.
+"""
+from __future__ import annotations
+import json
+import shlex
+from typing import Any
+
+from .config import Settings
+from .models import Catalog, build_launch_command
+from .ssh import ssh_run
+
+
+# Validates the proposed args against the same combined parser vLLM uses for
+# `vllm serve` (engine args + server args + frontend args). Returns one JSON
+# line on stdout: {"ok": true, ...} or {"ok": false, ...}.
+_VALIDATOR_SCRIPT = r"""
+import argparse, json, sys
+
+# Mirror what `vllm serve` does internally: FlexibleArgumentParser (which is
+# more lenient about dashes vs underscores) wrapped with make_arg_parser
+# (which adds engine + server + frontend args).
+parser = None
+try:
+    # Newer vLLM path
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+except Exception:
+    try:
+        # Older fallback
+        from vllm.engine.arg_utils import FlexibleArgumentParser
+    except Exception:
+        FlexibleArgumentParser = argparse.ArgumentParser  # type: ignore
+
+try:
+    from vllm.entrypoints.openai.cli_args import make_arg_parser
+    parser = make_arg_parser(FlexibleArgumentParser(add_help=False))
+except Exception:
+    pass
+if parser is None:
+    try:
+        from vllm.engine.arg_utils import EngineArgs
+        parser = FlexibleArgumentParser(add_help=False)
+        EngineArgs.add_cli_args(parser)
+    except Exception as e:
+        print(json.dumps({"ok": False, "stage": "import", "error": f"{type(e).__name__}: {e}"}))
+        sys.exit(0)
+
+class _ArgError(Exception):
+    pass
+
+def _err(message):
+    raise _ArgError(message)
+
+parser.error = _err  # capture argparse errors instead of sys.exit(2)
+
+try:
+    raw = sys.stdin.read()
+    arglist = json.loads(raw)
+    ns = parser.parse_args(arglist)
+    print(json.dumps({"ok": True, "model": getattr(ns, "model", None)}))
+except _ArgError as e:
+    print(json.dumps({"ok": False, "stage": "parse", "error": str(e)}))
+except SystemExit as e:
+    print(json.dumps({"ok": False, "stage": "parse", "error": f"argparse exit {e.code}"}))
+except Exception as e:
+    print(json.dumps({"ok": False, "stage": "parse", "error": f"{type(e).__name__}: {e}"}))
+"""
+
+
+def _vllm_arg_list(key: str, model_def, catalog: Catalog) -> list[str]:
+    """Reconstruct the args list passed to `vllm serve` (without the positional model)."""
+    cmd = build_launch_command(key, model_def, catalog.defaults)
+    # build_launch_command yields:
+    #   ./launch-cluster.sh [--solo] -d exec vllm serve <repo> <args...>
+    # We just want the bits after `vllm serve <repo>`.
+    tokens = shlex.split(cmd)
+    if "serve" not in tokens:
+        return []
+    i = tokens.index("serve")
+    after = tokens[i + 1 :]  # repo, then args
+    if not after:
+        return []
+    args = after[1:]  # drop the repo
+    # EngineArgs expects --model=REPO rather than positional, so prepend it.
+    return [f"--model={after[0]}", *args]
+
+
+async def validate_launch(key: str, catalog: Catalog, settings: Settings) -> dict:
+    if key not in catalog.models:
+        return {"ok": False, "stage": "lookup", "error": f"unknown model: {key}"}
+    if not settings.spark1_host or not settings.spark1_user:
+        return {"ok": False, "stage": "config", "error": "spark1 not configured"}
+
+    model = catalog.models[key]
+    arg_list = _vllm_arg_list(key, model, catalog)
+    if not arg_list:
+        return {"ok": False, "stage": "build", "error": "failed to build args list"}
+
+    payload = json.dumps(arg_list).replace("'", "'\\''")
+    # Pipe the JSON args list to a here-doc Python invocation. The validator
+    # reads from stdin to avoid shell-escaping the args themselves.
+    cmd = (
+        f"echo '{payload}' | docker exec -i vllm_node python3 -c "
+        + shlex.quote(_VALIDATOR_SCRIPT)
+    )
+
+    rc, out, err = await ssh_run(settings.spark1_host, settings.spark1_user, cmd, settings, timeout=20)
+    if rc != 0 and not out.strip():
+        return {
+            "ok": False,
+            "stage": "ssh",
+            "error": err.strip() or f"rc={rc}",
+            "cmd_args": arg_list,
+            "launch_cmd": build_launch_command(key, model, catalog.defaults),
+        }
+    last = out.strip().splitlines()[-1] if out.strip() else ""
+    try:
+        result: dict[str, Any] = json.loads(last)
+    except json.JSONDecodeError:
+        result = {"ok": False, "stage": "decode", "error": "validator did not return JSON", "raw": out[-500:]}
+    result["cmd_args"] = arg_list
+    result["launch_cmd"] = build_launch_command(key, model, catalog.defaults)
+    return result