6434b01a95
validate.py:
- Builds the same args list a real swap would pass to 'vllm serve'
- SSHes into Spark 1 and runs vLLM's own argparse layer inside the running vllm_node container, WITHOUT initializing the engine
- Uses FlexibleArgumentParser (from vllm.utils.argparse_utils, with fallback to engine.arg_utils) + make_arg_parser — the exact same parser the 'vllm serve' CLI uses. Earlier attempt with bare argparse.ArgumentParser was too strict (rejected '--moe_backend' with underscore that the real CLI accepts via FlexibleArgumentParser's normalization)
- Returns structured {ok, stage, error, cmd_args, launch_cmd} so the UI can surface the exact failure cause
Endpoint: POST /api/swap/{key}/validate. Cheap (~5s), no engine init, no disruption to the currently-loaded model.
Frontend: 'Test' button on every model card, inline result below the action row (green check or red detailed error). Result stays visible until the user reloads or clicks Test again.
Catches: typos in flag names, deprecated/removed flags after a vLLM upgrade, type mismatches. Does NOT catch runtime-only failures (Mamba block-size assertion, OOM at load, kernel-compat). Ok=true is necessary-but-not-sufficient; ok=false is definitive 'don't bother running it'.
138 lines
5.1 KiB
Python
138 lines
5.1 KiB
Python
"""Pre-flight validation of a proposed vLLM launch command.
|
|
|
|
Runs vLLM's own argparse layer (EngineArgs) inside the vllm_node container WITHOUT
|
|
starting the engine. Catches:
|
|
|
|
* unknown flag names (typos)
|
|
* bad types / values that argparse rejects
|
|
* deprecated flags removed in the installed vLLM version
|
|
|
|
Does NOT catch (these surface only during real engine init):
|
|
* model-architecture-specific constraints (e.g. Qwen3.6 Mamba block_size)
|
|
* OOM at weight-loading time
|
|
* Triton / CUDA-kernel compatibility errors
|
|
|
|
A pre-flight check that returns "ok" is therefore NOT a guarantee — but a
|
|
"failed" verdict is a definitive 'don't bother with the real swap'.
|
|
"""
|
|
from __future__ import annotations
|
|
import json
|
|
import shlex
|
|
from typing import Any
|
|
|
|
from .config import Settings
|
|
from .models import Catalog, build_launch_command
|
|
from .ssh import ssh_run
|
|
|
|
|
|
# Validates the proposed args against the same combined parser vLLM uses for
|
|
# `vllm serve` (engine args + server args + frontend args). Returns one JSON
|
|
# line on stdout: {"ok": true, ...} or {"ok": false, ...}.
|
|
_VALIDATOR_SCRIPT = r"""
|
|
import argparse, json, sys
|
|
|
|
# Mirror what `vllm serve` does internally: FlexibleArgumentParser (which is
|
|
# more lenient about dashes vs underscores) wrapped with make_arg_parser
|
|
# (which adds engine + server + frontend args).
|
|
parser = None
|
|
try:
|
|
# Newer vLLM path
|
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
|
except Exception:
|
|
try:
|
|
# Older fallback
|
|
from vllm.engine.arg_utils import FlexibleArgumentParser
|
|
except Exception:
|
|
FlexibleArgumentParser = argparse.ArgumentParser # type: ignore
|
|
|
|
try:
|
|
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
|
parser = make_arg_parser(FlexibleArgumentParser(add_help=False))
|
|
except Exception:
|
|
pass
|
|
if parser is None:
|
|
try:
|
|
from vllm.engine.arg_utils import EngineArgs
|
|
parser = FlexibleArgumentParser(add_help=False)
|
|
EngineArgs.add_cli_args(parser)
|
|
except Exception as e:
|
|
print(json.dumps({"ok": False, "stage": "import", "error": f"{type(e).__name__}: {e}"}))
|
|
sys.exit(0)
|
|
|
|
class _ArgError(Exception):
|
|
pass
|
|
|
|
def _err(message):
|
|
raise _ArgError(message)
|
|
|
|
parser.error = _err # capture argparse errors instead of sys.exit(2)
|
|
|
|
try:
|
|
raw = sys.stdin.read()
|
|
arglist = json.loads(raw)
|
|
ns = parser.parse_args(arglist)
|
|
print(json.dumps({"ok": True, "model": getattr(ns, "model", None)}))
|
|
except _ArgError as e:
|
|
print(json.dumps({"ok": False, "stage": "parse", "error": str(e)}))
|
|
except SystemExit as e:
|
|
print(json.dumps({"ok": False, "stage": "parse", "error": f"argparse exit {e.code}"}))
|
|
except Exception as e:
|
|
print(json.dumps({"ok": False, "stage": "parse", "error": f"{type(e).__name__}: {e}"}))
|
|
"""
|
|
|
|
|
|
def _vllm_arg_list(key: str, model_def, catalog: Catalog) -> list[str]:
|
|
"""Reconstruct the args list passed to `vllm serve` (without the positional model)."""
|
|
cmd = build_launch_command(key, model_def, catalog.defaults)
|
|
# build_launch_command yields:
|
|
# ./launch-cluster.sh [--solo] -d exec vllm serve <repo> <args...>
|
|
# We just want the bits after `vllm serve <repo>`.
|
|
tokens = shlex.split(cmd)
|
|
if "serve" not in tokens:
|
|
return []
|
|
i = tokens.index("serve")
|
|
after = tokens[i + 1 :] # repo, then args
|
|
if not after:
|
|
return []
|
|
args = after[1:] # drop the repo
|
|
# EngineArgs expects --model=REPO rather than positional, so prepend it.
|
|
return [f"--model={after[0]}", *args]
|
|
|
|
|
|
async def validate_launch(key: str, catalog: Catalog, settings: Settings) -> dict:
|
|
if key not in catalog.models:
|
|
return {"ok": False, "stage": "lookup", "error": f"unknown model: {key}"}
|
|
if not settings.spark1_host or not settings.spark1_user:
|
|
return {"ok": False, "stage": "config", "error": "spark1 not configured"}
|
|
|
|
model = catalog.models[key]
|
|
arg_list = _vllm_arg_list(key, model, catalog)
|
|
if not arg_list:
|
|
return {"ok": False, "stage": "build", "error": "failed to build args list"}
|
|
|
|
payload = json.dumps(arg_list).replace("'", "'\\''")
|
|
# Pipe the JSON args list to a here-doc Python invocation. The validator
|
|
# reads from stdin to avoid shell-escaping the args themselves.
|
|
cmd = (
|
|
f"echo '{payload}' | docker exec -i vllm_node python3 -c "
|
|
+ shlex.quote(_VALIDATOR_SCRIPT)
|
|
)
|
|
|
|
rc, out, err = await ssh_run(settings.spark1_host, settings.spark1_user, cmd, settings, timeout=20)
|
|
if rc != 0 and not out.strip():
|
|
return {
|
|
"ok": False,
|
|
"stage": "ssh",
|
|
"error": err.strip() or f"rc={rc}",
|
|
"cmd_args": arg_list,
|
|
"launch_cmd": build_launch_command(key, model, catalog.defaults),
|
|
}
|
|
last = out.strip().splitlines()[-1] if out.strip() else ""
|
|
try:
|
|
result: dict[str, Any] = json.loads(last)
|
|
except json.JSONDecodeError:
|
|
result = {"ok": False, "stage": "decode", "error": "validator did not return JSON", "raw": out[-500:]}
|
|
result["cmd_args"] = arg_list
|
|
result["launch_cmd"] = build_launch_command(key, model, catalog.defaults)
|
|
return result
|