diff --git a/image/app/server.py b/image/app/server.py index 5beec4d..9b6707f 100644 --- a/image/app/server.py +++ b/image/app/server.py @@ -22,6 +22,7 @@ from .services import docker_state, run_action, services_from_settings from .ssh import ssh_run from .swap import SwapManager from .updates import UpdateManager, get_update_status +from .validate import validate_launch from .wol import send_local_broadcast, send_via_peer @@ -434,6 +435,15 @@ class SwapRequest(BaseModel): dry_run: bool = False +@app.post("/api/swap/{key}/validate") +async def validate_swap(key: str) -> dict: + """Pre-flight check: run vLLM's argparse layer against the proposed launch + command WITHOUT starting an engine. Cheap (~5 s) and doesn't disturb the + currently-loaded model. + """ + return await validate_launch(key, catalog, settings) + + @app.post("/api/swap") async def post_swap(req: SwapRequest) -> dict: if not settings.configured and not req.dry_run: diff --git a/image/app/static/app.js b/image/app/static/app.js index fee7515..10d31fb 100644 --- a/image/app/static/app.js +++ b/image/app/static/app.js @@ -73,8 +73,10 @@ function renderCards() { + + `; root.appendChild(card); } @@ -84,6 +86,37 @@ function renderCards() { for (const btn of root.querySelectorAll('[data-adv-key]')) { btn.addEventListener('click', () => openAdvanced(btn.dataset.advKey)); } + for (const btn of root.querySelectorAll('[data-test-key]')) { + btn.addEventListener('click', () => testLaunch(btn.dataset.testKey, btn)); + } +} + +async function testLaunch(key, btn) { + const resultEl = document.querySelector(`[data-test-result-for="${key}"]`); + if (!resultEl) return; + const originalText = btn.textContent; + btn.disabled = true; + btn.textContent = 'Testing…'; + resultEl.classList.remove('hidden', 'ok', 'fail'); + resultEl.innerHTML = 'Checking launch args against vLLM\'s parser…'; + try { + const r = await fetchJSON(`/api/swap/${encodeURIComponent(key)}/validate`, { method: 'POST' }); + if (r.ok) { + resultEl.classList.add('ok'); + resultEl.innerHTML = ` Launch args parse OK. (Doesn't guarantee runtime success — only catches argparse-level issues.)`; + } else { + resultEl.classList.add('fail'); + const err = escapeHtml(r.error || 'unknown error'); + const stage = r.stage ? ` (${escapeHtml(r.stage)})` : ''; + resultEl.innerHTML = ` Would fail: ${err}${stage}`; + } + } catch (e) { + resultEl.classList.add('fail'); + resultEl.innerHTML = ` Test failed: ${escapeHtml(e.message)}`; + } finally { + btn.disabled = false; + btn.textContent = originalText; + } } function renderCurrent(status) { diff --git a/image/app/static/style.css b/image/app/static/style.css index 302c7b3..b13d37c 100644 --- a/image/app/static/style.css +++ b/image/app/static/style.css @@ -701,9 +701,24 @@ main { .card.active .btn { background: rgba(74, 222, 128, 0.12); color: var(--accent); border-color: rgba(74, 222, 128, 0.4); } .card-actions { display: flex; gap: 6px; } .card-actions .btn.primary { flex: 1; } -.card .adv-btn { padding: 8px 12px; font-size: 12px; } +.card .adv-btn, +.card .test-btn { padding: 8px 12px; font-size: 12px; } .card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); } +.test-result { + font-size: 12px; + line-height: 1.45; + padding: 8px 10px; + border-radius: 5px; + margin-top: 4px; + border: 1px solid var(--border); + background: var(--surface-2); +} +.test-result.ok { border-color: rgba(74, 222, 128, 0.4); background: rgba(74, 222, 128, 0.04); } +.test-result.fail { border-color: rgba(239, 68, 68, 0.45); background: rgba(239, 68, 68, 0.06); word-break: break-word; } +.test-result .ok-mark { color: var(--accent); font-weight: 600; } +.test-result .fail-mark { color: var(--error); font-weight: 600; } + .footer { margin-top: 28px; padding-top: 16px; diff --git a/image/app/validate.py b/image/app/validate.py new file mode 100644 index 0000000..983e267 --- /dev/null +++ b/image/app/validate.py @@ -0,0 +1,137 @@ +"""Pre-flight validation of a proposed vLLM launch command. + +Runs vLLM's own argparse layer (EngineArgs) inside the vllm_node container WITHOUT +starting the engine. Catches: + + * unknown flag names (typos) + * bad types / values that argparse rejects + * deprecated flags removed in the installed vLLM version + +Does NOT catch (these surface only during real engine init): + * model-architecture-specific constraints (e.g. Qwen3.6 Mamba block_size) + * OOM at weight-loading time + * Triton / CUDA-kernel compatibility errors + +A pre-flight check that returns "ok" is therefore NOT a guarantee — but a +"failed" verdict is a definitive 'don't bother with the real swap'. +""" +from __future__ import annotations +import json +import shlex +from typing import Any + +from .config import Settings +from .models import Catalog, build_launch_command +from .ssh import ssh_run + + +# Validates the proposed args against the same combined parser vLLM uses for +# `vllm serve` (engine args + server args + frontend args). Returns one JSON +# line on stdout: {"ok": true, ...} or {"ok": false, ...}. +_VALIDATOR_SCRIPT = r""" +import argparse, json, sys + +# Mirror what `vllm serve` does internally: FlexibleArgumentParser (which is +# more lenient about dashes vs underscores) wrapped with make_arg_parser +# (which adds engine + server + frontend args). +parser = None +try: + # Newer vLLM path + from vllm.utils.argparse_utils import FlexibleArgumentParser +except Exception: + try: + # Older fallback + from vllm.engine.arg_utils import FlexibleArgumentParser + except Exception: + FlexibleArgumentParser = argparse.ArgumentParser # type: ignore + +try: + from vllm.entrypoints.openai.cli_args import make_arg_parser + parser = make_arg_parser(FlexibleArgumentParser(add_help=False)) +except Exception: + pass +if parser is None: + try: + from vllm.engine.arg_utils import EngineArgs + parser = FlexibleArgumentParser(add_help=False) + EngineArgs.add_cli_args(parser) + except Exception as e: + print(json.dumps({"ok": False, "stage": "import", "error": f"{type(e).__name__}: {e}"})) + sys.exit(0) + +class _ArgError(Exception): + pass + +def _err(message): + raise _ArgError(message) + +parser.error = _err # capture argparse errors instead of sys.exit(2) + +try: + raw = sys.stdin.read() + arglist = json.loads(raw) + ns = parser.parse_args(arglist) + print(json.dumps({"ok": True, "model": getattr(ns, "model", None)})) +except _ArgError as e: + print(json.dumps({"ok": False, "stage": "parse", "error": str(e)})) +except SystemExit as e: + print(json.dumps({"ok": False, "stage": "parse", "error": f"argparse exit {e.code}"})) +except Exception as e: + print(json.dumps({"ok": False, "stage": "parse", "error": f"{type(e).__name__}: {e}"})) +""" + + +def _vllm_arg_list(key: str, model_def, catalog: Catalog) -> list[str]: + """Reconstruct the args list passed to `vllm serve` (without the positional model).""" + cmd = build_launch_command(key, model_def, catalog.defaults) + # build_launch_command yields: + # ./launch-cluster.sh [--solo] -d exec vllm serve + # We just want the bits after `vllm serve `. + tokens = shlex.split(cmd) + if "serve" not in tokens: + return [] + i = tokens.index("serve") + after = tokens[i + 1 :] # repo, then args + if not after: + return [] + args = after[1:] # drop the repo + # EngineArgs expects --model=REPO rather than positional, so prepend it. + return [f"--model={after[0]}", *args] + + +async def validate_launch(key: str, catalog: Catalog, settings: Settings) -> dict: + if key not in catalog.models: + return {"ok": False, "stage": "lookup", "error": f"unknown model: {key}"} + if not settings.spark1_host or not settings.spark1_user: + return {"ok": False, "stage": "config", "error": "spark1 not configured"} + + model = catalog.models[key] + arg_list = _vllm_arg_list(key, model, catalog) + if not arg_list: + return {"ok": False, "stage": "build", "error": "failed to build args list"} + + payload = json.dumps(arg_list).replace("'", "'\\''") + # Pipe the JSON args list to a here-doc Python invocation. The validator + # reads from stdin to avoid shell-escaping the args themselves. + cmd = ( + f"echo '{payload}' | docker exec -i vllm_node python3 -c " + + shlex.quote(_VALIDATOR_SCRIPT) + ) + + rc, out, err = await ssh_run(settings.spark1_host, settings.spark1_user, cmd, settings, timeout=20) + if rc != 0 and not out.strip(): + return { + "ok": False, + "stage": "ssh", + "error": err.strip() or f"rc={rc}", + "cmd_args": arg_list, + "launch_cmd": build_launch_command(key, model, catalog.defaults), + } + last = out.strip().splitlines()[-1] if out.strip() else "" + try: + result: dict[str, Any] = json.loads(last) + except json.JSONDecodeError: + result = {"ok": False, "stage": "decode", "error": "validator did not return JSON", "raw": out[-500:]} + result["cmd_args"] = arg_list + result["launch_cmd"] = build_launch_command(key, model, catalog.defaults) + return result diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts index 91f8828..fe170d1 100644 --- a/package/startos/versions/v0_1_0.ts +++ b/package/startos/versions/v0_1_0.ts @@ -1,10 +1,10 @@ import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' export const v0_1_0 = VersionInfo.of({ - version: '0.6.0:1', + version: '0.7.0:2', releaseNotes: { en_US: - 'v0.6: Service-level connectivity tracking and a passive failure-report endpoint. The connectivity log now records up/down transitions for Parakeet, Magpie, and vLLM in addition to the Spark hosts (driven by the existing /api/status and /api/services polling). A new POST /api/health-event endpoint lets external apps (e.g. Open WebUI) record failures they observed even when the failure was brief enough to slip between polls. The Connectivity log dialog shows hosts and services with separate badges, and reports appear inline with their source app + error detail.', + 'v0.7: pre-flight launch validation. New "Test" button on every model card runs vLLM\'s argparse against the proposed launch command inside the running vllm_node container — without starting an engine. Catches unknown flags, bad types, and version-removed flags in about 5 seconds, before disrupting the currently-loaded model. (Runtime-only failures like the Qwen3.6 Mamba block-size assertion still only surface during a real swap, but argparse-stage bugs are now caught up front.)', }, migrations: { up: async ({ effects }) => {},