v0.26.0:0 - disk-driven model menu (scan sparks; recipes; needs-setup)

The dashboard menu is now the set of models actually downloaded on the Sparks, not a hard-coded catalog. models.yaml + overrides are reframed as launch recipes matched to an on-disk model by repo; an on-disk model with no recipe is flagged needs_setup and its launch settings are inferred from its config.json for a one-time operator confirmation (discovery.py). - delete now removes weights AND the menu card (delete_from_disk sweeps all hosts; the delete endpoint resolves keys via the live menu) - new GET /api/models/suggest; /api/models returns the menu + a recipes list (download autocomplete); GET /api/models/disk-status removed - dropped the two legacy Qwen recipes (235B FP8, 2.5 72B) - tests: +test_discovery.py (cache parsing, infer_recipe, build_menu merge)
2026-06-18 11:09:56 -05:00
parent c0b35184ba
commit df9f244eae
14 changed files with 795 additions and 238 deletions
@@ -0,0 +1,209 @@
+"""Disk-driven model menu + launch-recipe inference.
+
+The dashboard's model list is whatever is actually downloaded on the Sparks
+(see `disk.list_cached_models`), NOT a hard-coded catalog. The bundled/overridden
+catalog entries are *launch recipes*: matched to an on-disk model by repo, they
+say HOW to launch it. A completed model on disk with no matching recipe shows up
+as `needs_setup` — the first switch reads its `config.json`, proposes a recipe
+(`infer_recipe`) the operator confirms once, and that confirmed recipe is saved
+to /data so it's a normal card from then on.
+
+Why a recipe layer at all, if the menu is the disk? Because a folder on disk
+doesn't say how to launch it: the per-family parsers (`--reasoning-parser`,
+`--tool-call-parser`), the MoE backend (some Gemma MoE checkpoints need
+`marlin` on GB10), and solo-vs-cluster topology can't be read off a directory.
+We infer a best guess from the model's own config + size, but the operator
+confirms it — a wrong guess is cheap, a wrong launch is not.
+"""
+from __future__ import annotations
+import asyncio
+import re
+
+from .config import Settings
+from .disk import list_cached_models, probe_disk
+from .overrides import extract_knobs_from_args
+
+
+# A model whose weights exceed this can't fit one Spark's 128 GB beside a KV
+# cache, so it must shard across both via Ray. A heuristic prefill only — the
+# operator confirms mode in the setup form, so the exact cutoff isn't critical.
+SINGLE_SPARK_BYTES = 115 * 1000 ** 3
+
+# Generic knob defaults applied to every inferred recipe (the operator can tweak
+# these in the setup form). Family-specific flags (parsers, MoE backend) are
+# layered on separately by `_detect_family`.
+_COMMON_KNOBS = {
+    "max_model_len": 32768,
+    "gpu_memory_utilization": 0.85,
+    "fastsafetensors": True,
+    "prefix_caching": True,
+    "kv_cache_dtype": "fp8",
+}
+
+
+def repo_to_key(repo: str) -> str:
+    """Stable, URL-safe menu key for a discovered model with no recipe key yet.
+
+    'RedHatAI/Qwen3.6-35B-A3B-NVFP4' -> 'redhatai-qwen3-6-35b-a3b-nvfp4'. The same
+    slug is used by the menu, the setup form, and `_identify_current_model`, so a
+    loaded-but-unconfigured model still highlights as active."""
+    return re.sub(r"[^a-z0-9_-]+", "-", repo.lower()).strip("-")
+
+
+def _detect_family(config: dict) -> tuple[str, list[str], list[str]]:
+    """Return (family_label, vllm_flags, capabilities) inferred from config.json.
+
+    Only family-specific, non-knob flags (parsers, MoE backend) go in vllm_flags;
+    generic knob defaults are handled by the caller. Best-effort and operator-
+    confirmed, so a wrong guess is cheap."""
+    arch = " ".join(config.get("architectures") or [])
+    mtype = str(config.get("model_type") or "")
+    s = (arch + " " + mtype).lower()
+    is_moe = (
+        "moe" in s
+        or any(config.get(k) for k in ("num_experts", "n_routed_experts", "num_local_experts"))
+    )
+    is_vision = (
+        "conditionalgeneration" in s
+        or "vision" in s
+        or "vlforcausallm" in s
+        or "vision_config" in config
+        or "image_token_index" in config
+    )
+    flags: list[str] = []
+    caps: list[str] = []
+    label = "Generic"
+    if mtype.startswith("qwen3") or "qwen3" in s:
+        label = "Qwen3 (MoE)" if is_moe else "Qwen3"
+        flags.append("--reasoning-parser=qwen3")
+        caps.append("reasoning")
+        if is_moe:
+            flags.append("--moe_backend=flashinfer_cutlass")
+    elif "gemma" in s:
+        label = "Gemma (MoE)" if is_moe else "Gemma"
+        flags += ["--reasoning-parser=gemma4", "--tool-call-parser=gemma4", "--enable-auto-tool-choice"]
+        caps += ["reasoning", "tools"]
+        if is_moe:
+            # The fast flashinfer/CUTLASS FP4 path errors on GB10 for Gemma MoE;
+            # marlin is the working fallback (see the Gemma 26B trial notes).
+            flags.append("--moe_backend=marlin")
+    if is_vision and "vision" not in caps:
+        caps.append("vision")
+    return label, flags, caps
+
+
+def _infer_mode(total_bytes: int, on_host_count: int) -> str:
+    """Solo unless the weights are present on both Sparks or too big for one."""
+    if on_host_count >= 2 or total_bytes > SINGLE_SPARK_BYTES:
+        return "cluster"
+    return "solo"
+
+
+def infer_recipe(repo: str, config: dict, total_bytes: int, on_host_count: int) -> dict:
+    """Propose a launch recipe for a discovered model — prefills the setup form."""
+    label, flags, caps = _detect_family(config or {})
+    mode = _infer_mode(total_bytes, on_host_count)
+    vllm_args = list(flags)
+    vllm_args.append("--max-num-batched-tokens=16384")
+    knobs = dict(_COMMON_KNOBS)
+    if mode == "cluster":
+        # Large models shard across both Sparks via Ray; leave more headroom.
+        vllm_args += ["-tp=2", "--distributed-executor-backend=ray"]
+        knobs["gpu_memory_utilization"] = 0.7
+    return {
+        "key": repo_to_key(repo),
+        "repo": repo,
+        "display_name": repo.split("/")[-1],
+        "mode": mode,
+        "capabilities": caps,
+        "vllm_args": vllm_args,
+        "knobs": knobs,
+        "family": label,
+    }
+
+
+def _menu_entry_from_recipe(m, *, on_disk: bool, total_bytes: int, per_host: list[dict]) -> dict:
+    d = m.model_dump()
+    d["effective_knobs"] = {**extract_knobs_from_args(m.vllm_args), **(m.knobs or {})}
+    d["needs_setup"] = False
+    d["on_disk"] = on_disk
+    d["total_bytes"] = total_bytes
+    d["per_host"] = per_host
+    return d
+
+
+async def build_menu(settings: Settings, catalog) -> dict[str, dict]:
+    """The disk-driven model menu: every completed model on the Sparks, annotated
+    with its launch recipe (matched by repo) or flagged `needs_setup` if none.
+
+    Two SSH scans total (one per Spark), run in parallel — much cheaper than the
+    old per-recipe disk probe. A host that errors is skipped, not fatal."""
+    hosts = [(settings.spark1_host, settings.spark1_user)]
+    if settings.spark2_host:
+        hosts.append((settings.spark2_host, settings.spark2_user))
+    scans = await asyncio.gather(
+        *(list_cached_models(h, u, settings) for h, u in hosts),
+        return_exceptions=True,
+    )
+    by_repo: dict[str, dict] = {}
+    for (h, _u), res in zip(hosts, scans):
+        if isinstance(res, Exception):
+            continue
+        for repo, size, complete in res:
+            e = by_repo.setdefault(repo, {"total_bytes": 0, "per_host": [], "complete": False})
+            e["total_bytes"] += size
+            e["per_host"].append({"host": h, "size_bytes": size})
+            e["complete"] = e["complete"] or complete
+
+    recipe_by_repo = {m.repo: (k, m) for k, m in catalog.models.items() if m.repo}
+
+    menu: dict[str, dict] = {}
+    for repo, info in by_repo.items():
+        # Skip half-fetched / corrupt caches (no finished snapshot) — they'd show
+        # as broken cards. In-flight downloads surface in the download panel.
+        if not info["complete"]:
+            continue
+        if repo in recipe_by_repo:
+            key, m = recipe_by_repo[repo]
+            menu[key] = _menu_entry_from_recipe(
+                m, on_disk=True, total_bytes=info["total_bytes"], per_host=info["per_host"]
+            )
+        else:
+            key = repo_to_key(repo)
+            menu[key] = {
+                "display_name": repo.split("/")[-1],
+                "repo": repo,
+                "local_path": None,
+                "size_gb": round(info["total_bytes"] / 1e9, 1),
+                "mode": _infer_mode(info["total_bytes"], len(info["per_host"])),
+                "capabilities": [],
+                "expected_ready_seconds": 300,
+                "vllm_args": [],
+                "description": None,
+                "knobs": None,
+                "custom": False,
+                "needs_setup": True,
+                "effective_knobs": {},
+                "on_disk": True,
+                "total_bytes": info["total_bytes"],
+                "per_host": info["per_host"],
+            }
+
+    # Local/fine-tuned recipes live as a directory, not an HF cache entry — probe
+    # each by path and include it if present. Their keys are unique catalog keys
+    # (and local models carry repo="" per ModelDef), so they never collide with a
+    # discovered repo's slug or an HF recipe key above.
+    for key, m in catalog.models.items():
+        if not m.local_path:
+            continue
+        st = await probe_disk(m.repo, m.mode, settings, local_path=m.local_path)
+        if not st.on_disk:
+            continue
+        menu[key] = _menu_entry_from_recipe(
+            m,
+            on_disk=True,
+            total_bytes=st.total_bytes,
+            per_host=[{"host": r.host, "size_bytes": r.size_bytes} for r in st.per_host if r.on_disk],
+        )
+
+    return menu