v0.26.0:0 - disk-driven model menu (scan sparks; recipes; needs-setup)
The dashboard menu is now the set of models actually downloaded on the Sparks, not a hard-coded catalog. models.yaml + overrides are reframed as launch recipes matched to an on-disk model by repo; an on-disk model with no recipe is flagged needs_setup and its launch settings are inferred from its config.json for a one-time operator confirmation (discovery.py). - delete now removes weights AND the menu card (delete_from_disk sweeps all hosts; the delete endpoint resolves keys via the live menu) - new GET /api/models/suggest; /api/models returns the menu + a recipes list (download autocomplete); GET /api/models/disk-status removed - dropped the two legacy Qwen recipes (235B FP8, 2.5 72B) - tests: +test_discovery.py (cache parsing, infer_recipe, build_menu merge)
This commit is contained in:
@@ -0,0 +1,209 @@
|
||||
"""Disk-driven model menu + launch-recipe inference.
|
||||
|
||||
The dashboard's model list is whatever is actually downloaded on the Sparks
|
||||
(see `disk.list_cached_models`), NOT a hard-coded catalog. The bundled/overridden
|
||||
catalog entries are *launch recipes*: matched to an on-disk model by repo, they
|
||||
say HOW to launch it. A completed model on disk with no matching recipe shows up
|
||||
as `needs_setup` — the first switch reads its `config.json`, proposes a recipe
|
||||
(`infer_recipe`) the operator confirms once, and that confirmed recipe is saved
|
||||
to /data so it's a normal card from then on.
|
||||
|
||||
Why a recipe layer at all, if the menu is the disk? Because a folder on disk
|
||||
doesn't say how to launch it: the per-family parsers (`--reasoning-parser`,
|
||||
`--tool-call-parser`), the MoE backend (some Gemma MoE checkpoints need
|
||||
`marlin` on GB10), and solo-vs-cluster topology can't be read off a directory.
|
||||
We infer a best guess from the model's own config + size, but the operator
|
||||
confirms it — a wrong guess is cheap, a wrong launch is not.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import re
|
||||
|
||||
from .config import Settings
|
||||
from .disk import list_cached_models, probe_disk
|
||||
from .overrides import extract_knobs_from_args
|
||||
|
||||
|
||||
# A model whose weights exceed this can't fit one Spark's 128 GB beside a KV
|
||||
# cache, so it must shard across both via Ray. A heuristic prefill only — the
|
||||
# operator confirms mode in the setup form, so the exact cutoff isn't critical.
|
||||
SINGLE_SPARK_BYTES = 115 * 1000 ** 3
|
||||
|
||||
# Generic knob defaults applied to every inferred recipe (the operator can tweak
|
||||
# these in the setup form). Family-specific flags (parsers, MoE backend) are
|
||||
# layered on separately by `_detect_family`.
|
||||
_COMMON_KNOBS = {
|
||||
"max_model_len": 32768,
|
||||
"gpu_memory_utilization": 0.85,
|
||||
"fastsafetensors": True,
|
||||
"prefix_caching": True,
|
||||
"kv_cache_dtype": "fp8",
|
||||
}
|
||||
|
||||
|
||||
def repo_to_key(repo: str) -> str:
|
||||
"""Stable, URL-safe menu key for a discovered model with no recipe key yet.
|
||||
|
||||
'RedHatAI/Qwen3.6-35B-A3B-NVFP4' -> 'redhatai-qwen3-6-35b-a3b-nvfp4'. The same
|
||||
slug is used by the menu, the setup form, and `_identify_current_model`, so a
|
||||
loaded-but-unconfigured model still highlights as active."""
|
||||
return re.sub(r"[^a-z0-9_-]+", "-", repo.lower()).strip("-")
|
||||
|
||||
|
||||
def _detect_family(config: dict) -> tuple[str, list[str], list[str]]:
|
||||
"""Return (family_label, vllm_flags, capabilities) inferred from config.json.
|
||||
|
||||
Only family-specific, non-knob flags (parsers, MoE backend) go in vllm_flags;
|
||||
generic knob defaults are handled by the caller. Best-effort and operator-
|
||||
confirmed, so a wrong guess is cheap."""
|
||||
arch = " ".join(config.get("architectures") or [])
|
||||
mtype = str(config.get("model_type") or "")
|
||||
s = (arch + " " + mtype).lower()
|
||||
is_moe = (
|
||||
"moe" in s
|
||||
or any(config.get(k) for k in ("num_experts", "n_routed_experts", "num_local_experts"))
|
||||
)
|
||||
is_vision = (
|
||||
"conditionalgeneration" in s
|
||||
or "vision" in s
|
||||
or "vlforcausallm" in s
|
||||
or "vision_config" in config
|
||||
or "image_token_index" in config
|
||||
)
|
||||
flags: list[str] = []
|
||||
caps: list[str] = []
|
||||
label = "Generic"
|
||||
if mtype.startswith("qwen3") or "qwen3" in s:
|
||||
label = "Qwen3 (MoE)" if is_moe else "Qwen3"
|
||||
flags.append("--reasoning-parser=qwen3")
|
||||
caps.append("reasoning")
|
||||
if is_moe:
|
||||
flags.append("--moe_backend=flashinfer_cutlass")
|
||||
elif "gemma" in s:
|
||||
label = "Gemma (MoE)" if is_moe else "Gemma"
|
||||
flags += ["--reasoning-parser=gemma4", "--tool-call-parser=gemma4", "--enable-auto-tool-choice"]
|
||||
caps += ["reasoning", "tools"]
|
||||
if is_moe:
|
||||
# The fast flashinfer/CUTLASS FP4 path errors on GB10 for Gemma MoE;
|
||||
# marlin is the working fallback (see the Gemma 26B trial notes).
|
||||
flags.append("--moe_backend=marlin")
|
||||
if is_vision and "vision" not in caps:
|
||||
caps.append("vision")
|
||||
return label, flags, caps
|
||||
|
||||
|
||||
def _infer_mode(total_bytes: int, on_host_count: int) -> str:
|
||||
"""Solo unless the weights are present on both Sparks or too big for one."""
|
||||
if on_host_count >= 2 or total_bytes > SINGLE_SPARK_BYTES:
|
||||
return "cluster"
|
||||
return "solo"
|
||||
|
||||
|
||||
def infer_recipe(repo: str, config: dict, total_bytes: int, on_host_count: int) -> dict:
|
||||
"""Propose a launch recipe for a discovered model — prefills the setup form."""
|
||||
label, flags, caps = _detect_family(config or {})
|
||||
mode = _infer_mode(total_bytes, on_host_count)
|
||||
vllm_args = list(flags)
|
||||
vllm_args.append("--max-num-batched-tokens=16384")
|
||||
knobs = dict(_COMMON_KNOBS)
|
||||
if mode == "cluster":
|
||||
# Large models shard across both Sparks via Ray; leave more headroom.
|
||||
vllm_args += ["-tp=2", "--distributed-executor-backend=ray"]
|
||||
knobs["gpu_memory_utilization"] = 0.7
|
||||
return {
|
||||
"key": repo_to_key(repo),
|
||||
"repo": repo,
|
||||
"display_name": repo.split("/")[-1],
|
||||
"mode": mode,
|
||||
"capabilities": caps,
|
||||
"vllm_args": vllm_args,
|
||||
"knobs": knobs,
|
||||
"family": label,
|
||||
}
|
||||
|
||||
|
||||
def _menu_entry_from_recipe(m, *, on_disk: bool, total_bytes: int, per_host: list[dict]) -> dict:
|
||||
d = m.model_dump()
|
||||
d["effective_knobs"] = {**extract_knobs_from_args(m.vllm_args), **(m.knobs or {})}
|
||||
d["needs_setup"] = False
|
||||
d["on_disk"] = on_disk
|
||||
d["total_bytes"] = total_bytes
|
||||
d["per_host"] = per_host
|
||||
return d
|
||||
|
||||
|
||||
async def build_menu(settings: Settings, catalog) -> dict[str, dict]:
|
||||
"""The disk-driven model menu: every completed model on the Sparks, annotated
|
||||
with its launch recipe (matched by repo) or flagged `needs_setup` if none.
|
||||
|
||||
Two SSH scans total (one per Spark), run in parallel — much cheaper than the
|
||||
old per-recipe disk probe. A host that errors is skipped, not fatal."""
|
||||
hosts = [(settings.spark1_host, settings.spark1_user)]
|
||||
if settings.spark2_host:
|
||||
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||
scans = await asyncio.gather(
|
||||
*(list_cached_models(h, u, settings) for h, u in hosts),
|
||||
return_exceptions=True,
|
||||
)
|
||||
by_repo: dict[str, dict] = {}
|
||||
for (h, _u), res in zip(hosts, scans):
|
||||
if isinstance(res, Exception):
|
||||
continue
|
||||
for repo, size, complete in res:
|
||||
e = by_repo.setdefault(repo, {"total_bytes": 0, "per_host": [], "complete": False})
|
||||
e["total_bytes"] += size
|
||||
e["per_host"].append({"host": h, "size_bytes": size})
|
||||
e["complete"] = e["complete"] or complete
|
||||
|
||||
recipe_by_repo = {m.repo: (k, m) for k, m in catalog.models.items() if m.repo}
|
||||
|
||||
menu: dict[str, dict] = {}
|
||||
for repo, info in by_repo.items():
|
||||
# Skip half-fetched / corrupt caches (no finished snapshot) — they'd show
|
||||
# as broken cards. In-flight downloads surface in the download panel.
|
||||
if not info["complete"]:
|
||||
continue
|
||||
if repo in recipe_by_repo:
|
||||
key, m = recipe_by_repo[repo]
|
||||
menu[key] = _menu_entry_from_recipe(
|
||||
m, on_disk=True, total_bytes=info["total_bytes"], per_host=info["per_host"]
|
||||
)
|
||||
else:
|
||||
key = repo_to_key(repo)
|
||||
menu[key] = {
|
||||
"display_name": repo.split("/")[-1],
|
||||
"repo": repo,
|
||||
"local_path": None,
|
||||
"size_gb": round(info["total_bytes"] / 1e9, 1),
|
||||
"mode": _infer_mode(info["total_bytes"], len(info["per_host"])),
|
||||
"capabilities": [],
|
||||
"expected_ready_seconds": 300,
|
||||
"vllm_args": [],
|
||||
"description": None,
|
||||
"knobs": None,
|
||||
"custom": False,
|
||||
"needs_setup": True,
|
||||
"effective_knobs": {},
|
||||
"on_disk": True,
|
||||
"total_bytes": info["total_bytes"],
|
||||
"per_host": info["per_host"],
|
||||
}
|
||||
|
||||
# Local/fine-tuned recipes live as a directory, not an HF cache entry — probe
|
||||
# each by path and include it if present. Their keys are unique catalog keys
|
||||
# (and local models carry repo="" per ModelDef), so they never collide with a
|
||||
# discovered repo's slug or an HF recipe key above.
|
||||
for key, m in catalog.models.items():
|
||||
if not m.local_path:
|
||||
continue
|
||||
st = await probe_disk(m.repo, m.mode, settings, local_path=m.local_path)
|
||||
if not st.on_disk:
|
||||
continue
|
||||
menu[key] = _menu_entry_from_recipe(
|
||||
m,
|
||||
on_disk=True,
|
||||
total_bytes=st.total_bytes,
|
||||
per_host=[{"host": r.host, "size_bytes": r.size_bytes} for r in st.per_host if r.on_disk],
|
||||
)
|
||||
|
||||
return menu
|
||||
Reference in New Issue
Block a user