df9f244eae
The dashboard menu is now the set of models actually downloaded on the Sparks, not a hard-coded catalog. models.yaml + overrides are reframed as launch recipes matched to an on-disk model by repo; an on-disk model with no recipe is flagged needs_setup and its launch settings are inferred from its config.json for a one-time operator confirmation (discovery.py). - delete now removes weights AND the menu card (delete_from_disk sweeps all hosts; the delete endpoint resolves keys via the live menu) - new GET /api/models/suggest; /api/models returns the menu + a recipes list (download autocomplete); GET /api/models/disk-status removed - dropped the two legacy Qwen recipes (235B FP8, 2.5 72B) - tests: +test_discovery.py (cache parsing, infer_recipe, build_menu merge)
210 lines
8.5 KiB
Python
210 lines
8.5 KiB
Python
"""Disk-driven model menu + launch-recipe inference.
|
|
|
|
The dashboard's model list is whatever is actually downloaded on the Sparks
|
|
(see `disk.list_cached_models`), NOT a hard-coded catalog. The bundled/overridden
|
|
catalog entries are *launch recipes*: matched to an on-disk model by repo, they
|
|
say HOW to launch it. A completed model on disk with no matching recipe shows up
|
|
as `needs_setup` — the first switch reads its `config.json`, proposes a recipe
|
|
(`infer_recipe`) the operator confirms once, and that confirmed recipe is saved
|
|
to /data so it's a normal card from then on.
|
|
|
|
Why a recipe layer at all, if the menu is the disk? Because a folder on disk
|
|
doesn't say how to launch it: the per-family parsers (`--reasoning-parser`,
|
|
`--tool-call-parser`), the MoE backend (some Gemma MoE checkpoints need
|
|
`marlin` on GB10), and solo-vs-cluster topology can't be read off a directory.
|
|
We infer a best guess from the model's own config + size, but the operator
|
|
confirms it — a wrong guess is cheap, a wrong launch is not.
|
|
"""
|
|
from __future__ import annotations
|
|
import asyncio
|
|
import re
|
|
|
|
from .config import Settings
|
|
from .disk import list_cached_models, probe_disk
|
|
from .overrides import extract_knobs_from_args
|
|
|
|
|
|
# A model whose weights exceed this can't fit one Spark's 128 GB beside a KV
|
|
# cache, so it must shard across both via Ray. A heuristic prefill only — the
|
|
# operator confirms mode in the setup form, so the exact cutoff isn't critical.
|
|
SINGLE_SPARK_BYTES = 115 * 1000 ** 3
|
|
|
|
# Generic knob defaults applied to every inferred recipe (the operator can tweak
|
|
# these in the setup form). Family-specific flags (parsers, MoE backend) are
|
|
# layered on separately by `_detect_family`.
|
|
_COMMON_KNOBS = {
|
|
"max_model_len": 32768,
|
|
"gpu_memory_utilization": 0.85,
|
|
"fastsafetensors": True,
|
|
"prefix_caching": True,
|
|
"kv_cache_dtype": "fp8",
|
|
}
|
|
|
|
|
|
def repo_to_key(repo: str) -> str:
|
|
"""Stable, URL-safe menu key for a discovered model with no recipe key yet.
|
|
|
|
'RedHatAI/Qwen3.6-35B-A3B-NVFP4' -> 'redhatai-qwen3-6-35b-a3b-nvfp4'. The same
|
|
slug is used by the menu, the setup form, and `_identify_current_model`, so a
|
|
loaded-but-unconfigured model still highlights as active."""
|
|
return re.sub(r"[^a-z0-9_-]+", "-", repo.lower()).strip("-")
|
|
|
|
|
|
def _detect_family(config: dict) -> tuple[str, list[str], list[str]]:
|
|
"""Return (family_label, vllm_flags, capabilities) inferred from config.json.
|
|
|
|
Only family-specific, non-knob flags (parsers, MoE backend) go in vllm_flags;
|
|
generic knob defaults are handled by the caller. Best-effort and operator-
|
|
confirmed, so a wrong guess is cheap."""
|
|
arch = " ".join(config.get("architectures") or [])
|
|
mtype = str(config.get("model_type") or "")
|
|
s = (arch + " " + mtype).lower()
|
|
is_moe = (
|
|
"moe" in s
|
|
or any(config.get(k) for k in ("num_experts", "n_routed_experts", "num_local_experts"))
|
|
)
|
|
is_vision = (
|
|
"conditionalgeneration" in s
|
|
or "vision" in s
|
|
or "vlforcausallm" in s
|
|
or "vision_config" in config
|
|
or "image_token_index" in config
|
|
)
|
|
flags: list[str] = []
|
|
caps: list[str] = []
|
|
label = "Generic"
|
|
if mtype.startswith("qwen3") or "qwen3" in s:
|
|
label = "Qwen3 (MoE)" if is_moe else "Qwen3"
|
|
flags.append("--reasoning-parser=qwen3")
|
|
caps.append("reasoning")
|
|
if is_moe:
|
|
flags.append("--moe_backend=flashinfer_cutlass")
|
|
elif "gemma" in s:
|
|
label = "Gemma (MoE)" if is_moe else "Gemma"
|
|
flags += ["--reasoning-parser=gemma4", "--tool-call-parser=gemma4", "--enable-auto-tool-choice"]
|
|
caps += ["reasoning", "tools"]
|
|
if is_moe:
|
|
# The fast flashinfer/CUTLASS FP4 path errors on GB10 for Gemma MoE;
|
|
# marlin is the working fallback (see the Gemma 26B trial notes).
|
|
flags.append("--moe_backend=marlin")
|
|
if is_vision and "vision" not in caps:
|
|
caps.append("vision")
|
|
return label, flags, caps
|
|
|
|
|
|
def _infer_mode(total_bytes: int, on_host_count: int) -> str:
|
|
"""Solo unless the weights are present on both Sparks or too big for one."""
|
|
if on_host_count >= 2 or total_bytes > SINGLE_SPARK_BYTES:
|
|
return "cluster"
|
|
return "solo"
|
|
|
|
|
|
def infer_recipe(repo: str, config: dict, total_bytes: int, on_host_count: int) -> dict:
|
|
"""Propose a launch recipe for a discovered model — prefills the setup form."""
|
|
label, flags, caps = _detect_family(config or {})
|
|
mode = _infer_mode(total_bytes, on_host_count)
|
|
vllm_args = list(flags)
|
|
vllm_args.append("--max-num-batched-tokens=16384")
|
|
knobs = dict(_COMMON_KNOBS)
|
|
if mode == "cluster":
|
|
# Large models shard across both Sparks via Ray; leave more headroom.
|
|
vllm_args += ["-tp=2", "--distributed-executor-backend=ray"]
|
|
knobs["gpu_memory_utilization"] = 0.7
|
|
return {
|
|
"key": repo_to_key(repo),
|
|
"repo": repo,
|
|
"display_name": repo.split("/")[-1],
|
|
"mode": mode,
|
|
"capabilities": caps,
|
|
"vllm_args": vllm_args,
|
|
"knobs": knobs,
|
|
"family": label,
|
|
}
|
|
|
|
|
|
def _menu_entry_from_recipe(m, *, on_disk: bool, total_bytes: int, per_host: list[dict]) -> dict:
|
|
d = m.model_dump()
|
|
d["effective_knobs"] = {**extract_knobs_from_args(m.vllm_args), **(m.knobs or {})}
|
|
d["needs_setup"] = False
|
|
d["on_disk"] = on_disk
|
|
d["total_bytes"] = total_bytes
|
|
d["per_host"] = per_host
|
|
return d
|
|
|
|
|
|
async def build_menu(settings: Settings, catalog) -> dict[str, dict]:
|
|
"""The disk-driven model menu: every completed model on the Sparks, annotated
|
|
with its launch recipe (matched by repo) or flagged `needs_setup` if none.
|
|
|
|
Two SSH scans total (one per Spark), run in parallel — much cheaper than the
|
|
old per-recipe disk probe. A host that errors is skipped, not fatal."""
|
|
hosts = [(settings.spark1_host, settings.spark1_user)]
|
|
if settings.spark2_host:
|
|
hosts.append((settings.spark2_host, settings.spark2_user))
|
|
scans = await asyncio.gather(
|
|
*(list_cached_models(h, u, settings) for h, u in hosts),
|
|
return_exceptions=True,
|
|
)
|
|
by_repo: dict[str, dict] = {}
|
|
for (h, _u), res in zip(hosts, scans):
|
|
if isinstance(res, Exception):
|
|
continue
|
|
for repo, size, complete in res:
|
|
e = by_repo.setdefault(repo, {"total_bytes": 0, "per_host": [], "complete": False})
|
|
e["total_bytes"] += size
|
|
e["per_host"].append({"host": h, "size_bytes": size})
|
|
e["complete"] = e["complete"] or complete
|
|
|
|
recipe_by_repo = {m.repo: (k, m) for k, m in catalog.models.items() if m.repo}
|
|
|
|
menu: dict[str, dict] = {}
|
|
for repo, info in by_repo.items():
|
|
# Skip half-fetched / corrupt caches (no finished snapshot) — they'd show
|
|
# as broken cards. In-flight downloads surface in the download panel.
|
|
if not info["complete"]:
|
|
continue
|
|
if repo in recipe_by_repo:
|
|
key, m = recipe_by_repo[repo]
|
|
menu[key] = _menu_entry_from_recipe(
|
|
m, on_disk=True, total_bytes=info["total_bytes"], per_host=info["per_host"]
|
|
)
|
|
else:
|
|
key = repo_to_key(repo)
|
|
menu[key] = {
|
|
"display_name": repo.split("/")[-1],
|
|
"repo": repo,
|
|
"local_path": None,
|
|
"size_gb": round(info["total_bytes"] / 1e9, 1),
|
|
"mode": _infer_mode(info["total_bytes"], len(info["per_host"])),
|
|
"capabilities": [],
|
|
"expected_ready_seconds": 300,
|
|
"vllm_args": [],
|
|
"description": None,
|
|
"knobs": None,
|
|
"custom": False,
|
|
"needs_setup": True,
|
|
"effective_knobs": {},
|
|
"on_disk": True,
|
|
"total_bytes": info["total_bytes"],
|
|
"per_host": info["per_host"],
|
|
}
|
|
|
|
# Local/fine-tuned recipes live as a directory, not an HF cache entry — probe
|
|
# each by path and include it if present. Their keys are unique catalog keys
|
|
# (and local models carry repo="" per ModelDef), so they never collide with a
|
|
# discovered repo's slug or an HF recipe key above.
|
|
for key, m in catalog.models.items():
|
|
if not m.local_path:
|
|
continue
|
|
st = await probe_disk(m.repo, m.mode, settings, local_path=m.local_path)
|
|
if not st.on_disk:
|
|
continue
|
|
menu[key] = _menu_entry_from_recipe(
|
|
m,
|
|
on_disk=True,
|
|
total_bytes=st.total_bytes,
|
|
per_host=[{"host": r.host, "size_bytes": r.size_bytes} for r in st.per_host if r.on_disk],
|
|
)
|
|
|
|
return menu
|