"""Disk-driven model menu + launch-recipe inference. The dashboard's model list is whatever is actually downloaded on the Sparks (see `disk.list_cached_models`), NOT a hard-coded catalog. The bundled/overridden catalog entries are *launch recipes*: matched to an on-disk model by repo, they say HOW to launch it. A completed model on disk with no matching recipe shows up as `needs_setup` — the first switch reads its `config.json`, proposes a recipe (`infer_recipe`) the operator confirms once, and that confirmed recipe is saved to /data so it's a normal card from then on. Why a recipe layer at all, if the menu is the disk? Because a folder on disk doesn't say how to launch it: the per-family parsers (`--reasoning-parser`, `--tool-call-parser`), the MoE backend (some Gemma MoE checkpoints need `marlin` on GB10), and solo-vs-cluster topology can't be read off a directory. We infer a best guess from the model's own config + size, but the operator confirms it — a wrong guess is cheap, a wrong launch is not. """ from __future__ import annotations import asyncio import re from .config import Settings from .disk import list_cached_models, probe_disk from .overrides import extract_knobs_from_args # A model whose weights exceed this can't fit one Spark's 128 GB beside a KV # cache, so it must shard across both via Ray. A heuristic prefill only — the # operator confirms mode in the setup form, so the exact cutoff isn't critical. SINGLE_SPARK_BYTES = 115 * 1000 ** 3 # Generic knob defaults applied to every inferred recipe (the operator can tweak # these in the setup form). Family-specific flags (parsers, MoE backend) are # layered on separately by `_detect_family`. _COMMON_KNOBS = { "max_model_len": 32768, "gpu_memory_utilization": 0.85, "fastsafetensors": True, "prefix_caching": True, "kv_cache_dtype": "fp8", } def repo_to_key(repo: str) -> str: """Stable, URL-safe menu key for a discovered model with no recipe key yet. 'RedHatAI/Qwen3.6-35B-A3B-NVFP4' -> 'redhatai-qwen3-6-35b-a3b-nvfp4'. The same slug is used by the menu, the setup form, and `_identify_current_model`, so a loaded-but-unconfigured model still highlights as active.""" return re.sub(r"[^a-z0-9_-]+", "-", repo.lower()).strip("-") def _detect_family(config: dict) -> tuple[str, list[str], list[str]]: """Return (family_label, vllm_flags, capabilities) inferred from config.json. Only family-specific, non-knob flags (parsers, MoE backend) go in vllm_flags; generic knob defaults are handled by the caller. Best-effort and operator- confirmed, so a wrong guess is cheap.""" arch = " ".join(config.get("architectures") or []) mtype = str(config.get("model_type") or "") s = (arch + " " + mtype).lower() is_moe = ( "moe" in s or any(config.get(k) for k in ("num_experts", "n_routed_experts", "num_local_experts")) ) is_vision = ( "conditionalgeneration" in s or "vision" in s or "vlforcausallm" in s or "vision_config" in config or "image_token_index" in config ) flags: list[str] = [] caps: list[str] = [] label = "Generic" if mtype.startswith("qwen3") or "qwen3" in s: label = "Qwen3 (MoE)" if is_moe else "Qwen3" flags.append("--reasoning-parser=qwen3") caps.append("reasoning") if is_moe: flags.append("--moe_backend=flashinfer_cutlass") elif "gemma" in s: label = "Gemma (MoE)" if is_moe else "Gemma" flags += ["--reasoning-parser=gemma4", "--tool-call-parser=gemma4", "--enable-auto-tool-choice"] caps += ["reasoning", "tools"] if is_moe: # The fast flashinfer/CUTLASS FP4 path errors on GB10 for Gemma MoE; # marlin is the working fallback (see the Gemma 26B trial notes). flags.append("--moe_backend=marlin") if is_vision and "vision" not in caps: caps.append("vision") return label, flags, caps def _infer_mode(total_bytes: int, on_host_count: int) -> str: """Solo unless the weights are present on both Sparks or too big for one.""" if on_host_count >= 2 or total_bytes > SINGLE_SPARK_BYTES: return "cluster" return "solo" def infer_recipe(repo: str, config: dict, total_bytes: int, on_host_count: int) -> dict: """Propose a launch recipe for a discovered model — prefills the setup form.""" label, flags, caps = _detect_family(config or {}) mode = _infer_mode(total_bytes, on_host_count) vllm_args = list(flags) vllm_args.append("--max-num-batched-tokens=16384") knobs = dict(_COMMON_KNOBS) if mode == "cluster": # Large models shard across both Sparks via Ray; leave more headroom. vllm_args += ["-tp=2", "--distributed-executor-backend=ray"] knobs["gpu_memory_utilization"] = 0.7 return { "key": repo_to_key(repo), "repo": repo, "display_name": repo.split("/")[-1], "mode": mode, "capabilities": caps, "vllm_args": vllm_args, "knobs": knobs, "family": label, } def _menu_entry_from_recipe(m, *, on_disk: bool, total_bytes: int, per_host: list[dict]) -> dict: d = m.model_dump() d["effective_knobs"] = {**extract_knobs_from_args(m.vllm_args), **(m.knobs or {})} d["needs_setup"] = False d["on_disk"] = on_disk d["total_bytes"] = total_bytes d["per_host"] = per_host return d async def build_menu(settings: Settings, catalog) -> dict[str, dict]: """The disk-driven model menu: every completed model on the Sparks, annotated with its launch recipe (matched by repo) or flagged `needs_setup` if none. Two SSH scans total (one per Spark), run in parallel — much cheaper than the old per-recipe disk probe. A host that errors is skipped, not fatal.""" hosts = [(settings.spark1_host, settings.spark1_user)] if settings.spark2_host: hosts.append((settings.spark2_host, settings.spark2_user)) scans = await asyncio.gather( *(list_cached_models(h, u, settings) for h, u in hosts), return_exceptions=True, ) by_repo: dict[str, dict] = {} for (h, _u), res in zip(hosts, scans): if isinstance(res, Exception): continue for repo, size, complete in res: e = by_repo.setdefault(repo, {"total_bytes": 0, "per_host": [], "complete": False}) e["total_bytes"] += size e["per_host"].append({"host": h, "size_bytes": size}) e["complete"] = e["complete"] or complete recipe_by_repo = {m.repo: (k, m) for k, m in catalog.models.items() if m.repo} menu: dict[str, dict] = {} for repo, info in by_repo.items(): # Skip half-fetched / corrupt caches (no finished snapshot) — they'd show # as broken cards. In-flight downloads surface in the download panel. if not info["complete"]: continue if repo in recipe_by_repo: key, m = recipe_by_repo[repo] menu[key] = _menu_entry_from_recipe( m, on_disk=True, total_bytes=info["total_bytes"], per_host=info["per_host"] ) else: key = repo_to_key(repo) menu[key] = { "display_name": repo.split("/")[-1], "repo": repo, "local_path": None, "size_gb": round(info["total_bytes"] / 1e9, 1), "mode": _infer_mode(info["total_bytes"], len(info["per_host"])), "capabilities": [], "expected_ready_seconds": 300, "vllm_args": [], "description": None, "knobs": None, "custom": False, "needs_setup": True, "effective_knobs": {}, "on_disk": True, "total_bytes": info["total_bytes"], "per_host": info["per_host"], } # Local/fine-tuned recipes live as a directory, not an HF cache entry — probe # each by path and include it if present. Their keys are unique catalog keys # (and local models carry repo="" per ModelDef), so they never collide with a # discovered repo's slug or an HF recipe key above. for key, m in catalog.models.items(): if not m.local_path: continue st = await probe_disk(m.repo, m.mode, settings, local_path=m.local_path) if not st.on_disk: continue menu[key] = _menu_entry_from_recipe( m, on_disk=True, total_bytes=st.total_bytes, per_host=[{"host": r.host, "size_bytes": r.size_bytes} for r in st.per_host if r.on_disk], ) return menu