spark-control/image/app/discovery.py

"""Disk-driven model menu + launch-recipe inference.

The dashboard's model list is whatever is actually downloaded on the Sparks
(see `disk.list_cached_models`), NOT a hard-coded catalog. The bundled/overridden
catalog entries are *launch recipes*: matched to an on-disk model by repo, they
say HOW to launch it. A completed model on disk with no matching recipe shows up
as `needs_setup` — the first switch reads its `config.json`, proposes a recipe
(`infer_recipe`) the operator confirms once, and that confirmed recipe is saved
to /data so it's a normal card from then on.

Why a recipe layer at all, if the menu is the disk? Because a folder on disk
doesn't say how to launch it: the per-family parsers (`--reasoning-parser`,
`--tool-call-parser`), the MoE backend (some Gemma MoE checkpoints need
`marlin` on GB10), and solo-vs-cluster topology can't be read off a directory.
We infer a best guess from the model's own config + size, but the operator
confirms it — a wrong guess is cheap, a wrong launch is not.
"""
from __future__ import annotations
import asyncio
import re

from .config import Settings
from .disk import list_cached_models, probe_disk
from .overrides import extract_knobs_from_args


# A model whose weights exceed this can't fit one Spark's 128 GB beside a KV
# cache, so it must shard across both via Ray. A heuristic prefill only — the
# operator confirms mode in the setup form, so the exact cutoff isn't critical.
SINGLE_SPARK_BYTES = 115 * 1000 ** 3

# Generic knob defaults applied to every inferred recipe (the operator can tweak
# these in the setup form). Family-specific flags (parsers, MoE backend) are
# layered on separately by `_detect_family`.
_COMMON_KNOBS = {
    "max_model_len": 32768,
    "gpu_memory_utilization": 0.85,
    "fastsafetensors": True,
    "prefix_caching": True,
    "kv_cache_dtype": "fp8",
}


def repo_to_key(repo: str) -> str:
    """Stable, URL-safe menu key for a discovered model with no recipe key yet.

    'RedHatAI/Qwen3.6-35B-A3B-NVFP4' -> 'redhatai-qwen3-6-35b-a3b-nvfp4'. The same
    slug is used by the menu, the setup form, and `_identify_current_model`, so a
    loaded-but-unconfigured model still highlights as active."""
    return re.sub(r"[^a-z0-9_-]+", "-", repo.lower()).strip("-")


def _detect_family(config: dict) -> tuple[str, list[str], list[str]]:
    """Return (family_label, vllm_flags, capabilities) inferred from config.json.

    Only family-specific, non-knob flags (parsers, MoE backend) go in vllm_flags;
    generic knob defaults are handled by the caller. Best-effort and operator-
    confirmed, so a wrong guess is cheap."""
    arch = " ".join(config.get("architectures") or [])
    mtype = str(config.get("model_type") or "")
    s = (arch + " " + mtype).lower()
    is_moe = (
        "moe" in s
        or any(config.get(k) for k in ("num_experts", "n_routed_experts", "num_local_experts"))
    )
    is_vision = (
        "conditionalgeneration" in s
        or "vision" in s
        or "vlforcausallm" in s
        or "vision_config" in config
        or "image_token_index" in config
    )
    flags: list[str] = []
    caps: list[str] = []
    label = "Generic"
    if mtype.startswith("qwen3") or "qwen3" in s:
        label = "Qwen3 (MoE)" if is_moe else "Qwen3"
        flags.append("--reasoning-parser=qwen3")
        caps.append("reasoning")
        if is_moe:
            flags.append("--moe_backend=flashinfer_cutlass")
    elif "gemma" in s:
        label = "Gemma (MoE)" if is_moe else "Gemma"
        flags += ["--reasoning-parser=gemma4", "--tool-call-parser=gemma4", "--enable-auto-tool-choice"]
        caps += ["reasoning", "tools"]
        if is_moe:
            # The fast flashinfer/CUTLASS FP4 path errors on GB10 for Gemma MoE;
            # marlin is the working fallback (see the Gemma 26B trial notes).
            flags.append("--moe_backend=marlin")
    if is_vision and "vision" not in caps:
        caps.append("vision")
    return label, flags, caps


def _infer_mode(total_bytes: int, on_host_count: int) -> str:
    """Solo unless the weights are present on both Sparks or too big for one."""
    if on_host_count >= 2 or total_bytes > SINGLE_SPARK_BYTES:
        return "cluster"
    return "solo"


def infer_recipe(repo: str, config: dict, total_bytes: int, on_host_count: int) -> dict:
    """Propose a launch recipe for a discovered model — prefills the setup form."""
    label, flags, caps = _detect_family(config or {})
    mode = _infer_mode(total_bytes, on_host_count)
    vllm_args = list(flags)
    vllm_args.append("--max-num-batched-tokens=16384")
    knobs = dict(_COMMON_KNOBS)
    if mode == "cluster":
        # Large models shard across both Sparks via Ray; leave more headroom.
        vllm_args += ["-tp=2", "--distributed-executor-backend=ray"]
        knobs["gpu_memory_utilization"] = 0.7
    return {
        "key": repo_to_key(repo),
        "repo": repo,
        "display_name": repo.split("/")[-1],
        "mode": mode,
        "capabilities": caps,
        "vllm_args": vllm_args,
        "knobs": knobs,
        "family": label,
    }


def _menu_entry_from_recipe(m, *, on_disk: bool, total_bytes: int, per_host: list[dict]) -> dict:
    d = m.model_dump()
    d["effective_knobs"] = {**extract_knobs_from_args(m.vllm_args), **(m.knobs or {})}
    d["needs_setup"] = False
    d["on_disk"] = on_disk
    d["total_bytes"] = total_bytes
    d["per_host"] = per_host
    return d


async def build_menu(settings: Settings, catalog) -> dict[str, dict]:
    """The disk-driven model menu: every completed model on the Sparks, annotated
    with its launch recipe (matched by repo) or flagged `needs_setup` if none.

    Two SSH scans total (one per Spark), run in parallel — much cheaper than the
    old per-recipe disk probe. A host that errors is skipped, not fatal."""
    hosts = [(settings.spark1_host, settings.spark1_user)]
    if settings.spark2_host:
        hosts.append((settings.spark2_host, settings.spark2_user))
    scans = await asyncio.gather(
        *(list_cached_models(h, u, settings) for h, u in hosts),
        return_exceptions=True,
    )
    by_repo: dict[str, dict] = {}
    for (h, _u), res in zip(hosts, scans):
        if isinstance(res, Exception):
            continue
        for repo, size, complete in res:
            e = by_repo.setdefault(repo, {"total_bytes": 0, "per_host": [], "complete": False})
            e["total_bytes"] += size
            e["per_host"].append({"host": h, "size_bytes": size})
            e["complete"] = e["complete"] or complete

    recipe_by_repo = {m.repo: (k, m) for k, m in catalog.models.items() if m.repo}

    menu: dict[str, dict] = {}
    for repo, info in by_repo.items():
        # Skip half-fetched / corrupt caches (no finished snapshot) — they'd show
        # as broken cards. In-flight downloads surface in the download panel.
        if not info["complete"]:
            continue
        if repo in recipe_by_repo:
            key, m = recipe_by_repo[repo]
            menu[key] = _menu_entry_from_recipe(
                m, on_disk=True, total_bytes=info["total_bytes"], per_host=info["per_host"]
            )
        else:
            key = repo_to_key(repo)
            menu[key] = {
                "display_name": repo.split("/")[-1],
                "repo": repo,
                "local_path": None,
                "size_gb": round(info["total_bytes"] / 1e9, 1),
                "mode": _infer_mode(info["total_bytes"], len(info["per_host"])),
                "capabilities": [],
                "expected_ready_seconds": 300,
                "vllm_args": [],
                "description": None,
                "knobs": None,
                "custom": False,
                "needs_setup": True,
                "effective_knobs": {},
                "on_disk": True,
                "total_bytes": info["total_bytes"],
                "per_host": info["per_host"],
            }

    # Local/fine-tuned recipes live as a directory, not an HF cache entry — probe
    # each by path and include it if present. Their keys are unique catalog keys
    # (and local models carry repo="" per ModelDef), so they never collide with a
    # discovered repo's slug or an HF recipe key above.
    for key, m in catalog.models.items():
        if not m.local_path:
            continue
        st = await probe_disk(m.repo, m.mode, settings, local_path=m.local_path)
        if not st.on_disk:
            continue
        menu[key] = _menu_entry_from_recipe(
            m,
            on_disk=True,
            total_bytes=st.total_bytes,
            per_host=[{"host": r.host, "size_bytes": r.size_bytes} for r in st.per_host if r.on_disk],
        )

    return menu