v0.26.0:0 - disk-driven model menu (scan sparks; recipes; needs-setup)

The dashboard menu is now the set of models actually downloaded on the Sparks, not a hard-coded catalog. models.yaml + overrides are reframed as launch recipes matched to an on-disk model by repo; an on-disk model with no recipe is flagged needs_setup and its launch settings are inferred from its config.json for a one-time operator confirmation (discovery.py). - delete now removes weights AND the menu card (delete_from_disk sweeps all hosts; the delete endpoint resolves keys via the live menu) - new GET /api/models/suggest; /api/models returns the menu + a recipes list (download autocomplete); GET /api/models/disk-status removed - dropped the two legacy Qwen recipes (235B FP8, 2.5 72B) - tests: +test_discovery.py (cache parsing, infer_recipe, build_menu merge)
2026-06-18 11:09:56 -05:00
parent c0b35184ba
commit df9f244eae
14 changed files with 795 additions and 238 deletions
@@ -0,0 +1,190 @@
+"""Disk-driven menu helpers: cache-dir parsing + launch-recipe inference.
+
+All offline — pure functions over a fake cache listing and fake config.json
+dicts. The SSH scan, the menu merge, and the suggest endpoint that wire these
+together are exercised by hand against the live cluster (mock-heavy unit tests of
+those would test the mocks).
+"""
+import asyncio
+
+from app import discovery
+from app.config import Settings
+from app.disk import DiskStatus, cache_dirname_to_repo, parse_cache_listing
+from app.discovery import repo_to_key, infer_recipe, _detect_family
+from app.models import load_catalog
+
+
+# ---- cache dirname <-> repo ----
+
+def test_cache_dirname_to_repo_roundtrip():
+    assert cache_dirname_to_repo("models--RedHatAI--Qwen3.6-35B-A3B-NVFP4") == "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
+
+
+def test_cache_dirname_name_with_double_dash():
+    # The org is the first segment; everything after is the name (single '/').
+    assert cache_dirname_to_repo("models--org--weird--name") == "org/weird--name"
+
+
+def test_cache_dirname_rejects_non_model_dirs():
+    assert cache_dirname_to_repo("datasets--foo--bar") is None
+    assert cache_dirname_to_repo("models--onlyorg") is None
+    assert cache_dirname_to_repo("random") is None
+
+
+# ---- parse_cache_listing ----
+
+def test_parse_cache_listing_complete_and_incomplete():
+    out = (
+        "20000000000|1|models--RedHatAI--Qwen3.6-35B-A3B-NVFP4\n"
+        "5000000000|0|models--some--half-downloaded\n"
+        "\n"
+        "garbage line with no pipes\n"
+        "123|1|not-a-model-dir\n"
+    )
+    items = parse_cache_listing(out)
+    assert items == [
+        ("RedHatAI/Qwen3.6-35B-A3B-NVFP4", 20000000000, True),
+        ("some/half-downloaded", 5000000000, False),
+    ]
+
+
+def test_parse_cache_listing_bad_size_defaults_zero():
+    items = parse_cache_listing("notanumber|1|models--a--b")
+    assert items == [("a/b", 0, True)]
+
+
+# ---- repo_to_key ----
+
+def test_repo_to_key_is_url_safe_and_stable():
+    assert repo_to_key("RedHatAI/Qwen3.6-35B-A3B-NVFP4") == "redhatai-qwen3-6-35b-a3b-nvfp4"
+    # Idempotent enough to be a stable id across calls.
+    assert repo_to_key("nvidia/Gemma-4-26B-A4B-NVFP4") == "nvidia-gemma-4-26b-a4b-nvfp4"
+
+
+# ---- family detection ----
+
+def test_detect_qwen3_moe():
+    cfg = {"architectures": ["Qwen3MoeForCausalLM"], "model_type": "qwen3_moe", "num_experts": 128}
+    label, flags, caps = _detect_family(cfg)
+    assert "--reasoning-parser=qwen3" in flags
+    assert "--moe_backend=flashinfer_cutlass" in flags
+    assert "reasoning" in caps
+    assert "MoE" in label
+
+
+def test_detect_gemma_moe_uses_marlin():
+    cfg = {"architectures": ["Gemma4MoeForConditionalGeneration"], "model_type": "gemma4_moe", "num_local_experts": 8}
+    label, flags, caps = _detect_family(cfg)
+    assert "--reasoning-parser=gemma4" in flags
+    assert "--tool-call-parser=gemma4" in flags
+    assert "--moe_backend=marlin" in flags        # NOT flashinfer_cutlass — GB10 footgun
+    assert "vision" in caps                         # ConditionalGeneration => multimodal
+    assert "tools" in caps
+
+
+def test_detect_generic_has_no_family_flags():
+    label, flags, caps = _detect_family({"architectures": ["LlamaForCausalLM"], "model_type": "llama"})
+    assert flags == []
+    assert label == "Generic"
+
+
+def test_detect_vision_from_config_keys():
+    _, _, caps = _detect_family({"model_type": "qwen3", "vision_config": {"x": 1}})
+    assert "vision" in caps
+
+
+# ---- infer_recipe (the prefill the setup form receives) ----
+
+def test_infer_recipe_solo_small_model():
+    cfg = {"architectures": ["Qwen3ForCausalLM"], "model_type": "qwen3"}
+    rec = infer_recipe("RedHatAI/Qwen3.6-35B-A3B-NVFP4", cfg, total_bytes=20_000_000_000, on_host_count=1)
+    assert rec["mode"] == "solo"
+    assert rec["key"] == "redhatai-qwen3-6-35b-a3b-nvfp4"
+    assert rec["repo"] == "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
+    assert "--reasoning-parser=qwen3" in rec["vllm_args"]
+    assert "-tp=2" not in rec["vllm_args"]
+    assert rec["knobs"]["kv_cache_dtype"] == "fp8"
+
+
+def test_infer_recipe_cluster_when_on_both_hosts():
+    rec = infer_recipe("org/big", {}, total_bytes=10_000_000_000, on_host_count=2)
+    assert rec["mode"] == "cluster"
+    assert "-tp=2" in rec["vllm_args"]
+    assert "--distributed-executor-backend=ray" in rec["vllm_args"]
+    assert rec["knobs"]["gpu_memory_utilization"] == 0.7
+
+
+def test_infer_recipe_cluster_when_too_big_for_one_spark():
+    rec = infer_recipe("org/huge", {}, total_bytes=200_000_000_000, on_host_count=1)
+    assert rec["mode"] == "cluster"
+
+
+# ---- build_menu merge (disk scan ∪ recipes) ----
+
+def _both_spark_settings(monkeypatch) -> Settings:
+    for k in ("SPARK1_HOST", "SPARK1_USER", "SPARK2_HOST", "SPARK2_USER"):
+        monkeypatch.delenv(k, raising=False)
+    monkeypatch.setenv("SPARK1_HOST", "1.1.1.1")
+    monkeypatch.setenv("SPARK1_USER", "u")
+    monkeypatch.setenv("SPARK2_HOST", "2.2.2.2")
+    monkeypatch.setenv("SPARK2_USER", "u")
+    return Settings.from_env()
+
+
+def test_build_menu_merges_recipe_discovered_and_hides_incomplete(monkeypatch):
+    cat = load_catalog("models.yaml")  # bundled recipes incl. qwen36 + gemma4
+    settings = _both_spark_settings(monkeypatch)
+
+    async def fake_list(host, user, s):
+        if host == "1.1.1.1":
+            return [
+                ("RedHatAI/Qwen3.6-35B-A3B-NVFP4", 20_000_000_000, True),  # recipe match
+                ("someorg/mystery-7B", 7_000_000_000, True),               # needs setup
+                ("broken/half", 1_000_000_000, False),                     # incomplete -> hidden
+            ]
+        return []  # spark2 empty
+
+    async def fake_probe(repo, mode, s, *, local_path=None):
+        return DiskStatus(repo=local_path or repo, on_disk=False, total_bytes=0, per_host=[])
+
+    monkeypatch.setattr(discovery, "list_cached_models", fake_list)
+    monkeypatch.setattr(discovery, "probe_disk", fake_probe)
+
+    menu = asyncio.run(discovery.build_menu(settings, cat))
+
+    # Recipe-matched: keyed by recipe key, ready (not needs_setup), real size.
+    assert "qwen36" in menu
+    assert menu["qwen36"]["needs_setup"] is False
+    assert menu["qwen36"]["total_bytes"] == 20_000_000_000
+
+    # Discovered-without-recipe: slug key, needs_setup.
+    slug = repo_to_key("someorg/mystery-7B")
+    assert menu[slug]["needs_setup"] is True
+
+    # Incomplete download is filtered out entirely.
+    assert all("half" not in k for k in menu)
+
+    # A recipe with nothing on disk (e.g. gemma4) must NOT appear — the menu is the disk.
+    assert "gemma4" not in menu
+
+
+def test_build_menu_sums_cluster_model_across_both_sparks(monkeypatch):
+    cat = load_catalog("models.yaml")
+    settings = _both_spark_settings(monkeypatch)
+
+    async def fake_list(host, user, s):
+        # Same repo present on BOTH Sparks — one card, sizes summed (not two cards).
+        return [("org/sharded-235B", 70_000_000_000, True)]
+
+    async def fake_probe(repo, mode, s, *, local_path=None):
+        return DiskStatus(repo=repo, on_disk=False, total_bytes=0, per_host=[])
+
+    monkeypatch.setattr(discovery, "list_cached_models", fake_list)
+    monkeypatch.setattr(discovery, "probe_disk", fake_probe)
+
+    menu = asyncio.run(discovery.build_menu(settings, cat))
+    key = repo_to_key("org/sharded-235B")
+    assert list(menu) == [key]                       # exactly one card
+    assert menu[key]["total_bytes"] == 140_000_000_000  # summed across both hosts
+    assert len(menu[key]["per_host"]) == 2
+    assert menu[key]["mode"] == "cluster"            # present on 2 hosts -> cluster