From 75fd0846b4e9251952b8b758e116f93407cfa8e0 Mon Sep 17 00:00:00 2001 From: Grant Date: Tue, 12 May 2026 11:30:47 -0500 Subject: [PATCH] v0.2.3 - Per-model Advanced settings + catalog-add for downloaded models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend: - overrides.py: read/write /data/models-overrides.yaml (knobs + custom entries) - apply_knobs_to_args(): strip matching flags from bundled vllm_args and append knob values, so knob changes properly override bundled defaults - extract_knobs_from_args(): seed UI knob values from bundled args so the Advanced dialog has correct starting state - models.py: load_catalog merges overrides on top of bundled yaml - GET /api/models returns effective_knobs per model - PUT /api/models/{key}/knobs persists knob changes - POST /api/models adds a custom catalog entry - DELETE /api/models/{key} removes a custom entry (bundled models cannot be deleted) - swap_manager.reload_catalog() called after each mutation so swaps see latest Frontend: - New 'Advanced' button on every card opens a modal dialog: max-model-len input, gpu-memory-utilization slider, three optimization checkboxes (fastsafetensors, prefix caching, FP8 KV cache). Save persists; Cancel discards. Custom models also have a Delete button. - After a successful download, automatically open the 'Add to catalog' dialog pre-filled with the repo, with the same knob defaults — user just enters key, display name, and clicks Save. - Custom catalog entries are tagged with a blue 'custom' pill on the card. Package: bump 0.2.3:0; main.ts sets MODELS_OVERRIDES=/data/models-overrides.yaml so overrides persist on the StartOS volume. --- image/app/models.py | 45 ++++++++- image/app/overrides.py | 145 +++++++++++++++++++++++++++++ image/app/server.py | 64 ++++++++++++- image/app/static/app.js | 138 +++++++++++++++++++++++++-- image/app/static/index.html | 48 ++++++++++ image/app/static/style.css | 57 ++++++++++++ package/startos/main.ts | 1 + package/startos/versions/v0_1_0.ts | 4 +- 8 files changed, 490 insertions(+), 12 deletions(-) create mode 100644 image/app/overrides.py diff --git a/image/app/models.py b/image/app/models.py index 40e56ff..061596b 100644 --- a/image/app/models.py +++ b/image/app/models.py @@ -1,8 +1,10 @@ from __future__ import annotations -from typing import Literal +from typing import Literal, Optional import yaml from pydantic import BaseModel, Field +from .overrides import apply_knobs_to_args, load_overrides + class ModelDef(BaseModel): display_name: str @@ -13,6 +15,8 @@ class ModelDef(BaseModel): expected_ready_seconds: int = 300 vllm_args: list[str] = Field(default_factory=list) description: str | None = None + knobs: dict | None = None # user-customized; merged at launch time + custom: bool = False # True if this came from /data overrides class Defaults(BaseModel): @@ -25,17 +29,52 @@ class Catalog(BaseModel): models: dict[str, ModelDef] +def _merge_overrides(catalog: Catalog) -> Catalog: + """Apply user overrides + custom entries from /data/models-overrides.yaml.""" + ov = load_overrides() + knobs_by_key = ov.get("knobs") or {} + custom_entries = ov.get("custom") or [] + + new_models: dict[str, ModelDef] = {} + for key, m in catalog.models.items(): + k = knobs_by_key.get(key) + new_models[key] = m.model_copy(update={"knobs": k}) if k else m + + for entry in custom_entries: + key = entry.get("key") + if not key: + continue + defaults_dump = { + "display_name": entry.get("display_name", key), + "repo": entry["repo"], + "size_gb": float(entry.get("size_gb", 0)), + "mode": entry.get("mode", "solo"), + "capabilities": entry.get("capabilities") or [], + "expected_ready_seconds": int(entry.get("expected_ready_seconds", 300)), + "vllm_args": entry.get("vllm_args") or [], + "description": entry.get("description"), + "knobs": entry.get("knobs"), + "custom": True, + } + new_models[key] = ModelDef.model_validate(defaults_dump) + + return Catalog(defaults=catalog.defaults, models=new_models) + + def load_catalog(path: str) -> Catalog: with open(path) as f: data = yaml.safe_load(f) - return Catalog.model_validate(data) + bundled = Catalog.model_validate(data) + return _merge_overrides(bundled) def build_launch_command(key: str, model: ModelDef, defaults: Defaults) -> str: """Return the shell command to launch `model` on Spark 1. + User knobs (if any) override matching flags in the bundled vllm_args. Assumes cwd will be `~/spark-vllm-docker` (we cd in the SSH wrapper). """ solo = "--solo " if model.mode == "solo" else "" - args = [f"--port={defaults.port}", f"--host={defaults.host}", *model.vllm_args] + base_args = apply_knobs_to_args(list(model.vllm_args), model.knobs) + args = [f"--port={defaults.port}", f"--host={defaults.host}", *base_args] return f"./launch-cluster.sh {solo}-d exec vllm serve {model.repo} {' '.join(args)}" diff --git a/image/app/overrides.py b/image/app/overrides.py new file mode 100644 index 0000000..5ef5b2e --- /dev/null +++ b/image/app/overrides.py @@ -0,0 +1,145 @@ +"""User overrides + custom catalog entries, persisted on the package's main volume. + +File: /data/models-overrides.yaml (only when something has been customized). +Survives package updates because /data is the StartOS volume. + +Shape: + knobs: + qwen36: + max_model_len: 65536 + gpu_memory_utilization: 0.85 + fastsafetensors: true + prefix_caching: true + kv_cache_dtype: fp8 # one of: 'fp8' | 'auto' | null + custom: + - key: my-new-model + display_name: My New Model (from download) + repo: my-org/my-model + size_gb: 20 + mode: solo + description: null + capabilities: [] + knobs: + max_model_len: 32768 + gpu_memory_utilization: 0.85 + fastsafetensors: true + prefix_caching: true + kv_cache_dtype: fp8 +""" +from __future__ import annotations +import os +from pathlib import Path +from typing import Any, Optional +import yaml + + +KNOWN_FLAG_MAP = { + "max_model_len": "--max-model-len", + "gpu_memory_utilization": "--gpu-memory-utilization", +} + + +def _path() -> str: + # Container: /data/models-overrides.yaml. Local dev: cwd or env override. + return os.environ.get("MODELS_OVERRIDES", "/data/models-overrides.yaml") + + +def load_overrides() -> dict: + p = _path() + try: + with open(p) as f: + data = yaml.safe_load(f) or {} + except FileNotFoundError: + return {"knobs": {}, "custom": []} + data.setdefault("knobs", {}) + data.setdefault("custom", []) + return data + + +def save_overrides(data: dict) -> None: + p = _path() + parent = Path(p).parent + parent.mkdir(parents=True, exist_ok=True) + with open(p, "w") as f: + yaml.safe_dump(data, f, sort_keys=False) + + +def set_knobs(key: str, knobs: dict) -> dict: + data = load_overrides() + if not knobs: + data["knobs"].pop(key, None) + else: + data["knobs"][key] = knobs + save_overrides(data) + return data + + +def add_custom(entry: dict) -> dict: + data = load_overrides() + # Replace if same key already exists + data["custom"] = [c for c in data["custom"] if c.get("key") != entry["key"]] + data["custom"].append(entry) + save_overrides(data) + return data + + +def delete_custom(key: str) -> dict: + data = load_overrides() + data["custom"] = [c for c in data["custom"] if c.get("key") != key] + data["knobs"].pop(key, None) + save_overrides(data) + return data + + +def apply_knobs_to_args(vllm_args: list[str], knobs: Optional[dict]) -> list[str]: + """Return a new vllm_args list with knob values overriding matching flags.""" + if not knobs: + return list(vllm_args) + skip_prefixes: list[str] = [] + for k, flag in KNOWN_FLAG_MAP.items(): + if k in knobs: + skip_prefixes.append(f"{flag}=") + skip_prefixes.append(flag + " ") # rare style + # opt-in flag toggles + if "fastsafetensors" in knobs: + skip_prefixes.append("--load-format=") + if "prefix_caching" in knobs: + skip_prefixes.append("--enable-prefix-caching") + if "kv_cache_dtype" in knobs: + skip_prefixes.append("--kv-cache-dtype=") + + filtered = [a for a in vllm_args if not any(a.startswith(p) or a == p.rstrip("= ") for p in skip_prefixes)] + + for k, flag in KNOWN_FLAG_MAP.items(): + if k in knobs and knobs[k] is not None: + filtered.append(f"{flag}={knobs[k]}") + + if knobs.get("fastsafetensors"): + filtered.append("--load-format=fastsafetensors") + if knobs.get("prefix_caching"): + filtered.append("--enable-prefix-caching") + kvd = knobs.get("kv_cache_dtype") + if kvd and kvd != "auto": + filtered.append(f"--kv-cache-dtype={kvd}") + + return filtered + + +def extract_knobs_from_args(vllm_args: list[str]) -> dict: + """Reverse: read default knob values from a model's bundled vllm_args, so the UI + has a starting point even before the user has saved any overrides.""" + knobs: dict[str, Any] = {} + for a in vllm_args: + if a.startswith("--max-model-len="): + try: knobs["max_model_len"] = int(a.split("=", 1)[1]) + except ValueError: pass + elif a.startswith("--gpu-memory-utilization="): + try: knobs["gpu_memory_utilization"] = float(a.split("=", 1)[1]) + except ValueError: pass + elif a == "--load-format=fastsafetensors": + knobs["fastsafetensors"] = True + elif a == "--enable-prefix-caching": + knobs["prefix_caching"] = True + elif a.startswith("--kv-cache-dtype="): + knobs["kv_cache_dtype"] = a.split("=", 1)[1] + return knobs diff --git a/image/app/server.py b/image/app/server.py index bb496fe..d73b5e5 100644 --- a/image/app/server.py +++ b/image/app/server.py @@ -13,6 +13,7 @@ from .config import Settings from .download import DownloadManager from .health import check_magpie, check_parakeet, check_vllm from .models import load_catalog +from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs from .services import docker_state, run_action, services_from_settings from .ssh import ssh_run from .swap import SwapManager @@ -46,14 +47,75 @@ async def get_config() -> dict: } +def _reload_catalog() -> None: + global catalog + catalog = load_catalog(settings.models_yaml) + swap_manager.reload_catalog(catalog) + + @app.get("/api/models") async def get_models() -> dict: + out_models: dict[str, dict] = {} + for key, m in catalog.models.items(): + d = m.model_dump() + # Always include effective knobs for the UI (defaults from base args + any overrides) + d["effective_knobs"] = {**extract_knobs_from_args(m.vllm_args), **(m.knobs or {})} + out_models[key] = d return { "defaults": catalog.defaults.model_dump(), - "models": {k: v.model_dump() for k, v in catalog.models.items()}, + "models": out_models, } +class KnobsBody(BaseModel): + knobs: dict + + +@app.put("/api/models/{key}/knobs") +async def put_model_knobs(key: str, body: KnobsBody) -> dict: + if key not in catalog.models: + raise HTTPException(404, f"unknown model: {key}") + # Strip empty/None values + clean = {k: v for k, v in body.knobs.items() if v not in (None, "")} + set_knobs(key, clean) + _reload_catalog() + return {"ok": True, "key": key, "knobs": clean} + + +class CustomModelBody(BaseModel): + key: str + display_name: str + repo: str + size_gb: float = 0 + mode: Literal["solo", "cluster"] = "solo" + description: str | None = None + capabilities: list[str] = [] + vllm_args: list[str] = [] + knobs: dict | None = None + + +@app.post("/api/models") +async def post_model(body: CustomModelBody) -> dict: + if not body.key or not body.key.replace("-", "").replace("_", "").isalnum(): + raise HTTPException(400, "key must be alphanumeric/-/_ only") + if body.key in catalog.models and not catalog.models[body.key].custom: + raise HTTPException(409, f"'{body.key}' is a bundled model — pick a different key") + add_custom(body.model_dump()) + _reload_catalog() + return {"ok": True, "key": body.key} + + +@app.delete("/api/models/{key}") +async def del_model(key: str) -> dict: + if key not in catalog.models: + raise HTTPException(404, f"unknown model: {key}") + if not catalog.models[key].custom: + raise HTTPException(400, "cannot delete a bundled model; you may override its knobs instead") + delete_custom(key) + _reload_catalog() + return {"ok": True, "key": key} + + @app.get("/api/services") async def get_services() -> dict: """Lifecycle state of always-on support services (Parakeet, Magpie, …). diff --git a/image/app/static/app.js b/image/app/static/app.js index 8b6adba..125a8a1 100644 --- a/image/app/static/app.js +++ b/image/app/static/app.js @@ -53,24 +53,32 @@ function renderCards() { const desc = m.description ? `
${escapeHtml(m.description)}
` : ''; + const customPill = m.custom ? `custom` : ''; card.innerHTML = `
${escapeHtml(m.display_name)}
${m.mode} ${m.size_gb} GB + ${customPill} ${(m.capabilities || []).map(c => `${escapeHtml(c)}`).join('')}
${desc}
${escapeHtml(m.repo)}
- +
+ + +
`; root.appendChild(card); } - for (const btn of $$('.card .btn')) { - btn.addEventListener('click', () => triggerSwap(btn.dataset.key)); + for (const btn of root.querySelectorAll('[data-swap-key]')) { + btn.addEventListener('click', () => triggerSwap(btn.dataset.swapKey)); + } + for (const btn of root.querySelectorAll('[data-adv-key]')) { + btn.addEventListener('click', () => openAdvanced(btn.dataset.advKey)); } } @@ -544,6 +552,8 @@ async function startDownload() { alert('Enter a HuggingFace repo in the form "org/name", e.g. RedHatAI/Qwen3.6-35B-A3B-NVFP4'); return; } + dlState.last_repo = repo; + dlState.last_mode = mode; try { const r = await fetchJSON('/api/download', { method: 'POST', @@ -623,12 +633,126 @@ function handleDownloadDone(d) { el('#dl-phase').textContent = 'Failed'; } else { el('#dl-title').textContent = 'Done'; - el('#dl-phase').textContent = 'Done ✓ — you can now add this model to the catalog and swap to it.'; + el('#dl-phase').textContent = 'Done ✓'; el('#dl-progress-fill').style.width = '100%'; + // Offer to add to catalog + const repo = dlState.last_repo; + const mode = dlState.last_mode; + if (repo) { + setTimeout(() => openCatalogDialog(repo, mode), 600); + } } dlState.job_id = null; } +// ===================== Advanced / Add to catalog ===================== + +function openAdvanced(key) { + const m = state.models[key]; + if (!m) return; + const dlg = el('#advanced-dialog'); + el('#adv-title').textContent = `Advanced — ${m.display_name}`; + const k = m.effective_knobs || {}; + el('#adv-mml').value = k.max_model_len ?? ''; + el('#adv-gmu').value = k.gpu_memory_utilization ?? 0.85; + el('#adv-gmu-out').value = parseFloat(el('#adv-gmu').value).toFixed(2); + el('#adv-fst').checked = !!k.fastsafetensors; + el('#adv-pcache').checked = !!k.prefix_caching; + el('#adv-fp8').checked = k.kv_cache_dtype === 'fp8'; + const del = el('#adv-delete'); + del.classList.toggle('hidden', !m.custom); + del.onclick = async () => { + if (!confirm(`Delete "${m.display_name}" from the catalog? The model weights on disk are NOT deleted.`)) return; + try { + await fetchJSON(`/api/models/${encodeURIComponent(key)}`, { method: 'DELETE' }); + dlg.close(); + await loadModels(); + pollStatus(); + } catch (e) { alert('Delete failed: ' + e.message); } + }; + const form = el('#advanced-form'); + form.onsubmit = async (e) => { + e.preventDefault(); + const knobs = {}; + const mml = parseInt(el('#adv-mml').value, 10); + if (Number.isFinite(mml) && mml > 0) knobs.max_model_len = mml; + const gmu = parseFloat(el('#adv-gmu').value); + if (Number.isFinite(gmu)) knobs.gpu_memory_utilization = gmu; + if (el('#adv-fst').checked) knobs.fastsafetensors = true; else knobs.fastsafetensors = false; + if (el('#adv-pcache').checked) knobs.prefix_caching = true; else knobs.prefix_caching = false; + knobs.kv_cache_dtype = el('#adv-fp8').checked ? 'fp8' : 'auto'; + try { + await fetchJSON(`/api/models/${encodeURIComponent(key)}/knobs`, { + method: 'PUT', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ knobs }), + }); + dlg.close(); + await loadModels(); + pollStatus(); + } catch (e) { alert('Save failed: ' + e.message); } + }; + dlg.showModal(); +} + +function openCatalogDialog(repo, mode) { + const dlg = el('#catalog-dialog'); + const key = repo.split('/').pop().toLowerCase().replace(/[^a-z0-9_-]/g, '-'); + el('#cd-key').value = key; + el('#cd-name').value = repo.split('/').pop(); + el('#cd-repo').value = repo; + el('#cd-size').value = ''; + el('#cd-mode').value = mode || 'solo'; + el('#cd-desc').value = ''; + el('#cd-mml').value = 32768; + el('#cd-gmu').value = 0.85; + el('#cd-gmu-out').value = '0.85'; + el('#cd-fst').checked = true; + el('#cd-pcache').checked = true; + el('#cd-fp8').checked = true; + dlg.showModal(); +} + +function setupCatalogDialog() { + el('#cd-cancel').addEventListener('click', () => el('#catalog-dialog').close()); + el('#cd-gmu').addEventListener('input', (e) => { el('#cd-gmu-out').value = parseFloat(e.target.value).toFixed(2); }); + el('#catalog-form').addEventListener('submit', async (e) => { + e.preventDefault(); + const body = { + key: el('#cd-key').value.trim(), + display_name: el('#cd-name').value.trim(), + repo: el('#cd-repo').value.trim(), + size_gb: parseFloat(el('#cd-size').value) || 0, + mode: el('#cd-mode').value, + description: el('#cd-desc').value.trim() || null, + vllm_args: [], + knobs: { + max_model_len: parseInt(el('#cd-mml').value, 10) || 32768, + gpu_memory_utilization: parseFloat(el('#cd-gmu').value), + fastsafetensors: el('#cd-fst').checked, + prefix_caching: el('#cd-pcache').checked, + kv_cache_dtype: el('#cd-fp8').checked ? 'fp8' : 'auto', + }, + }; + try { + await fetchJSON('/api/models', { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(body), + }); + el('#catalog-dialog').close(); + closeDownloadPanel(); + await loadModels(); + pollStatus(); + } catch (e) { alert('Add to catalog failed: ' + e.message); } + }); +} + +function setupAdvancedDialog() { + el('#adv-cancel').addEventListener('click', () => el('#advanced-dialog').close()); + el('#adv-gmu').addEventListener('input', (e) => { el('#adv-gmu-out').value = parseFloat(e.target.value).toFixed(2); }); +} + // ===================== updates (spark-vllm-docker) ===================== const updState = { @@ -769,6 +893,8 @@ async function init() { list.open = !list.open; }); el('#ub-apply').addEventListener('click', applyUpdate); + setupCatalogDialog(); + setupAdvancedDialog(); await loadModels(); await pollStatus(); await renderServices(); diff --git a/image/app/static/index.html b/image/app/static/index.html index 33ed6bd..ae07bfd 100644 --- a/image/app/static/index.html +++ b/image/app/static/index.html @@ -74,6 +74,54 @@ + + + + + + + +