diff --git a/image/app/models.py b/image/app/models.py
index 40e56ff..061596b 100644
--- a/image/app/models.py
+++ b/image/app/models.py
@@ -1,8 +1,10 @@
from __future__ import annotations
-from typing import Literal
+from typing import Literal, Optional
import yaml
from pydantic import BaseModel, Field
+from .overrides import apply_knobs_to_args, load_overrides
+
class ModelDef(BaseModel):
display_name: str
@@ -13,6 +15,8 @@ class ModelDef(BaseModel):
expected_ready_seconds: int = 300
vllm_args: list[str] = Field(default_factory=list)
description: str | None = None
+ knobs: dict | None = None # user-customized; merged at launch time
+ custom: bool = False # True if this came from /data overrides
class Defaults(BaseModel):
@@ -25,17 +29,52 @@ class Catalog(BaseModel):
models: dict[str, ModelDef]
+def _merge_overrides(catalog: Catalog) -> Catalog:
+ """Apply user overrides + custom entries from /data/models-overrides.yaml."""
+ ov = load_overrides()
+ knobs_by_key = ov.get("knobs") or {}
+ custom_entries = ov.get("custom") or []
+
+ new_models: dict[str, ModelDef] = {}
+ for key, m in catalog.models.items():
+ k = knobs_by_key.get(key)
+ new_models[key] = m.model_copy(update={"knobs": k}) if k else m
+
+ for entry in custom_entries:
+ key = entry.get("key")
+ if not key:
+ continue
+ defaults_dump = {
+ "display_name": entry.get("display_name", key),
+ "repo": entry["repo"],
+ "size_gb": float(entry.get("size_gb", 0)),
+ "mode": entry.get("mode", "solo"),
+ "capabilities": entry.get("capabilities") or [],
+ "expected_ready_seconds": int(entry.get("expected_ready_seconds", 300)),
+ "vllm_args": entry.get("vllm_args") or [],
+ "description": entry.get("description"),
+ "knobs": entry.get("knobs"),
+ "custom": True,
+ }
+ new_models[key] = ModelDef.model_validate(defaults_dump)
+
+ return Catalog(defaults=catalog.defaults, models=new_models)
+
+
def load_catalog(path: str) -> Catalog:
with open(path) as f:
data = yaml.safe_load(f)
- return Catalog.model_validate(data)
+ bundled = Catalog.model_validate(data)
+ return _merge_overrides(bundled)
def build_launch_command(key: str, model: ModelDef, defaults: Defaults) -> str:
"""Return the shell command to launch `model` on Spark 1.
+ User knobs (if any) override matching flags in the bundled vllm_args.
Assumes cwd will be `~/spark-vllm-docker` (we cd in the SSH wrapper).
"""
solo = "--solo " if model.mode == "solo" else ""
- args = [f"--port={defaults.port}", f"--host={defaults.host}", *model.vllm_args]
+ base_args = apply_knobs_to_args(list(model.vllm_args), model.knobs)
+ args = [f"--port={defaults.port}", f"--host={defaults.host}", *base_args]
return f"./launch-cluster.sh {solo}-d exec vllm serve {model.repo} {' '.join(args)}"
diff --git a/image/app/overrides.py b/image/app/overrides.py
new file mode 100644
index 0000000..5ef5b2e
--- /dev/null
+++ b/image/app/overrides.py
@@ -0,0 +1,145 @@
+"""User overrides + custom catalog entries, persisted on the package's main volume.
+
+File: /data/models-overrides.yaml (only when something has been customized).
+Survives package updates because /data is the StartOS volume.
+
+Shape:
+ knobs:
+ qwen36:
+ max_model_len: 65536
+ gpu_memory_utilization: 0.85
+ fastsafetensors: true
+ prefix_caching: true
+ kv_cache_dtype: fp8 # one of: 'fp8' | 'auto' | null
+ custom:
+ - key: my-new-model
+ display_name: My New Model (from download)
+ repo: my-org/my-model
+ size_gb: 20
+ mode: solo
+ description: null
+ capabilities: []
+ knobs:
+ max_model_len: 32768
+ gpu_memory_utilization: 0.85
+ fastsafetensors: true
+ prefix_caching: true
+ kv_cache_dtype: fp8
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Any, Optional
+import yaml
+
+
+KNOWN_FLAG_MAP = {
+ "max_model_len": "--max-model-len",
+ "gpu_memory_utilization": "--gpu-memory-utilization",
+}
+
+
+def _path() -> str:
+ # Container: /data/models-overrides.yaml. Local dev: cwd or env override.
+ return os.environ.get("MODELS_OVERRIDES", "/data/models-overrides.yaml")
+
+
+def load_overrides() -> dict:
+ p = _path()
+ try:
+ with open(p) as f:
+ data = yaml.safe_load(f) or {}
+ except FileNotFoundError:
+ return {"knobs": {}, "custom": []}
+ data.setdefault("knobs", {})
+ data.setdefault("custom", [])
+ return data
+
+
+def save_overrides(data: dict) -> None:
+ p = _path()
+ parent = Path(p).parent
+ parent.mkdir(parents=True, exist_ok=True)
+ with open(p, "w") as f:
+ yaml.safe_dump(data, f, sort_keys=False)
+
+
+def set_knobs(key: str, knobs: dict) -> dict:
+ data = load_overrides()
+ if not knobs:
+ data["knobs"].pop(key, None)
+ else:
+ data["knobs"][key] = knobs
+ save_overrides(data)
+ return data
+
+
+def add_custom(entry: dict) -> dict:
+ data = load_overrides()
+ # Replace if same key already exists
+ data["custom"] = [c for c in data["custom"] if c.get("key") != entry["key"]]
+ data["custom"].append(entry)
+ save_overrides(data)
+ return data
+
+
+def delete_custom(key: str) -> dict:
+ data = load_overrides()
+ data["custom"] = [c for c in data["custom"] if c.get("key") != key]
+ data["knobs"].pop(key, None)
+ save_overrides(data)
+ return data
+
+
+def apply_knobs_to_args(vllm_args: list[str], knobs: Optional[dict]) -> list[str]:
+ """Return a new vllm_args list with knob values overriding matching flags."""
+ if not knobs:
+ return list(vllm_args)
+ skip_prefixes: list[str] = []
+ for k, flag in KNOWN_FLAG_MAP.items():
+ if k in knobs:
+ skip_prefixes.append(f"{flag}=")
+ skip_prefixes.append(flag + " ") # rare style
+ # opt-in flag toggles
+ if "fastsafetensors" in knobs:
+ skip_prefixes.append("--load-format=")
+ if "prefix_caching" in knobs:
+ skip_prefixes.append("--enable-prefix-caching")
+ if "kv_cache_dtype" in knobs:
+ skip_prefixes.append("--kv-cache-dtype=")
+
+ filtered = [a for a in vllm_args if not any(a.startswith(p) or a == p.rstrip("= ") for p in skip_prefixes)]
+
+ for k, flag in KNOWN_FLAG_MAP.items():
+ if k in knobs and knobs[k] is not None:
+ filtered.append(f"{flag}={knobs[k]}")
+
+ if knobs.get("fastsafetensors"):
+ filtered.append("--load-format=fastsafetensors")
+ if knobs.get("prefix_caching"):
+ filtered.append("--enable-prefix-caching")
+ kvd = knobs.get("kv_cache_dtype")
+ if kvd and kvd != "auto":
+ filtered.append(f"--kv-cache-dtype={kvd}")
+
+ return filtered
+
+
+def extract_knobs_from_args(vllm_args: list[str]) -> dict:
+ """Reverse: read default knob values from a model's bundled vllm_args, so the UI
+ has a starting point even before the user has saved any overrides."""
+ knobs: dict[str, Any] = {}
+ for a in vllm_args:
+ if a.startswith("--max-model-len="):
+ try: knobs["max_model_len"] = int(a.split("=", 1)[1])
+ except ValueError: pass
+ elif a.startswith("--gpu-memory-utilization="):
+ try: knobs["gpu_memory_utilization"] = float(a.split("=", 1)[1])
+ except ValueError: pass
+ elif a == "--load-format=fastsafetensors":
+ knobs["fastsafetensors"] = True
+ elif a == "--enable-prefix-caching":
+ knobs["prefix_caching"] = True
+ elif a.startswith("--kv-cache-dtype="):
+ knobs["kv_cache_dtype"] = a.split("=", 1)[1]
+ return knobs
diff --git a/image/app/server.py b/image/app/server.py
index bb496fe..d73b5e5 100644
--- a/image/app/server.py
+++ b/image/app/server.py
@@ -13,6 +13,7 @@ from .config import Settings
from .download import DownloadManager
from .health import check_magpie, check_parakeet, check_vllm
from .models import load_catalog
+from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
from .services import docker_state, run_action, services_from_settings
from .ssh import ssh_run
from .swap import SwapManager
@@ -46,14 +47,75 @@ async def get_config() -> dict:
}
+def _reload_catalog() -> None:
+ global catalog
+ catalog = load_catalog(settings.models_yaml)
+ swap_manager.reload_catalog(catalog)
+
+
@app.get("/api/models")
async def get_models() -> dict:
+ out_models: dict[str, dict] = {}
+ for key, m in catalog.models.items():
+ d = m.model_dump()
+ # Always include effective knobs for the UI (defaults from base args + any overrides)
+ d["effective_knobs"] = {**extract_knobs_from_args(m.vllm_args), **(m.knobs or {})}
+ out_models[key] = d
return {
"defaults": catalog.defaults.model_dump(),
- "models": {k: v.model_dump() for k, v in catalog.models.items()},
+ "models": out_models,
}
+class KnobsBody(BaseModel):
+ knobs: dict
+
+
+@app.put("/api/models/{key}/knobs")
+async def put_model_knobs(key: str, body: KnobsBody) -> dict:
+ if key not in catalog.models:
+ raise HTTPException(404, f"unknown model: {key}")
+ # Strip empty/None values
+ clean = {k: v for k, v in body.knobs.items() if v not in (None, "")}
+ set_knobs(key, clean)
+ _reload_catalog()
+ return {"ok": True, "key": key, "knobs": clean}
+
+
+class CustomModelBody(BaseModel):
+ key: str
+ display_name: str
+ repo: str
+ size_gb: float = 0
+ mode: Literal["solo", "cluster"] = "solo"
+ description: str | None = None
+ capabilities: list[str] = []
+ vllm_args: list[str] = []
+ knobs: dict | None = None
+
+
+@app.post("/api/models")
+async def post_model(body: CustomModelBody) -> dict:
+ if not body.key or not body.key.replace("-", "").replace("_", "").isalnum():
+ raise HTTPException(400, "key must be alphanumeric/-/_ only")
+ if body.key in catalog.models and not catalog.models[body.key].custom:
+ raise HTTPException(409, f"'{body.key}' is a bundled model — pick a different key")
+ add_custom(body.model_dump())
+ _reload_catalog()
+ return {"ok": True, "key": body.key}
+
+
+@app.delete("/api/models/{key}")
+async def del_model(key: str) -> dict:
+ if key not in catalog.models:
+ raise HTTPException(404, f"unknown model: {key}")
+ if not catalog.models[key].custom:
+ raise HTTPException(400, "cannot delete a bundled model; you may override its knobs instead")
+ delete_custom(key)
+ _reload_catalog()
+ return {"ok": True, "key": key}
+
+
@app.get("/api/services")
async def get_services() -> dict:
"""Lifecycle state of always-on support services (Parakeet, Magpie, …).
diff --git a/image/app/static/app.js b/image/app/static/app.js
index 8b6adba..125a8a1 100644
--- a/image/app/static/app.js
+++ b/image/app/static/app.js
@@ -53,24 +53,32 @@ function renderCards() {
const desc = m.description
? `
${escapeHtml(m.description)}
`
: '';
+ const customPill = m.custom ? `custom` : '';
card.innerHTML = `
${escapeHtml(m.display_name)}
${m.mode}
${m.size_gb} GB
+ ${customPill}
${(m.capabilities || []).map(c => `${escapeHtml(c)}`).join('')}
${desc}
${escapeHtml(m.repo)}
-
+
+
+
+
`;
root.appendChild(card);
}
- for (const btn of $$('.card .btn')) {
- btn.addEventListener('click', () => triggerSwap(btn.dataset.key));
+ for (const btn of root.querySelectorAll('[data-swap-key]')) {
+ btn.addEventListener('click', () => triggerSwap(btn.dataset.swapKey));
+ }
+ for (const btn of root.querySelectorAll('[data-adv-key]')) {
+ btn.addEventListener('click', () => openAdvanced(btn.dataset.advKey));
}
}
@@ -544,6 +552,8 @@ async function startDownload() {
alert('Enter a HuggingFace repo in the form "org/name", e.g. RedHatAI/Qwen3.6-35B-A3B-NVFP4');
return;
}
+ dlState.last_repo = repo;
+ dlState.last_mode = mode;
try {
const r = await fetchJSON('/api/download', {
method: 'POST',
@@ -623,12 +633,126 @@ function handleDownloadDone(d) {
el('#dl-phase').textContent = 'Failed';
} else {
el('#dl-title').textContent = 'Done';
- el('#dl-phase').textContent = 'Done ✓ — you can now add this model to the catalog and swap to it.';
+ el('#dl-phase').textContent = 'Done ✓';
el('#dl-progress-fill').style.width = '100%';
+ // Offer to add to catalog
+ const repo = dlState.last_repo;
+ const mode = dlState.last_mode;
+ if (repo) {
+ setTimeout(() => openCatalogDialog(repo, mode), 600);
+ }
}
dlState.job_id = null;
}
+// ===================== Advanced / Add to catalog =====================
+
+function openAdvanced(key) {
+ const m = state.models[key];
+ if (!m) return;
+ const dlg = el('#advanced-dialog');
+ el('#adv-title').textContent = `Advanced — ${m.display_name}`;
+ const k = m.effective_knobs || {};
+ el('#adv-mml').value = k.max_model_len ?? '';
+ el('#adv-gmu').value = k.gpu_memory_utilization ?? 0.85;
+ el('#adv-gmu-out').value = parseFloat(el('#adv-gmu').value).toFixed(2);
+ el('#adv-fst').checked = !!k.fastsafetensors;
+ el('#adv-pcache').checked = !!k.prefix_caching;
+ el('#adv-fp8').checked = k.kv_cache_dtype === 'fp8';
+ const del = el('#adv-delete');
+ del.classList.toggle('hidden', !m.custom);
+ del.onclick = async () => {
+ if (!confirm(`Delete "${m.display_name}" from the catalog? The model weights on disk are NOT deleted.`)) return;
+ try {
+ await fetchJSON(`/api/models/${encodeURIComponent(key)}`, { method: 'DELETE' });
+ dlg.close();
+ await loadModels();
+ pollStatus();
+ } catch (e) { alert('Delete failed: ' + e.message); }
+ };
+ const form = el('#advanced-form');
+ form.onsubmit = async (e) => {
+ e.preventDefault();
+ const knobs = {};
+ const mml = parseInt(el('#adv-mml').value, 10);
+ if (Number.isFinite(mml) && mml > 0) knobs.max_model_len = mml;
+ const gmu = parseFloat(el('#adv-gmu').value);
+ if (Number.isFinite(gmu)) knobs.gpu_memory_utilization = gmu;
+ if (el('#adv-fst').checked) knobs.fastsafetensors = true; else knobs.fastsafetensors = false;
+ if (el('#adv-pcache').checked) knobs.prefix_caching = true; else knobs.prefix_caching = false;
+ knobs.kv_cache_dtype = el('#adv-fp8').checked ? 'fp8' : 'auto';
+ try {
+ await fetchJSON(`/api/models/${encodeURIComponent(key)}/knobs`, {
+ method: 'PUT',
+ headers: { 'content-type': 'application/json' },
+ body: JSON.stringify({ knobs }),
+ });
+ dlg.close();
+ await loadModels();
+ pollStatus();
+ } catch (e) { alert('Save failed: ' + e.message); }
+ };
+ dlg.showModal();
+}
+
+function openCatalogDialog(repo, mode) {
+ const dlg = el('#catalog-dialog');
+ const key = repo.split('/').pop().toLowerCase().replace(/[^a-z0-9_-]/g, '-');
+ el('#cd-key').value = key;
+ el('#cd-name').value = repo.split('/').pop();
+ el('#cd-repo').value = repo;
+ el('#cd-size').value = '';
+ el('#cd-mode').value = mode || 'solo';
+ el('#cd-desc').value = '';
+ el('#cd-mml').value = 32768;
+ el('#cd-gmu').value = 0.85;
+ el('#cd-gmu-out').value = '0.85';
+ el('#cd-fst').checked = true;
+ el('#cd-pcache').checked = true;
+ el('#cd-fp8').checked = true;
+ dlg.showModal();
+}
+
+function setupCatalogDialog() {
+ el('#cd-cancel').addEventListener('click', () => el('#catalog-dialog').close());
+ el('#cd-gmu').addEventListener('input', (e) => { el('#cd-gmu-out').value = parseFloat(e.target.value).toFixed(2); });
+ el('#catalog-form').addEventListener('submit', async (e) => {
+ e.preventDefault();
+ const body = {
+ key: el('#cd-key').value.trim(),
+ display_name: el('#cd-name').value.trim(),
+ repo: el('#cd-repo').value.trim(),
+ size_gb: parseFloat(el('#cd-size').value) || 0,
+ mode: el('#cd-mode').value,
+ description: el('#cd-desc').value.trim() || null,
+ vllm_args: [],
+ knobs: {
+ max_model_len: parseInt(el('#cd-mml').value, 10) || 32768,
+ gpu_memory_utilization: parseFloat(el('#cd-gmu').value),
+ fastsafetensors: el('#cd-fst').checked,
+ prefix_caching: el('#cd-pcache').checked,
+ kv_cache_dtype: el('#cd-fp8').checked ? 'fp8' : 'auto',
+ },
+ };
+ try {
+ await fetchJSON('/api/models', {
+ method: 'POST',
+ headers: { 'content-type': 'application/json' },
+ body: JSON.stringify(body),
+ });
+ el('#catalog-dialog').close();
+ closeDownloadPanel();
+ await loadModels();
+ pollStatus();
+ } catch (e) { alert('Add to catalog failed: ' + e.message); }
+ });
+}
+
+function setupAdvancedDialog() {
+ el('#adv-cancel').addEventListener('click', () => el('#advanced-dialog').close());
+ el('#adv-gmu').addEventListener('input', (e) => { el('#adv-gmu-out').value = parseFloat(e.target.value).toFixed(2); });
+}
+
// ===================== updates (spark-vllm-docker) =====================
const updState = {
@@ -769,6 +893,8 @@ async function init() {
list.open = !list.open;
});
el('#ub-apply').addEventListener('click', applyUpdate);
+ setupCatalogDialog();
+ setupAdvancedDialog();
await loadModels();
await pollStatus();
await renderServices();
diff --git a/image/app/static/index.html b/image/app/static/index.html
index 33ed6bd..ae07bfd 100644
--- a/image/app/static/index.html
+++ b/image/app/static/index.html
@@ -74,6 +74,54 @@
+
+
+
+