Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9ff7ee9c1e | |||
| 1602b3b3b4 | |||
| 8ac455f5f5 |
@@ -173,16 +173,38 @@ class DeepHealth:
|
||||
if not s.spark1_host:
|
||||
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||
base = f"http://{s.spark1_host}:{s.vllm_port}"
|
||||
# Step 1: is there a model loaded?
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as c:
|
||||
r = await c.get(f"{base}/v1/models")
|
||||
r.raise_for_status()
|
||||
if 200 <= r.status_code < 300:
|
||||
models = r.json().get("data") or []
|
||||
if not models:
|
||||
return ProbeResult(ok=False, at=now_iso, error="no model loaded")
|
||||
model_id = models[0]["id"]
|
||||
except Exception as e:
|
||||
return ProbeResult(ok=False, at=now_iso, error=f"list models: {type(e).__name__}: {e}")
|
||||
else:
|
||||
# 5xx on /v1/models suggests something wedged after a model loaded
|
||||
return ProbeResult(
|
||||
ok=False,
|
||||
at=now_iso,
|
||||
error=f"list_models HTTP {r.status_code}: {r.text[:240]}",
|
||||
)
|
||||
except Exception:
|
||||
# Connection refused / timeout: usually means no vLLM process listening
|
||||
# (the vllm_node container is alive but no `vllm serve` is running yet).
|
||||
# That's an idle state, not a wedge — don't trigger auto-restart.
|
||||
return ProbeResult(
|
||||
ok=True,
|
||||
at=now_iso,
|
||||
note="no model currently loaded (idle)",
|
||||
)
|
||||
|
||||
if not models:
|
||||
return ProbeResult(
|
||||
ok=True,
|
||||
at=now_iso,
|
||||
note="no model currently loaded (idle)",
|
||||
)
|
||||
|
||||
model_id = models[0]["id"]
|
||||
# Step 2: model is loaded; verify it can actually complete a 1-token request.
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||
@@ -197,7 +219,7 @@ class DeepHealth:
|
||||
)
|
||||
latency = round((time.monotonic() - t0) * 1000)
|
||||
if 200 <= r.status_code < 300:
|
||||
return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
|
||||
return ProbeResult(ok=True, at=now_iso, latency_ms=latency, note=f"model={model_id}")
|
||||
return ProbeResult(
|
||||
ok=False,
|
||||
at=now_iso,
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
"""On-disk presence + deletion for Hugging Face model caches on the Sparks.
|
||||
|
||||
The HF cache layout for a repo `org/name` is:
|
||||
|
||||
~/.cache/huggingface/hub/models--org--name/
|
||||
|
||||
We use `du -sb` to measure size (bytes) and `rm -rf` to free it. All operations
|
||||
are gated by the server endpoints, which refuse to delete a currently-loaded
|
||||
model or one tied to an in-flight swap/download.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import shlex
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from .config import Settings
|
||||
from .ssh import ssh_run
|
||||
|
||||
|
||||
def repo_to_cache_dirname(repo: str) -> str:
|
||||
"""Convert 'org/name' to 'models--org--name' (the HF hub cache directory)."""
|
||||
if "/" not in repo:
|
||||
raise ValueError(f"repo must be in 'org/name' form: {repo!r}")
|
||||
return "models--" + repo.replace("/", "--")
|
||||
|
||||
|
||||
def _cache_path(repo: str) -> str:
|
||||
"""Full remote path to the model's cache directory."""
|
||||
# Use $HOME so it resolves correctly regardless of the SSH user's home.
|
||||
return f"$HOME/.cache/huggingface/hub/{repo_to_cache_dirname(repo)}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class HostDiskResult:
|
||||
host: str
|
||||
on_disk: bool
|
||||
size_bytes: int = 0
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiskStatus:
|
||||
repo: str
|
||||
on_disk: bool # True if present on AT LEAST one host
|
||||
total_bytes: int # sum across hosts
|
||||
per_host: list[HostDiskResult]
|
||||
|
||||
|
||||
async def probe_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult:
|
||||
"""Return whether the model's cache dir exists on this host and its size."""
|
||||
if not host or not user:
|
||||
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||
path = _cache_path(repo)
|
||||
# `du -sb` prints bytes; if the dir doesn't exist, `du` returns non-zero.
|
||||
# We test existence explicitly first so we can report on_disk=False cleanly.
|
||||
cmd = (
|
||||
f"if [ -d {shlex.quote(path)} ]; then "
|
||||
f"du -sb {shlex.quote(path)} 2>/dev/null | awk '{{print $1}}'; "
|
||||
f"else echo MISSING; fi"
|
||||
)
|
||||
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=20.0)
|
||||
if rc != 0:
|
||||
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
|
||||
raw = out.strip()
|
||||
if raw == "MISSING" or raw == "":
|
||||
return HostDiskResult(host=host, on_disk=False)
|
||||
try:
|
||||
size = int(raw.splitlines()[-1])
|
||||
except ValueError:
|
||||
return HostDiskResult(host=host, on_disk=False, error=f"unparsable du output: {raw!r}")
|
||||
return HostDiskResult(host=host, on_disk=True, size_bytes=size)
|
||||
|
||||
|
||||
async def probe_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
|
||||
"""Probe one model across the relevant Sparks based on its mode (solo|cluster)."""
|
||||
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
|
||||
if mode == "cluster" and settings.spark2_host:
|
||||
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||
|
||||
results = await asyncio.gather(*(probe_host(h, u, repo, settings) for h, u in hosts))
|
||||
on_disk = any(r.on_disk for r in results)
|
||||
total = sum(r.size_bytes for r in results)
|
||||
return DiskStatus(repo=repo, on_disk=on_disk, total_bytes=total, per_host=list(results))
|
||||
|
||||
|
||||
async def delete_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult:
|
||||
"""Probe + rm -rf on one host. Returns bytes freed (0 if the dir wasn't there)."""
|
||||
if not host or not user:
|
||||
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||
path = _cache_path(repo)
|
||||
# Safety: hard-code the prefix in the command so a bad `repo` can never escape.
|
||||
# Compute size first, then remove. If absent, still return success (idempotent).
|
||||
cmd = (
|
||||
f"set -e; "
|
||||
f"P={shlex.quote(path)}; "
|
||||
f"if [ -d \"$P\" ]; then "
|
||||
f" SIZE=$(du -sb \"$P\" 2>/dev/null | awk '{{print $1}}'); "
|
||||
f" rm -rf -- \"$P\"; "
|
||||
f" echo FREED $SIZE; "
|
||||
f"else "
|
||||
f" echo FREED 0; "
|
||||
f"fi"
|
||||
)
|
||||
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=120.0)
|
||||
if rc != 0:
|
||||
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
|
||||
# Parse the "FREED N" line
|
||||
freed = 0
|
||||
for line in out.splitlines():
|
||||
parts = line.strip().split()
|
||||
if len(parts) == 2 and parts[0] == "FREED":
|
||||
try:
|
||||
freed = int(parts[1])
|
||||
except ValueError:
|
||||
pass
|
||||
break
|
||||
return HostDiskResult(host=host, on_disk=False, size_bytes=freed)
|
||||
|
||||
|
||||
async def delete_from_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
|
||||
"""rm -rf the model's cache dir on the relevant Sparks. Idempotent."""
|
||||
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
|
||||
if mode == "cluster" and settings.spark2_host:
|
||||
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||
|
||||
results = await asyncio.gather(*(delete_host(h, u, repo, settings) for h, u in hosts))
|
||||
total_freed = sum(r.size_bytes for r in results)
|
||||
# After deletion, on_disk should be False on all hosts.
|
||||
return DiskStatus(repo=repo, on_disk=False, total_bytes=total_freed, per_host=list(results))
|
||||
@@ -13,6 +13,7 @@ from .config import Settings
|
||||
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||
from .custom_services import add_custom_service, delete_custom_service
|
||||
from .deep_health import DeepHealth
|
||||
from .disk import delete_from_disk, probe_disk
|
||||
from .download import DownloadManager
|
||||
from .hardware import HardwareProbe
|
||||
from .health import check_magpie, check_parakeet, check_vllm
|
||||
@@ -139,6 +140,89 @@ async def del_model(key: str) -> dict:
|
||||
return {"ok": True, "key": key}
|
||||
|
||||
|
||||
@app.get("/api/models/disk-status")
|
||||
async def get_models_disk_status() -> dict:
|
||||
"""Probe each catalog model's HF cache on the appropriate Spark(s) in parallel.
|
||||
|
||||
Result is keyed by model key: {on_disk, total_bytes, per_host:[{host,on_disk,size_bytes,error?}]}.
|
||||
Designed to be called once on dashboard load; takes ~1–3s depending on Spark count.
|
||||
"""
|
||||
if not settings.configured:
|
||||
return {"configured": False, "models": {}}
|
||||
keys = list(catalog.models.keys())
|
||||
statuses = await asyncio.gather(*(
|
||||
probe_disk(catalog.models[k].repo, catalog.models[k].mode, settings) for k in keys
|
||||
), return_exceptions=True)
|
||||
out: dict[str, dict] = {}
|
||||
for k, s in zip(keys, statuses):
|
||||
if isinstance(s, Exception):
|
||||
out[k] = {"on_disk": False, "total_bytes": 0, "per_host": [], "error": str(s)}
|
||||
continue
|
||||
out[k] = {
|
||||
"on_disk": s.on_disk,
|
||||
"total_bytes": s.total_bytes,
|
||||
"per_host": [
|
||||
{"host": r.host, "on_disk": r.on_disk, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
|
||||
for r in s.per_host
|
||||
],
|
||||
}
|
||||
return {"configured": True, "models": out}
|
||||
|
||||
|
||||
@app.delete("/api/models/{key}/disk")
|
||||
async def del_model_disk(key: str) -> dict:
|
||||
"""Delete a model's weights from the Spark filesystem(s). The catalog entry stays.
|
||||
|
||||
Safety rails:
|
||||
- Refuses if the model is currently loaded on vLLM.
|
||||
- Refuses if a swap or download is in flight.
|
||||
- Idempotent: if the cache dir is already gone on a host, that host reports 0 bytes freed.
|
||||
"""
|
||||
if key not in catalog.models:
|
||||
raise HTTPException(404, f"unknown model: {key}")
|
||||
m = catalog.models[key]
|
||||
|
||||
# Refuse if currently loaded
|
||||
try:
|
||||
vllm = await check_vllm(settings)
|
||||
except Exception:
|
||||
vllm = {}
|
||||
if vllm.get("ok") and vllm.get("current_model") == m.repo:
|
||||
raise HTTPException(
|
||||
409,
|
||||
f"'{m.display_name}' is the currently loaded model. Switch to a different model first, then try again."
|
||||
)
|
||||
|
||||
# Refuse if a swap is in flight
|
||||
if swap_manager.current_job_id:
|
||||
raise HTTPException(409, "a model swap is in progress; wait for it to finish")
|
||||
|
||||
# Refuse if a download is in flight for this same repo (a different model's download is fine)
|
||||
if download_manager.current_job_id:
|
||||
job = download_manager.get(download_manager.current_job_id)
|
||||
if job and job.repo == m.repo:
|
||||
raise HTTPException(409, "this model is currently downloading; cancel or wait for it to finish")
|
||||
|
||||
status = await delete_from_disk(m.repo, m.mode, settings)
|
||||
# Audit log
|
||||
record_report(
|
||||
f"disk:{key}",
|
||||
ok=True,
|
||||
source="disk-delete",
|
||||
detail=f"freed {status.total_bytes} bytes across {len(status.per_host)} host(s)",
|
||||
)
|
||||
return {
|
||||
"ok": True,
|
||||
"key": key,
|
||||
"repo": m.repo,
|
||||
"bytes_freed": status.total_bytes,
|
||||
"per_host": [
|
||||
{"host": r.host, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
|
||||
for r in status.per_host
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/hardware")
|
||||
async def get_hardware() -> dict:
|
||||
"""Per-Spark hardware snapshot — RAM, disk, GPU mem + util, CPU load, uptime."""
|
||||
|
||||
@@ -18,6 +18,8 @@ const state = {
|
||||
configured: true,
|
||||
timer_handle: null,
|
||||
deep_health: {},
|
||||
disk_status: {}, // keyed by model key: { on_disk, total_bytes, per_host }
|
||||
disk_status_loaded: false,
|
||||
};
|
||||
|
||||
const el = (sel) => document.querySelector(sel);
|
||||
@@ -57,12 +59,36 @@ function renderCards() {
|
||||
? `<div class="desc">${escapeHtml(m.description)}</div>`
|
||||
: '';
|
||||
const customPill = m.custom ? `<span class="tag custom-pill">custom</span>` : '';
|
||||
// Disk-presence pill + trash button. Until /api/models/disk-status comes back,
|
||||
// we don't know — render a neutral placeholder.
|
||||
const disk = state.disk_status[key];
|
||||
let diskPill = '';
|
||||
if (state.disk_status_loaded) {
|
||||
if (disk && disk.on_disk) {
|
||||
const gb = (disk.total_bytes / 1e9);
|
||||
diskPill = `<span class="tag on-disk" title="Weights present on disk">on disk · ${gb.toFixed(1)} GB</span>`;
|
||||
} else {
|
||||
diskPill = `<span class="tag not-on-disk" title="Weights not downloaded">not downloaded</span>`;
|
||||
}
|
||||
}
|
||||
// Trash button — hidden if not on disk; disabled (with tooltip) if currently loaded.
|
||||
let trashBtn = '';
|
||||
if (state.disk_status_loaded && disk && disk.on_disk) {
|
||||
const disabled = isActive || isSwapping;
|
||||
const tip = isActive
|
||||
? 'Currently loaded — switch to another model first'
|
||||
: isSwapping
|
||||
? 'A swap is in progress'
|
||||
: 'Delete weights from disk';
|
||||
trashBtn = `<button class="icon-btn danger" data-disk-del-key="${key}" title="${escapeHtml(tip)}" aria-label="Delete from disk" ${disabled ? 'disabled' : ''}>${trashIcon}</button>`;
|
||||
}
|
||||
card.innerHTML = `
|
||||
<div class="name">${escapeHtml(m.display_name)}</div>
|
||||
<div class="meta">
|
||||
<span class="tag mode-${m.mode}">${m.mode}</span>
|
||||
<span class="tag">${m.size_gb} GB</span>
|
||||
${customPill}
|
||||
${diskPill}
|
||||
${(m.capabilities || []).map(c => `<span class="tag cap">${escapeHtml(c)}</span>`).join('')}
|
||||
</div>
|
||||
${desc}
|
||||
@@ -76,6 +102,7 @@ function renderCards() {
|
||||
</button>
|
||||
<button class="btn test-btn" data-test-key="${key}" title="Pre-flight check the launch command without starting the engine">Test</button>
|
||||
<button class="btn adv-btn" data-adv-key="${key}" title="Advanced settings">Advanced</button>
|
||||
${trashBtn}
|
||||
</div>
|
||||
<div class="test-result hidden" data-test-result-for="${key}"></div>
|
||||
`;
|
||||
@@ -90,8 +117,13 @@ function renderCards() {
|
||||
for (const btn of root.querySelectorAll('[data-test-key]')) {
|
||||
btn.addEventListener('click', () => testLaunch(btn.dataset.testKey, btn));
|
||||
}
|
||||
for (const btn of root.querySelectorAll('[data-disk-del-key]')) {
|
||||
btn.addEventListener('click', () => openDiskDeleteDialog(btn.dataset.diskDelKey));
|
||||
}
|
||||
}
|
||||
|
||||
const trashIcon = '<svg viewBox="0 0 24 24" width="14" height="14" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" aria-hidden="true"><polyline points="3 6 5 6 21 6"></polyline><path d="M19 6l-1 14a2 2 0 0 1-2 2H8a2 2 0 0 1-2-2L5 6"></path><path d="M10 11v6"></path><path d="M14 11v6"></path><path d="M9 6V4a2 2 0 0 1 2-2h2a2 2 0 0 1 2 2v2"></path></svg>';
|
||||
|
||||
async function testLaunch(key, btn) {
|
||||
const resultEl = document.querySelector(`[data-test-result-for="${key}"]`);
|
||||
if (!resultEl) return;
|
||||
@@ -739,6 +771,78 @@ async function loadModels() {
|
||||
state.models = data.models || {};
|
||||
}
|
||||
|
||||
async function loadDiskStatus() {
|
||||
// Probes each catalog model's HF cache over SSH; takes a beat. Best-effort.
|
||||
try {
|
||||
const r = await fetchJSON('/api/models/disk-status');
|
||||
if (r && r.models) {
|
||||
state.disk_status = r.models;
|
||||
state.disk_status_loaded = true;
|
||||
renderCards();
|
||||
}
|
||||
} catch (e) {
|
||||
// Silent — pills just won't render. Don't block dashboard.
|
||||
console.warn('disk-status probe failed:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
function fmtBytesShort(n) {
|
||||
if (!Number.isFinite(n) || n <= 0) return '0 B';
|
||||
if (n >= 1e9) return `${(n / 1e9).toFixed(1)} GB`;
|
||||
if (n >= 1e6) return `${(n / 1e6).toFixed(1)} MB`;
|
||||
if (n >= 1e3) return `${(n / 1e3).toFixed(1)} KB`;
|
||||
return `${n} B`;
|
||||
}
|
||||
|
||||
function openDiskDeleteDialog(key) {
|
||||
const m = state.models[key];
|
||||
const disk = state.disk_status[key];
|
||||
if (!m || !disk || !disk.on_disk) return;
|
||||
const dlg = el('#disk-delete-dialog');
|
||||
el('#dd-summary').innerHTML = `Free <strong>${fmtBytesShort(disk.total_bytes)}</strong> by removing <strong>${escapeHtml(m.display_name)}</strong> (<code>${escapeHtml(m.repo)}</code>) from disk.`;
|
||||
const hostsEl = el('#dd-hosts');
|
||||
hostsEl.innerHTML = '';
|
||||
for (const h of (disk.per_host || [])) {
|
||||
if (!h.on_disk) continue;
|
||||
const li = document.createElement('li');
|
||||
li.innerHTML = `<code>${escapeHtml(h.host)}</code> — ${fmtBytesShort(h.size_bytes)}`;
|
||||
hostsEl.appendChild(li);
|
||||
}
|
||||
const errEl = el('#dd-error');
|
||||
errEl.classList.add('hidden');
|
||||
errEl.textContent = '';
|
||||
|
||||
const confirm = el('#dd-confirm');
|
||||
const cancel = el('#dd-cancel');
|
||||
const onCancel = () => dlg.close();
|
||||
const onConfirm = async () => {
|
||||
confirm.disabled = true;
|
||||
cancel.disabled = true;
|
||||
confirm.textContent = 'Deleting…';
|
||||
try {
|
||||
const r = await fetchJSON(`/api/models/${encodeURIComponent(key)}/disk`, { method: 'DELETE' });
|
||||
dlg.close();
|
||||
// Optimistically clear local disk state for this key, then refresh.
|
||||
delete state.disk_status[key];
|
||||
renderCards();
|
||||
// Eagerly re-probe so size is accurate (and shows "not downloaded" pill).
|
||||
loadDiskStatus();
|
||||
const freed = r && typeof r.bytes_freed === 'number' ? fmtBytesShort(r.bytes_freed) : '';
|
||||
console.log(`Deleted ${m.display_name} from disk${freed ? ` — freed ${freed}` : ''}.`);
|
||||
} catch (e) {
|
||||
errEl.textContent = e.message || 'Delete failed';
|
||||
errEl.classList.remove('hidden');
|
||||
} finally {
|
||||
confirm.disabled = false;
|
||||
cancel.disabled = false;
|
||||
confirm.textContent = 'Delete from disk';
|
||||
}
|
||||
};
|
||||
cancel.onclick = onCancel;
|
||||
confirm.onclick = onConfirm;
|
||||
dlg.showModal();
|
||||
}
|
||||
|
||||
async function triggerSwap(modelKey) {
|
||||
if (state.swap_job_id) return;
|
||||
try {
|
||||
@@ -1523,9 +1627,12 @@ async function init() {
|
||||
await renderServices();
|
||||
pollHardware();
|
||||
pollUpdates();
|
||||
// Disk-status probe runs after first paint — slow over SSH and not blocking.
|
||||
loadDiskStatus();
|
||||
setInterval(pollStatus, 5000);
|
||||
setInterval(pollHardware, 8000); // every 8s
|
||||
setInterval(pollUpdates, 300000); // every 5 min
|
||||
setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely
|
||||
}
|
||||
|
||||
init();
|
||||
|
||||
@@ -188,6 +188,20 @@
|
||||
</form>
|
||||
</dialog>
|
||||
|
||||
<dialog id="disk-delete-dialog" class="modal">
|
||||
<form method="dialog" class="modal-form">
|
||||
<h3>Delete model weights from disk?</h3>
|
||||
<p id="dd-summary" class="muted small"></p>
|
||||
<ul class="muted small dd-hosts" id="dd-hosts"></ul>
|
||||
<p class="muted small">This is reversible — you can re-download from the catalog at any time. The catalog entry stays intact.</p>
|
||||
<p id="dd-error" class="muted small dd-error hidden"></p>
|
||||
<div class="modal-actions">
|
||||
<button type="button" id="dd-cancel" class="btn">Cancel</button>
|
||||
<button type="button" id="dd-confirm" class="btn danger">Delete from disk</button>
|
||||
</div>
|
||||
</form>
|
||||
</dialog>
|
||||
|
||||
<dialog id="advanced-dialog" class="modal">
|
||||
<form method="dialog" class="modal-form" id="advanced-form">
|
||||
<h3 id="adv-title">Advanced settings</h3>
|
||||
|
||||
@@ -717,6 +717,14 @@ main {
|
||||
.card .adv-btn,
|
||||
.card .test-btn { padding: 8px 12px; font-size: 12px; }
|
||||
.card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); }
|
||||
.tag.on-disk { color: var(--accent); border-color: rgba(74, 222, 128, 0.4); }
|
||||
.tag.not-on-disk { color: var(--muted); border-color: var(--border); opacity: 0.7; }
|
||||
.card-actions .icon-btn.danger { color: var(--error); border-color: rgba(239, 68, 68, 0.3); margin-left: auto; }
|
||||
.card-actions .icon-btn.danger:hover:not(:disabled) { background: rgba(239, 68, 68, 0.08); border-color: var(--error); color: var(--error); }
|
||||
.card-actions .icon-btn.danger:disabled { opacity: 0.35; cursor: not-allowed; }
|
||||
.dd-hosts { padding-left: 18px; margin: 4px 0 8px; }
|
||||
.dd-hosts code { background: var(--surface-2); padding: 1px 5px; border-radius: 4px; }
|
||||
.dd-error { color: var(--error); }
|
||||
|
||||
.test-result {
|
||||
font-size: 12px;
|
||||
|
||||
@@ -30,6 +30,7 @@ models:
|
||||
- -tp=2
|
||||
- --distributed-executor-backend=ray
|
||||
- --max-model-len=32768
|
||||
- --max-num-batched-tokens=16384
|
||||
|
||||
gemma4:
|
||||
display_name: "Gemma 4 31B"
|
||||
@@ -45,6 +46,7 @@ models:
|
||||
vllm_args:
|
||||
- --gpu-memory-utilization=0.8
|
||||
- --max-model-len=32768
|
||||
- --max-num-batched-tokens=16384
|
||||
- --reasoning-parser=gemma4
|
||||
- --tool-call-parser=gemma4
|
||||
- --enable-auto-tool-choice
|
||||
|
||||
@@ -24,6 +24,10 @@ This flag is Blackwell-specific. If vLLM in the container reports `unrecognized
|
||||
|
||||
Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe.
|
||||
|
||||
## Multimodal token budget for vision models (fixed in v0.8.0:1)
|
||||
|
||||
After the eugr/spark-vllm-docker update, vLLM became stricter about multimodal token budgets. Vision-capable models like Gemma 4 31B and Qwen3-VL crash at engine init with `ValueError: Chunked MM input disabled but max_tokens_per_mm_item (2496) is larger than max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into every model that has the `vision` capability. Now applied to qwen3-vl, gemma4, and qwen36 (which was already set for the Mamba issue).
|
||||
|
||||
## Two SSH paths to Spark 1 from the laptop
|
||||
|
||||
`ssh <spark-user>@<spark-1-ip>` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `<spark-1-host>.local`. Always use the `.local` hostname or `<spark-2-ip>`-style entries that ARE matched.
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||
|
||||
export const v0_1_0 = VersionInfo.of({
|
||||
version: '0.8.0:2',
|
||||
version: '0.8.1:0',
|
||||
releaseNotes: {
|
||||
en_US:
|
||||
'v0.8: deep health probes. Every 5 minutes, Spark Control sends a tiny synthetic inference request to each service (1 second of silent audio to Parakeet, short text to Magpie, 1-token completion to vLLM). All payloads are generated in-memory and never written to disk. If a probe returns CUDA-error / 5xx signals while the container is still "up" — i.e. the classic Triton-wedge pattern where /health stays green but real inference fails — Spark Control automatically restarts the affected container. Rate-limited to 3 auto-restarts per service per 30 minutes. Each service card now shows the last deep-check timestamp, latency, and an inline "Run now" button. Failures and recoveries are logged into the connectivity history with source=deep-health.',
|
||||
'v0.8.1: model weights can now be deleted from disk directly from the dashboard. Each model card shows whether the weights are present (with on-disk GB size) or not yet downloaded. When present and the model is NOT currently loaded, a small trash icon appears on the card; clicking it pops a confirmation showing how many GB will be freed and on which Spark(s), then runs `rm -rf` on the Hugging Face cache directory via SSH. Cluster-mode models are deleted from both Sparks; solo-mode from Spark 1 only. Safety rails: refuses to delete the currently-loaded model, refuses during an in-flight swap or download, and the catalog entry stays — you can always re-download. Disk status is probed once on dashboard load and re-checked every 60s.',
|
||||
},
|
||||
migrations: {
|
||||
up: async ({ effects }) => {},
|
||||
|
||||
Reference in New Issue
Block a user