Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9ff7ee9c1e | |||
| 1602b3b3b4 | |||
| 8ac455f5f5 |
@@ -173,16 +173,38 @@ class DeepHealth:
|
|||||||
if not s.spark1_host:
|
if not s.spark1_host:
|
||||||
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||||
base = f"http://{s.spark1_host}:{s.vllm_port}"
|
base = f"http://{s.spark1_host}:{s.vllm_port}"
|
||||||
|
# Step 1: is there a model loaded?
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=5.0) as c:
|
async with httpx.AsyncClient(timeout=5.0) as c:
|
||||||
r = await c.get(f"{base}/v1/models")
|
r = await c.get(f"{base}/v1/models")
|
||||||
r.raise_for_status()
|
if 200 <= r.status_code < 300:
|
||||||
models = r.json().get("data") or []
|
models = r.json().get("data") or []
|
||||||
|
else:
|
||||||
|
# 5xx on /v1/models suggests something wedged after a model loaded
|
||||||
|
return ProbeResult(
|
||||||
|
ok=False,
|
||||||
|
at=now_iso,
|
||||||
|
error=f"list_models HTTP {r.status_code}: {r.text[:240]}",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Connection refused / timeout: usually means no vLLM process listening
|
||||||
|
# (the vllm_node container is alive but no `vllm serve` is running yet).
|
||||||
|
# That's an idle state, not a wedge — don't trigger auto-restart.
|
||||||
|
return ProbeResult(
|
||||||
|
ok=True,
|
||||||
|
at=now_iso,
|
||||||
|
note="no model currently loaded (idle)",
|
||||||
|
)
|
||||||
|
|
||||||
if not models:
|
if not models:
|
||||||
return ProbeResult(ok=False, at=now_iso, error="no model loaded")
|
return ProbeResult(
|
||||||
|
ok=True,
|
||||||
|
at=now_iso,
|
||||||
|
note="no model currently loaded (idle)",
|
||||||
|
)
|
||||||
|
|
||||||
model_id = models[0]["id"]
|
model_id = models[0]["id"]
|
||||||
except Exception as e:
|
# Step 2: model is loaded; verify it can actually complete a 1-token request.
|
||||||
return ProbeResult(ok=False, at=now_iso, error=f"list models: {type(e).__name__}: {e}")
|
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||||
@@ -197,7 +219,7 @@ class DeepHealth:
|
|||||||
)
|
)
|
||||||
latency = round((time.monotonic() - t0) * 1000)
|
latency = round((time.monotonic() - t0) * 1000)
|
||||||
if 200 <= r.status_code < 300:
|
if 200 <= r.status_code < 300:
|
||||||
return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
|
return ProbeResult(ok=True, at=now_iso, latency_ms=latency, note=f"model={model_id}")
|
||||||
return ProbeResult(
|
return ProbeResult(
|
||||||
ok=False,
|
ok=False,
|
||||||
at=now_iso,
|
at=now_iso,
|
||||||
|
|||||||
@@ -0,0 +1,130 @@
|
|||||||
|
"""On-disk presence + deletion for Hugging Face model caches on the Sparks.
|
||||||
|
|
||||||
|
The HF cache layout for a repo `org/name` is:
|
||||||
|
|
||||||
|
~/.cache/huggingface/hub/models--org--name/
|
||||||
|
|
||||||
|
We use `du -sb` to measure size (bytes) and `rm -rf` to free it. All operations
|
||||||
|
are gated by the server endpoints, which refuse to delete a currently-loaded
|
||||||
|
model or one tied to an in-flight swap/download.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
import shlex
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from .config import Settings
|
||||||
|
from .ssh import ssh_run
|
||||||
|
|
||||||
|
|
||||||
|
def repo_to_cache_dirname(repo: str) -> str:
|
||||||
|
"""Convert 'org/name' to 'models--org--name' (the HF hub cache directory)."""
|
||||||
|
if "/" not in repo:
|
||||||
|
raise ValueError(f"repo must be in 'org/name' form: {repo!r}")
|
||||||
|
return "models--" + repo.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
|
def _cache_path(repo: str) -> str:
|
||||||
|
"""Full remote path to the model's cache directory."""
|
||||||
|
# Use $HOME so it resolves correctly regardless of the SSH user's home.
|
||||||
|
return f"$HOME/.cache/huggingface/hub/{repo_to_cache_dirname(repo)}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HostDiskResult:
|
||||||
|
host: str
|
||||||
|
on_disk: bool
|
||||||
|
size_bytes: int = 0
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DiskStatus:
|
||||||
|
repo: str
|
||||||
|
on_disk: bool # True if present on AT LEAST one host
|
||||||
|
total_bytes: int # sum across hosts
|
||||||
|
per_host: list[HostDiskResult]
|
||||||
|
|
||||||
|
|
||||||
|
async def probe_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult:
|
||||||
|
"""Return whether the model's cache dir exists on this host and its size."""
|
||||||
|
if not host or not user:
|
||||||
|
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||||
|
path = _cache_path(repo)
|
||||||
|
# `du -sb` prints bytes; if the dir doesn't exist, `du` returns non-zero.
|
||||||
|
# We test existence explicitly first so we can report on_disk=False cleanly.
|
||||||
|
cmd = (
|
||||||
|
f"if [ -d {shlex.quote(path)} ]; then "
|
||||||
|
f"du -sb {shlex.quote(path)} 2>/dev/null | awk '{{print $1}}'; "
|
||||||
|
f"else echo MISSING; fi"
|
||||||
|
)
|
||||||
|
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=20.0)
|
||||||
|
if rc != 0:
|
||||||
|
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
|
||||||
|
raw = out.strip()
|
||||||
|
if raw == "MISSING" or raw == "":
|
||||||
|
return HostDiskResult(host=host, on_disk=False)
|
||||||
|
try:
|
||||||
|
size = int(raw.splitlines()[-1])
|
||||||
|
except ValueError:
|
||||||
|
return HostDiskResult(host=host, on_disk=False, error=f"unparsable du output: {raw!r}")
|
||||||
|
return HostDiskResult(host=host, on_disk=True, size_bytes=size)
|
||||||
|
|
||||||
|
|
||||||
|
async def probe_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
|
||||||
|
"""Probe one model across the relevant Sparks based on its mode (solo|cluster)."""
|
||||||
|
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
|
||||||
|
if mode == "cluster" and settings.spark2_host:
|
||||||
|
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||||
|
|
||||||
|
results = await asyncio.gather(*(probe_host(h, u, repo, settings) for h, u in hosts))
|
||||||
|
on_disk = any(r.on_disk for r in results)
|
||||||
|
total = sum(r.size_bytes for r in results)
|
||||||
|
return DiskStatus(repo=repo, on_disk=on_disk, total_bytes=total, per_host=list(results))
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_host(host: str, user: str, repo: str, settings: Settings) -> HostDiskResult:
|
||||||
|
"""Probe + rm -rf on one host. Returns bytes freed (0 if the dir wasn't there)."""
|
||||||
|
if not host or not user:
|
||||||
|
return HostDiskResult(host=host or "?", on_disk=False, error="host not configured")
|
||||||
|
path = _cache_path(repo)
|
||||||
|
# Safety: hard-code the prefix in the command so a bad `repo` can never escape.
|
||||||
|
# Compute size first, then remove. If absent, still return success (idempotent).
|
||||||
|
cmd = (
|
||||||
|
f"set -e; "
|
||||||
|
f"P={shlex.quote(path)}; "
|
||||||
|
f"if [ -d \"$P\" ]; then "
|
||||||
|
f" SIZE=$(du -sb \"$P\" 2>/dev/null | awk '{{print $1}}'); "
|
||||||
|
f" rm -rf -- \"$P\"; "
|
||||||
|
f" echo FREED $SIZE; "
|
||||||
|
f"else "
|
||||||
|
f" echo FREED 0; "
|
||||||
|
f"fi"
|
||||||
|
)
|
||||||
|
rc, out, err = await ssh_run(host, user, cmd, settings, timeout=120.0)
|
||||||
|
if rc != 0:
|
||||||
|
return HostDiskResult(host=host, on_disk=False, error=(err or out).strip() or f"rc={rc}")
|
||||||
|
# Parse the "FREED N" line
|
||||||
|
freed = 0
|
||||||
|
for line in out.splitlines():
|
||||||
|
parts = line.strip().split()
|
||||||
|
if len(parts) == 2 and parts[0] == "FREED":
|
||||||
|
try:
|
||||||
|
freed = int(parts[1])
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
break
|
||||||
|
return HostDiskResult(host=host, on_disk=False, size_bytes=freed)
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_from_disk(repo: str, mode: str, settings: Settings) -> DiskStatus:
|
||||||
|
"""rm -rf the model's cache dir on the relevant Sparks. Idempotent."""
|
||||||
|
hosts: list[tuple[str, str]] = [(settings.spark1_host, settings.spark1_user)]
|
||||||
|
if mode == "cluster" and settings.spark2_host:
|
||||||
|
hosts.append((settings.spark2_host, settings.spark2_user))
|
||||||
|
|
||||||
|
results = await asyncio.gather(*(delete_host(h, u, repo, settings) for h, u in hosts))
|
||||||
|
total_freed = sum(r.size_bytes for r in results)
|
||||||
|
# After deletion, on_disk should be False on all hosts.
|
||||||
|
return DiskStatus(repo=repo, on_disk=False, total_bytes=total_freed, per_host=list(results))
|
||||||
@@ -13,6 +13,7 @@ from .config import Settings
|
|||||||
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||||
from .custom_services import add_custom_service, delete_custom_service
|
from .custom_services import add_custom_service, delete_custom_service
|
||||||
from .deep_health import DeepHealth
|
from .deep_health import DeepHealth
|
||||||
|
from .disk import delete_from_disk, probe_disk
|
||||||
from .download import DownloadManager
|
from .download import DownloadManager
|
||||||
from .hardware import HardwareProbe
|
from .hardware import HardwareProbe
|
||||||
from .health import check_magpie, check_parakeet, check_vllm
|
from .health import check_magpie, check_parakeet, check_vllm
|
||||||
@@ -139,6 +140,89 @@ async def del_model(key: str) -> dict:
|
|||||||
return {"ok": True, "key": key}
|
return {"ok": True, "key": key}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/models/disk-status")
|
||||||
|
async def get_models_disk_status() -> dict:
|
||||||
|
"""Probe each catalog model's HF cache on the appropriate Spark(s) in parallel.
|
||||||
|
|
||||||
|
Result is keyed by model key: {on_disk, total_bytes, per_host:[{host,on_disk,size_bytes,error?}]}.
|
||||||
|
Designed to be called once on dashboard load; takes ~1–3s depending on Spark count.
|
||||||
|
"""
|
||||||
|
if not settings.configured:
|
||||||
|
return {"configured": False, "models": {}}
|
||||||
|
keys = list(catalog.models.keys())
|
||||||
|
statuses = await asyncio.gather(*(
|
||||||
|
probe_disk(catalog.models[k].repo, catalog.models[k].mode, settings) for k in keys
|
||||||
|
), return_exceptions=True)
|
||||||
|
out: dict[str, dict] = {}
|
||||||
|
for k, s in zip(keys, statuses):
|
||||||
|
if isinstance(s, Exception):
|
||||||
|
out[k] = {"on_disk": False, "total_bytes": 0, "per_host": [], "error": str(s)}
|
||||||
|
continue
|
||||||
|
out[k] = {
|
||||||
|
"on_disk": s.on_disk,
|
||||||
|
"total_bytes": s.total_bytes,
|
||||||
|
"per_host": [
|
||||||
|
{"host": r.host, "on_disk": r.on_disk, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
|
||||||
|
for r in s.per_host
|
||||||
|
],
|
||||||
|
}
|
||||||
|
return {"configured": True, "models": out}
|
||||||
|
|
||||||
|
|
||||||
|
@app.delete("/api/models/{key}/disk")
|
||||||
|
async def del_model_disk(key: str) -> dict:
|
||||||
|
"""Delete a model's weights from the Spark filesystem(s). The catalog entry stays.
|
||||||
|
|
||||||
|
Safety rails:
|
||||||
|
- Refuses if the model is currently loaded on vLLM.
|
||||||
|
- Refuses if a swap or download is in flight.
|
||||||
|
- Idempotent: if the cache dir is already gone on a host, that host reports 0 bytes freed.
|
||||||
|
"""
|
||||||
|
if key not in catalog.models:
|
||||||
|
raise HTTPException(404, f"unknown model: {key}")
|
||||||
|
m = catalog.models[key]
|
||||||
|
|
||||||
|
# Refuse if currently loaded
|
||||||
|
try:
|
||||||
|
vllm = await check_vllm(settings)
|
||||||
|
except Exception:
|
||||||
|
vllm = {}
|
||||||
|
if vllm.get("ok") and vllm.get("current_model") == m.repo:
|
||||||
|
raise HTTPException(
|
||||||
|
409,
|
||||||
|
f"'{m.display_name}' is the currently loaded model. Switch to a different model first, then try again."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Refuse if a swap is in flight
|
||||||
|
if swap_manager.current_job_id:
|
||||||
|
raise HTTPException(409, "a model swap is in progress; wait for it to finish")
|
||||||
|
|
||||||
|
# Refuse if a download is in flight for this same repo (a different model's download is fine)
|
||||||
|
if download_manager.current_job_id:
|
||||||
|
job = download_manager.get(download_manager.current_job_id)
|
||||||
|
if job and job.repo == m.repo:
|
||||||
|
raise HTTPException(409, "this model is currently downloading; cancel or wait for it to finish")
|
||||||
|
|
||||||
|
status = await delete_from_disk(m.repo, m.mode, settings)
|
||||||
|
# Audit log
|
||||||
|
record_report(
|
||||||
|
f"disk:{key}",
|
||||||
|
ok=True,
|
||||||
|
source="disk-delete",
|
||||||
|
detail=f"freed {status.total_bytes} bytes across {len(status.per_host)} host(s)",
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"key": key,
|
||||||
|
"repo": m.repo,
|
||||||
|
"bytes_freed": status.total_bytes,
|
||||||
|
"per_host": [
|
||||||
|
{"host": r.host, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
|
||||||
|
for r in status.per_host
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/hardware")
|
@app.get("/api/hardware")
|
||||||
async def get_hardware() -> dict:
|
async def get_hardware() -> dict:
|
||||||
"""Per-Spark hardware snapshot — RAM, disk, GPU mem + util, CPU load, uptime."""
|
"""Per-Spark hardware snapshot — RAM, disk, GPU mem + util, CPU load, uptime."""
|
||||||
|
|||||||
@@ -18,6 +18,8 @@ const state = {
|
|||||||
configured: true,
|
configured: true,
|
||||||
timer_handle: null,
|
timer_handle: null,
|
||||||
deep_health: {},
|
deep_health: {},
|
||||||
|
disk_status: {}, // keyed by model key: { on_disk, total_bytes, per_host }
|
||||||
|
disk_status_loaded: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
const el = (sel) => document.querySelector(sel);
|
const el = (sel) => document.querySelector(sel);
|
||||||
@@ -57,12 +59,36 @@ function renderCards() {
|
|||||||
? `<div class="desc">${escapeHtml(m.description)}</div>`
|
? `<div class="desc">${escapeHtml(m.description)}</div>`
|
||||||
: '';
|
: '';
|
||||||
const customPill = m.custom ? `<span class="tag custom-pill">custom</span>` : '';
|
const customPill = m.custom ? `<span class="tag custom-pill">custom</span>` : '';
|
||||||
|
// Disk-presence pill + trash button. Until /api/models/disk-status comes back,
|
||||||
|
// we don't know — render a neutral placeholder.
|
||||||
|
const disk = state.disk_status[key];
|
||||||
|
let diskPill = '';
|
||||||
|
if (state.disk_status_loaded) {
|
||||||
|
if (disk && disk.on_disk) {
|
||||||
|
const gb = (disk.total_bytes / 1e9);
|
||||||
|
diskPill = `<span class="tag on-disk" title="Weights present on disk">on disk · ${gb.toFixed(1)} GB</span>`;
|
||||||
|
} else {
|
||||||
|
diskPill = `<span class="tag not-on-disk" title="Weights not downloaded">not downloaded</span>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Trash button — hidden if not on disk; disabled (with tooltip) if currently loaded.
|
||||||
|
let trashBtn = '';
|
||||||
|
if (state.disk_status_loaded && disk && disk.on_disk) {
|
||||||
|
const disabled = isActive || isSwapping;
|
||||||
|
const tip = isActive
|
||||||
|
? 'Currently loaded — switch to another model first'
|
||||||
|
: isSwapping
|
||||||
|
? 'A swap is in progress'
|
||||||
|
: 'Delete weights from disk';
|
||||||
|
trashBtn = `<button class="icon-btn danger" data-disk-del-key="${key}" title="${escapeHtml(tip)}" aria-label="Delete from disk" ${disabled ? 'disabled' : ''}>${trashIcon}</button>`;
|
||||||
|
}
|
||||||
card.innerHTML = `
|
card.innerHTML = `
|
||||||
<div class="name">${escapeHtml(m.display_name)}</div>
|
<div class="name">${escapeHtml(m.display_name)}</div>
|
||||||
<div class="meta">
|
<div class="meta">
|
||||||
<span class="tag mode-${m.mode}">${m.mode}</span>
|
<span class="tag mode-${m.mode}">${m.mode}</span>
|
||||||
<span class="tag">${m.size_gb} GB</span>
|
<span class="tag">${m.size_gb} GB</span>
|
||||||
${customPill}
|
${customPill}
|
||||||
|
${diskPill}
|
||||||
${(m.capabilities || []).map(c => `<span class="tag cap">${escapeHtml(c)}</span>`).join('')}
|
${(m.capabilities || []).map(c => `<span class="tag cap">${escapeHtml(c)}</span>`).join('')}
|
||||||
</div>
|
</div>
|
||||||
${desc}
|
${desc}
|
||||||
@@ -76,6 +102,7 @@ function renderCards() {
|
|||||||
</button>
|
</button>
|
||||||
<button class="btn test-btn" data-test-key="${key}" title="Pre-flight check the launch command without starting the engine">Test</button>
|
<button class="btn test-btn" data-test-key="${key}" title="Pre-flight check the launch command without starting the engine">Test</button>
|
||||||
<button class="btn adv-btn" data-adv-key="${key}" title="Advanced settings">Advanced</button>
|
<button class="btn adv-btn" data-adv-key="${key}" title="Advanced settings">Advanced</button>
|
||||||
|
${trashBtn}
|
||||||
</div>
|
</div>
|
||||||
<div class="test-result hidden" data-test-result-for="${key}"></div>
|
<div class="test-result hidden" data-test-result-for="${key}"></div>
|
||||||
`;
|
`;
|
||||||
@@ -90,8 +117,13 @@ function renderCards() {
|
|||||||
for (const btn of root.querySelectorAll('[data-test-key]')) {
|
for (const btn of root.querySelectorAll('[data-test-key]')) {
|
||||||
btn.addEventListener('click', () => testLaunch(btn.dataset.testKey, btn));
|
btn.addEventListener('click', () => testLaunch(btn.dataset.testKey, btn));
|
||||||
}
|
}
|
||||||
|
for (const btn of root.querySelectorAll('[data-disk-del-key]')) {
|
||||||
|
btn.addEventListener('click', () => openDiskDeleteDialog(btn.dataset.diskDelKey));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const trashIcon = '<svg viewBox="0 0 24 24" width="14" height="14" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" aria-hidden="true"><polyline points="3 6 5 6 21 6"></polyline><path d="M19 6l-1 14a2 2 0 0 1-2 2H8a2 2 0 0 1-2-2L5 6"></path><path d="M10 11v6"></path><path d="M14 11v6"></path><path d="M9 6V4a2 2 0 0 1 2-2h2a2 2 0 0 1 2 2v2"></path></svg>';
|
||||||
|
|
||||||
async function testLaunch(key, btn) {
|
async function testLaunch(key, btn) {
|
||||||
const resultEl = document.querySelector(`[data-test-result-for="${key}"]`);
|
const resultEl = document.querySelector(`[data-test-result-for="${key}"]`);
|
||||||
if (!resultEl) return;
|
if (!resultEl) return;
|
||||||
@@ -739,6 +771,78 @@ async function loadModels() {
|
|||||||
state.models = data.models || {};
|
state.models = data.models || {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function loadDiskStatus() {
|
||||||
|
// Probes each catalog model's HF cache over SSH; takes a beat. Best-effort.
|
||||||
|
try {
|
||||||
|
const r = await fetchJSON('/api/models/disk-status');
|
||||||
|
if (r && r.models) {
|
||||||
|
state.disk_status = r.models;
|
||||||
|
state.disk_status_loaded = true;
|
||||||
|
renderCards();
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Silent — pills just won't render. Don't block dashboard.
|
||||||
|
console.warn('disk-status probe failed:', e.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function fmtBytesShort(n) {
|
||||||
|
if (!Number.isFinite(n) || n <= 0) return '0 B';
|
||||||
|
if (n >= 1e9) return `${(n / 1e9).toFixed(1)} GB`;
|
||||||
|
if (n >= 1e6) return `${(n / 1e6).toFixed(1)} MB`;
|
||||||
|
if (n >= 1e3) return `${(n / 1e3).toFixed(1)} KB`;
|
||||||
|
return `${n} B`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function openDiskDeleteDialog(key) {
|
||||||
|
const m = state.models[key];
|
||||||
|
const disk = state.disk_status[key];
|
||||||
|
if (!m || !disk || !disk.on_disk) return;
|
||||||
|
const dlg = el('#disk-delete-dialog');
|
||||||
|
el('#dd-summary').innerHTML = `Free <strong>${fmtBytesShort(disk.total_bytes)}</strong> by removing <strong>${escapeHtml(m.display_name)}</strong> (<code>${escapeHtml(m.repo)}</code>) from disk.`;
|
||||||
|
const hostsEl = el('#dd-hosts');
|
||||||
|
hostsEl.innerHTML = '';
|
||||||
|
for (const h of (disk.per_host || [])) {
|
||||||
|
if (!h.on_disk) continue;
|
||||||
|
const li = document.createElement('li');
|
||||||
|
li.innerHTML = `<code>${escapeHtml(h.host)}</code> — ${fmtBytesShort(h.size_bytes)}`;
|
||||||
|
hostsEl.appendChild(li);
|
||||||
|
}
|
||||||
|
const errEl = el('#dd-error');
|
||||||
|
errEl.classList.add('hidden');
|
||||||
|
errEl.textContent = '';
|
||||||
|
|
||||||
|
const confirm = el('#dd-confirm');
|
||||||
|
const cancel = el('#dd-cancel');
|
||||||
|
const onCancel = () => dlg.close();
|
||||||
|
const onConfirm = async () => {
|
||||||
|
confirm.disabled = true;
|
||||||
|
cancel.disabled = true;
|
||||||
|
confirm.textContent = 'Deleting…';
|
||||||
|
try {
|
||||||
|
const r = await fetchJSON(`/api/models/${encodeURIComponent(key)}/disk`, { method: 'DELETE' });
|
||||||
|
dlg.close();
|
||||||
|
// Optimistically clear local disk state for this key, then refresh.
|
||||||
|
delete state.disk_status[key];
|
||||||
|
renderCards();
|
||||||
|
// Eagerly re-probe so size is accurate (and shows "not downloaded" pill).
|
||||||
|
loadDiskStatus();
|
||||||
|
const freed = r && typeof r.bytes_freed === 'number' ? fmtBytesShort(r.bytes_freed) : '';
|
||||||
|
console.log(`Deleted ${m.display_name} from disk${freed ? ` — freed ${freed}` : ''}.`);
|
||||||
|
} catch (e) {
|
||||||
|
errEl.textContent = e.message || 'Delete failed';
|
||||||
|
errEl.classList.remove('hidden');
|
||||||
|
} finally {
|
||||||
|
confirm.disabled = false;
|
||||||
|
cancel.disabled = false;
|
||||||
|
confirm.textContent = 'Delete from disk';
|
||||||
|
}
|
||||||
|
};
|
||||||
|
cancel.onclick = onCancel;
|
||||||
|
confirm.onclick = onConfirm;
|
||||||
|
dlg.showModal();
|
||||||
|
}
|
||||||
|
|
||||||
async function triggerSwap(modelKey) {
|
async function triggerSwap(modelKey) {
|
||||||
if (state.swap_job_id) return;
|
if (state.swap_job_id) return;
|
||||||
try {
|
try {
|
||||||
@@ -1523,9 +1627,12 @@ async function init() {
|
|||||||
await renderServices();
|
await renderServices();
|
||||||
pollHardware();
|
pollHardware();
|
||||||
pollUpdates();
|
pollUpdates();
|
||||||
|
// Disk-status probe runs after first paint — slow over SSH and not blocking.
|
||||||
|
loadDiskStatus();
|
||||||
setInterval(pollStatus, 5000);
|
setInterval(pollStatus, 5000);
|
||||||
setInterval(pollHardware, 8000); // every 8s
|
setInterval(pollHardware, 8000); // every 8s
|
||||||
setInterval(pollUpdates, 300000); // every 5 min
|
setInterval(pollUpdates, 300000); // every 5 min
|
||||||
|
setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely
|
||||||
}
|
}
|
||||||
|
|
||||||
init();
|
init();
|
||||||
|
|||||||
@@ -188,6 +188,20 @@
|
|||||||
</form>
|
</form>
|
||||||
</dialog>
|
</dialog>
|
||||||
|
|
||||||
|
<dialog id="disk-delete-dialog" class="modal">
|
||||||
|
<form method="dialog" class="modal-form">
|
||||||
|
<h3>Delete model weights from disk?</h3>
|
||||||
|
<p id="dd-summary" class="muted small"></p>
|
||||||
|
<ul class="muted small dd-hosts" id="dd-hosts"></ul>
|
||||||
|
<p class="muted small">This is reversible — you can re-download from the catalog at any time. The catalog entry stays intact.</p>
|
||||||
|
<p id="dd-error" class="muted small dd-error hidden"></p>
|
||||||
|
<div class="modal-actions">
|
||||||
|
<button type="button" id="dd-cancel" class="btn">Cancel</button>
|
||||||
|
<button type="button" id="dd-confirm" class="btn danger">Delete from disk</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</dialog>
|
||||||
|
|
||||||
<dialog id="advanced-dialog" class="modal">
|
<dialog id="advanced-dialog" class="modal">
|
||||||
<form method="dialog" class="modal-form" id="advanced-form">
|
<form method="dialog" class="modal-form" id="advanced-form">
|
||||||
<h3 id="adv-title">Advanced settings</h3>
|
<h3 id="adv-title">Advanced settings</h3>
|
||||||
|
|||||||
@@ -717,6 +717,14 @@ main {
|
|||||||
.card .adv-btn,
|
.card .adv-btn,
|
||||||
.card .test-btn { padding: 8px 12px; font-size: 12px; }
|
.card .test-btn { padding: 8px 12px; font-size: 12px; }
|
||||||
.card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); }
|
.card .custom-pill { color: var(--info); border-color: rgba(96, 165, 250, 0.4); }
|
||||||
|
.tag.on-disk { color: var(--accent); border-color: rgba(74, 222, 128, 0.4); }
|
||||||
|
.tag.not-on-disk { color: var(--muted); border-color: var(--border); opacity: 0.7; }
|
||||||
|
.card-actions .icon-btn.danger { color: var(--error); border-color: rgba(239, 68, 68, 0.3); margin-left: auto; }
|
||||||
|
.card-actions .icon-btn.danger:hover:not(:disabled) { background: rgba(239, 68, 68, 0.08); border-color: var(--error); color: var(--error); }
|
||||||
|
.card-actions .icon-btn.danger:disabled { opacity: 0.35; cursor: not-allowed; }
|
||||||
|
.dd-hosts { padding-left: 18px; margin: 4px 0 8px; }
|
||||||
|
.dd-hosts code { background: var(--surface-2); padding: 1px 5px; border-radius: 4px; }
|
||||||
|
.dd-error { color: var(--error); }
|
||||||
|
|
||||||
.test-result {
|
.test-result {
|
||||||
font-size: 12px;
|
font-size: 12px;
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ models:
|
|||||||
- -tp=2
|
- -tp=2
|
||||||
- --distributed-executor-backend=ray
|
- --distributed-executor-backend=ray
|
||||||
- --max-model-len=32768
|
- --max-model-len=32768
|
||||||
|
- --max-num-batched-tokens=16384
|
||||||
|
|
||||||
gemma4:
|
gemma4:
|
||||||
display_name: "Gemma 4 31B"
|
display_name: "Gemma 4 31B"
|
||||||
@@ -45,6 +46,7 @@ models:
|
|||||||
vllm_args:
|
vllm_args:
|
||||||
- --gpu-memory-utilization=0.8
|
- --gpu-memory-utilization=0.8
|
||||||
- --max-model-len=32768
|
- --max-model-len=32768
|
||||||
|
- --max-num-batched-tokens=16384
|
||||||
- --reasoning-parser=gemma4
|
- --reasoning-parser=gemma4
|
||||||
- --tool-call-parser=gemma4
|
- --tool-call-parser=gemma4
|
||||||
- --enable-auto-tool-choice
|
- --enable-auto-tool-choice
|
||||||
|
|||||||
@@ -24,6 +24,10 @@ This flag is Blackwell-specific. If vLLM in the container reports `unrecognized
|
|||||||
|
|
||||||
Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe.
|
Qwen3.6 uses a Mamba-attention hybrid that requires `--max-num-batched-tokens >= 2096`. vLLM's default is 2048, which trips `AssertionError: In Mamba cache align mode, block_size (2096) must be <= max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into the bundled qwen36 entry — matches the upstream qwen3.5-35b-a3b-fp8 recipe.
|
||||||
|
|
||||||
|
## Multimodal token budget for vision models (fixed in v0.8.0:1)
|
||||||
|
|
||||||
|
After the eugr/spark-vllm-docker update, vLLM became stricter about multimodal token budgets. Vision-capable models like Gemma 4 31B and Qwen3-VL crash at engine init with `ValueError: Chunked MM input disabled but max_tokens_per_mm_item (2496) is larger than max_num_batched_tokens (2048)`. Fix: bake `--max-num-batched-tokens=16384` into every model that has the `vision` capability. Now applied to qwen3-vl, gemma4, and qwen36 (which was already set for the Mamba issue).
|
||||||
|
|
||||||
## Two SSH paths to Spark 1 from the laptop
|
## Two SSH paths to Spark 1 from the laptop
|
||||||
|
|
||||||
`ssh <spark-user>@<spark-1-ip>` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `<spark-1-host>.local`. Always use the `.local` hostname or `<spark-2-ip>`-style entries that ARE matched.
|
`ssh <spark-user>@<spark-1-ip>` does NOT work from the laptop because the NVIDIA Sync ssh_config only has a Host entry for `<spark-1-host>.local`. Always use the `.local` hostname or `<spark-2-ip>`-style entries that ARE matched.
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
export const v0_1_0 = VersionInfo.of({
|
export const v0_1_0 = VersionInfo.of({
|
||||||
version: '0.8.0:2',
|
version: '0.8.1:0',
|
||||||
releaseNotes: {
|
releaseNotes: {
|
||||||
en_US:
|
en_US:
|
||||||
'v0.8: deep health probes. Every 5 minutes, Spark Control sends a tiny synthetic inference request to each service (1 second of silent audio to Parakeet, short text to Magpie, 1-token completion to vLLM). All payloads are generated in-memory and never written to disk. If a probe returns CUDA-error / 5xx signals while the container is still "up" — i.e. the classic Triton-wedge pattern where /health stays green but real inference fails — Spark Control automatically restarts the affected container. Rate-limited to 3 auto-restarts per service per 30 minutes. Each service card now shows the last deep-check timestamp, latency, and an inline "Run now" button. Failures and recoveries are logged into the connectivity history with source=deep-health.',
|
'v0.8.1: model weights can now be deleted from disk directly from the dashboard. Each model card shows whether the weights are present (with on-disk GB size) or not yet downloaded. When present and the model is NOT currently loaded, a small trash icon appears on the card; clicking it pops a confirmation showing how many GB will be freed and on which Spark(s), then runs `rm -rf` on the Hugging Face cache directory via SSH. Cluster-mode models are deleted from both Sparks; solo-mode from Spark 1 only. Safety rails: refuses to delete the currently-loaded model, refuses during an in-flight swap or download, and the catalog entry stays — you can always re-download. Disk status is probed once on dashboard load and re-checked every 60s.',
|
||||||
},
|
},
|
||||||
migrations: {
|
migrations: {
|
||||||
up: async ({ effects }) => {},
|
up: async ({ effects }) => {},
|
||||||
|
|||||||
Reference in New Issue
Block a user