v0.8.1:0 - delete model weights from disk via card trash icon

Each model card now shows whether its weights are present on disk
(with GB size) or not yet downloaded. When present and the model
isn't currently loaded, a trash icon appears; clicking it pops a
confirmation showing exactly how many GB will be freed and on
which Spark(s), then runs rm -rf on the HF cache directory via SSH.

Cluster-mode models are removed from both Sparks; solo-mode from
Spark 1 only. Safety rails: refuses to delete the currently-loaded
model, refuses during an in-flight swap or download, and the
catalog entry stays intact so it can be re-downloaded anytime.

Backend:
  - new image/app/disk.py: probe_disk + delete_from_disk over SSH
  - GET  /api/models/disk-status — parallel probe across all catalog models
  - DELETE /api/models/{key}/disk — guarded rm -rf, logs to connectivity events

Frontend:
  - on-disk / not-downloaded pills on every card
  - trash icon-btn in card-actions row (hidden when not on disk)
  - confirmation dialog showing per-host bytes-to-free
  - disk-status re-checked every 60s

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Keysat
2026-05-13 17:07:20 -05:00
parent 1602b3b3b4
commit 9ff7ee9c1e
6 changed files with 345 additions and 2 deletions
+84
View File
@@ -13,6 +13,7 @@ from .config import Settings
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
from .custom_services import add_custom_service, delete_custom_service
from .deep_health import DeepHealth
from .disk import delete_from_disk, probe_disk
from .download import DownloadManager
from .hardware import HardwareProbe
from .health import check_magpie, check_parakeet, check_vllm
@@ -139,6 +140,89 @@ async def del_model(key: str) -> dict:
return {"ok": True, "key": key}
@app.get("/api/models/disk-status")
async def get_models_disk_status() -> dict:
"""Probe each catalog model's HF cache on the appropriate Spark(s) in parallel.
Result is keyed by model key: {on_disk, total_bytes, per_host:[{host,on_disk,size_bytes,error?}]}.
Designed to be called once on dashboard load; takes ~13s depending on Spark count.
"""
if not settings.configured:
return {"configured": False, "models": {}}
keys = list(catalog.models.keys())
statuses = await asyncio.gather(*(
probe_disk(catalog.models[k].repo, catalog.models[k].mode, settings) for k in keys
), return_exceptions=True)
out: dict[str, dict] = {}
for k, s in zip(keys, statuses):
if isinstance(s, Exception):
out[k] = {"on_disk": False, "total_bytes": 0, "per_host": [], "error": str(s)}
continue
out[k] = {
"on_disk": s.on_disk,
"total_bytes": s.total_bytes,
"per_host": [
{"host": r.host, "on_disk": r.on_disk, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
for r in s.per_host
],
}
return {"configured": True, "models": out}
@app.delete("/api/models/{key}/disk")
async def del_model_disk(key: str) -> dict:
"""Delete a model's weights from the Spark filesystem(s). The catalog entry stays.
Safety rails:
- Refuses if the model is currently loaded on vLLM.
- Refuses if a swap or download is in flight.
- Idempotent: if the cache dir is already gone on a host, that host reports 0 bytes freed.
"""
if key not in catalog.models:
raise HTTPException(404, f"unknown model: {key}")
m = catalog.models[key]
# Refuse if currently loaded
try:
vllm = await check_vllm(settings)
except Exception:
vllm = {}
if vllm.get("ok") and vllm.get("current_model") == m.repo:
raise HTTPException(
409,
f"'{m.display_name}' is the currently loaded model. Switch to a different model first, then try again."
)
# Refuse if a swap is in flight
if swap_manager.current_job_id:
raise HTTPException(409, "a model swap is in progress; wait for it to finish")
# Refuse if a download is in flight for this same repo (a different model's download is fine)
if download_manager.current_job_id:
job = download_manager.get(download_manager.current_job_id)
if job and job.repo == m.repo:
raise HTTPException(409, "this model is currently downloading; cancel or wait for it to finish")
status = await delete_from_disk(m.repo, m.mode, settings)
# Audit log
record_report(
f"disk:{key}",
ok=True,
source="disk-delete",
detail=f"freed {status.total_bytes} bytes across {len(status.per_host)} host(s)",
)
return {
"ok": True,
"key": key,
"repo": m.repo,
"bytes_freed": status.total_bytes,
"per_host": [
{"host": r.host, "size_bytes": r.size_bytes, **({"error": r.error} if r.error else {})}
for r in status.per_host
],
}
@app.get("/api/hardware")
async def get_hardware() -> dict:
"""Per-Spark hardware snapshot — RAM, disk, GPU mem + util, CPU load, uptime."""