v0.25.0:0 - cluster coordination layer (swap lock + webhook + schedule registry)
GPU-arbiter safety layer for when automation, not just the dashboard, swaps models: - swap reservation lock (POST/GET/DELETE /api/swap/lock); 423-enforced in post_swap via a single-read gate, TTL-bounded, secret-token auth, human force-release override + dashboard banner - swap webhook (swap_complete/swap_failed) fired outside the swap lock, optional HMAC signature, configurable URL+secret - read-only schedule registry (GET/POST/DELETE /api/schedule) + dashboard panel New module image/app/coordination.py; docs/COORDINATION.md for consumers; 22 offline tests in test_coordination.py.
This commit is contained in:
@@ -103,6 +103,8 @@ class Settings:
|
||||
bind_port: int
|
||||
open_webui_url: str
|
||||
ngc_api_key: str
|
||||
swap_webhook_url: str
|
||||
swap_webhook_secret: str
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> "Settings":
|
||||
@@ -165,6 +167,11 @@ class Settings:
|
||||
bind_port=_env_int("BIND_PORT", 9999),
|
||||
open_webui_url=_env("OPEN_WEBUI_URL", ""),
|
||||
ngc_api_key=_env("NGC_API_KEY", ""),
|
||||
# Coordination layer: fire a swap-lifecycle webhook to this URL so
|
||||
# downstream consumers re-point their model config on a swap. Blank
|
||||
# ⇒ disabled. The optional secret HMAC-signs the body (X-Spark-Signature).
|
||||
swap_webhook_url=_env("SWAP_WEBHOOK_URL", ""),
|
||||
swap_webhook_secret=_env("SWAP_WEBHOOK_SECRET", ""),
|
||||
)
|
||||
|
||||
@property
|
||||
|
||||
@@ -0,0 +1,342 @@
|
||||
"""Cluster-coordination layer: the GPU swap lock, swap-event webhook, and the
|
||||
read-only schedule registry.
|
||||
|
||||
Spark Control is the **control plane / GPU arbiter, not a job runner.** Recurring
|
||||
business pipelines live in separate services that *call* the swap API. These
|
||||
three primitives add the *safety* layer around that:
|
||||
|
||||
- **Swap lock** — a TTL-bounded reservation of the swap path. An external
|
||||
scheduler acquires it before swapping; while held by someone else the
|
||||
dashboard's manual swap is refused (enforced in the swap endpoint, not
|
||||
advisory). Holder name is descriptive; the returned token is the secret that
|
||||
authorises a swap or a release.
|
||||
- **Webhook** — fires `swap_complete` / `swap_failed` to a configurable URL so
|
||||
downstream consumers re-point their provider config when the running model
|
||||
changes. Optionally HMAC-signed.
|
||||
- **Schedule registry** — a read-only view the dashboard surfaces, *registered
|
||||
by* external schedulers. Spark Control stores what it's told; it does not own
|
||||
or execute any schedule.
|
||||
|
||||
All state is in-memory (mirroring the swap/download/NIM job managers). On a
|
||||
restart the lock resets to *unlocked* — the available-by-default failure mode;
|
||||
the swap manager's own in-progress guard still prevents two swaps at once —
|
||||
and schedulers re-register their schedules.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# A lock reserves the GPU for a window; clamp the TTL so a buggy client can
|
||||
# neither pin the cluster forever nor take a zero-length (useless) lock.
|
||||
LOCK_TTL_MIN = 1
|
||||
LOCK_TTL_MAX = 86_400 # 24h
|
||||
LOCK_TTL_DEFAULT = 900 # 15 min
|
||||
|
||||
# Schedule ids are reflected to the dashboard and used as a URL path segment on
|
||||
# delete, so a caller-supplied id is whitelist-checked. Generated ids are hex.
|
||||
_SCHEDULE_ID_RE = re.compile(r"^[A-Za-z0-9_.-]{1,64}$")
|
||||
|
||||
|
||||
def valid_schedule_id(value: str) -> bool:
|
||||
"""Whitelist check for a caller-supplied schedule id (register and delete)."""
|
||||
return bool(_SCHEDULE_ID_RE.match(value or ""))
|
||||
|
||||
|
||||
def _now() -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def _iso(dt: datetime) -> str:
|
||||
return dt.isoformat()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------- swap lock ----
|
||||
|
||||
class LockHeld(Exception):
|
||||
"""The lock is held by a different holder. Carries the public lock state so
|
||||
the endpoint can return holder + expiry in the 409 body."""
|
||||
|
||||
def __init__(self, state: dict) -> None:
|
||||
self.state = state
|
||||
super().__init__("swap lock is held by another holder")
|
||||
|
||||
|
||||
@dataclass
|
||||
class LockState:
|
||||
holder: str
|
||||
token: str
|
||||
acquired_at: datetime
|
||||
expires_at: datetime
|
||||
note: str = ""
|
||||
|
||||
def public(self, now: datetime) -> dict:
|
||||
"""Token-free view safe to expose on GET / in error bodies."""
|
||||
return {
|
||||
"held": True,
|
||||
"holder": self.holder,
|
||||
"acquired_at": _iso(self.acquired_at),
|
||||
"expires_at": _iso(self.expires_at),
|
||||
"seconds_remaining": max(0, int((self.expires_at - now).total_seconds())),
|
||||
"note": self.note,
|
||||
}
|
||||
|
||||
|
||||
class SwapLockManager:
|
||||
"""In-memory, TTL-bounded reservation of the GPU swap path.
|
||||
|
||||
`now` is injectable on every method purely so the expiry logic is testable
|
||||
without sleeping; production calls omit it and get wall-clock UTC.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._lock: Optional[LockState] = None
|
||||
|
||||
def _active(self, now: Optional[datetime] = None) -> Optional[LockState]:
|
||||
"""The current lock if one is held and unexpired; lazily clears an
|
||||
expired lock so it never lingers."""
|
||||
now = now or _now()
|
||||
if self._lock is not None and self._lock.expires_at <= now:
|
||||
self._lock = None
|
||||
return self._lock
|
||||
|
||||
def status(self, now: Optional[datetime] = None) -> dict:
|
||||
now = now or _now()
|
||||
active = self._active(now)
|
||||
return active.public(now) if active else {"held": False}
|
||||
|
||||
def acquire(
|
||||
self,
|
||||
holder: str,
|
||||
ttl_seconds: Optional[int] = None,
|
||||
note: str = "",
|
||||
token: Optional[str] = None,
|
||||
*,
|
||||
now: Optional[datetime] = None,
|
||||
) -> LockState:
|
||||
"""Acquire a free lock (new token), or extend one already held by
|
||||
presenting its token. A request without the token is refused even if the
|
||||
holder name matches — the name is descriptive, the token is the secret.
|
||||
"""
|
||||
now = now or _now()
|
||||
holder = (holder or "").strip()
|
||||
if not holder:
|
||||
raise ValueError("holder is required")
|
||||
ttl = ttl_seconds if ttl_seconds is not None else LOCK_TTL_DEFAULT
|
||||
try:
|
||||
ttl = int(ttl)
|
||||
except (TypeError, ValueError):
|
||||
ttl = LOCK_TTL_DEFAULT
|
||||
ttl = max(LOCK_TTL_MIN, min(LOCK_TTL_MAX, ttl))
|
||||
|
||||
active = self._active(now)
|
||||
if active is not None:
|
||||
# Held — only the token-holder may extend/re-acquire.
|
||||
if not (token and hmac.compare_digest(active.token, token)):
|
||||
raise LockHeld(active.public(now))
|
||||
self._lock = LockState(
|
||||
holder=holder or active.holder,
|
||||
token=active.token,
|
||||
acquired_at=active.acquired_at,
|
||||
expires_at=now + timedelta(seconds=ttl),
|
||||
note=note or active.note,
|
||||
)
|
||||
return self._lock
|
||||
|
||||
self._lock = LockState(
|
||||
holder=holder,
|
||||
token=uuid.uuid4().hex,
|
||||
acquired_at=now,
|
||||
expires_at=now + timedelta(seconds=ttl),
|
||||
note=note,
|
||||
)
|
||||
return self._lock
|
||||
|
||||
def verify(self, token: Optional[str], now: Optional[datetime] = None) -> bool:
|
||||
"""True iff `token` matches the currently-active lock."""
|
||||
active = self._active(now)
|
||||
return bool(active and token and hmac.compare_digest(active.token, token))
|
||||
|
||||
def is_blocked_by(self, token: Optional[str], now: Optional[datetime] = None) -> Optional[dict]:
|
||||
"""Single-read swap gate. Returns the public lock state if an active
|
||||
lock blocks a swap carrying this token, else None. Does exactly one
|
||||
`_active()` read so the decision can't straddle a TTL expiry the way a
|
||||
separate status()+verify() pair could (which, at the expiry tick, would
|
||||
spuriously refuse a swap that should now be allowed)."""
|
||||
now = now or _now()
|
||||
active = self._active(now)
|
||||
if active is None:
|
||||
return None
|
||||
if token and hmac.compare_digest(active.token, token):
|
||||
return None
|
||||
return active.public(now)
|
||||
|
||||
def release(
|
||||
self,
|
||||
token: Optional[str] = None,
|
||||
*,
|
||||
force: bool = False,
|
||||
now: Optional[datetime] = None,
|
||||
) -> bool:
|
||||
"""Release the lock. Returns False if nothing was held. Requires the
|
||||
matching token unless `force` (the human override from the dashboard)."""
|
||||
active = self._active(now)
|
||||
if active is None:
|
||||
return False
|
||||
if not force and not self.verify(token, now):
|
||||
raise PermissionError("token does not hold the lock")
|
||||
self._lock = None
|
||||
return True
|
||||
|
||||
|
||||
# ----------------------------------------------------------------- webhook ----
|
||||
|
||||
def build_webhook_payload(
|
||||
*,
|
||||
event: str,
|
||||
job_id: str,
|
||||
model_key: str,
|
||||
state: str,
|
||||
returncode: Optional[int],
|
||||
started_at: Optional[str],
|
||||
finished_at: Optional[str],
|
||||
dry_run: bool,
|
||||
) -> dict:
|
||||
return {
|
||||
"event": event, # swap_complete | swap_failed
|
||||
"job_id": job_id,
|
||||
"model_key": model_key,
|
||||
"state": state,
|
||||
"returncode": returncode,
|
||||
"started_at": started_at,
|
||||
"finished_at": finished_at,
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
|
||||
|
||||
def sign_payload(secret: str, body: bytes) -> str:
|
||||
"""`X-Spark-Signature` value: sha256 HMAC of the exact JSON body the
|
||||
consumer receives, so they can recompute and trust it."""
|
||||
return "sha256=" + hmac.new(secret.encode(), body, hashlib.sha256).hexdigest()
|
||||
|
||||
|
||||
class WebhookNotifier:
|
||||
"""Fire-and-forget POST of swap-lifecycle events. A webhook failure is
|
||||
logged and swallowed — it must never affect the swap outcome."""
|
||||
|
||||
def __init__(self, url: str, secret: str = "", timeout: float = 5.0) -> None:
|
||||
self.url = (url or "").strip()
|
||||
self.secret = secret or ""
|
||||
self.timeout = timeout
|
||||
|
||||
@property
|
||||
def enabled(self) -> bool:
|
||||
return bool(self.url)
|
||||
|
||||
async def fire(self, event: str, payload: dict) -> None:
|
||||
if not self.enabled:
|
||||
return
|
||||
body = json.dumps(payload).encode()
|
||||
headers = {
|
||||
"content-type": "application/json",
|
||||
"user-agent": "spark-control-webhook",
|
||||
"x-spark-event": event,
|
||||
}
|
||||
if self.secret:
|
||||
headers["x-spark-signature"] = sign_payload(self.secret, body)
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||
await client.post(self.url, content=body, headers=headers)
|
||||
except Exception as e: # noqa: BLE001 — best-effort, never propagate
|
||||
log.warning("swap webhook to %s failed: %s", self.url, e)
|
||||
|
||||
|
||||
# -------------------------------------------------------- schedule registry ----
|
||||
|
||||
@dataclass
|
||||
class ScheduleEntry:
|
||||
id: str
|
||||
name: str
|
||||
owner: str = ""
|
||||
cron: str = ""
|
||||
next_run: str = ""
|
||||
description: str = ""
|
||||
registered_at: str = ""
|
||||
updated_at: str = ""
|
||||
|
||||
def public(self) -> dict:
|
||||
return {
|
||||
"id": self.id,
|
||||
"name": self.name,
|
||||
"owner": self.owner,
|
||||
"cron": self.cron,
|
||||
"next_run": self.next_run,
|
||||
"description": self.description,
|
||||
"registered_at": self.registered_at,
|
||||
"updated_at": self.updated_at,
|
||||
}
|
||||
|
||||
|
||||
class ScheduleRegistry:
|
||||
"""What external schedulers tell us about their cron jobs. Read-only from the
|
||||
dashboard's side; Spark Control never executes any of it."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._items: dict[str, ScheduleEntry] = {}
|
||||
|
||||
def list(self) -> list[dict]:
|
||||
return [e.public() for e in self._items.values()]
|
||||
|
||||
def register(
|
||||
self,
|
||||
*,
|
||||
name: str,
|
||||
id: Optional[str] = None,
|
||||
owner: str = "",
|
||||
cron: str = "",
|
||||
next_run: str = "",
|
||||
description: str = "",
|
||||
) -> ScheduleEntry:
|
||||
name = (name or "").strip()
|
||||
if not name:
|
||||
raise ValueError("name is required")
|
||||
if id is not None:
|
||||
id = id.strip()
|
||||
if id and not valid_schedule_id(id):
|
||||
raise ValueError("id must match [A-Za-z0-9_.-] (max 64 chars)")
|
||||
ts = _iso(_now())
|
||||
existing = self._items.get(id) if id else None
|
||||
if existing is not None:
|
||||
existing.name = name
|
||||
existing.owner = owner.strip()
|
||||
existing.cron = cron
|
||||
existing.next_run = next_run
|
||||
existing.description = description
|
||||
existing.updated_at = ts
|
||||
return existing
|
||||
sid = id or uuid.uuid4().hex[:8]
|
||||
entry = ScheduleEntry(
|
||||
id=sid,
|
||||
name=name,
|
||||
owner=owner.strip(),
|
||||
cron=cron,
|
||||
next_run=next_run,
|
||||
description=description,
|
||||
registered_at=ts,
|
||||
updated_at=ts,
|
||||
)
|
||||
self._items[sid] = entry
|
||||
return entry
|
||||
|
||||
def delete(self, schedule_id: str) -> bool:
|
||||
return self._items.pop(schedule_id, None) is not None
|
||||
+107
-2
@@ -11,6 +11,7 @@ from typing import Literal
|
||||
|
||||
from .config import Settings
|
||||
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||
from .coordination import LockHeld, ScheduleRegistry, SwapLockManager, WebhookNotifier, valid_schedule_id
|
||||
from .custom_services import add_custom_service, delete_custom_service
|
||||
from .audio_proxy import build_router as build_audio_router
|
||||
from .deep_health import DeepHealth
|
||||
@@ -37,7 +38,12 @@ from .wol import send_local_broadcast, send_via_peer
|
||||
|
||||
settings = Settings.from_env()
|
||||
catalog = load_catalog(settings.models_yaml)
|
||||
swap_manager = SwapManager(settings, catalog)
|
||||
# Coordination layer (GPU arbiter): swap-lifecycle webhook, the swap reservation
|
||||
# lock, and the read-only schedule registry. See coordination.py.
|
||||
swap_webhook = WebhookNotifier(settings.swap_webhook_url, settings.swap_webhook_secret)
|
||||
swap_lock = SwapLockManager()
|
||||
schedule_registry = ScheduleRegistry()
|
||||
swap_manager = SwapManager(settings, catalog, notifier=swap_webhook)
|
||||
download_manager = DownloadManager(settings)
|
||||
update_manager = UpdateManager(settings)
|
||||
hardware_probe = HardwareProbe(settings)
|
||||
@@ -67,6 +73,10 @@ _CSRF_EXEMPT_PREFIXES = (
|
||||
"/api/audio/", # diarize-chunk / label-merge / transcribe-with-speakers
|
||||
"/api/health-event", # health reports posted by consumer apps
|
||||
)
|
||||
# Note: the coordination endpoints (/api/swap/lock, /api/schedule) are
|
||||
# intentionally NOT exempt. External schedulers are non-browser clients (no
|
||||
# Origin header) so they pass the guard already — same as /api/swap — while a
|
||||
# malicious page can't drive them from the operator's browser. Don't add them.
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
@@ -892,9 +902,21 @@ async def validate_swap(key: str) -> dict:
|
||||
|
||||
|
||||
@app.post("/api/swap")
|
||||
async def post_swap(req: SwapRequest) -> dict:
|
||||
async def post_swap(req: SwapRequest, request: Request) -> dict:
|
||||
if not settings.configured and not req.dry_run:
|
||||
raise HTTPException(503, "spark1 not configured")
|
||||
# Enforce the swap reservation lock (the GPU arbiter). A held lock blocks any
|
||||
# real swap that doesn't present the holder's token in X-Swap-Lock-Token — so
|
||||
# an external scheduler that holds the lock can swap, but the dashboard (no
|
||||
# token) is refused while someone else holds it. Dry runs don't touch the
|
||||
# cluster, so they're exempt.
|
||||
if not req.dry_run:
|
||||
blocked = swap_lock.is_blocked_by(request.headers.get("x-swap-lock-token"))
|
||||
if blocked is not None:
|
||||
raise HTTPException(status_code=423, detail={
|
||||
"error": "the GPU swap path is reserved by another holder",
|
||||
"lock": blocked,
|
||||
})
|
||||
try:
|
||||
job = await swap_manager.trigger(req.model_key, dry_run=req.dry_run)
|
||||
except KeyError:
|
||||
@@ -949,6 +971,89 @@ async def stream_swap(job_id: str):
|
||||
return StreamingResponse(gen(), media_type="text/event-stream")
|
||||
|
||||
|
||||
# ---- Coordination layer: swap lock + schedule registry ----
|
||||
# Endpoints are control-surface, not browser-exempt: an external scheduler is a
|
||||
# non-browser client (no Origin header) so it passes the CSRF guard already, the
|
||||
# same way it calls /api/swap today; the dashboard is same-origin.
|
||||
|
||||
class LockAcquireRequest(BaseModel):
|
||||
holder: str
|
||||
ttl_seconds: int | None = None
|
||||
note: str = ""
|
||||
token: str | None = None # present only to extend an existing hold
|
||||
|
||||
|
||||
@app.post("/api/swap/lock")
|
||||
async def acquire_swap_lock(req: LockAcquireRequest) -> dict:
|
||||
"""Reserve the GPU swap path. Returns a secret token used to swap (header
|
||||
X-Swap-Lock-Token) and to release. 409 if held by another holder."""
|
||||
try:
|
||||
lock = swap_lock.acquire(req.holder, req.ttl_seconds, req.note, token=req.token)
|
||||
except ValueError as e:
|
||||
raise HTTPException(422, str(e))
|
||||
except LockHeld as e:
|
||||
raise HTTPException(status_code=409, detail={
|
||||
"error": "swap lock is held by another holder",
|
||||
"lock": e.state,
|
||||
})
|
||||
return {**swap_lock.status(), "token": lock.token}
|
||||
|
||||
|
||||
@app.get("/api/swap/lock")
|
||||
async def get_swap_lock() -> dict:
|
||||
"""Public, token-free view of the reservation: held? who? until when?"""
|
||||
return swap_lock.status()
|
||||
|
||||
|
||||
@app.delete("/api/swap/lock")
|
||||
async def release_swap_lock(request: Request, force: bool = Query(False)) -> dict:
|
||||
"""Release the reservation. Needs the matching X-Swap-Lock-Token unless
|
||||
?force=true (the human override from the dashboard)."""
|
||||
token = request.headers.get("x-swap-lock-token") or request.query_params.get("token")
|
||||
try:
|
||||
released = swap_lock.release(token, force=force)
|
||||
except PermissionError as e:
|
||||
raise HTTPException(403, str(e))
|
||||
return {"released": released, **swap_lock.status()}
|
||||
|
||||
|
||||
class ScheduleRequest(BaseModel):
|
||||
name: str
|
||||
id: str | None = None
|
||||
owner: str = ""
|
||||
cron: str = ""
|
||||
next_run: str = ""
|
||||
description: str = ""
|
||||
|
||||
|
||||
@app.get("/api/schedule")
|
||||
async def list_schedules() -> dict:
|
||||
return {"schedules": schedule_registry.list()}
|
||||
|
||||
|
||||
@app.post("/api/schedule")
|
||||
async def register_schedule(req: ScheduleRequest) -> dict:
|
||||
"""Register (or update, by id) a schedule an external scheduler owns. Spark
|
||||
Control only stores it for the dashboard — it never executes it."""
|
||||
try:
|
||||
entry = schedule_registry.register(
|
||||
name=req.name, id=req.id, owner=req.owner,
|
||||
cron=req.cron, next_run=req.next_run, description=req.description,
|
||||
)
|
||||
except ValueError as e:
|
||||
raise HTTPException(422, str(e))
|
||||
return entry.public()
|
||||
|
||||
|
||||
@app.delete("/api/schedule/{schedule_id}")
|
||||
async def delete_schedule(schedule_id: str) -> dict:
|
||||
# Whitelist the path segment at the boundary (repo convention), even though
|
||||
# it's only ever a dict key — keeps it from being reflected or logged raw.
|
||||
if not valid_schedule_id(schedule_id):
|
||||
raise HTTPException(422, "invalid schedule id")
|
||||
return {"deleted": schedule_registry.delete(schedule_id)}
|
||||
|
||||
|
||||
class DownloadRequest(BaseModel):
|
||||
repo: str
|
||||
mode: Literal["spark1", "spark2", "cluster"] = "spark1"
|
||||
|
||||
+100
-2
@@ -21,11 +21,19 @@ const state = {
|
||||
deep_health: {},
|
||||
disk_status: {}, // keyed by model key: { on_disk, total_bytes, per_host }
|
||||
disk_status_loaded: false,
|
||||
lock: { held: false }, // GPU swap reservation (coordination layer)
|
||||
schedules: [], // schedules external automation has registered
|
||||
};
|
||||
|
||||
const el = (sel) => document.querySelector(sel);
|
||||
const $$ = (sel) => document.querySelectorAll(sel);
|
||||
|
||||
// ISO timestamp -> local clock string (e.g. "2:45:10 PM"); '' if unparseable.
|
||||
function fmtClock(iso) {
|
||||
const t = Date.parse(iso);
|
||||
return isNaN(t) ? '' : new Date(t).toLocaleTimeString();
|
||||
}
|
||||
|
||||
function escapeHtml(s) {
|
||||
if (s == null) return '';
|
||||
return String(s)
|
||||
@@ -51,6 +59,12 @@ function renderCards() {
|
||||
const root = el('#cards');
|
||||
root.innerHTML = '';
|
||||
const isSwapping = !!state.swap_job_id;
|
||||
// GPU reserved by external automation — manual swaps are refused server-side
|
||||
// (423); reflect that in the buttons so the click never bounces.
|
||||
const locked = !!(state.lock && state.lock.held);
|
||||
const lockTip = locked
|
||||
? `Reserved by ${state.lock.holder || 'automation'}${state.lock.expires_at ? ' until ' + fmtClock(state.lock.expires_at) : ''}`
|
||||
: '';
|
||||
for (const key of Object.keys(state.models)) {
|
||||
const m = state.models[key];
|
||||
const isActive = key === state.current_model_key;
|
||||
@@ -94,7 +108,9 @@ function renderCards() {
|
||||
if (isActive) {
|
||||
primaryBtn = `<button class="btn" disabled>Current</button>`;
|
||||
} else if (isOnDisk) {
|
||||
primaryBtn = `<button class="btn primary" data-swap-key="${key}" ${isSwapping ? 'disabled' : ''}>Switch to this</button>`;
|
||||
const swapBlocked = isSwapping || locked;
|
||||
const tip = locked ? ` title="${escapeHtml(lockTip)}"` : '';
|
||||
primaryBtn = `<button class="btn primary" data-swap-key="${key}"${tip} ${swapBlocked ? 'disabled' : ''}>Switch to this</button>`;
|
||||
} else if (m.local_path) {
|
||||
// A local model can't be "downloaded" — its directory has to exist on the Spark.
|
||||
primaryBtn = `<button class="btn" disabled title="Directory not found on the Spark — create it there, then refresh">Not found on Spark</button>`;
|
||||
@@ -1234,6 +1250,11 @@ function openDiskDeleteDialog(key) {
|
||||
|
||||
async function triggerSwap(modelKey) {
|
||||
if (state.swap_job_id) return;
|
||||
if (state.lock && state.lock.held) {
|
||||
const until = state.lock.expires_at ? ' until ' + fmtClock(state.lock.expires_at) : '';
|
||||
alert(`The GPU swap path is reserved by ${state.lock.holder || 'automation'}${until}. Use "Release" on the reservation banner to override.`);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const r = await fetchJSON('/api/swap', {
|
||||
method: 'POST',
|
||||
@@ -1242,10 +1263,84 @@ async function triggerSwap(modelKey) {
|
||||
});
|
||||
attachToSwap(r.job_id, /*needsBackfill=*/false);
|
||||
} catch (e) {
|
||||
alert('Failed to start swap: ' + e.message);
|
||||
// 423 Locked: a reservation was acquired between our last poll and this click.
|
||||
if (e.message && e.message.startsWith('423')) {
|
||||
alert('The GPU swap path was just reserved by automation. Refreshing…');
|
||||
pollCoordination();
|
||||
} else {
|
||||
alert('Failed to start swap: ' + e.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---- coordination layer: swap lock + schedule registry ----
|
||||
|
||||
async function pollCoordination() {
|
||||
try {
|
||||
state.lock = await fetchJSON('/api/swap/lock');
|
||||
} catch { state.lock = { held: false }; }
|
||||
try {
|
||||
const r = await fetchJSON('/api/schedule');
|
||||
state.schedules = r.schedules || [];
|
||||
} catch { state.schedules = []; }
|
||||
renderLockBanner();
|
||||
renderSchedules();
|
||||
renderCards(); // reflect lock state on the swap buttons
|
||||
}
|
||||
|
||||
function renderLockBanner() {
|
||||
const banner = el('#lock-banner');
|
||||
if (!banner) return;
|
||||
const lock = state.lock;
|
||||
if (lock && lock.held) {
|
||||
const until = lock.expires_at ? ` until ${fmtClock(lock.expires_at)}` : '';
|
||||
const note = lock.note ? ` — ${escapeHtml(lock.note)}` : '';
|
||||
el('#lock-text').innerHTML =
|
||||
`GPU swap path reserved by <strong>${escapeHtml(lock.holder || 'automation')}</strong>${until}${note}. Manual swaps are paused.`;
|
||||
banner.classList.remove('hidden');
|
||||
} else {
|
||||
banner.classList.add('hidden');
|
||||
}
|
||||
}
|
||||
|
||||
function renderSchedules() {
|
||||
const panel = el('#schedule-panel');
|
||||
const list = el('#schedule-list');
|
||||
if (!panel || !list) return;
|
||||
const items = state.schedules || [];
|
||||
if (!items.length) {
|
||||
panel.classList.add('hidden');
|
||||
list.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
list.innerHTML = items.map((s) => {
|
||||
const meta = [
|
||||
s.cron ? `<code>${escapeHtml(s.cron)}</code>` : '',
|
||||
s.next_run ? `next: ${escapeHtml(s.next_run)}` : '',
|
||||
s.owner ? `by ${escapeHtml(s.owner)}` : '',
|
||||
].filter(Boolean).join(' · ');
|
||||
const desc = s.description ? `<div class="desc">${escapeHtml(s.description)}</div>` : '';
|
||||
return `<div class="schedule-item">
|
||||
<div class="name">${escapeHtml(s.name)}</div>
|
||||
<div class="muted small">${meta}</div>
|
||||
${desc}
|
||||
</div>`;
|
||||
}).join('');
|
||||
panel.classList.remove('hidden');
|
||||
}
|
||||
|
||||
async function releaseLock() {
|
||||
const lock = state.lock || {};
|
||||
const who = lock.holder || 'automation';
|
||||
if (!confirm(`Force-release the GPU reservation held by ${who}? Any job relying on it may then collide with a manual swap.`)) return;
|
||||
try {
|
||||
await fetchJSON('/api/swap/lock?force=true', { method: 'DELETE' });
|
||||
} catch (e) {
|
||||
alert('Failed to release: ' + e.message);
|
||||
}
|
||||
pollCoordination();
|
||||
}
|
||||
|
||||
async function triggerDownloadForKey(modelKey) {
|
||||
const m = state.models[modelKey];
|
||||
if (!m) return;
|
||||
@@ -2102,6 +2197,7 @@ async function init() {
|
||||
});
|
||||
el('#sshkey-close').addEventListener('click', () => el('#sshkey-dialog').close());
|
||||
el('#open-local').addEventListener('click', openLocalModelDialog);
|
||||
el('#lock-release').addEventListener('click', releaseLock);
|
||||
setupCatalogDialog();
|
||||
setupAdvancedDialog();
|
||||
setupLocalModelDialog();
|
||||
@@ -2119,6 +2215,7 @@ async function init() {
|
||||
await loadModels();
|
||||
await pollStatus();
|
||||
await renderServices();
|
||||
pollCoordination();
|
||||
pollHardware();
|
||||
pollUpdates();
|
||||
// Disk-status probe runs after first paint — slow over SSH and not blocking.
|
||||
@@ -2126,6 +2223,7 @@ async function init() {
|
||||
// Speech-model patches panel — slow over SSH, runs after first paint.
|
||||
renderSpeechModels();
|
||||
setInterval(pollStatus, 5000);
|
||||
setInterval(pollCoordination, 5000); // swap lock + schedule registry
|
||||
setInterval(pollHardware, 8000); // every 8s
|
||||
setInterval(pollUpdates, 300000); // every 5 min
|
||||
setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely
|
||||
|
||||
@@ -96,6 +96,13 @@
|
||||
</details>
|
||||
</section>
|
||||
|
||||
<section id="lock-banner" class="banner lock-banner hidden">
|
||||
<span class="lock-icon" aria-hidden="true">🔒</span>
|
||||
<span id="lock-text">GPU swap path reserved</span>
|
||||
<span class="spacer"></span>
|
||||
<button id="lock-release" class="btn small-btn">Release</button>
|
||||
</section>
|
||||
|
||||
<nav id="dashboard-tabs" class="dashboard-tabs hidden" role="tablist">
|
||||
<button type="button" class="dashboard-tab" data-tab="llm" role="tab" aria-selected="true">LLM</button>
|
||||
<button type="button" class="dashboard-tab" data-tab="audio" role="tab" aria-selected="false">Audio / Speech</button>
|
||||
@@ -394,6 +401,14 @@
|
||||
<section id="cards" class="cards"></section>
|
||||
</section>
|
||||
|
||||
<section id="schedule-panel" class="schedule-panel hidden">
|
||||
<div class="section-header">
|
||||
<h2 class="section-title">Scheduled jobs</h2>
|
||||
</div>
|
||||
<p class="muted small">Registered by your own automation. Spark Control only displays these — it doesn't run them.</p>
|
||||
<div id="schedule-list" class="schedule-list"></div>
|
||||
</section>
|
||||
|
||||
<section id="update-banner" class="update-banner hidden">
|
||||
<div class="ub-context muted small">
|
||||
Updates to <strong><a href="https://github.com/eugr/spark-vllm-docker" target="_blank" rel="noopener">eugr/spark-vllm-docker</a></strong>
|
||||
|
||||
@@ -74,6 +74,42 @@ main {
|
||||
}
|
||||
.banner em { font-style: normal; background: rgba(245, 158, 11, 0.15); padding: 2px 6px; border-radius: 4px; }
|
||||
|
||||
/* GPU swap reservation (coordination layer) — informational, not a warning. */
|
||||
.lock-banner {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
border-color: var(--info);
|
||||
color: var(--info);
|
||||
}
|
||||
.lock-banner .lock-icon { font-size: 16px; }
|
||||
.lock-banner strong { color: var(--text); }
|
||||
.lock-banner .spacer { flex: 1; }
|
||||
|
||||
/* Scheduled-jobs panel — read-only view of what external automation registered. */
|
||||
.schedule-panel { margin-top: 8px; }
|
||||
.schedule-list {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(240px, 1fr));
|
||||
gap: 12px;
|
||||
margin-top: 8px;
|
||||
}
|
||||
.schedule-item {
|
||||
background: var(--surface);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: var(--radius);
|
||||
padding: 12px 14px;
|
||||
}
|
||||
.schedule-item .name { font-weight: 600; margin-bottom: 4px; }
|
||||
.schedule-item code {
|
||||
background: var(--surface-2);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 4px;
|
||||
padding: 1px 5px;
|
||||
font-size: 12px;
|
||||
}
|
||||
.schedule-item .desc { margin-top: 6px; color: var(--muted); font-size: 13px; }
|
||||
|
||||
/* ===== Endpoint panel ===== */
|
||||
|
||||
.endpoint-panel {
|
||||
|
||||
+23
-1
@@ -6,6 +6,7 @@ from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from .config import Settings
|
||||
from .coordination import WebhookNotifier, build_webhook_payload
|
||||
from .models import Catalog, build_launch_command
|
||||
from .shellsafe import quote_arg
|
||||
from .ssh import ssh_run, ssh_stream, StreamHandle
|
||||
@@ -33,9 +34,15 @@ class SwapJob:
|
||||
|
||||
|
||||
class SwapManager:
|
||||
def __init__(self, settings: Settings, catalog: Catalog) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
settings: Settings,
|
||||
catalog: Catalog,
|
||||
notifier: Optional[WebhookNotifier] = None,
|
||||
) -> None:
|
||||
self.settings = settings
|
||||
self.catalog = catalog
|
||||
self.notifier = notifier
|
||||
self.lock = asyncio.Lock()
|
||||
self.jobs: dict[str, SwapJob] = {}
|
||||
self.current_job_id: Optional[str] = None
|
||||
@@ -78,6 +85,21 @@ class SwapManager:
|
||||
job.finished_at = datetime.now(timezone.utc).isoformat()
|
||||
if self.current_job_id == job.id:
|
||||
self.current_job_id = None
|
||||
# Outside the swap lock (so a webhook POST can't stall a queued swap) and
|
||||
# only for real swaps — a dry run never changes the running model. A
|
||||
# webhook failure is logged inside fire(), never raised.
|
||||
if self.notifier is not None and self.notifier.enabled and not job.dry_run:
|
||||
event = "swap_complete" if job.state == "ready" else "swap_failed"
|
||||
await self.notifier.fire(event, build_webhook_payload(
|
||||
event=event,
|
||||
job_id=job.id,
|
||||
model_key=job.model_key,
|
||||
state=job.state,
|
||||
returncode=job.returncode,
|
||||
started_at=job.started_at,
|
||||
finished_at=job.finished_at,
|
||||
dry_run=job.dry_run,
|
||||
))
|
||||
|
||||
async def _do(self, job: SwapJob) -> None:
|
||||
model = self.catalog.models[job.model_key]
|
||||
|
||||
Reference in New Issue
Block a user