v0.25.0:0 - cluster coordination layer (swap lock + webhook + schedule registry)

GPU-arbiter safety layer for when automation, not just the dashboard, swaps
models:
- swap reservation lock (POST/GET/DELETE /api/swap/lock); 423-enforced in
  post_swap via a single-read gate, TTL-bounded, secret-token auth, human
  force-release override + dashboard banner
- swap webhook (swap_complete/swap_failed) fired outside the swap lock, optional
  HMAC signature, configurable URL+secret
- read-only schedule registry (GET/POST/DELETE /api/schedule) + dashboard panel

New module image/app/coordination.py; docs/COORDINATION.md for consumers; 22
offline tests in test_coordination.py.
This commit is contained in:
Keysat
2026-06-18 07:07:08 -05:00
parent dd3d1412d4
commit 7ae6ab3ba8
15 changed files with 1026 additions and 15 deletions
+107 -2
View File
@@ -11,6 +11,7 @@ from typing import Literal
from .config import Settings
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
from .coordination import LockHeld, ScheduleRegistry, SwapLockManager, WebhookNotifier, valid_schedule_id
from .custom_services import add_custom_service, delete_custom_service
from .audio_proxy import build_router as build_audio_router
from .deep_health import DeepHealth
@@ -37,7 +38,12 @@ from .wol import send_local_broadcast, send_via_peer
settings = Settings.from_env()
catalog = load_catalog(settings.models_yaml)
swap_manager = SwapManager(settings, catalog)
# Coordination layer (GPU arbiter): swap-lifecycle webhook, the swap reservation
# lock, and the read-only schedule registry. See coordination.py.
swap_webhook = WebhookNotifier(settings.swap_webhook_url, settings.swap_webhook_secret)
swap_lock = SwapLockManager()
schedule_registry = ScheduleRegistry()
swap_manager = SwapManager(settings, catalog, notifier=swap_webhook)
download_manager = DownloadManager(settings)
update_manager = UpdateManager(settings)
hardware_probe = HardwareProbe(settings)
@@ -67,6 +73,10 @@ _CSRF_EXEMPT_PREFIXES = (
"/api/audio/", # diarize-chunk / label-merge / transcribe-with-speakers
"/api/health-event", # health reports posted by consumer apps
)
# Note: the coordination endpoints (/api/swap/lock, /api/schedule) are
# intentionally NOT exempt. External schedulers are non-browser clients (no
# Origin header) so they pass the guard already — same as /api/swap — while a
# malicious page can't drive them from the operator's browser. Don't add them.
@app.middleware("http")
@@ -892,9 +902,21 @@ async def validate_swap(key: str) -> dict:
@app.post("/api/swap")
async def post_swap(req: SwapRequest) -> dict:
async def post_swap(req: SwapRequest, request: Request) -> dict:
if not settings.configured and not req.dry_run:
raise HTTPException(503, "spark1 not configured")
# Enforce the swap reservation lock (the GPU arbiter). A held lock blocks any
# real swap that doesn't present the holder's token in X-Swap-Lock-Token — so
# an external scheduler that holds the lock can swap, but the dashboard (no
# token) is refused while someone else holds it. Dry runs don't touch the
# cluster, so they're exempt.
if not req.dry_run:
blocked = swap_lock.is_blocked_by(request.headers.get("x-swap-lock-token"))
if blocked is not None:
raise HTTPException(status_code=423, detail={
"error": "the GPU swap path is reserved by another holder",
"lock": blocked,
})
try:
job = await swap_manager.trigger(req.model_key, dry_run=req.dry_run)
except KeyError:
@@ -949,6 +971,89 @@ async def stream_swap(job_id: str):
return StreamingResponse(gen(), media_type="text/event-stream")
# ---- Coordination layer: swap lock + schedule registry ----
# Endpoints are control-surface, not browser-exempt: an external scheduler is a
# non-browser client (no Origin header) so it passes the CSRF guard already, the
# same way it calls /api/swap today; the dashboard is same-origin.
class LockAcquireRequest(BaseModel):
holder: str
ttl_seconds: int | None = None
note: str = ""
token: str | None = None # present only to extend an existing hold
@app.post("/api/swap/lock")
async def acquire_swap_lock(req: LockAcquireRequest) -> dict:
"""Reserve the GPU swap path. Returns a secret token used to swap (header
X-Swap-Lock-Token) and to release. 409 if held by another holder."""
try:
lock = swap_lock.acquire(req.holder, req.ttl_seconds, req.note, token=req.token)
except ValueError as e:
raise HTTPException(422, str(e))
except LockHeld as e:
raise HTTPException(status_code=409, detail={
"error": "swap lock is held by another holder",
"lock": e.state,
})
return {**swap_lock.status(), "token": lock.token}
@app.get("/api/swap/lock")
async def get_swap_lock() -> dict:
"""Public, token-free view of the reservation: held? who? until when?"""
return swap_lock.status()
@app.delete("/api/swap/lock")
async def release_swap_lock(request: Request, force: bool = Query(False)) -> dict:
"""Release the reservation. Needs the matching X-Swap-Lock-Token unless
?force=true (the human override from the dashboard)."""
token = request.headers.get("x-swap-lock-token") or request.query_params.get("token")
try:
released = swap_lock.release(token, force=force)
except PermissionError as e:
raise HTTPException(403, str(e))
return {"released": released, **swap_lock.status()}
class ScheduleRequest(BaseModel):
name: str
id: str | None = None
owner: str = ""
cron: str = ""
next_run: str = ""
description: str = ""
@app.get("/api/schedule")
async def list_schedules() -> dict:
return {"schedules": schedule_registry.list()}
@app.post("/api/schedule")
async def register_schedule(req: ScheduleRequest) -> dict:
"""Register (or update, by id) a schedule an external scheduler owns. Spark
Control only stores it for the dashboard — it never executes it."""
try:
entry = schedule_registry.register(
name=req.name, id=req.id, owner=req.owner,
cron=req.cron, next_run=req.next_run, description=req.description,
)
except ValueError as e:
raise HTTPException(422, str(e))
return entry.public()
@app.delete("/api/schedule/{schedule_id}")
async def delete_schedule(schedule_id: str) -> dict:
# Whitelist the path segment at the boundary (repo convention), even though
# it's only ever a dict key — keeps it from being reflected or logged raw.
if not valid_schedule_id(schedule_id):
raise HTTPException(422, "invalid schedule id")
return {"deleted": schedule_registry.delete(schedule_id)}
class DownloadRequest(BaseModel):
repo: str
mode: Literal["spark1", "spark2", "cluster"] = "spark1"