v0.25.0:0 - cluster coordination layer (swap lock + webhook + schedule registry)
GPU-arbiter safety layer for when automation, not just the dashboard, swaps models: - swap reservation lock (POST/GET/DELETE /api/swap/lock); 423-enforced in post_swap via a single-read gate, TTL-bounded, secret-token auth, human force-release override + dashboard banner - swap webhook (swap_complete/swap_failed) fired outside the swap lock, optional HMAC signature, configurable URL+secret - read-only schedule registry (GET/POST/DELETE /api/schedule) + dashboard panel New module image/app/coordination.py; docs/COORDINATION.md for consumers; 22 offline tests in test_coordination.py.
This commit is contained in:
+107
-2
@@ -11,6 +11,7 @@ from typing import Literal
|
||||
|
||||
from .config import Settings
|
||||
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||
from .coordination import LockHeld, ScheduleRegistry, SwapLockManager, WebhookNotifier, valid_schedule_id
|
||||
from .custom_services import add_custom_service, delete_custom_service
|
||||
from .audio_proxy import build_router as build_audio_router
|
||||
from .deep_health import DeepHealth
|
||||
@@ -37,7 +38,12 @@ from .wol import send_local_broadcast, send_via_peer
|
||||
|
||||
settings = Settings.from_env()
|
||||
catalog = load_catalog(settings.models_yaml)
|
||||
swap_manager = SwapManager(settings, catalog)
|
||||
# Coordination layer (GPU arbiter): swap-lifecycle webhook, the swap reservation
|
||||
# lock, and the read-only schedule registry. See coordination.py.
|
||||
swap_webhook = WebhookNotifier(settings.swap_webhook_url, settings.swap_webhook_secret)
|
||||
swap_lock = SwapLockManager()
|
||||
schedule_registry = ScheduleRegistry()
|
||||
swap_manager = SwapManager(settings, catalog, notifier=swap_webhook)
|
||||
download_manager = DownloadManager(settings)
|
||||
update_manager = UpdateManager(settings)
|
||||
hardware_probe = HardwareProbe(settings)
|
||||
@@ -67,6 +73,10 @@ _CSRF_EXEMPT_PREFIXES = (
|
||||
"/api/audio/", # diarize-chunk / label-merge / transcribe-with-speakers
|
||||
"/api/health-event", # health reports posted by consumer apps
|
||||
)
|
||||
# Note: the coordination endpoints (/api/swap/lock, /api/schedule) are
|
||||
# intentionally NOT exempt. External schedulers are non-browser clients (no
|
||||
# Origin header) so they pass the guard already — same as /api/swap — while a
|
||||
# malicious page can't drive them from the operator's browser. Don't add them.
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
@@ -892,9 +902,21 @@ async def validate_swap(key: str) -> dict:
|
||||
|
||||
|
||||
@app.post("/api/swap")
|
||||
async def post_swap(req: SwapRequest) -> dict:
|
||||
async def post_swap(req: SwapRequest, request: Request) -> dict:
|
||||
if not settings.configured and not req.dry_run:
|
||||
raise HTTPException(503, "spark1 not configured")
|
||||
# Enforce the swap reservation lock (the GPU arbiter). A held lock blocks any
|
||||
# real swap that doesn't present the holder's token in X-Swap-Lock-Token — so
|
||||
# an external scheduler that holds the lock can swap, but the dashboard (no
|
||||
# token) is refused while someone else holds it. Dry runs don't touch the
|
||||
# cluster, so they're exempt.
|
||||
if not req.dry_run:
|
||||
blocked = swap_lock.is_blocked_by(request.headers.get("x-swap-lock-token"))
|
||||
if blocked is not None:
|
||||
raise HTTPException(status_code=423, detail={
|
||||
"error": "the GPU swap path is reserved by another holder",
|
||||
"lock": blocked,
|
||||
})
|
||||
try:
|
||||
job = await swap_manager.trigger(req.model_key, dry_run=req.dry_run)
|
||||
except KeyError:
|
||||
@@ -949,6 +971,89 @@ async def stream_swap(job_id: str):
|
||||
return StreamingResponse(gen(), media_type="text/event-stream")
|
||||
|
||||
|
||||
# ---- Coordination layer: swap lock + schedule registry ----
|
||||
# Endpoints are control-surface, not browser-exempt: an external scheduler is a
|
||||
# non-browser client (no Origin header) so it passes the CSRF guard already, the
|
||||
# same way it calls /api/swap today; the dashboard is same-origin.
|
||||
|
||||
class LockAcquireRequest(BaseModel):
|
||||
holder: str
|
||||
ttl_seconds: int | None = None
|
||||
note: str = ""
|
||||
token: str | None = None # present only to extend an existing hold
|
||||
|
||||
|
||||
@app.post("/api/swap/lock")
|
||||
async def acquire_swap_lock(req: LockAcquireRequest) -> dict:
|
||||
"""Reserve the GPU swap path. Returns a secret token used to swap (header
|
||||
X-Swap-Lock-Token) and to release. 409 if held by another holder."""
|
||||
try:
|
||||
lock = swap_lock.acquire(req.holder, req.ttl_seconds, req.note, token=req.token)
|
||||
except ValueError as e:
|
||||
raise HTTPException(422, str(e))
|
||||
except LockHeld as e:
|
||||
raise HTTPException(status_code=409, detail={
|
||||
"error": "swap lock is held by another holder",
|
||||
"lock": e.state,
|
||||
})
|
||||
return {**swap_lock.status(), "token": lock.token}
|
||||
|
||||
|
||||
@app.get("/api/swap/lock")
|
||||
async def get_swap_lock() -> dict:
|
||||
"""Public, token-free view of the reservation: held? who? until when?"""
|
||||
return swap_lock.status()
|
||||
|
||||
|
||||
@app.delete("/api/swap/lock")
|
||||
async def release_swap_lock(request: Request, force: bool = Query(False)) -> dict:
|
||||
"""Release the reservation. Needs the matching X-Swap-Lock-Token unless
|
||||
?force=true (the human override from the dashboard)."""
|
||||
token = request.headers.get("x-swap-lock-token") or request.query_params.get("token")
|
||||
try:
|
||||
released = swap_lock.release(token, force=force)
|
||||
except PermissionError as e:
|
||||
raise HTTPException(403, str(e))
|
||||
return {"released": released, **swap_lock.status()}
|
||||
|
||||
|
||||
class ScheduleRequest(BaseModel):
|
||||
name: str
|
||||
id: str | None = None
|
||||
owner: str = ""
|
||||
cron: str = ""
|
||||
next_run: str = ""
|
||||
description: str = ""
|
||||
|
||||
|
||||
@app.get("/api/schedule")
|
||||
async def list_schedules() -> dict:
|
||||
return {"schedules": schedule_registry.list()}
|
||||
|
||||
|
||||
@app.post("/api/schedule")
|
||||
async def register_schedule(req: ScheduleRequest) -> dict:
|
||||
"""Register (or update, by id) a schedule an external scheduler owns. Spark
|
||||
Control only stores it for the dashboard — it never executes it."""
|
||||
try:
|
||||
entry = schedule_registry.register(
|
||||
name=req.name, id=req.id, owner=req.owner,
|
||||
cron=req.cron, next_run=req.next_run, description=req.description,
|
||||
)
|
||||
except ValueError as e:
|
||||
raise HTTPException(422, str(e))
|
||||
return entry.public()
|
||||
|
||||
|
||||
@app.delete("/api/schedule/{schedule_id}")
|
||||
async def delete_schedule(schedule_id: str) -> dict:
|
||||
# Whitelist the path segment at the boundary (repo convention), even though
|
||||
# it's only ever a dict key — keeps it from being reflected or logged raw.
|
||||
if not valid_schedule_id(schedule_id):
|
||||
raise HTTPException(422, "invalid schedule id")
|
||||
return {"deleted": schedule_registry.delete(schedule_id)}
|
||||
|
||||
|
||||
class DownloadRequest(BaseModel):
|
||||
repo: str
|
||||
mode: Literal["spark1", "spark2", "cluster"] = "spark1"
|
||||
|
||||
Reference in New Issue
Block a user