v0.25.0:0 - cluster coordination layer (swap lock + webhook + schedule registry)
GPU-arbiter safety layer for when automation, not just the dashboard, swaps models: - swap reservation lock (POST/GET/DELETE /api/swap/lock); 423-enforced in post_swap via a single-read gate, TTL-bounded, secret-token auth, human force-release override + dashboard banner - swap webhook (swap_complete/swap_failed) fired outside the swap lock, optional HMAC signature, configurable URL+secret - read-only schedule registry (GET/POST/DELETE /api/schedule) + dashboard panel New module image/app/coordination.py; docs/COORDINATION.md for consumers; 22 offline tests in test_coordination.py.
This commit is contained in:
+23
-1
@@ -6,6 +6,7 @@ from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from .config import Settings
|
||||
from .coordination import WebhookNotifier, build_webhook_payload
|
||||
from .models import Catalog, build_launch_command
|
||||
from .shellsafe import quote_arg
|
||||
from .ssh import ssh_run, ssh_stream, StreamHandle
|
||||
@@ -33,9 +34,15 @@ class SwapJob:
|
||||
|
||||
|
||||
class SwapManager:
|
||||
def __init__(self, settings: Settings, catalog: Catalog) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
settings: Settings,
|
||||
catalog: Catalog,
|
||||
notifier: Optional[WebhookNotifier] = None,
|
||||
) -> None:
|
||||
self.settings = settings
|
||||
self.catalog = catalog
|
||||
self.notifier = notifier
|
||||
self.lock = asyncio.Lock()
|
||||
self.jobs: dict[str, SwapJob] = {}
|
||||
self.current_job_id: Optional[str] = None
|
||||
@@ -78,6 +85,21 @@ class SwapManager:
|
||||
job.finished_at = datetime.now(timezone.utc).isoformat()
|
||||
if self.current_job_id == job.id:
|
||||
self.current_job_id = None
|
||||
# Outside the swap lock (so a webhook POST can't stall a queued swap) and
|
||||
# only for real swaps — a dry run never changes the running model. A
|
||||
# webhook failure is logged inside fire(), never raised.
|
||||
if self.notifier is not None and self.notifier.enabled and not job.dry_run:
|
||||
event = "swap_complete" if job.state == "ready" else "swap_failed"
|
||||
await self.notifier.fire(event, build_webhook_payload(
|
||||
event=event,
|
||||
job_id=job.id,
|
||||
model_key=job.model_key,
|
||||
state=job.state,
|
||||
returncode=job.returncode,
|
||||
started_at=job.started_at,
|
||||
finished_at=job.finished_at,
|
||||
dry_run=job.dry_run,
|
||||
))
|
||||
|
||||
async def _do(self, job: SwapJob) -> None:
|
||||
model = self.catalog.models[job.model_key]
|
||||
|
||||
Reference in New Issue
Block a user