v0.25.0:0 - cluster coordination layer (swap lock + webhook + schedule registry)

GPU-arbiter safety layer for when automation, not just the dashboard, swaps
models:
- swap reservation lock (POST/GET/DELETE /api/swap/lock); 423-enforced in
  post_swap via a single-read gate, TTL-bounded, secret-token auth, human
  force-release override + dashboard banner
- swap webhook (swap_complete/swap_failed) fired outside the swap lock, optional
  HMAC signature, configurable URL+secret
- read-only schedule registry (GET/POST/DELETE /api/schedule) + dashboard panel

New module image/app/coordination.py; docs/COORDINATION.md for consumers; 22
offline tests in test_coordination.py.
This commit is contained in:
Keysat
2026-06-18 07:07:08 -05:00
parent dd3d1412d4
commit 7ae6ab3ba8
15 changed files with 1026 additions and 15 deletions
+100 -2
View File
@@ -21,11 +21,19 @@ const state = {
deep_health: {},
disk_status: {}, // keyed by model key: { on_disk, total_bytes, per_host }
disk_status_loaded: false,
lock: { held: false }, // GPU swap reservation (coordination layer)
schedules: [], // schedules external automation has registered
};
const el = (sel) => document.querySelector(sel);
const $$ = (sel) => document.querySelectorAll(sel);
// ISO timestamp -> local clock string (e.g. "2:45:10 PM"); '' if unparseable.
function fmtClock(iso) {
const t = Date.parse(iso);
return isNaN(t) ? '' : new Date(t).toLocaleTimeString();
}
function escapeHtml(s) {
if (s == null) return '';
return String(s)
@@ -51,6 +59,12 @@ function renderCards() {
const root = el('#cards');
root.innerHTML = '';
const isSwapping = !!state.swap_job_id;
// GPU reserved by external automation — manual swaps are refused server-side
// (423); reflect that in the buttons so the click never bounces.
const locked = !!(state.lock && state.lock.held);
const lockTip = locked
? `Reserved by ${state.lock.holder || 'automation'}${state.lock.expires_at ? ' until ' + fmtClock(state.lock.expires_at) : ''}`
: '';
for (const key of Object.keys(state.models)) {
const m = state.models[key];
const isActive = key === state.current_model_key;
@@ -94,7 +108,9 @@ function renderCards() {
if (isActive) {
primaryBtn = `<button class="btn" disabled>Current</button>`;
} else if (isOnDisk) {
primaryBtn = `<button class="btn primary" data-swap-key="${key}" ${isSwapping ? 'disabled' : ''}>Switch to this</button>`;
const swapBlocked = isSwapping || locked;
const tip = locked ? ` title="${escapeHtml(lockTip)}"` : '';
primaryBtn = `<button class="btn primary" data-swap-key="${key}"${tip} ${swapBlocked ? 'disabled' : ''}>Switch to this</button>`;
} else if (m.local_path) {
// A local model can't be "downloaded" — its directory has to exist on the Spark.
primaryBtn = `<button class="btn" disabled title="Directory not found on the Spark — create it there, then refresh">Not found on Spark</button>`;
@@ -1234,6 +1250,11 @@ function openDiskDeleteDialog(key) {
async function triggerSwap(modelKey) {
if (state.swap_job_id) return;
if (state.lock && state.lock.held) {
const until = state.lock.expires_at ? ' until ' + fmtClock(state.lock.expires_at) : '';
alert(`The GPU swap path is reserved by ${state.lock.holder || 'automation'}${until}. Use "Release" on the reservation banner to override.`);
return;
}
try {
const r = await fetchJSON('/api/swap', {
method: 'POST',
@@ -1242,10 +1263,84 @@ async function triggerSwap(modelKey) {
});
attachToSwap(r.job_id, /*needsBackfill=*/false);
} catch (e) {
alert('Failed to start swap: ' + e.message);
// 423 Locked: a reservation was acquired between our last poll and this click.
if (e.message && e.message.startsWith('423')) {
alert('The GPU swap path was just reserved by automation. Refreshing…');
pollCoordination();
} else {
alert('Failed to start swap: ' + e.message);
}
}
}
// ---- coordination layer: swap lock + schedule registry ----
async function pollCoordination() {
try {
state.lock = await fetchJSON('/api/swap/lock');
} catch { state.lock = { held: false }; }
try {
const r = await fetchJSON('/api/schedule');
state.schedules = r.schedules || [];
} catch { state.schedules = []; }
renderLockBanner();
renderSchedules();
renderCards(); // reflect lock state on the swap buttons
}
function renderLockBanner() {
const banner = el('#lock-banner');
if (!banner) return;
const lock = state.lock;
if (lock && lock.held) {
const until = lock.expires_at ? ` until ${fmtClock(lock.expires_at)}` : '';
const note = lock.note ? `${escapeHtml(lock.note)}` : '';
el('#lock-text').innerHTML =
`GPU swap path reserved by <strong>${escapeHtml(lock.holder || 'automation')}</strong>${until}${note}. Manual swaps are paused.`;
banner.classList.remove('hidden');
} else {
banner.classList.add('hidden');
}
}
function renderSchedules() {
const panel = el('#schedule-panel');
const list = el('#schedule-list');
if (!panel || !list) return;
const items = state.schedules || [];
if (!items.length) {
panel.classList.add('hidden');
list.innerHTML = '';
return;
}
list.innerHTML = items.map((s) => {
const meta = [
s.cron ? `<code>${escapeHtml(s.cron)}</code>` : '',
s.next_run ? `next: ${escapeHtml(s.next_run)}` : '',
s.owner ? `by ${escapeHtml(s.owner)}` : '',
].filter(Boolean).join(' · ');
const desc = s.description ? `<div class="desc">${escapeHtml(s.description)}</div>` : '';
return `<div class="schedule-item">
<div class="name">${escapeHtml(s.name)}</div>
<div class="muted small">${meta}</div>
${desc}
</div>`;
}).join('');
panel.classList.remove('hidden');
}
async function releaseLock() {
const lock = state.lock || {};
const who = lock.holder || 'automation';
if (!confirm(`Force-release the GPU reservation held by ${who}? Any job relying on it may then collide with a manual swap.`)) return;
try {
await fetchJSON('/api/swap/lock?force=true', { method: 'DELETE' });
} catch (e) {
alert('Failed to release: ' + e.message);
}
pollCoordination();
}
async function triggerDownloadForKey(modelKey) {
const m = state.models[modelKey];
if (!m) return;
@@ -2102,6 +2197,7 @@ async function init() {
});
el('#sshkey-close').addEventListener('click', () => el('#sshkey-dialog').close());
el('#open-local').addEventListener('click', openLocalModelDialog);
el('#lock-release').addEventListener('click', releaseLock);
setupCatalogDialog();
setupAdvancedDialog();
setupLocalModelDialog();
@@ -2119,6 +2215,7 @@ async function init() {
await loadModels();
await pollStatus();
await renderServices();
pollCoordination();
pollHardware();
pollUpdates();
// Disk-status probe runs after first paint — slow over SSH and not blocking.
@@ -2126,6 +2223,7 @@ async function init() {
// Speech-model patches panel — slow over SSH, runs after first paint.
renderSpeechModels();
setInterval(pollStatus, 5000);
setInterval(pollCoordination, 5000); // swap lock + schedule registry
setInterval(pollHardware, 8000); // every 8s
setInterval(pollUpdates, 300000); // every 5 min
setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely