v0.25.0:0 - cluster coordination layer (swap lock + webhook + schedule registry)
GPU-arbiter safety layer for when automation, not just the dashboard, swaps models: - swap reservation lock (POST/GET/DELETE /api/swap/lock); 423-enforced in post_swap via a single-read gate, TTL-bounded, secret-token auth, human force-release override + dashboard banner - swap webhook (swap_complete/swap_failed) fired outside the swap lock, optional HMAC signature, configurable URL+secret - read-only schedule registry (GET/POST/DELETE /api/schedule) + dashboard panel New module image/app/coordination.py; docs/COORDINATION.md for consumers; 22 offline tests in test_coordination.py.
This commit is contained in:
+100
-2
@@ -21,11 +21,19 @@ const state = {
|
||||
deep_health: {},
|
||||
disk_status: {}, // keyed by model key: { on_disk, total_bytes, per_host }
|
||||
disk_status_loaded: false,
|
||||
lock: { held: false }, // GPU swap reservation (coordination layer)
|
||||
schedules: [], // schedules external automation has registered
|
||||
};
|
||||
|
||||
const el = (sel) => document.querySelector(sel);
|
||||
const $$ = (sel) => document.querySelectorAll(sel);
|
||||
|
||||
// ISO timestamp -> local clock string (e.g. "2:45:10 PM"); '' if unparseable.
|
||||
function fmtClock(iso) {
|
||||
const t = Date.parse(iso);
|
||||
return isNaN(t) ? '' : new Date(t).toLocaleTimeString();
|
||||
}
|
||||
|
||||
function escapeHtml(s) {
|
||||
if (s == null) return '';
|
||||
return String(s)
|
||||
@@ -51,6 +59,12 @@ function renderCards() {
|
||||
const root = el('#cards');
|
||||
root.innerHTML = '';
|
||||
const isSwapping = !!state.swap_job_id;
|
||||
// GPU reserved by external automation — manual swaps are refused server-side
|
||||
// (423); reflect that in the buttons so the click never bounces.
|
||||
const locked = !!(state.lock && state.lock.held);
|
||||
const lockTip = locked
|
||||
? `Reserved by ${state.lock.holder || 'automation'}${state.lock.expires_at ? ' until ' + fmtClock(state.lock.expires_at) : ''}`
|
||||
: '';
|
||||
for (const key of Object.keys(state.models)) {
|
||||
const m = state.models[key];
|
||||
const isActive = key === state.current_model_key;
|
||||
@@ -94,7 +108,9 @@ function renderCards() {
|
||||
if (isActive) {
|
||||
primaryBtn = `<button class="btn" disabled>Current</button>`;
|
||||
} else if (isOnDisk) {
|
||||
primaryBtn = `<button class="btn primary" data-swap-key="${key}" ${isSwapping ? 'disabled' : ''}>Switch to this</button>`;
|
||||
const swapBlocked = isSwapping || locked;
|
||||
const tip = locked ? ` title="${escapeHtml(lockTip)}"` : '';
|
||||
primaryBtn = `<button class="btn primary" data-swap-key="${key}"${tip} ${swapBlocked ? 'disabled' : ''}>Switch to this</button>`;
|
||||
} else if (m.local_path) {
|
||||
// A local model can't be "downloaded" — its directory has to exist on the Spark.
|
||||
primaryBtn = `<button class="btn" disabled title="Directory not found on the Spark — create it there, then refresh">Not found on Spark</button>`;
|
||||
@@ -1234,6 +1250,11 @@ function openDiskDeleteDialog(key) {
|
||||
|
||||
async function triggerSwap(modelKey) {
|
||||
if (state.swap_job_id) return;
|
||||
if (state.lock && state.lock.held) {
|
||||
const until = state.lock.expires_at ? ' until ' + fmtClock(state.lock.expires_at) : '';
|
||||
alert(`The GPU swap path is reserved by ${state.lock.holder || 'automation'}${until}. Use "Release" on the reservation banner to override.`);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const r = await fetchJSON('/api/swap', {
|
||||
method: 'POST',
|
||||
@@ -1242,10 +1263,84 @@ async function triggerSwap(modelKey) {
|
||||
});
|
||||
attachToSwap(r.job_id, /*needsBackfill=*/false);
|
||||
} catch (e) {
|
||||
alert('Failed to start swap: ' + e.message);
|
||||
// 423 Locked: a reservation was acquired between our last poll and this click.
|
||||
if (e.message && e.message.startsWith('423')) {
|
||||
alert('The GPU swap path was just reserved by automation. Refreshing…');
|
||||
pollCoordination();
|
||||
} else {
|
||||
alert('Failed to start swap: ' + e.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---- coordination layer: swap lock + schedule registry ----
|
||||
|
||||
async function pollCoordination() {
|
||||
try {
|
||||
state.lock = await fetchJSON('/api/swap/lock');
|
||||
} catch { state.lock = { held: false }; }
|
||||
try {
|
||||
const r = await fetchJSON('/api/schedule');
|
||||
state.schedules = r.schedules || [];
|
||||
} catch { state.schedules = []; }
|
||||
renderLockBanner();
|
||||
renderSchedules();
|
||||
renderCards(); // reflect lock state on the swap buttons
|
||||
}
|
||||
|
||||
function renderLockBanner() {
|
||||
const banner = el('#lock-banner');
|
||||
if (!banner) return;
|
||||
const lock = state.lock;
|
||||
if (lock && lock.held) {
|
||||
const until = lock.expires_at ? ` until ${fmtClock(lock.expires_at)}` : '';
|
||||
const note = lock.note ? ` — ${escapeHtml(lock.note)}` : '';
|
||||
el('#lock-text').innerHTML =
|
||||
`GPU swap path reserved by <strong>${escapeHtml(lock.holder || 'automation')}</strong>${until}${note}. Manual swaps are paused.`;
|
||||
banner.classList.remove('hidden');
|
||||
} else {
|
||||
banner.classList.add('hidden');
|
||||
}
|
||||
}
|
||||
|
||||
function renderSchedules() {
|
||||
const panel = el('#schedule-panel');
|
||||
const list = el('#schedule-list');
|
||||
if (!panel || !list) return;
|
||||
const items = state.schedules || [];
|
||||
if (!items.length) {
|
||||
panel.classList.add('hidden');
|
||||
list.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
list.innerHTML = items.map((s) => {
|
||||
const meta = [
|
||||
s.cron ? `<code>${escapeHtml(s.cron)}</code>` : '',
|
||||
s.next_run ? `next: ${escapeHtml(s.next_run)}` : '',
|
||||
s.owner ? `by ${escapeHtml(s.owner)}` : '',
|
||||
].filter(Boolean).join(' · ');
|
||||
const desc = s.description ? `<div class="desc">${escapeHtml(s.description)}</div>` : '';
|
||||
return `<div class="schedule-item">
|
||||
<div class="name">${escapeHtml(s.name)}</div>
|
||||
<div class="muted small">${meta}</div>
|
||||
${desc}
|
||||
</div>`;
|
||||
}).join('');
|
||||
panel.classList.remove('hidden');
|
||||
}
|
||||
|
||||
async function releaseLock() {
|
||||
const lock = state.lock || {};
|
||||
const who = lock.holder || 'automation';
|
||||
if (!confirm(`Force-release the GPU reservation held by ${who}? Any job relying on it may then collide with a manual swap.`)) return;
|
||||
try {
|
||||
await fetchJSON('/api/swap/lock?force=true', { method: 'DELETE' });
|
||||
} catch (e) {
|
||||
alert('Failed to release: ' + e.message);
|
||||
}
|
||||
pollCoordination();
|
||||
}
|
||||
|
||||
async function triggerDownloadForKey(modelKey) {
|
||||
const m = state.models[modelKey];
|
||||
if (!m) return;
|
||||
@@ -2102,6 +2197,7 @@ async function init() {
|
||||
});
|
||||
el('#sshkey-close').addEventListener('click', () => el('#sshkey-dialog').close());
|
||||
el('#open-local').addEventListener('click', openLocalModelDialog);
|
||||
el('#lock-release').addEventListener('click', releaseLock);
|
||||
setupCatalogDialog();
|
||||
setupAdvancedDialog();
|
||||
setupLocalModelDialog();
|
||||
@@ -2119,6 +2215,7 @@ async function init() {
|
||||
await loadModels();
|
||||
await pollStatus();
|
||||
await renderServices();
|
||||
pollCoordination();
|
||||
pollHardware();
|
||||
pollUpdates();
|
||||
// Disk-status probe runs after first paint — slow over SSH and not blocking.
|
||||
@@ -2126,6 +2223,7 @@ async function init() {
|
||||
// Speech-model patches panel — slow over SSH, runs after first paint.
|
||||
renderSpeechModels();
|
||||
setInterval(pollStatus, 5000);
|
||||
setInterval(pollCoordination, 5000); // swap lock + schedule registry
|
||||
setInterval(pollHardware, 8000); // every 8s
|
||||
setInterval(pollUpdates, 300000); // every 5 min
|
||||
setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely
|
||||
|
||||
Reference in New Issue
Block a user