v0.8.0 - Deep health probes + auto-restart on CUDA wedge

deep_health.py:
- Synthetic probes per service, all payloads generated in-memory (BytesIO), never written to disk:
  - Parakeet: 1s of digital silence via in-memory WAV → POST /v1/audio/transcriptions
  - Magpie:   short 'hi' text → POST /v1/audio/synthesize (multipart form-data, real TTS API endpoint discovered via openapi.json)
  - vLLM:     1-token completion against currently-loaded model
- Background loop runs every 5 minutes (configurable). Best-effort: exceptions in the loop never kill it.
- Auto-restart on wedge-pattern errors (cudaErrorUnknown / CUFFT_INTERNAL_ERROR / 500 / Engine core init failed): docker restart of the affected container.
  - Rate-limited: max 3 restarts per service per 30 min.
  - Cooldown: 120 s between consecutive restarts on the same service.
  - 60 s startup grace before any auto-restart can fire after the app boots.
- Probe failures + recoveries logged via record_report(source='deep-health') into the connectivity history alongside the polling-based transitions.

API:
- GET /api/deep-health: per-service last result + auto-restart counters
- POST /api/deep-health/{service}/run: manual trigger now

UI:
- Service cards show 'Deep check ok/FAILED <time> <latency>' inline, plus a ↻ button to run-now
- Auto-restart count in 30-min window surfaced on the card when > 0
- Inline error excerpt shown for failed probes

Bug fix: server.py app startup hook was placed before the FastAPI app object was constructed (would crash on import). Moved after.
This commit is contained in:
Grant
2026-05-12 14:41:01 -05:00
parent 6434b01a95
commit 000c55febe
5 changed files with 442 additions and 2 deletions
+51
View File
@@ -17,6 +17,7 @@ const state = {
config: {},
configured: true,
timer_handle: null,
deep_health: {},
};
const el = (sel) => document.querySelector(sel);
@@ -413,6 +414,35 @@ async function renderServices() {
const restartsRow = s.restart_count != null && s.restart_count > 1
? `<div class="row"><span class="k">Restarts</span><span class="v">${s.restart_count}</span></div>`
: '';
const dh = state.deep_health?.[name];
let deepRow = '';
if (dh && dh.last) {
const last = dh.last;
const when = (last.at || '').slice(11, 19); // HH:MM:SS
const verdict = last.ok
? `<span class="dh-ok">deep check ok</span>`
: `<span class="dh-fail">deep check FAILED</span>`;
const lat = last.latency_ms != null ? ` <span class="muted">${last.latency_ms} ms</span>` : '';
const restarts = dh.auto_restarts_window > 0
? ` <span class="muted">· ${dh.auto_restarts_window} auto-restart${dh.auto_restarts_window === 1 ? '' : 's'} in 30 min</span>`
: '';
deepRow = `
<div class="row deep-row">
<span class="k">Deep</span>
<span class="v deep-v">${verdict} <span class="muted small">${escapeHtml(when)}</span>${lat}${restarts}</span>
<button class="icon-btn dh-run-btn" data-dh-run="${escapeHtml(name)}" title="Run deep check now">↻</button>
</div>
${last.ok ? '' : `<div class="deep-error muted small">${escapeHtml((last.error || last.note || '').slice(0, 200))}</div>`}
`;
} else if (dh) {
deepRow = `
<div class="row deep-row">
<span class="k">Deep</span>
<span class="v muted-v">no probe yet</span>
<button class="icon-btn dh-run-btn" data-dh-run="${escapeHtml(name)}" title="Run deep check now">↻</button>
</div>
`;
}
card.innerHTML = `
<div class="head">
<span class="name">${escapeHtml(name)}</span>
@@ -423,6 +453,7 @@ async function renderServices() {
${urlRow}
${modelRow}
${restartsRow}
${deepRow}
<div class="service-actions">
<button class="btn" data-svc-action="${name}:start" ${disable('start') ? 'disabled' : ''}>Start</button>
<button class="btn" data-svc-action="${name}:restart" ${disable('restart') ? 'disabled' : ''}>Restart</button>
@@ -434,6 +465,25 @@ async function renderServices() {
for (const btn of grid.querySelectorAll('.btn[data-svc-action]')) {
btn.addEventListener('click', () => onServiceAction(btn.dataset.svcAction));
}
for (const btn of grid.querySelectorAll('[data-dh-run]')) {
btn.addEventListener('click', () => onDeepHealthRun(btn.dataset.dhRun, btn));
}
}
async function onDeepHealthRun(name, btn) {
btn.disabled = true;
const orig = btn.textContent;
btn.textContent = '…';
try {
await fetchJSON(`/api/deep-health/${encodeURIComponent(name)}/run`, { method: 'POST' });
} catch (e) {
console.warn('deep-health run failed', e);
} finally {
try { state.deep_health = await fetchJSON('/api/deep-health'); } catch {}
btn.textContent = orig;
btn.disabled = false;
renderServices();
}
}
async function onServiceAction(key) {
@@ -668,6 +718,7 @@ async function pollStatus() {
// Refresh services state lazily — every 5s poll triggers this too.
try {
state.services = await fetchJSON('/api/services');
try { state.deep_health = await fetchJSON('/api/deep-health'); } catch {}
renderServices();
} catch {}
if (status.current_swap_job && status.current_swap_job !== state.swap_job_id) {