v0.8.0 - Deep health probes + auto-restart on CUDA wedge
deep_health.py:
- Synthetic probes per service, all payloads generated in-memory (BytesIO), never written to disk:
- Parakeet: 1s of digital silence via in-memory WAV → POST /v1/audio/transcriptions
- Magpie: short 'hi' text → POST /v1/audio/synthesize (multipart form-data, real TTS API endpoint discovered via openapi.json)
- vLLM: 1-token completion against currently-loaded model
- Background loop runs every 5 minutes (configurable). Best-effort: exceptions in the loop never kill it.
- Auto-restart on wedge-pattern errors (cudaErrorUnknown / CUFFT_INTERNAL_ERROR / 500 / Engine core init failed): docker restart of the affected container.
- Rate-limited: max 3 restarts per service per 30 min.
- Cooldown: 120 s between consecutive restarts on the same service.
- 60 s startup grace before any auto-restart can fire after the app boots.
- Probe failures + recoveries logged via record_report(source='deep-health') into the connectivity history alongside the polling-based transitions.
API:
- GET /api/deep-health: per-service last result + auto-restart counters
- POST /api/deep-health/{service}/run: manual trigger now
UI:
- Service cards show 'Deep check ok/FAILED <time> <latency>' inline, plus a ↻ button to run-now
- Auto-restart count in 30-min window surfaced on the card when > 0
- Inline error excerpt shown for failed probes
Bug fix: server.py app startup hook was placed before the FastAPI app object was constructed (would crash on import). Moved after.
This commit is contained in:
@@ -17,6 +17,7 @@ const state = {
|
||||
config: {},
|
||||
configured: true,
|
||||
timer_handle: null,
|
||||
deep_health: {},
|
||||
};
|
||||
|
||||
const el = (sel) => document.querySelector(sel);
|
||||
@@ -413,6 +414,35 @@ async function renderServices() {
|
||||
const restartsRow = s.restart_count != null && s.restart_count > 1
|
||||
? `<div class="row"><span class="k">Restarts</span><span class="v">${s.restart_count}</span></div>`
|
||||
: '';
|
||||
const dh = state.deep_health?.[name];
|
||||
let deepRow = '';
|
||||
if (dh && dh.last) {
|
||||
const last = dh.last;
|
||||
const when = (last.at || '').slice(11, 19); // HH:MM:SS
|
||||
const verdict = last.ok
|
||||
? `<span class="dh-ok">deep check ok</span>`
|
||||
: `<span class="dh-fail">deep check FAILED</span>`;
|
||||
const lat = last.latency_ms != null ? ` <span class="muted">${last.latency_ms} ms</span>` : '';
|
||||
const restarts = dh.auto_restarts_window > 0
|
||||
? ` <span class="muted">· ${dh.auto_restarts_window} auto-restart${dh.auto_restarts_window === 1 ? '' : 's'} in 30 min</span>`
|
||||
: '';
|
||||
deepRow = `
|
||||
<div class="row deep-row">
|
||||
<span class="k">Deep</span>
|
||||
<span class="v deep-v">${verdict} <span class="muted small">${escapeHtml(when)}</span>${lat}${restarts}</span>
|
||||
<button class="icon-btn dh-run-btn" data-dh-run="${escapeHtml(name)}" title="Run deep check now">↻</button>
|
||||
</div>
|
||||
${last.ok ? '' : `<div class="deep-error muted small">${escapeHtml((last.error || last.note || '').slice(0, 200))}</div>`}
|
||||
`;
|
||||
} else if (dh) {
|
||||
deepRow = `
|
||||
<div class="row deep-row">
|
||||
<span class="k">Deep</span>
|
||||
<span class="v muted-v">no probe yet</span>
|
||||
<button class="icon-btn dh-run-btn" data-dh-run="${escapeHtml(name)}" title="Run deep check now">↻</button>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
card.innerHTML = `
|
||||
<div class="head">
|
||||
<span class="name">${escapeHtml(name)}</span>
|
||||
@@ -423,6 +453,7 @@ async function renderServices() {
|
||||
${urlRow}
|
||||
${modelRow}
|
||||
${restartsRow}
|
||||
${deepRow}
|
||||
<div class="service-actions">
|
||||
<button class="btn" data-svc-action="${name}:start" ${disable('start') ? 'disabled' : ''}>Start</button>
|
||||
<button class="btn" data-svc-action="${name}:restart" ${disable('restart') ? 'disabled' : ''}>Restart</button>
|
||||
@@ -434,6 +465,25 @@ async function renderServices() {
|
||||
for (const btn of grid.querySelectorAll('.btn[data-svc-action]')) {
|
||||
btn.addEventListener('click', () => onServiceAction(btn.dataset.svcAction));
|
||||
}
|
||||
for (const btn of grid.querySelectorAll('[data-dh-run]')) {
|
||||
btn.addEventListener('click', () => onDeepHealthRun(btn.dataset.dhRun, btn));
|
||||
}
|
||||
}
|
||||
|
||||
async function onDeepHealthRun(name, btn) {
|
||||
btn.disabled = true;
|
||||
const orig = btn.textContent;
|
||||
btn.textContent = '…';
|
||||
try {
|
||||
await fetchJSON(`/api/deep-health/${encodeURIComponent(name)}/run`, { method: 'POST' });
|
||||
} catch (e) {
|
||||
console.warn('deep-health run failed', e);
|
||||
} finally {
|
||||
try { state.deep_health = await fetchJSON('/api/deep-health'); } catch {}
|
||||
btn.textContent = orig;
|
||||
btn.disabled = false;
|
||||
renderServices();
|
||||
}
|
||||
}
|
||||
|
||||
async function onServiceAction(key) {
|
||||
@@ -668,6 +718,7 @@ async function pollStatus() {
|
||||
// Refresh services state lazily — every 5s poll triggers this too.
|
||||
try {
|
||||
state.services = await fetchJSON('/api/services');
|
||||
try { state.deep_health = await fetchJSON('/api/deep-health'); } catch {}
|
||||
renderServices();
|
||||
} catch {}
|
||||
if (status.current_swap_job && status.current_swap_job !== state.swap_job_id) {
|
||||
|
||||
Reference in New Issue
Block a user