v0.6.0 - Service-level connectivity tracking + passive failure-report endpoint
connectivity.py:
- Generalized 'spark' subject to any string; renamed 'spark' field to 'subject'
- Legacy v0.5 events with the old 'spark' field are migrated transparently on read (kind defaults to 'transition')
- New record_report(subject, ok, source, detail, latency_ms): always appends an event with kind='report'; does NOT mutate the current state (only active polling is authoritative)
- summary() returns events normalized to the new schema
Wiring:
- /api/status now calls record_state for vllm/parakeet/magpie (dedup on no-change)
- /api/services calls record_state for each service after its http check
- Result: dashboard observes service-level transitions automatically with no extra polling
Passive endpoint:
- POST /api/health-event with {service, ok, source?, error?, ms?}
- Useful for external apps (e.g. Open WebUI) to surface sub-poll-interval failures the dashboard would otherwise miss
UI:
- Connectivity dialog groups events by subject (hosts ordered first, then services)
- Per-subject summary shows transition count, down count, report count, failed-report count
- Transitions and reports render inline with distinct styling; reports show source app + error + latency
- Legacy v0.5 events render unchanged
Docs:
- README documents /api/health-event with a curl example
Package: bump to 0.6.0:0
This commit is contained in:
+87
-23
@@ -1,17 +1,28 @@
|
||||
"""Track Spark up/down transitions and cache discovered MAC addresses.
|
||||
"""Track up/down transitions for any subject (Sparks AND services) and cache MACs.
|
||||
|
||||
Persisted to /data/connectivity.json so history survives package restarts:
|
||||
Persisted to /data/connectivity.json. Schema:
|
||||
|
||||
{
|
||||
"macs": { "spark1": "aa:bb:..", "spark2": "11:22:.." },
|
||||
"current": { "spark1": "up", "spark2": "down" },
|
||||
"last_change": { "spark1": "2026-05-12T15:00:00Z", ... },
|
||||
"current": { "spark1": "up", "parakeet": "up", "magpie": "down", ... },
|
||||
"last_change": { ... },
|
||||
"events": [
|
||||
{ "spark": "spark2", "at": "2026-05-12T17:30:00Z", "transition": "down" },
|
||||
{ "spark": "spark2", "at": "2026-05-12T18:45:00Z", "transition": "up", "down_seconds": 4500 },
|
||||
...
|
||||
# Active-probe transition (logged when state flips during polling)
|
||||
{ "subject": "spark2", "at": "...", "kind": "transition",
|
||||
"transition": "down" },
|
||||
{ "subject": "spark2", "at": "...", "kind": "transition",
|
||||
"transition": "up", "down_seconds": 4500 },
|
||||
|
||||
# Passive report (logged whenever an external app POSTs to
|
||||
# /api/health-event regardless of state change)
|
||||
{ "subject": "parakeet", "at": "...", "kind": "report",
|
||||
"ok": false, "source": "open-webui",
|
||||
"detail": "Connection refused", "latency_ms": 320 },
|
||||
]
|
||||
}
|
||||
|
||||
Legacy events from v0.5 with `spark` instead of `subject` and no `kind` field
|
||||
are read transparently as kind="transition".
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
@@ -59,21 +70,24 @@ def load() -> dict:
|
||||
return d
|
||||
|
||||
|
||||
def record_mac(spark: str, mac: Optional[str]) -> None:
|
||||
def record_mac(subject: str, mac: Optional[str]) -> None:
|
||||
if not mac:
|
||||
return
|
||||
with _lock:
|
||||
d = _read()
|
||||
d.setdefault("macs", {})
|
||||
if d["macs"].get(spark) != mac:
|
||||
d["macs"][spark] = mac
|
||||
if d["macs"].get(subject) != mac:
|
||||
d["macs"][subject] = mac
|
||||
_write(d)
|
||||
|
||||
|
||||
def record_state(spark: str, reachable: bool) -> Optional[dict]:
|
||||
"""Update current state. If it differs from the last seen state, append an event.
|
||||
def record_state(subject: str, reachable: bool) -> Optional[dict]:
|
||||
"""Update current state for `subject`. If it differs from the last seen
|
||||
state, append a transition event. Returns the event dict if a transition
|
||||
was recorded, else None.
|
||||
|
||||
Returns the event dict if a transition was recorded, else None.
|
||||
`subject` can be a Spark host key (spark1/spark2) or a service name
|
||||
(parakeet/magpie/vllm).
|
||||
"""
|
||||
new_state = "up" if reachable else "down"
|
||||
now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
@@ -83,12 +97,17 @@ def record_state(spark: str, reachable: bool) -> Optional[dict]:
|
||||
d.setdefault("current", {})
|
||||
d.setdefault("last_change", {})
|
||||
d.setdefault("events", [])
|
||||
prev = d["current"].get(spark)
|
||||
prev = d["current"].get(subject)
|
||||
if prev == new_state:
|
||||
return None
|
||||
event: dict = {"spark": spark, "at": now, "transition": new_state}
|
||||
event: dict = {
|
||||
"subject": subject,
|
||||
"at": now,
|
||||
"kind": "transition",
|
||||
"transition": new_state,
|
||||
}
|
||||
# When we have a previous state and timestamp, compute duration
|
||||
last_change = d["last_change"].get(spark)
|
||||
last_change = d["last_change"].get(subject)
|
||||
if prev and last_change:
|
||||
try:
|
||||
prev_dt = datetime.fromisoformat(last_change.replace("Z", "+00:00"))
|
||||
@@ -99,28 +118,73 @@ def record_state(spark: str, reachable: bool) -> Optional[dict]:
|
||||
event["up_seconds"] = round(duration)
|
||||
except ValueError:
|
||||
pass
|
||||
d["current"][spark] = new_state
|
||||
d["last_change"][spark] = now
|
||||
d["current"][subject] = new_state
|
||||
d["last_change"][subject] = now
|
||||
d["events"].append(event)
|
||||
# Keep rolling window
|
||||
if len(d["events"]) > MAX_EVENTS:
|
||||
d["events"] = d["events"][-MAX_EVENTS:]
|
||||
_write(d)
|
||||
return event
|
||||
|
||||
|
||||
def get_mac(spark: str) -> Optional[str]:
|
||||
def record_report(
|
||||
subject: str,
|
||||
*,
|
||||
ok: bool,
|
||||
source: str = "external",
|
||||
detail: str = "",
|
||||
latency_ms: Optional[int] = None,
|
||||
) -> dict:
|
||||
"""Record a passive report from an external caller (e.g. Open WebUI got a
|
||||
503 calling Parakeet). Always appended to the events list; does NOT change
|
||||
the active-probe state (which only the polling probe is authoritative on).
|
||||
"""
|
||||
now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
with _lock:
|
||||
d = _read()
|
||||
d.setdefault("events", [])
|
||||
event: dict = {
|
||||
"subject": subject,
|
||||
"at": now,
|
||||
"kind": "report",
|
||||
"ok": bool(ok),
|
||||
"source": source or "external",
|
||||
}
|
||||
if detail:
|
||||
event["detail"] = detail
|
||||
if latency_ms is not None:
|
||||
event["latency_ms"] = int(latency_ms)
|
||||
d["events"].append(event)
|
||||
if len(d["events"]) > MAX_EVENTS:
|
||||
d["events"] = d["events"][-MAX_EVENTS:]
|
||||
_write(d)
|
||||
return event
|
||||
|
||||
|
||||
def get_mac(subject: str) -> Optional[str]:
|
||||
d = load()
|
||||
return d.get("macs", {}).get(spark)
|
||||
return d.get("macs", {}).get(subject)
|
||||
|
||||
|
||||
def _normalize_event(e: dict) -> dict:
|
||||
"""Promote legacy v0.5 events to the v0.6 shape so the UI sees one schema."""
|
||||
if "subject" in e:
|
||||
e.setdefault("kind", "transition")
|
||||
return e
|
||||
# Legacy: had "spark" + "transition" only
|
||||
if "spark" in e:
|
||||
e["subject"] = e.pop("spark")
|
||||
e.setdefault("kind", "transition")
|
||||
return e
|
||||
|
||||
|
||||
def summary() -> dict:
|
||||
"""Compact summary for the UI: known MACs, current state, recent events."""
|
||||
d = load()
|
||||
events = d.get("events", [])
|
||||
events = [_normalize_event(dict(e)) for e in d.get("events", [])]
|
||||
return {
|
||||
"macs": d.get("macs", {}),
|
||||
"current": d.get("current", {}),
|
||||
"last_change": d.get("last_change", {}),
|
||||
"events": events[-50:],
|
||||
"events": events[-80:],
|
||||
}
|
||||
|
||||
+38
-1
@@ -10,7 +10,7 @@ from pydantic import BaseModel
|
||||
from typing import Literal
|
||||
|
||||
from .config import Settings
|
||||
from .connectivity import get_mac, summary as connectivity_summary
|
||||
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||
from .custom_services import add_custom_service, delete_custom_service
|
||||
from .download import DownloadManager
|
||||
from .hardware import HardwareProbe
|
||||
@@ -136,6 +136,37 @@ async def get_connectivity() -> dict:
|
||||
return connectivity_summary()
|
||||
|
||||
|
||||
class HealthEventBody(BaseModel):
|
||||
service: str # e.g. "parakeet", "magpie", "vllm"
|
||||
ok: bool # true on success, false on failure
|
||||
source: str | None = None # what app reported (e.g. "open-webui")
|
||||
error: str | None = None # optional detail
|
||||
ms: int | None = None # optional latency
|
||||
|
||||
|
||||
@app.post("/api/health-event")
|
||||
async def post_health_event(body: HealthEventBody) -> dict:
|
||||
"""Passive endpoint: any LAN app can POST here when its call to one of our
|
||||
services succeeds or (more usefully) fails. We log the report into the
|
||||
connectivity history so a brief blip that polling misses still surfaces.
|
||||
|
||||
Example:
|
||||
curl -X POST http://<dashboard>/api/health-event \\
|
||||
-H 'content-type: application/json' \\
|
||||
-d '{"service":"parakeet","ok":false,"error":"503","source":"open-webui","ms":420}'
|
||||
"""
|
||||
if not body.service.strip():
|
||||
raise HTTPException(400, "service is required")
|
||||
event = record_report(
|
||||
body.service.strip(),
|
||||
ok=body.ok,
|
||||
source=(body.source or "external").strip(),
|
||||
detail=(body.error or "").strip(),
|
||||
latency_ms=body.ms,
|
||||
)
|
||||
return {"ok": True, "recorded": event}
|
||||
|
||||
|
||||
@app.post("/api/spark/{name}/wake")
|
||||
async def wake_spark(name: str) -> dict:
|
||||
"""Send a Wake-on-LAN magic packet for the named Spark.
|
||||
@@ -216,6 +247,8 @@ async def get_services() -> dict:
|
||||
results = await asyncio.gather(*[one(n) for n in services.keys()])
|
||||
for name, info in results:
|
||||
out[name] = info
|
||||
# Feed http reachability into the connectivity log (transition-only)
|
||||
record_state(name, bool(info.get("http_ready")))
|
||||
return out
|
||||
|
||||
|
||||
@@ -372,6 +405,10 @@ async def get_status() -> dict:
|
||||
check_parakeet(settings),
|
||||
check_magpie(settings),
|
||||
)
|
||||
# Feed health into the connectivity log (deduped — only logs on transition)
|
||||
record_state("vllm", bool(vllm.get("ok")))
|
||||
record_state("parakeet", bool(parakeet.get("ok")))
|
||||
record_state("magpie", bool(magpie.get("ok")))
|
||||
current_key = _identify_current_model(vllm.get("current_model"))
|
||||
return {
|
||||
"configured": settings.configured,
|
||||
|
||||
+56
-15
@@ -146,28 +146,42 @@ function openConnectivityDialog() {
|
||||
const c = state.connectivity || {};
|
||||
const events = c.events || [];
|
||||
if (events.length === 0) {
|
||||
content.innerHTML = '<div class="muted small">No transitions recorded yet. Once a Spark goes down and comes back, you\'ll see entries here.</div>';
|
||||
content.innerHTML = '<div class="muted small">No events recorded yet. Once a Spark or service goes down and back up (or an external app reports a failure), entries appear here.</div>';
|
||||
dlg.showModal();
|
||||
return;
|
||||
}
|
||||
const bySpark = {};
|
||||
const bySubject = {};
|
||||
for (const e of events) {
|
||||
(bySpark[e.spark] = bySpark[e.spark] || []).push(e);
|
||||
const subj = e.subject || e.spark || 'unknown'; // legacy fallback
|
||||
(bySubject[subj] = bySubject[subj] || []).push(e);
|
||||
}
|
||||
const html = Object.entries(bySpark).map(([spark, evs]) => {
|
||||
const downs = evs.filter(e => e.transition === 'down').length;
|
||||
const mac = c.macs?.[spark];
|
||||
// Sort subjects: hosts first, then services, alphabetical
|
||||
const hostOrder = ['spark1', 'spark2'];
|
||||
const subjects = Object.keys(bySubject).sort((a, b) => {
|
||||
const ia = hostOrder.indexOf(a);
|
||||
const ib = hostOrder.indexOf(b);
|
||||
if (ia >= 0 && ib >= 0) return ia - ib;
|
||||
if (ia >= 0) return -1;
|
||||
if (ib >= 0) return 1;
|
||||
return a.localeCompare(b);
|
||||
});
|
||||
|
||||
const html = subjects.map((subj) => {
|
||||
const evs = bySubject[subj];
|
||||
const transitions = evs.filter(e => (e.kind || 'transition') === 'transition');
|
||||
const reports = evs.filter(e => e.kind === 'report');
|
||||
const downs = transitions.filter(e => e.transition === 'down').length;
|
||||
const failedReports = reports.filter(e => !e.ok).length;
|
||||
const mac = c.macs?.[subj];
|
||||
const summaryParts = [];
|
||||
if (transitions.length) summaryParts.push(`${transitions.length} probe transition${transitions.length===1?'':'s'} (${downs} down)`);
|
||||
if (reports.length) summaryParts.push(`${reports.length} app report${reports.length===1?'':'s'} (${failedReports} failed)`);
|
||||
const isHost = hostOrder.includes(subj);
|
||||
return `
|
||||
<div class="conn-spark">
|
||||
<h4>${escapeHtml(spark)}${mac ? ` <span class="muted small">${escapeHtml(mac)}</span>` : ''}</h4>
|
||||
<div class="conn-summary">${evs.length} transition${evs.length===1?'':'s'} · ${downs} down event${downs===1?'':'s'} in window</div>
|
||||
${evs.slice(-25).reverse().map(e => `
|
||||
<div class="conn-event ${e.transition}">
|
||||
<span class="when">${escapeHtml(e.at.replace('T', ' ').replace('Z', ''))}</span>
|
||||
<span class="what">${e.transition === 'up' ? '↑ came back online' : '↓ dropped offline'}</span>
|
||||
<span class="dur">${e.down_seconds != null ? `was down ${fmtDuration(e.down_seconds)}` : ''}${e.up_seconds != null ? `was up ${fmtDuration(e.up_seconds)}` : ''}</span>
|
||||
</div>
|
||||
`).join('')}
|
||||
<h4>${escapeHtml(subj)}${isHost ? ' <span class="muted small">[host]</span>' : ' <span class="muted small">[service]</span>'}${mac ? ` <span class="muted small">${escapeHtml(mac)}</span>` : ''}</h4>
|
||||
<div class="conn-summary">${summaryParts.join(' · ') || 'no events'}</div>
|
||||
${evs.slice(-30).reverse().map(e => renderConnEvent(e)).join('')}
|
||||
</div>
|
||||
`;
|
||||
}).join('');
|
||||
@@ -175,6 +189,33 @@ function openConnectivityDialog() {
|
||||
dlg.showModal();
|
||||
}
|
||||
|
||||
function renderConnEvent(e) {
|
||||
const when = escapeHtml((e.at || '').replace('T', ' ').replace('Z', ''));
|
||||
const kind = e.kind || 'transition';
|
||||
if (kind === 'report') {
|
||||
const ok = !!e.ok;
|
||||
const source = escapeHtml(e.source || 'external');
|
||||
const detail = e.detail ? ` — ${escapeHtml(e.detail)}` : '';
|
||||
const latency = e.latency_ms != null ? ` (${e.latency_ms} ms)` : '';
|
||||
return `
|
||||
<div class="conn-event ${ok ? 'up' : 'down'} report">
|
||||
<span class="when">${when}</span>
|
||||
<span class="what">${ok ? '◷ report: ok' : '◷ report: failed'} <span class="muted">from</span> ${source}${detail}</span>
|
||||
<span class="dur">${latency}</span>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
const down = e.down_seconds != null ? `was down ${fmtDuration(e.down_seconds)}` : '';
|
||||
const up = e.up_seconds != null ? `was up ${fmtDuration(e.up_seconds)}` : '';
|
||||
return `
|
||||
<div class="conn-event ${e.transition}">
|
||||
<span class="when">${when}</span>
|
||||
<span class="what">${e.transition === 'up' ? '↑ came back online' : '↓ dropped offline'}</span>
|
||||
<span class="dur">${down}${up}</span>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
async function wakeSpark(name) {
|
||||
try {
|
||||
const r = await fetchJSON(`/api/spark/${name}/wake`, { method: 'POST' });
|
||||
|
||||
@@ -411,6 +411,8 @@ main {
|
||||
.conn-event .what { flex: 1; }
|
||||
.conn-event.up .what { color: var(--accent); }
|
||||
.conn-event.down .what { color: var(--error); }
|
||||
.conn-event.report .what { font-style: italic; }
|
||||
.conn-event .muted { color: var(--muted); font-style: normal; }
|
||||
.conn-event .dur { color: var(--muted); }
|
||||
.conn-summary { color: var(--muted); font-size: 11px; padding: 4px 0 10px; }
|
||||
.hw-metric { display: flex; align-items: center; gap: 10px; font-size: 12px; }
|
||||
|
||||
Reference in New Issue
Block a user