diff --git a/README.md b/README.md index 37b5493..1ec0bf7 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,24 @@ Other services on your LAN can hit `GET /api/endpoints` to learn where the curre `base_url` is filled in whenever Configure Sparks has been completed (even if the underlying service isn't currently up). Pair the URL with `ready: true` to safely route traffic. +## Reporting failures from external apps + +Spark Control polls every 5 s, so a brief blip in Parakeet/Magpie/vLLM availability can slip between polls and never make it into the connectivity log. To capture short failures, an external app (e.g. Open WebUI) can POST whenever a call fails (or succeeds): + +```bash +curl -X POST http:///api/health-event \ + -H 'content-type: application/json' \ + -d '{ + "service": "parakeet", + "ok": false, + "source": "open-webui", + "error": "HTTP 503", + "ms": 420 + }' +``` + +Fields: `service` (required), `ok` (required), `source` (optional, free-form), `error` (optional), `ms` (optional latency). Each POST appends a `report` event to the connectivity log alongside the polling-based transition events. + ## Status **v0.2.3** — installed and verified on a Start9 server. Five bundled LLMs in the catalog (qwen3-vl, gemma4, qwen36, qwen3-235b-fp8, qwen2.5-72b), plus any custom models added through the UI. diff --git a/image/app/connectivity.py b/image/app/connectivity.py index 89e9be0..5d785d5 100644 --- a/image/app/connectivity.py +++ b/image/app/connectivity.py @@ -1,17 +1,28 @@ -"""Track Spark up/down transitions and cache discovered MAC addresses. +"""Track up/down transitions for any subject (Sparks AND services) and cache MACs. -Persisted to /data/connectivity.json so history survives package restarts: +Persisted to /data/connectivity.json. Schema: { "macs": { "spark1": "aa:bb:..", "spark2": "11:22:.." }, - "current": { "spark1": "up", "spark2": "down" }, - "last_change": { "spark1": "2026-05-12T15:00:00Z", ... }, + "current": { "spark1": "up", "parakeet": "up", "magpie": "down", ... }, + "last_change": { ... }, "events": [ - { "spark": "spark2", "at": "2026-05-12T17:30:00Z", "transition": "down" }, - { "spark": "spark2", "at": "2026-05-12T18:45:00Z", "transition": "up", "down_seconds": 4500 }, - ... + # Active-probe transition (logged when state flips during polling) + { "subject": "spark2", "at": "...", "kind": "transition", + "transition": "down" }, + { "subject": "spark2", "at": "...", "kind": "transition", + "transition": "up", "down_seconds": 4500 }, + + # Passive report (logged whenever an external app POSTs to + # /api/health-event regardless of state change) + { "subject": "parakeet", "at": "...", "kind": "report", + "ok": false, "source": "open-webui", + "detail": "Connection refused", "latency_ms": 320 }, ] } + +Legacy events from v0.5 with `spark` instead of `subject` and no `kind` field +are read transparently as kind="transition". """ from __future__ import annotations import json @@ -59,21 +70,24 @@ def load() -> dict: return d -def record_mac(spark: str, mac: Optional[str]) -> None: +def record_mac(subject: str, mac: Optional[str]) -> None: if not mac: return with _lock: d = _read() d.setdefault("macs", {}) - if d["macs"].get(spark) != mac: - d["macs"][spark] = mac + if d["macs"].get(subject) != mac: + d["macs"][subject] = mac _write(d) -def record_state(spark: str, reachable: bool) -> Optional[dict]: - """Update current state. If it differs from the last seen state, append an event. +def record_state(subject: str, reachable: bool) -> Optional[dict]: + """Update current state for `subject`. If it differs from the last seen + state, append a transition event. Returns the event dict if a transition + was recorded, else None. - Returns the event dict if a transition was recorded, else None. + `subject` can be a Spark host key (spark1/spark2) or a service name + (parakeet/magpie/vllm). """ new_state = "up" if reachable else "down" now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") @@ -83,12 +97,17 @@ def record_state(spark: str, reachable: bool) -> Optional[dict]: d.setdefault("current", {}) d.setdefault("last_change", {}) d.setdefault("events", []) - prev = d["current"].get(spark) + prev = d["current"].get(subject) if prev == new_state: return None - event: dict = {"spark": spark, "at": now, "transition": new_state} + event: dict = { + "subject": subject, + "at": now, + "kind": "transition", + "transition": new_state, + } # When we have a previous state and timestamp, compute duration - last_change = d["last_change"].get(spark) + last_change = d["last_change"].get(subject) if prev and last_change: try: prev_dt = datetime.fromisoformat(last_change.replace("Z", "+00:00")) @@ -99,28 +118,73 @@ def record_state(spark: str, reachable: bool) -> Optional[dict]: event["up_seconds"] = round(duration) except ValueError: pass - d["current"][spark] = new_state - d["last_change"][spark] = now + d["current"][subject] = new_state + d["last_change"][subject] = now d["events"].append(event) - # Keep rolling window if len(d["events"]) > MAX_EVENTS: d["events"] = d["events"][-MAX_EVENTS:] _write(d) return event -def get_mac(spark: str) -> Optional[str]: +def record_report( + subject: str, + *, + ok: bool, + source: str = "external", + detail: str = "", + latency_ms: Optional[int] = None, +) -> dict: + """Record a passive report from an external caller (e.g. Open WebUI got a + 503 calling Parakeet). Always appended to the events list; does NOT change + the active-probe state (which only the polling probe is authoritative on). + """ + now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") + with _lock: + d = _read() + d.setdefault("events", []) + event: dict = { + "subject": subject, + "at": now, + "kind": "report", + "ok": bool(ok), + "source": source or "external", + } + if detail: + event["detail"] = detail + if latency_ms is not None: + event["latency_ms"] = int(latency_ms) + d["events"].append(event) + if len(d["events"]) > MAX_EVENTS: + d["events"] = d["events"][-MAX_EVENTS:] + _write(d) + return event + + +def get_mac(subject: str) -> Optional[str]: d = load() - return d.get("macs", {}).get(spark) + return d.get("macs", {}).get(subject) + + +def _normalize_event(e: dict) -> dict: + """Promote legacy v0.5 events to the v0.6 shape so the UI sees one schema.""" + if "subject" in e: + e.setdefault("kind", "transition") + return e + # Legacy: had "spark" + "transition" only + if "spark" in e: + e["subject"] = e.pop("spark") + e.setdefault("kind", "transition") + return e def summary() -> dict: """Compact summary for the UI: known MACs, current state, recent events.""" d = load() - events = d.get("events", []) + events = [_normalize_event(dict(e)) for e in d.get("events", [])] return { "macs": d.get("macs", {}), "current": d.get("current", {}), "last_change": d.get("last_change", {}), - "events": events[-50:], + "events": events[-80:], } diff --git a/image/app/server.py b/image/app/server.py index 55cc15e..5beec4d 100644 --- a/image/app/server.py +++ b/image/app/server.py @@ -10,7 +10,7 @@ from pydantic import BaseModel from typing import Literal from .config import Settings -from .connectivity import get_mac, summary as connectivity_summary +from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary from .custom_services import add_custom_service, delete_custom_service from .download import DownloadManager from .hardware import HardwareProbe @@ -136,6 +136,37 @@ async def get_connectivity() -> dict: return connectivity_summary() +class HealthEventBody(BaseModel): + service: str # e.g. "parakeet", "magpie", "vllm" + ok: bool # true on success, false on failure + source: str | None = None # what app reported (e.g. "open-webui") + error: str | None = None # optional detail + ms: int | None = None # optional latency + + +@app.post("/api/health-event") +async def post_health_event(body: HealthEventBody) -> dict: + """Passive endpoint: any LAN app can POST here when its call to one of our + services succeeds or (more usefully) fails. We log the report into the + connectivity history so a brief blip that polling misses still surfaces. + + Example: + curl -X POST http:///api/health-event \\ + -H 'content-type: application/json' \\ + -d '{"service":"parakeet","ok":false,"error":"503","source":"open-webui","ms":420}' + """ + if not body.service.strip(): + raise HTTPException(400, "service is required") + event = record_report( + body.service.strip(), + ok=body.ok, + source=(body.source or "external").strip(), + detail=(body.error or "").strip(), + latency_ms=body.ms, + ) + return {"ok": True, "recorded": event} + + @app.post("/api/spark/{name}/wake") async def wake_spark(name: str) -> dict: """Send a Wake-on-LAN magic packet for the named Spark. @@ -216,6 +247,8 @@ async def get_services() -> dict: results = await asyncio.gather(*[one(n) for n in services.keys()]) for name, info in results: out[name] = info + # Feed http reachability into the connectivity log (transition-only) + record_state(name, bool(info.get("http_ready"))) return out @@ -372,6 +405,10 @@ async def get_status() -> dict: check_parakeet(settings), check_magpie(settings), ) + # Feed health into the connectivity log (deduped — only logs on transition) + record_state("vllm", bool(vllm.get("ok"))) + record_state("parakeet", bool(parakeet.get("ok"))) + record_state("magpie", bool(magpie.get("ok"))) current_key = _identify_current_model(vllm.get("current_model")) return { "configured": settings.configured, diff --git a/image/app/static/app.js b/image/app/static/app.js index 7f05150..fee7515 100644 --- a/image/app/static/app.js +++ b/image/app/static/app.js @@ -146,28 +146,42 @@ function openConnectivityDialog() { const c = state.connectivity || {}; const events = c.events || []; if (events.length === 0) { - content.innerHTML = '
No transitions recorded yet. Once a Spark goes down and comes back, you\'ll see entries here.
'; + content.innerHTML = '
No events recorded yet. Once a Spark or service goes down and back up (or an external app reports a failure), entries appear here.
'; dlg.showModal(); return; } - const bySpark = {}; + const bySubject = {}; for (const e of events) { - (bySpark[e.spark] = bySpark[e.spark] || []).push(e); + const subj = e.subject || e.spark || 'unknown'; // legacy fallback + (bySubject[subj] = bySubject[subj] || []).push(e); } - const html = Object.entries(bySpark).map(([spark, evs]) => { - const downs = evs.filter(e => e.transition === 'down').length; - const mac = c.macs?.[spark]; + // Sort subjects: hosts first, then services, alphabetical + const hostOrder = ['spark1', 'spark2']; + const subjects = Object.keys(bySubject).sort((a, b) => { + const ia = hostOrder.indexOf(a); + const ib = hostOrder.indexOf(b); + if (ia >= 0 && ib >= 0) return ia - ib; + if (ia >= 0) return -1; + if (ib >= 0) return 1; + return a.localeCompare(b); + }); + + const html = subjects.map((subj) => { + const evs = bySubject[subj]; + const transitions = evs.filter(e => (e.kind || 'transition') === 'transition'); + const reports = evs.filter(e => e.kind === 'report'); + const downs = transitions.filter(e => e.transition === 'down').length; + const failedReports = reports.filter(e => !e.ok).length; + const mac = c.macs?.[subj]; + const summaryParts = []; + if (transitions.length) summaryParts.push(`${transitions.length} probe transition${transitions.length===1?'':'s'} (${downs} down)`); + if (reports.length) summaryParts.push(`${reports.length} app report${reports.length===1?'':'s'} (${failedReports} failed)`); + const isHost = hostOrder.includes(subj); return `
-

${escapeHtml(spark)}${mac ? ` ${escapeHtml(mac)}` : ''}

-
${evs.length} transition${evs.length===1?'':'s'} · ${downs} down event${downs===1?'':'s'} in window
- ${evs.slice(-25).reverse().map(e => ` -
- ${escapeHtml(e.at.replace('T', ' ').replace('Z', ''))} - ${e.transition === 'up' ? '↑ came back online' : '↓ dropped offline'} - ${e.down_seconds != null ? `was down ${fmtDuration(e.down_seconds)}` : ''}${e.up_seconds != null ? `was up ${fmtDuration(e.up_seconds)}` : ''} -
- `).join('')} +

${escapeHtml(subj)}${isHost ? ' [host]' : ' [service]'}${mac ? ` ${escapeHtml(mac)}` : ''}

+
${summaryParts.join(' · ') || 'no events'}
+ ${evs.slice(-30).reverse().map(e => renderConnEvent(e)).join('')}
`; }).join(''); @@ -175,6 +189,33 @@ function openConnectivityDialog() { dlg.showModal(); } +function renderConnEvent(e) { + const when = escapeHtml((e.at || '').replace('T', ' ').replace('Z', '')); + const kind = e.kind || 'transition'; + if (kind === 'report') { + const ok = !!e.ok; + const source = escapeHtml(e.source || 'external'); + const detail = e.detail ? ` — ${escapeHtml(e.detail)}` : ''; + const latency = e.latency_ms != null ? ` (${e.latency_ms} ms)` : ''; + return ` +
+ ${when} + ${ok ? '◷ report: ok' : '◷ report: failed'} from ${source}${detail} + ${latency} +
+ `; + } + const down = e.down_seconds != null ? `was down ${fmtDuration(e.down_seconds)}` : ''; + const up = e.up_seconds != null ? `was up ${fmtDuration(e.up_seconds)}` : ''; + return ` +
+ ${when} + ${e.transition === 'up' ? '↑ came back online' : '↓ dropped offline'} + ${down}${up} +
+ `; +} + async function wakeSpark(name) { try { const r = await fetchJSON(`/api/spark/${name}/wake`, { method: 'POST' }); diff --git a/image/app/static/style.css b/image/app/static/style.css index bbdaadd..302c7b3 100644 --- a/image/app/static/style.css +++ b/image/app/static/style.css @@ -411,6 +411,8 @@ main { .conn-event .what { flex: 1; } .conn-event.up .what { color: var(--accent); } .conn-event.down .what { color: var(--error); } +.conn-event.report .what { font-style: italic; } +.conn-event .muted { color: var(--muted); font-style: normal; } .conn-event .dur { color: var(--muted); } .conn-summary { color: var(--muted); font-size: 11px; padding: 4px 0 10px; } .hw-metric { display: flex; align-items: center; gap: 10px; font-size: 12px; } diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts index f91643d..bbf8673 100644 --- a/package/startos/versions/v0_1_0.ts +++ b/package/startos/versions/v0_1_0.ts @@ -1,10 +1,10 @@ import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' export const v0_1_0 = VersionInfo.of({ - version: '0.5.0:0', + version: '0.6.0:0', releaseNotes: { en_US: - 'v0.5: Wake-on-LAN + connectivity history. Each Spark\'s MAC is now auto-discovered during the normal hardware sweep and cached in /data/connectivity.json. Up/down transitions are logged with duration. Unreachable hardware cards get a "Wake (WoL)" button that sends a magic packet (preferring the other Spark as the sender so it originates on the right LAN segment). New "Connectivity log" button in the hardware section shows the recent transitions for each Spark — useful for spotting patterns (e.g. always-at-noon dropouts).', + 'v0.6: Service-level connectivity tracking and a passive failure-report endpoint. The connectivity log now records up/down transitions for Parakeet, Magpie, and vLLM in addition to the Spark hosts (driven by the existing /api/status and /api/services polling). A new POST /api/health-event endpoint lets external apps (e.g. Open WebUI) record failures they observed even when the failure was brief enough to slip between polls. The Connectivity log dialog shows hosts and services with separate badges, and reports appear inline with their source app + error detail.', }, migrations: { up: async ({ effects }) => {},