Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ee8c2406b8 | |||
| a02f4db850 |
@@ -84,6 +84,24 @@ Other services on your LAN can hit `GET /api/endpoints` to learn where the curre
|
|||||||
|
|
||||||
`base_url` is filled in whenever Configure Sparks has been completed (even if the underlying service isn't currently up). Pair the URL with `ready: true` to safely route traffic.
|
`base_url` is filled in whenever Configure Sparks has been completed (even if the underlying service isn't currently up). Pair the URL with `ready: true` to safely route traffic.
|
||||||
|
|
||||||
|
## Reporting failures from external apps
|
||||||
|
|
||||||
|
Spark Control polls every 5 s, so a brief blip in Parakeet/Magpie/vLLM availability can slip between polls and never make it into the connectivity log. To capture short failures, an external app (e.g. Open WebUI) can POST whenever a call fails (or succeeds):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://<dashboard-url>/api/health-event \
|
||||||
|
-H 'content-type: application/json' \
|
||||||
|
-d '{
|
||||||
|
"service": "parakeet",
|
||||||
|
"ok": false,
|
||||||
|
"source": "open-webui",
|
||||||
|
"error": "HTTP 503",
|
||||||
|
"ms": 420
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Fields: `service` (required), `ok` (required), `source` (optional, free-form), `error` (optional), `ms` (optional latency). Each POST appends a `report` event to the connectivity log alongside the polling-based transition events.
|
||||||
|
|
||||||
## Status
|
## Status
|
||||||
|
|
||||||
**v0.2.3** — installed and verified on a Start9 server. Five bundled LLMs in the catalog (qwen3-vl, gemma4, qwen36, qwen3-235b-fp8, qwen2.5-72b), plus any custom models added through the UI.
|
**v0.2.3** — installed and verified on a Start9 server. Five bundled LLMs in the catalog (qwen3-vl, gemma4, qwen36, qwen3-235b-fp8, qwen2.5-72b), plus any custom models added through the UI.
|
||||||
|
|||||||
@@ -0,0 +1,190 @@
|
|||||||
|
"""Track up/down transitions for any subject (Sparks AND services) and cache MACs.
|
||||||
|
|
||||||
|
Persisted to /data/connectivity.json. Schema:
|
||||||
|
|
||||||
|
{
|
||||||
|
"macs": { "spark1": "aa:bb:..", "spark2": "11:22:.." },
|
||||||
|
"current": { "spark1": "up", "parakeet": "up", "magpie": "down", ... },
|
||||||
|
"last_change": { ... },
|
||||||
|
"events": [
|
||||||
|
# Active-probe transition (logged when state flips during polling)
|
||||||
|
{ "subject": "spark2", "at": "...", "kind": "transition",
|
||||||
|
"transition": "down" },
|
||||||
|
{ "subject": "spark2", "at": "...", "kind": "transition",
|
||||||
|
"transition": "up", "down_seconds": 4500 },
|
||||||
|
|
||||||
|
# Passive report (logged whenever an external app POSTs to
|
||||||
|
# /api/health-event regardless of state change)
|
||||||
|
{ "subject": "parakeet", "at": "...", "kind": "report",
|
||||||
|
"ok": false, "source": "open-webui",
|
||||||
|
"detail": "Connection refused", "latency_ms": 320 },
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
Legacy events from v0.5 with `spark` instead of `subject` and no `kind` field
|
||||||
|
are read transparently as kind="transition".
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
MAX_EVENTS = 200 # rolling window — plenty for showing recent history
|
||||||
|
|
||||||
|
|
||||||
|
def _path() -> str:
|
||||||
|
return os.environ.get("CONNECTIVITY_LOG", "/data/connectivity.json")
|
||||||
|
|
||||||
|
|
||||||
|
_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def _read() -> dict:
|
||||||
|
try:
|
||||||
|
with open(_path()) as f:
|
||||||
|
return json.load(f) or {}
|
||||||
|
except (FileNotFoundError, json.JSONDecodeError):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _write(data: dict) -> None:
|
||||||
|
p = _path()
|
||||||
|
Path(p).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
tmp = p + ".tmp"
|
||||||
|
with open(tmp, "w") as f:
|
||||||
|
json.dump(data, f, indent=2, sort_keys=False)
|
||||||
|
os.replace(tmp, p)
|
||||||
|
|
||||||
|
|
||||||
|
def load() -> dict:
|
||||||
|
with _lock:
|
||||||
|
d = _read()
|
||||||
|
d.setdefault("macs", {})
|
||||||
|
d.setdefault("current", {})
|
||||||
|
d.setdefault("last_change", {})
|
||||||
|
d.setdefault("events", [])
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def record_mac(subject: str, mac: Optional[str]) -> None:
|
||||||
|
if not mac:
|
||||||
|
return
|
||||||
|
with _lock:
|
||||||
|
d = _read()
|
||||||
|
d.setdefault("macs", {})
|
||||||
|
if d["macs"].get(subject) != mac:
|
||||||
|
d["macs"][subject] = mac
|
||||||
|
_write(d)
|
||||||
|
|
||||||
|
|
||||||
|
def record_state(subject: str, reachable: bool) -> Optional[dict]:
|
||||||
|
"""Update current state for `subject`. If it differs from the last seen
|
||||||
|
state, append a transition event. Returns the event dict if a transition
|
||||||
|
was recorded, else None.
|
||||||
|
|
||||||
|
`subject` can be a Spark host key (spark1/spark2) or a service name
|
||||||
|
(parakeet/magpie/vllm).
|
||||||
|
"""
|
||||||
|
new_state = "up" if reachable else "down"
|
||||||
|
now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||||
|
with _lock:
|
||||||
|
d = _read()
|
||||||
|
d.setdefault("macs", {})
|
||||||
|
d.setdefault("current", {})
|
||||||
|
d.setdefault("last_change", {})
|
||||||
|
d.setdefault("events", [])
|
||||||
|
prev = d["current"].get(subject)
|
||||||
|
if prev == new_state:
|
||||||
|
return None
|
||||||
|
event: dict = {
|
||||||
|
"subject": subject,
|
||||||
|
"at": now,
|
||||||
|
"kind": "transition",
|
||||||
|
"transition": new_state,
|
||||||
|
}
|
||||||
|
# When we have a previous state and timestamp, compute duration
|
||||||
|
last_change = d["last_change"].get(subject)
|
||||||
|
if prev and last_change:
|
||||||
|
try:
|
||||||
|
prev_dt = datetime.fromisoformat(last_change.replace("Z", "+00:00"))
|
||||||
|
duration = (datetime.now(timezone.utc) - prev_dt).total_seconds()
|
||||||
|
if prev == "down" and new_state == "up":
|
||||||
|
event["down_seconds"] = round(duration)
|
||||||
|
if prev == "up" and new_state == "down":
|
||||||
|
event["up_seconds"] = round(duration)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
d["current"][subject] = new_state
|
||||||
|
d["last_change"][subject] = now
|
||||||
|
d["events"].append(event)
|
||||||
|
if len(d["events"]) > MAX_EVENTS:
|
||||||
|
d["events"] = d["events"][-MAX_EVENTS:]
|
||||||
|
_write(d)
|
||||||
|
return event
|
||||||
|
|
||||||
|
|
||||||
|
def record_report(
|
||||||
|
subject: str,
|
||||||
|
*,
|
||||||
|
ok: bool,
|
||||||
|
source: str = "external",
|
||||||
|
detail: str = "",
|
||||||
|
latency_ms: Optional[int] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Record a passive report from an external caller (e.g. Open WebUI got a
|
||||||
|
503 calling Parakeet). Always appended to the events list; does NOT change
|
||||||
|
the active-probe state (which only the polling probe is authoritative on).
|
||||||
|
"""
|
||||||
|
now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||||
|
with _lock:
|
||||||
|
d = _read()
|
||||||
|
d.setdefault("events", [])
|
||||||
|
event: dict = {
|
||||||
|
"subject": subject,
|
||||||
|
"at": now,
|
||||||
|
"kind": "report",
|
||||||
|
"ok": bool(ok),
|
||||||
|
"source": source or "external",
|
||||||
|
}
|
||||||
|
if detail:
|
||||||
|
event["detail"] = detail
|
||||||
|
if latency_ms is not None:
|
||||||
|
event["latency_ms"] = int(latency_ms)
|
||||||
|
d["events"].append(event)
|
||||||
|
if len(d["events"]) > MAX_EVENTS:
|
||||||
|
d["events"] = d["events"][-MAX_EVENTS:]
|
||||||
|
_write(d)
|
||||||
|
return event
|
||||||
|
|
||||||
|
|
||||||
|
def get_mac(subject: str) -> Optional[str]:
|
||||||
|
d = load()
|
||||||
|
return d.get("macs", {}).get(subject)
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_event(e: dict) -> dict:
|
||||||
|
"""Promote legacy v0.5 events to the v0.6 shape so the UI sees one schema."""
|
||||||
|
if "subject" in e:
|
||||||
|
e.setdefault("kind", "transition")
|
||||||
|
return e
|
||||||
|
# Legacy: had "spark" + "transition" only
|
||||||
|
if "spark" in e:
|
||||||
|
e["subject"] = e.pop("spark")
|
||||||
|
e.setdefault("kind", "transition")
|
||||||
|
return e
|
||||||
|
|
||||||
|
|
||||||
|
def summary() -> dict:
|
||||||
|
"""Compact summary for the UI: known MACs, current state, recent events."""
|
||||||
|
d = load()
|
||||||
|
events = [_normalize_event(dict(e)) for e in d.get("events", [])]
|
||||||
|
return {
|
||||||
|
"macs": d.get("macs", {}),
|
||||||
|
"current": d.get("current", {}),
|
||||||
|
"last_change": d.get("last_change", {}),
|
||||||
|
"events": events[-80:],
|
||||||
|
}
|
||||||
+12
-4
@@ -10,6 +10,7 @@ import time
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from .config import Settings
|
from .config import Settings
|
||||||
|
from .connectivity import record_mac, record_state
|
||||||
from .ssh import ssh_run
|
from .ssh import ssh_run
|
||||||
|
|
||||||
|
|
||||||
@@ -23,6 +24,8 @@ echo MEMORY=$(free -b 2>/dev/null | awk '/^Mem:/ {print $2, $3}')
|
|||||||
echo DISK=$(df -B1 / 2>/dev/null | awk 'NR==2 {print $2, $3}')
|
echo DISK=$(df -B1 / 2>/dev/null | awk 'NR==2 {print $2, $3}')
|
||||||
echo GPU=$(nvidia-smi --query-gpu=name,utilization.gpu,temperature.gpu,power.draw,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1)
|
echo GPU=$(nvidia-smi --query-gpu=name,utilization.gpu,temperature.gpu,power.draw,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1)
|
||||||
echo GPU_MEM_USED_MIB=$(nvidia-smi --query-compute-apps=used_gpu_memory --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END {print s+0}')
|
echo GPU_MEM_USED_MIB=$(nvidia-smi --query-compute-apps=used_gpu_memory --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END {print s+0}')
|
||||||
|
DEFIF=$(ip route show default 2>/dev/null | awk '{print $5; exit}')
|
||||||
|
echo MAC=$(cat /sys/class/net/$DEFIF/address 2>/dev/null)
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
|
|
||||||
@@ -78,6 +81,9 @@ def _parse(out: str) -> dict:
|
|||||||
# Sum per-process compute memory (works even on unified-memory systems)
|
# Sum per-process compute memory (works even on unified-memory systems)
|
||||||
if info.get("gpu_mem_used_mib"):
|
if info.get("gpu_mem_used_mib"):
|
||||||
parsed["gpu_mem_used_mib"] = _parse_int(info["gpu_mem_used_mib"])
|
parsed["gpu_mem_used_mib"] = _parse_int(info["gpu_mem_used_mib"])
|
||||||
|
# MAC address on the default-route interface (for Wake-on-LAN)
|
||||||
|
if info.get("mac"):
|
||||||
|
parsed["mac"] = info["mac"].lower()
|
||||||
return parsed
|
return parsed
|
||||||
|
|
||||||
|
|
||||||
@@ -118,12 +124,14 @@ class HardwareProbe:
|
|||||||
# marked this host unreachable, return the cached failure immediately.
|
# marked this host unreachable, return the cached failure immediately.
|
||||||
rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=6)
|
rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=6)
|
||||||
if rc != 0:
|
if rc != 0:
|
||||||
# Cache failures for a slightly longer TTL so the dashboard isn't
|
|
||||||
# blocked behind 6 s of SSH timeout on every poll.
|
|
||||||
result = {"reachable": False, "configured": True, "host": host, "error": err.strip() or out.strip() or f"rc={rc}"}
|
result = {"reachable": False, "configured": True, "host": host, "error": err.strip() or out.strip() or f"rc={rc}"}
|
||||||
self._cache[key] = (now, result)
|
self._cache[key] = (now, result)
|
||||||
# Override the TTL effectively by inserting a sentinel into the cache age
|
record_state(key, False)
|
||||||
return result
|
return result
|
||||||
result = {"reachable": True, "configured": True, "host": host, **_parse(out)}
|
parsed = _parse(out)
|
||||||
|
result = {"reachable": True, "configured": True, "host": host, **parsed}
|
||||||
self._cache[key] = (now, result)
|
self._cache[key] = (now, result)
|
||||||
|
record_state(key, True)
|
||||||
|
if parsed.get("mac"):
|
||||||
|
record_mac(key, parsed["mac"])
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from pydantic import BaseModel
|
|||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
from .config import Settings
|
from .config import Settings
|
||||||
|
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||||
from .custom_services import add_custom_service, delete_custom_service
|
from .custom_services import add_custom_service, delete_custom_service
|
||||||
from .download import DownloadManager
|
from .download import DownloadManager
|
||||||
from .hardware import HardwareProbe
|
from .hardware import HardwareProbe
|
||||||
@@ -21,6 +22,7 @@ from .services import docker_state, run_action, services_from_settings
|
|||||||
from .ssh import ssh_run
|
from .ssh import ssh_run
|
||||||
from .swap import SwapManager
|
from .swap import SwapManager
|
||||||
from .updates import UpdateManager, get_update_status
|
from .updates import UpdateManager, get_update_status
|
||||||
|
from .wol import send_local_broadcast, send_via_peer
|
||||||
|
|
||||||
|
|
||||||
settings = Settings.from_env()
|
settings = Settings.from_env()
|
||||||
@@ -128,6 +130,81 @@ async def get_hardware() -> dict:
|
|||||||
return await hardware_probe.fetch()
|
return await hardware_probe.fetch()
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/connectivity")
|
||||||
|
async def get_connectivity() -> dict:
|
||||||
|
"""Up/down transition log per Spark + cached MACs."""
|
||||||
|
return connectivity_summary()
|
||||||
|
|
||||||
|
|
||||||
|
class HealthEventBody(BaseModel):
|
||||||
|
service: str # e.g. "parakeet", "magpie", "vllm"
|
||||||
|
ok: bool # true on success, false on failure
|
||||||
|
source: str | None = None # what app reported (e.g. "open-webui")
|
||||||
|
error: str | None = None # optional detail
|
||||||
|
ms: int | None = None # optional latency
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/health-event")
|
||||||
|
async def post_health_event(body: HealthEventBody) -> dict:
|
||||||
|
"""Passive endpoint: any LAN app can POST here when its call to one of our
|
||||||
|
services succeeds or (more usefully) fails. We log the report into the
|
||||||
|
connectivity history so a brief blip that polling misses still surfaces.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
curl -X POST http://<dashboard>/api/health-event \\
|
||||||
|
-H 'content-type: application/json' \\
|
||||||
|
-d '{"service":"parakeet","ok":false,"error":"503","source":"open-webui","ms":420}'
|
||||||
|
"""
|
||||||
|
if not body.service.strip():
|
||||||
|
raise HTTPException(400, "service is required")
|
||||||
|
event = record_report(
|
||||||
|
body.service.strip(),
|
||||||
|
ok=body.ok,
|
||||||
|
source=(body.source or "external").strip(),
|
||||||
|
detail=(body.error or "").strip(),
|
||||||
|
latency_ms=body.ms,
|
||||||
|
)
|
||||||
|
return {"ok": True, "recorded": event}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/spark/{name}/wake")
|
||||||
|
async def wake_spark(name: str) -> dict:
|
||||||
|
"""Send a Wake-on-LAN magic packet for the named Spark.
|
||||||
|
|
||||||
|
Tries the OTHER Spark (if reachable) first because the packet has to
|
||||||
|
originate on the target's LAN segment to be reliable. Falls back to a
|
||||||
|
direct UDP broadcast from this container.
|
||||||
|
"""
|
||||||
|
if name not in ("spark1", "spark2"):
|
||||||
|
raise HTTPException(404, f"unknown spark: {name}")
|
||||||
|
mac = get_mac(name)
|
||||||
|
if not mac:
|
||||||
|
raise HTTPException(400, f"MAC for {name} not yet known; bring it up once so we can probe it, then this will work next time it sleeps")
|
||||||
|
|
||||||
|
# Find the peer's connectivity to decide the path.
|
||||||
|
other = "spark2" if name == "spark1" else "spark1"
|
||||||
|
other_host = settings.spark1_host if other == "spark1" else settings.spark2_host
|
||||||
|
other_user = settings.spark1_user if other == "spark1" else settings.spark2_user
|
||||||
|
|
||||||
|
delivered_via = None
|
||||||
|
via_peer_ok = False
|
||||||
|
via_peer_err = ""
|
||||||
|
if other_host and other_user:
|
||||||
|
via_peer_ok, via_peer_err = await send_via_peer(other_host, other_user, mac, settings)
|
||||||
|
if via_peer_ok:
|
||||||
|
delivered_via = other
|
||||||
|
|
||||||
|
if not via_peer_ok:
|
||||||
|
# Fall back to direct from this container
|
||||||
|
try:
|
||||||
|
send_local_broadcast(mac)
|
||||||
|
delivered_via = "container"
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(500, f"WoL failed: peer={via_peer_err!r} container={e!r}")
|
||||||
|
|
||||||
|
return {"ok": True, "spark": name, "mac": mac, "delivered_via": delivered_via}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/services")
|
@app.get("/api/services")
|
||||||
async def get_services() -> dict:
|
async def get_services() -> dict:
|
||||||
"""Lifecycle state of always-on support services (Parakeet, Magpie, …).
|
"""Lifecycle state of always-on support services (Parakeet, Magpie, …).
|
||||||
@@ -170,6 +247,8 @@ async def get_services() -> dict:
|
|||||||
results = await asyncio.gather(*[one(n) for n in services.keys()])
|
results = await asyncio.gather(*[one(n) for n in services.keys()])
|
||||||
for name, info in results:
|
for name, info in results:
|
||||||
out[name] = info
|
out[name] = info
|
||||||
|
# Feed http reachability into the connectivity log (transition-only)
|
||||||
|
record_state(name, bool(info.get("http_ready")))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
@@ -326,6 +405,10 @@ async def get_status() -> dict:
|
|||||||
check_parakeet(settings),
|
check_parakeet(settings),
|
||||||
check_magpie(settings),
|
check_magpie(settings),
|
||||||
)
|
)
|
||||||
|
# Feed health into the connectivity log (deduped — only logs on transition)
|
||||||
|
record_state("vllm", bool(vllm.get("ok")))
|
||||||
|
record_state("parakeet", bool(parakeet.get("ok")))
|
||||||
|
record_state("magpie", bool(magpie.get("ok")))
|
||||||
current_key = _identify_current_model(vllm.get("current_model"))
|
current_key = _identify_current_model(vllm.get("current_model"))
|
||||||
return {
|
return {
|
||||||
"configured": settings.configured,
|
"configured": settings.configured,
|
||||||
|
|||||||
+117
-1
@@ -121,10 +121,110 @@ function bar(usedPct, warn) {
|
|||||||
async function pollHardware() {
|
async function pollHardware() {
|
||||||
try {
|
try {
|
||||||
state.hardware = await fetchJSON('/api/hardware');
|
state.hardware = await fetchJSON('/api/hardware');
|
||||||
|
try { state.connectivity = await fetchJSON('/api/connectivity'); } catch {}
|
||||||
renderHardware();
|
renderHardware();
|
||||||
} catch (e) { console.warn('hardware poll failed', e); }
|
} catch (e) { console.warn('hardware poll failed', e); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function fmtDuration(sec) {
|
||||||
|
if (sec == null) return '';
|
||||||
|
if (sec < 60) return `${Math.round(sec)}s`;
|
||||||
|
if (sec < 3600) return `${Math.round(sec / 60)}m`;
|
||||||
|
if (sec < 86400) {
|
||||||
|
const h = Math.floor(sec / 3600);
|
||||||
|
const m = Math.round((sec % 3600) / 60);
|
||||||
|
return m ? `${h}h ${m}m` : `${h}h`;
|
||||||
|
}
|
||||||
|
const d = Math.floor(sec / 86400);
|
||||||
|
const h = Math.round((sec % 86400) / 3600);
|
||||||
|
return h ? `${d}d ${h}h` : `${d}d`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function openConnectivityDialog() {
|
||||||
|
const dlg = el('#connectivity-dialog');
|
||||||
|
const content = el('#connectivity-content');
|
||||||
|
const c = state.connectivity || {};
|
||||||
|
const events = c.events || [];
|
||||||
|
if (events.length === 0) {
|
||||||
|
content.innerHTML = '<div class="muted small">No events recorded yet. Once a Spark or service goes down and back up (or an external app reports a failure), entries appear here.</div>';
|
||||||
|
dlg.showModal();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const bySubject = {};
|
||||||
|
for (const e of events) {
|
||||||
|
const subj = e.subject || e.spark || 'unknown'; // legacy fallback
|
||||||
|
(bySubject[subj] = bySubject[subj] || []).push(e);
|
||||||
|
}
|
||||||
|
// Sort subjects: hosts first, then services, alphabetical
|
||||||
|
const hostOrder = ['spark1', 'spark2'];
|
||||||
|
const subjects = Object.keys(bySubject).sort((a, b) => {
|
||||||
|
const ia = hostOrder.indexOf(a);
|
||||||
|
const ib = hostOrder.indexOf(b);
|
||||||
|
if (ia >= 0 && ib >= 0) return ia - ib;
|
||||||
|
if (ia >= 0) return -1;
|
||||||
|
if (ib >= 0) return 1;
|
||||||
|
return a.localeCompare(b);
|
||||||
|
});
|
||||||
|
|
||||||
|
const html = subjects.map((subj) => {
|
||||||
|
const evs = bySubject[subj];
|
||||||
|
const transitions = evs.filter(e => (e.kind || 'transition') === 'transition');
|
||||||
|
const reports = evs.filter(e => e.kind === 'report');
|
||||||
|
const downs = transitions.filter(e => e.transition === 'down').length;
|
||||||
|
const failedReports = reports.filter(e => !e.ok).length;
|
||||||
|
const mac = c.macs?.[subj];
|
||||||
|
const summaryParts = [];
|
||||||
|
if (transitions.length) summaryParts.push(`${transitions.length} probe transition${transitions.length===1?'':'s'} (${downs} down)`);
|
||||||
|
if (reports.length) summaryParts.push(`${reports.length} app report${reports.length===1?'':'s'} (${failedReports} failed)`);
|
||||||
|
const isHost = hostOrder.includes(subj);
|
||||||
|
return `
|
||||||
|
<div class="conn-spark">
|
||||||
|
<h4>${escapeHtml(subj)}${isHost ? ' <span class="muted small">[host]</span>' : ' <span class="muted small">[service]</span>'}${mac ? ` <span class="muted small">${escapeHtml(mac)}</span>` : ''}</h4>
|
||||||
|
<div class="conn-summary">${summaryParts.join(' · ') || 'no events'}</div>
|
||||||
|
${evs.slice(-30).reverse().map(e => renderConnEvent(e)).join('')}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}).join('');
|
||||||
|
content.innerHTML = html;
|
||||||
|
dlg.showModal();
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderConnEvent(e) {
|
||||||
|
const when = escapeHtml((e.at || '').replace('T', ' ').replace('Z', ''));
|
||||||
|
const kind = e.kind || 'transition';
|
||||||
|
if (kind === 'report') {
|
||||||
|
const ok = !!e.ok;
|
||||||
|
const source = escapeHtml(e.source || 'external');
|
||||||
|
const detail = e.detail ? ` — ${escapeHtml(e.detail)}` : '';
|
||||||
|
const latency = e.latency_ms != null ? ` (${e.latency_ms} ms)` : '';
|
||||||
|
return `
|
||||||
|
<div class="conn-event ${ok ? 'up' : 'down'} report">
|
||||||
|
<span class="when">${when}</span>
|
||||||
|
<span class="what">${ok ? '◷ report: ok' : '◷ report: failed'} <span class="muted">from</span> ${source}${detail}</span>
|
||||||
|
<span class="dur">${latency}</span>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
const down = e.down_seconds != null ? `was down ${fmtDuration(e.down_seconds)}` : '';
|
||||||
|
const up = e.up_seconds != null ? `was up ${fmtDuration(e.up_seconds)}` : '';
|
||||||
|
return `
|
||||||
|
<div class="conn-event ${e.transition}">
|
||||||
|
<span class="when">${when}</span>
|
||||||
|
<span class="what">${e.transition === 'up' ? '↑ came back online' : '↓ dropped offline'}</span>
|
||||||
|
<span class="dur">${down}${up}</span>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function wakeSpark(name) {
|
||||||
|
try {
|
||||||
|
const r = await fetchJSON(`/api/spark/${name}/wake`, { method: 'POST' });
|
||||||
|
alert(`Wake-on-LAN sent to ${name} (MAC ${r.mac}, via ${r.delivered_via}). Give it ~30 seconds to wake; the card will go green when it comes back.`);
|
||||||
|
} catch (e) {
|
||||||
|
alert(`Wake failed: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function renderHardware() {
|
function renderHardware() {
|
||||||
const panel = el('#hardware-panel');
|
const panel = el('#hardware-panel');
|
||||||
const grid = el('#hardware-grid');
|
const grid = el('#hardware-grid');
|
||||||
@@ -138,14 +238,23 @@ function renderHardware() {
|
|||||||
const card = document.createElement('div');
|
const card = document.createElement('div');
|
||||||
if (!s.reachable) {
|
if (!s.reachable) {
|
||||||
card.className = 'hw-card unreachable';
|
card.className = 'hw-card unreachable';
|
||||||
|
const mac = state.connectivity?.macs?.[key];
|
||||||
|
const wolRow = mac
|
||||||
|
? `<div class="wol-row">
|
||||||
|
<span class="mac-display">${escapeHtml(mac)}</span>
|
||||||
|
<span class="spacer"></span>
|
||||||
|
<button class="btn" data-wake="${escapeHtml(key)}">Wake (WoL)</button>
|
||||||
|
</div>`
|
||||||
|
: `<div class="muted small">MAC not yet known — once it's been up once with this dashboard installed, "Wake" will appear here.</div>`;
|
||||||
card.innerHTML = `
|
card.innerHTML = `
|
||||||
<div class="head">
|
<div class="head">
|
||||||
<span class="name">${escapeHtml(key)}</span>
|
<span class="name">${escapeHtml(key)}</span>
|
||||||
<span class="meta">unreachable</span>
|
<span class="meta">unreachable</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="muted small">${escapeHtml(s.host || '')} — ${escapeHtml(s.error || 'no response')}</div>
|
<div class="muted small">${escapeHtml(s.host || '')} — ${escapeHtml(s.error || 'no response')}</div>
|
||||||
|
${wolRow}
|
||||||
<div class="muted small" style="line-height:1.5">
|
<div class="muted small" style="line-height:1.5">
|
||||||
Spark Control can't restart a Spark that won't answer SSH. Steps to try:
|
If Wake-on-LAN doesn't bring it back, manual steps:
|
||||||
<ol style="margin: 6px 0 0 18px; padding: 0;">
|
<ol style="margin: 6px 0 0 18px; padding: 0;">
|
||||||
<li>Verify it's powered on (check the front LED).</li>
|
<li>Verify it's powered on (check the front LED).</li>
|
||||||
<li>Ping it from another LAN device.</li>
|
<li>Ping it from another LAN device.</li>
|
||||||
@@ -1307,6 +1416,13 @@ async function init() {
|
|||||||
el('#nim-cancel').addEventListener('click', () => el('#nim-dialog').close());
|
el('#nim-cancel').addEventListener('click', () => el('#nim-dialog').close());
|
||||||
el('#nim-form').addEventListener('submit', submitNim);
|
el('#nim-form').addEventListener('submit', submitNim);
|
||||||
el('#nim-prog-close').addEventListener('click', () => el('#nim-progress-dialog').close());
|
el('#nim-prog-close').addEventListener('click', () => el('#nim-progress-dialog').close());
|
||||||
|
el('#open-connectivity').addEventListener('click', openConnectivityDialog);
|
||||||
|
el('#connectivity-close').addEventListener('click', () => el('#connectivity-dialog').close());
|
||||||
|
// Wake-on-LAN buttons live on unreachable hardware cards; delegate.
|
||||||
|
el('#hardware-grid').addEventListener('click', (e) => {
|
||||||
|
const btn = e.target.closest('[data-wake]');
|
||||||
|
if (btn) wakeSpark(btn.dataset.wake);
|
||||||
|
});
|
||||||
setupCatalogDialog();
|
setupCatalogDialog();
|
||||||
setupAdvancedDialog();
|
setupAdvancedDialog();
|
||||||
// Open WebUI link from /api/config
|
// Open WebUI link from /api/config
|
||||||
|
|||||||
@@ -26,8 +26,22 @@
|
|||||||
</section>
|
</section>
|
||||||
|
|
||||||
<section id="hardware-panel" class="hardware-panel hidden">
|
<section id="hardware-panel" class="hardware-panel hidden">
|
||||||
|
<div class="section-header">
|
||||||
<h2 class="section-title">Spark hardware</h2>
|
<h2 class="section-title">Spark hardware</h2>
|
||||||
|
<button id="open-connectivity" class="btn small-btn">Connectivity log</button>
|
||||||
|
</div>
|
||||||
<div id="hardware-grid" class="hardware-grid"></div>
|
<div id="hardware-grid" class="hardware-grid"></div>
|
||||||
|
|
||||||
|
<dialog id="connectivity-dialog" class="modal">
|
||||||
|
<form method="dialog" class="modal-form">
|
||||||
|
<h3>Spark connectivity history</h3>
|
||||||
|
<p class="muted small">Most recent up/down transitions per Spark. Tracked since this dashboard was installed.</p>
|
||||||
|
<div id="connectivity-content" class="connectivity-content"></div>
|
||||||
|
<div class="modal-actions">
|
||||||
|
<button type="button" id="connectivity-close" class="btn">Close</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</dialog>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<section id="endpoint-panel" class="endpoint-panel hidden">
|
<section id="endpoint-panel" class="endpoint-panel hidden">
|
||||||
|
|||||||
@@ -377,6 +377,44 @@ main {
|
|||||||
.hw-card.unreachable { border-color: rgba(239, 68, 68, 0.4); }
|
.hw-card.unreachable { border-color: rgba(239, 68, 68, 0.4); }
|
||||||
.hw-card.unreachable .name { color: var(--error); }
|
.hw-card.unreachable .name { color: var(--error); }
|
||||||
.hw-card.unreachable ol { color: var(--muted); }
|
.hw-card.unreachable ol { color: var(--muted); }
|
||||||
|
.hw-card .wol-row {
|
||||||
|
margin-top: 8px;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 8px;
|
||||||
|
font-size: 12px;
|
||||||
|
color: var(--muted);
|
||||||
|
}
|
||||||
|
.hw-card .wol-row .btn { padding: 5px 10px; font-size: 12px; }
|
||||||
|
.hw-card .mac-display { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; }
|
||||||
|
|
||||||
|
.connectivity-content {
|
||||||
|
max-height: 360px;
|
||||||
|
overflow-y: auto;
|
||||||
|
border: 1px solid var(--border);
|
||||||
|
border-radius: 6px;
|
||||||
|
padding: 10px;
|
||||||
|
background: var(--surface-2);
|
||||||
|
}
|
||||||
|
.conn-spark { margin-bottom: 16px; }
|
||||||
|
.conn-spark h4 { font-size: 13px; margin: 0 0 8px; color: var(--text); }
|
||||||
|
.conn-event {
|
||||||
|
font-size: 12px;
|
||||||
|
display: flex;
|
||||||
|
gap: 10px;
|
||||||
|
padding: 4px 0;
|
||||||
|
border-bottom: 1px solid rgba(255,255,255,0.04);
|
||||||
|
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
|
||||||
|
}
|
||||||
|
.conn-event:last-child { border-bottom: 0; }
|
||||||
|
.conn-event .when { color: var(--muted); flex-shrink: 0; }
|
||||||
|
.conn-event .what { flex: 1; }
|
||||||
|
.conn-event.up .what { color: var(--accent); }
|
||||||
|
.conn-event.down .what { color: var(--error); }
|
||||||
|
.conn-event.report .what { font-style: italic; }
|
||||||
|
.conn-event .muted { color: var(--muted); font-style: normal; }
|
||||||
|
.conn-event .dur { color: var(--muted); }
|
||||||
|
.conn-summary { color: var(--muted); font-size: 11px; padding: 4px 0 10px; }
|
||||||
.hw-metric { display: flex; align-items: center; gap: 10px; font-size: 12px; }
|
.hw-metric { display: flex; align-items: center; gap: 10px; font-size: 12px; }
|
||||||
.hw-metric .label { color: var(--muted); width: 56px; flex-shrink: 0; text-transform: uppercase; letter-spacing: 0.05em; font-size: 11px; }
|
.hw-metric .label { color: var(--muted); width: 56px; flex-shrink: 0; text-transform: uppercase; letter-spacing: 0.05em; font-size: 11px; }
|
||||||
.hw-metric .bar { flex: 1; height: 8px; background: var(--surface-2); border-radius: 4px; overflow: hidden; position: relative; }
|
.hw-metric .bar { flex: 1; height: 8px; background: var(--surface-2); border-radius: 4px; overflow: hidden; position: relative; }
|
||||||
|
|||||||
@@ -0,0 +1,69 @@
|
|||||||
|
"""Wake-on-LAN.
|
||||||
|
|
||||||
|
Two delivery paths, tried in order:
|
||||||
|
|
||||||
|
1. SSH into the other Spark and have IT broadcast — most reliable because the
|
||||||
|
packet originates from the same LAN subnet as the sleeping Spark.
|
||||||
|
2. Direct UDP broadcast from this container. May or may not work depending
|
||||||
|
on the StartOS container's network namespace.
|
||||||
|
|
||||||
|
The DGX Spark's NIC must have WoL enabled in firmware/OS for either path to
|
||||||
|
actually wake the box; this module just delivers the magic packet correctly.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
import re
|
||||||
|
import socket
|
||||||
|
|
||||||
|
from .config import Settings
|
||||||
|
from .ssh import ssh_run
|
||||||
|
|
||||||
|
|
||||||
|
_MAC_RE = re.compile(r"^[0-9a-fA-F]{2}([:-]?[0-9a-fA-F]{2}){5}$")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_mac(mac: str) -> str:
|
||||||
|
mac = mac.strip().lower()
|
||||||
|
if not _MAC_RE.match(mac):
|
||||||
|
raise ValueError(f"invalid MAC address: {mac!r}")
|
||||||
|
return mac.replace("-", ":")
|
||||||
|
|
||||||
|
|
||||||
|
def build_magic_packet(mac: str) -> bytes:
|
||||||
|
mac_bytes = bytes.fromhex(normalize_mac(mac).replace(":", ""))
|
||||||
|
return b"\xff" * 6 + mac_bytes * 16
|
||||||
|
|
||||||
|
|
||||||
|
def send_local_broadcast(mac: str, broadcast: str = "255.255.255.255", port: int = 9) -> None:
|
||||||
|
"""Send from THIS container. May not reach the LAN in some topologies."""
|
||||||
|
pkt = build_magic_packet(mac)
|
||||||
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||||
|
try:
|
||||||
|
s.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
|
||||||
|
s.sendto(pkt, (broadcast, port))
|
||||||
|
# Also send to port 7 (alternate WoL convention) for safety
|
||||||
|
s.sendto(pkt, (broadcast, 7))
|
||||||
|
finally:
|
||||||
|
s.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def send_via_peer(host: str, user: str, mac: str, settings: Settings) -> tuple[bool, str]:
|
||||||
|
"""Use a different (reachable) Spark to send the WoL packet to its peer.
|
||||||
|
|
||||||
|
Uses Python 3 (always present on the Sparks for vLLM) to avoid depending on
|
||||||
|
wakeonlan / etherwake being installed.
|
||||||
|
"""
|
||||||
|
normalized = normalize_mac(mac)
|
||||||
|
mac_hex = normalized.replace(":", "")
|
||||||
|
py = (
|
||||||
|
"python3 -c \""
|
||||||
|
"import socket; "
|
||||||
|
f"m=bytes.fromhex('{mac_hex}'); "
|
||||||
|
"s=socket.socket(socket.AF_INET, socket.SOCK_DGRAM); "
|
||||||
|
"s.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1); "
|
||||||
|
"s.sendto(b'\\xff'*6 + m*16, ('255.255.255.255', 9)); "
|
||||||
|
"s.sendto(b'\\xff'*6 + m*16, ('255.255.255.255', 7)); "
|
||||||
|
"print('sent')\""
|
||||||
|
)
|
||||||
|
rc, out, err = await ssh_run(host, user, py, settings, timeout=8)
|
||||||
|
return rc == 0 and "sent" in out, (err.strip() or out.strip() or f"rc={rc}")
|
||||||
@@ -50,6 +50,7 @@ export const main = sdk.setupMain(async ({ effects }) => {
|
|||||||
MAGPIE_CONTAINER: cfg.magpie_container,
|
MAGPIE_CONTAINER: cfg.magpie_container,
|
||||||
MODELS_OVERRIDES: '/data/models-overrides.yaml',
|
MODELS_OVERRIDES: '/data/models-overrides.yaml',
|
||||||
SERVICES_OVERRIDES: '/data/services-overrides.yaml',
|
SERVICES_OVERRIDES: '/data/services-overrides.yaml',
|
||||||
|
CONNECTIVITY_LOG: '/data/connectivity.json',
|
||||||
OPEN_WEBUI_URL: cfg.open_webui_url,
|
OPEN_WEBUI_URL: cfg.open_webui_url,
|
||||||
NGC_API_KEY: cfg.ngc_api_key,
|
NGC_API_KEY: cfg.ngc_api_key,
|
||||||
BIND_PORT: String(uiPort),
|
BIND_PORT: String(uiPort),
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
export const v0_1_0 = VersionInfo.of({
|
export const v0_1_0 = VersionInfo.of({
|
||||||
version: '0.4.0:0',
|
version: '0.6.0:0',
|
||||||
releaseNotes: {
|
releaseNotes: {
|
||||||
en_US:
|
en_US:
|
||||||
'v0.4: install NIM containers from the dashboard. New "+ Install NIM" button next to the services panel shows a curated catalog (Parakeet, Magpie, Riva...) plus a free-form image field. Streams docker pull + docker run output with phase + elapsed timer; persists installed services to /data/services-overrides.yaml so they show up in the services panel after install. Configure Sparks now has an NGC API key field (masked) needed for nvcr.io. v0.3.1 hotfix bundled in: hardware/services SSH timeouts shortened (6 s) and failures cached for 25 s so an unreachable Spark doesn\'t hang the whole dashboard. Hardware card for an unreachable Spark now includes troubleshooting steps.',
|
'v0.6: Service-level connectivity tracking and a passive failure-report endpoint. The connectivity log now records up/down transitions for Parakeet, Magpie, and vLLM in addition to the Spark hosts (driven by the existing /api/status and /api/services polling). A new POST /api/health-event endpoint lets external apps (e.g. Open WebUI) record failures they observed even when the failure was brief enough to slip between polls. The Connectivity log dialog shows hosts and services with separate badges, and reports appear inline with their source app + error detail.',
|
||||||
},
|
},
|
||||||
migrations: {
|
migrations: {
|
||||||
up: async ({ effects }) => {},
|
up: async ({ effects }) => {},
|
||||||
|
|||||||
Reference in New Issue
Block a user