diff --git a/image/app/connectivity.py b/image/app/connectivity.py
new file mode 100644
index 0000000..89e9be0
--- /dev/null
+++ b/image/app/connectivity.py
@@ -0,0 +1,126 @@
+"""Track Spark up/down transitions and cache discovered MAC addresses.
+
+Persisted to /data/connectivity.json so history survives package restarts:
+
+ {
+ "macs": { "spark1": "aa:bb:..", "spark2": "11:22:.." },
+ "current": { "spark1": "up", "spark2": "down" },
+ "last_change": { "spark1": "2026-05-12T15:00:00Z", ... },
+ "events": [
+ { "spark": "spark2", "at": "2026-05-12T17:30:00Z", "transition": "down" },
+ { "spark": "spark2", "at": "2026-05-12T18:45:00Z", "transition": "up", "down_seconds": 4500 },
+ ...
+ ]
+ }
+"""
+from __future__ import annotations
+import json
+import os
+import threading
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+
+MAX_EVENTS = 200 # rolling window — plenty for showing recent history
+
+
+def _path() -> str:
+ return os.environ.get("CONNECTIVITY_LOG", "/data/connectivity.json")
+
+
+_lock = threading.Lock()
+
+
+def _read() -> dict:
+ try:
+ with open(_path()) as f:
+ return json.load(f) or {}
+ except (FileNotFoundError, json.JSONDecodeError):
+ return {}
+
+
+def _write(data: dict) -> None:
+ p = _path()
+ Path(p).parent.mkdir(parents=True, exist_ok=True)
+ tmp = p + ".tmp"
+ with open(tmp, "w") as f:
+ json.dump(data, f, indent=2, sort_keys=False)
+ os.replace(tmp, p)
+
+
+def load() -> dict:
+ with _lock:
+ d = _read()
+ d.setdefault("macs", {})
+ d.setdefault("current", {})
+ d.setdefault("last_change", {})
+ d.setdefault("events", [])
+ return d
+
+
+def record_mac(spark: str, mac: Optional[str]) -> None:
+ if not mac:
+ return
+ with _lock:
+ d = _read()
+ d.setdefault("macs", {})
+ if d["macs"].get(spark) != mac:
+ d["macs"][spark] = mac
+ _write(d)
+
+
+def record_state(spark: str, reachable: bool) -> Optional[dict]:
+ """Update current state. If it differs from the last seen state, append an event.
+
+ Returns the event dict if a transition was recorded, else None.
+ """
+ new_state = "up" if reachable else "down"
+ now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
+ with _lock:
+ d = _read()
+ d.setdefault("macs", {})
+ d.setdefault("current", {})
+ d.setdefault("last_change", {})
+ d.setdefault("events", [])
+ prev = d["current"].get(spark)
+ if prev == new_state:
+ return None
+ event: dict = {"spark": spark, "at": now, "transition": new_state}
+ # When we have a previous state and timestamp, compute duration
+ last_change = d["last_change"].get(spark)
+ if prev and last_change:
+ try:
+ prev_dt = datetime.fromisoformat(last_change.replace("Z", "+00:00"))
+ duration = (datetime.now(timezone.utc) - prev_dt).total_seconds()
+ if prev == "down" and new_state == "up":
+ event["down_seconds"] = round(duration)
+ if prev == "up" and new_state == "down":
+ event["up_seconds"] = round(duration)
+ except ValueError:
+ pass
+ d["current"][spark] = new_state
+ d["last_change"][spark] = now
+ d["events"].append(event)
+ # Keep rolling window
+ if len(d["events"]) > MAX_EVENTS:
+ d["events"] = d["events"][-MAX_EVENTS:]
+ _write(d)
+ return event
+
+
+def get_mac(spark: str) -> Optional[str]:
+ d = load()
+ return d.get("macs", {}).get(spark)
+
+
+def summary() -> dict:
+ """Compact summary for the UI: known MACs, current state, recent events."""
+ d = load()
+ events = d.get("events", [])
+ return {
+ "macs": d.get("macs", {}),
+ "current": d.get("current", {}),
+ "last_change": d.get("last_change", {}),
+ "events": events[-50:],
+ }
diff --git a/image/app/hardware.py b/image/app/hardware.py
index 4527026..5561c38 100644
--- a/image/app/hardware.py
+++ b/image/app/hardware.py
@@ -10,6 +10,7 @@ import time
from typing import Any
from .config import Settings
+from .connectivity import record_mac, record_state
from .ssh import ssh_run
@@ -23,6 +24,8 @@ echo MEMORY=$(free -b 2>/dev/null | awk '/^Mem:/ {print $2, $3}')
echo DISK=$(df -B1 / 2>/dev/null | awk 'NR==2 {print $2, $3}')
echo GPU=$(nvidia-smi --query-gpu=name,utilization.gpu,temperature.gpu,power.draw,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1)
echo GPU_MEM_USED_MIB=$(nvidia-smi --query-compute-apps=used_gpu_memory --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END {print s+0}')
+DEFIF=$(ip route show default 2>/dev/null | awk '{print $5; exit}')
+echo MAC=$(cat /sys/class/net/$DEFIF/address 2>/dev/null)
""".strip()
@@ -78,6 +81,9 @@ def _parse(out: str) -> dict:
# Sum per-process compute memory (works even on unified-memory systems)
if info.get("gpu_mem_used_mib"):
parsed["gpu_mem_used_mib"] = _parse_int(info["gpu_mem_used_mib"])
+ # MAC address on the default-route interface (for Wake-on-LAN)
+ if info.get("mac"):
+ parsed["mac"] = info["mac"].lower()
return parsed
@@ -118,12 +124,14 @@ class HardwareProbe:
# marked this host unreachable, return the cached failure immediately.
rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=6)
if rc != 0:
- # Cache failures for a slightly longer TTL so the dashboard isn't
- # blocked behind 6 s of SSH timeout on every poll.
result = {"reachable": False, "configured": True, "host": host, "error": err.strip() or out.strip() or f"rc={rc}"}
self._cache[key] = (now, result)
- # Override the TTL effectively by inserting a sentinel into the cache age
+ record_state(key, False)
return result
- result = {"reachable": True, "configured": True, "host": host, **_parse(out)}
+ parsed = _parse(out)
+ result = {"reachable": True, "configured": True, "host": host, **parsed}
self._cache[key] = (now, result)
+ record_state(key, True)
+ if parsed.get("mac"):
+ record_mac(key, parsed["mac"])
return result
diff --git a/image/app/server.py b/image/app/server.py
index 3417260..55cc15e 100644
--- a/image/app/server.py
+++ b/image/app/server.py
@@ -10,6 +10,7 @@ from pydantic import BaseModel
from typing import Literal
from .config import Settings
+from .connectivity import get_mac, summary as connectivity_summary
from .custom_services import add_custom_service, delete_custom_service
from .download import DownloadManager
from .hardware import HardwareProbe
@@ -21,6 +22,7 @@ from .services import docker_state, run_action, services_from_settings
from .ssh import ssh_run
from .swap import SwapManager
from .updates import UpdateManager, get_update_status
+from .wol import send_local_broadcast, send_via_peer
settings = Settings.from_env()
@@ -128,6 +130,50 @@ async def get_hardware() -> dict:
return await hardware_probe.fetch()
+@app.get("/api/connectivity")
+async def get_connectivity() -> dict:
+ """Up/down transition log per Spark + cached MACs."""
+ return connectivity_summary()
+
+
+@app.post("/api/spark/{name}/wake")
+async def wake_spark(name: str) -> dict:
+ """Send a Wake-on-LAN magic packet for the named Spark.
+
+ Tries the OTHER Spark (if reachable) first because the packet has to
+ originate on the target's LAN segment to be reliable. Falls back to a
+ direct UDP broadcast from this container.
+ """
+ if name not in ("spark1", "spark2"):
+ raise HTTPException(404, f"unknown spark: {name}")
+ mac = get_mac(name)
+ if not mac:
+ raise HTTPException(400, f"MAC for {name} not yet known; bring it up once so we can probe it, then this will work next time it sleeps")
+
+ # Find the peer's connectivity to decide the path.
+ other = "spark2" if name == "spark1" else "spark1"
+ other_host = settings.spark1_host if other == "spark1" else settings.spark2_host
+ other_user = settings.spark1_user if other == "spark1" else settings.spark2_user
+
+ delivered_via = None
+ via_peer_ok = False
+ via_peer_err = ""
+ if other_host and other_user:
+ via_peer_ok, via_peer_err = await send_via_peer(other_host, other_user, mac, settings)
+ if via_peer_ok:
+ delivered_via = other
+
+ if not via_peer_ok:
+ # Fall back to direct from this container
+ try:
+ send_local_broadcast(mac)
+ delivered_via = "container"
+ except Exception as e:
+ raise HTTPException(500, f"WoL failed: peer={via_peer_err!r} container={e!r}")
+
+ return {"ok": True, "spark": name, "mac": mac, "delivered_via": delivered_via}
+
+
@app.get("/api/services")
async def get_services() -> dict:
"""Lifecycle state of always-on support services (Parakeet, Magpie, …).
diff --git a/image/app/static/app.js b/image/app/static/app.js
index 54fb1df..7f05150 100644
--- a/image/app/static/app.js
+++ b/image/app/static/app.js
@@ -121,10 +121,69 @@ function bar(usedPct, warn) {
async function pollHardware() {
try {
state.hardware = await fetchJSON('/api/hardware');
+ try { state.connectivity = await fetchJSON('/api/connectivity'); } catch {}
renderHardware();
} catch (e) { console.warn('hardware poll failed', e); }
}
+function fmtDuration(sec) {
+ if (sec == null) return '';
+ if (sec < 60) return `${Math.round(sec)}s`;
+ if (sec < 3600) return `${Math.round(sec / 60)}m`;
+ if (sec < 86400) {
+ const h = Math.floor(sec / 3600);
+ const m = Math.round((sec % 3600) / 60);
+ return m ? `${h}h ${m}m` : `${h}h`;
+ }
+ const d = Math.floor(sec / 86400);
+ const h = Math.round((sec % 86400) / 3600);
+ return h ? `${d}d ${h}h` : `${d}d`;
+}
+
+function openConnectivityDialog() {
+ const dlg = el('#connectivity-dialog');
+ const content = el('#connectivity-content');
+ const c = state.connectivity || {};
+ const events = c.events || [];
+ if (events.length === 0) {
+ content.innerHTML = '
No transitions recorded yet. Once a Spark goes down and comes back, you\'ll see entries here.
';
+ dlg.showModal();
+ return;
+ }
+ const bySpark = {};
+ for (const e of events) {
+ (bySpark[e.spark] = bySpark[e.spark] || []).push(e);
+ }
+ const html = Object.entries(bySpark).map(([spark, evs]) => {
+ const downs = evs.filter(e => e.transition === 'down').length;
+ const mac = c.macs?.[spark];
+ return `
+
+
${escapeHtml(spark)}${mac ? ` ${escapeHtml(mac)}` : ''}
+
${evs.length} transition${evs.length===1?'':'s'} · ${downs} down event${downs===1?'':'s'} in window
+ ${evs.slice(-25).reverse().map(e => `
+
+ ${escapeHtml(e.at.replace('T', ' ').replace('Z', ''))}
+ ${e.transition === 'up' ? '↑ came back online' : '↓ dropped offline'}
+ ${e.down_seconds != null ? `was down ${fmtDuration(e.down_seconds)}` : ''}${e.up_seconds != null ? `was up ${fmtDuration(e.up_seconds)}` : ''}
+
+ `).join('')}
+
+ `;
+ }).join('');
+ content.innerHTML = html;
+ dlg.showModal();
+}
+
+async function wakeSpark(name) {
+ try {
+ const r = await fetchJSON(`/api/spark/${name}/wake`, { method: 'POST' });
+ alert(`Wake-on-LAN sent to ${name} (MAC ${r.mac}, via ${r.delivered_via}). Give it ~30 seconds to wake; the card will go green when it comes back.`);
+ } catch (e) {
+ alert(`Wake failed: ${e.message}`);
+ }
+}
+
function renderHardware() {
const panel = el('#hardware-panel');
const grid = el('#hardware-grid');
@@ -138,14 +197,23 @@ function renderHardware() {
const card = document.createElement('div');
if (!s.reachable) {
card.className = 'hw-card unreachable';
+ const mac = state.connectivity?.macs?.[key];
+ const wolRow = mac
+ ? `
+ ${escapeHtml(mac)}
+
+
+
`
+ : `MAC not yet known — once it's been up once with this dashboard installed, "Wake" will appear here.
`;
card.innerHTML = `
${escapeHtml(key)}
unreachable
${escapeHtml(s.host || '')} — ${escapeHtml(s.error || 'no response')}
+ ${wolRow}
- Spark Control can't restart a Spark that won't answer SSH. Steps to try:
+ If Wake-on-LAN doesn't bring it back, manual steps:
- Verify it's powered on (check the front LED).
- Ping it from another LAN device.
@@ -1307,6 +1375,13 @@ async function init() {
el('#nim-cancel').addEventListener('click', () => el('#nim-dialog').close());
el('#nim-form').addEventListener('submit', submitNim);
el('#nim-prog-close').addEventListener('click', () => el('#nim-progress-dialog').close());
+ el('#open-connectivity').addEventListener('click', openConnectivityDialog);
+ el('#connectivity-close').addEventListener('click', () => el('#connectivity-dialog').close());
+ // Wake-on-LAN buttons live on unreachable hardware cards; delegate.
+ el('#hardware-grid').addEventListener('click', (e) => {
+ const btn = e.target.closest('[data-wake]');
+ if (btn) wakeSpark(btn.dataset.wake);
+ });
setupCatalogDialog();
setupAdvancedDialog();
// Open WebUI link from /api/config
diff --git a/image/app/static/index.html b/image/app/static/index.html
index 05a67c2..f9cf853 100644
--- a/image/app/static/index.html
+++ b/image/app/static/index.html
@@ -26,8 +26,22 @@
- Spark hardware
+
+
+
diff --git a/image/app/static/style.css b/image/app/static/style.css
index a83919e..bbdaadd 100644
--- a/image/app/static/style.css
+++ b/image/app/static/style.css
@@ -377,6 +377,42 @@ main {
.hw-card.unreachable { border-color: rgba(239, 68, 68, 0.4); }
.hw-card.unreachable .name { color: var(--error); }
.hw-card.unreachable ol { color: var(--muted); }
+.hw-card .wol-row {
+ margin-top: 8px;
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ font-size: 12px;
+ color: var(--muted);
+}
+.hw-card .wol-row .btn { padding: 5px 10px; font-size: 12px; }
+.hw-card .mac-display { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; }
+
+.connectivity-content {
+ max-height: 360px;
+ overflow-y: auto;
+ border: 1px solid var(--border);
+ border-radius: 6px;
+ padding: 10px;
+ background: var(--surface-2);
+}
+.conn-spark { margin-bottom: 16px; }
+.conn-spark h4 { font-size: 13px; margin: 0 0 8px; color: var(--text); }
+.conn-event {
+ font-size: 12px;
+ display: flex;
+ gap: 10px;
+ padding: 4px 0;
+ border-bottom: 1px solid rgba(255,255,255,0.04);
+ font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+.conn-event:last-child { border-bottom: 0; }
+.conn-event .when { color: var(--muted); flex-shrink: 0; }
+.conn-event .what { flex: 1; }
+.conn-event.up .what { color: var(--accent); }
+.conn-event.down .what { color: var(--error); }
+.conn-event .dur { color: var(--muted); }
+.conn-summary { color: var(--muted); font-size: 11px; padding: 4px 0 10px; }
.hw-metric { display: flex; align-items: center; gap: 10px; font-size: 12px; }
.hw-metric .label { color: var(--muted); width: 56px; flex-shrink: 0; text-transform: uppercase; letter-spacing: 0.05em; font-size: 11px; }
.hw-metric .bar { flex: 1; height: 8px; background: var(--surface-2); border-radius: 4px; overflow: hidden; position: relative; }
diff --git a/image/app/wol.py b/image/app/wol.py
new file mode 100644
index 0000000..0f08986
--- /dev/null
+++ b/image/app/wol.py
@@ -0,0 +1,69 @@
+"""Wake-on-LAN.
+
+Two delivery paths, tried in order:
+
+ 1. SSH into the other Spark and have IT broadcast — most reliable because the
+ packet originates from the same LAN subnet as the sleeping Spark.
+ 2. Direct UDP broadcast from this container. May or may not work depending
+ on the StartOS container's network namespace.
+
+The DGX Spark's NIC must have WoL enabled in firmware/OS for either path to
+actually wake the box; this module just delivers the magic packet correctly.
+"""
+from __future__ import annotations
+import asyncio
+import re
+import socket
+
+from .config import Settings
+from .ssh import ssh_run
+
+
+_MAC_RE = re.compile(r"^[0-9a-fA-F]{2}([:-]?[0-9a-fA-F]{2}){5}$")
+
+
+def normalize_mac(mac: str) -> str:
+ mac = mac.strip().lower()
+ if not _MAC_RE.match(mac):
+ raise ValueError(f"invalid MAC address: {mac!r}")
+ return mac.replace("-", ":")
+
+
+def build_magic_packet(mac: str) -> bytes:
+ mac_bytes = bytes.fromhex(normalize_mac(mac).replace(":", ""))
+ return b"\xff" * 6 + mac_bytes * 16
+
+
+def send_local_broadcast(mac: str, broadcast: str = "255.255.255.255", port: int = 9) -> None:
+ """Send from THIS container. May not reach the LAN in some topologies."""
+ pkt = build_magic_packet(mac)
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+ try:
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
+ s.sendto(pkt, (broadcast, port))
+ # Also send to port 7 (alternate WoL convention) for safety
+ s.sendto(pkt, (broadcast, 7))
+ finally:
+ s.close()
+
+
+async def send_via_peer(host: str, user: str, mac: str, settings: Settings) -> tuple[bool, str]:
+ """Use a different (reachable) Spark to send the WoL packet to its peer.
+
+ Uses Python 3 (always present on the Sparks for vLLM) to avoid depending on
+ wakeonlan / etherwake being installed.
+ """
+ normalized = normalize_mac(mac)
+ mac_hex = normalized.replace(":", "")
+ py = (
+ "python3 -c \""
+ "import socket; "
+ f"m=bytes.fromhex('{mac_hex}'); "
+ "s=socket.socket(socket.AF_INET, socket.SOCK_DGRAM); "
+ "s.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1); "
+ "s.sendto(b'\\xff'*6 + m*16, ('255.255.255.255', 9)); "
+ "s.sendto(b'\\xff'*6 + m*16, ('255.255.255.255', 7)); "
+ "print('sent')\""
+ )
+ rc, out, err = await ssh_run(host, user, py, settings, timeout=8)
+ return rc == 0 and "sent" in out, (err.strip() or out.strip() or f"rc={rc}")
diff --git a/package/startos/main.ts b/package/startos/main.ts
index 3c1e914..979470e 100644
--- a/package/startos/main.ts
+++ b/package/startos/main.ts
@@ -50,6 +50,7 @@ export const main = sdk.setupMain(async ({ effects }) => {
MAGPIE_CONTAINER: cfg.magpie_container,
MODELS_OVERRIDES: '/data/models-overrides.yaml',
SERVICES_OVERRIDES: '/data/services-overrides.yaml',
+ CONNECTIVITY_LOG: '/data/connectivity.json',
OPEN_WEBUI_URL: cfg.open_webui_url,
NGC_API_KEY: cfg.ngc_api_key,
BIND_PORT: String(uiPort),
diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts
index 0ddbd98..f91643d 100644
--- a/package/startos/versions/v0_1_0.ts
+++ b/package/startos/versions/v0_1_0.ts
@@ -1,10 +1,10 @@
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
export const v0_1_0 = VersionInfo.of({
- version: '0.4.0:0',
+ version: '0.5.0:0',
releaseNotes: {
en_US:
- 'v0.4: install NIM containers from the dashboard. New "+ Install NIM" button next to the services panel shows a curated catalog (Parakeet, Magpie, Riva...) plus a free-form image field. Streams docker pull + docker run output with phase + elapsed timer; persists installed services to /data/services-overrides.yaml so they show up in the services panel after install. Configure Sparks now has an NGC API key field (masked) needed for nvcr.io. v0.3.1 hotfix bundled in: hardware/services SSH timeouts shortened (6 s) and failures cached for 25 s so an unreachable Spark doesn\'t hang the whole dashboard. Hardware card for an unreachable Spark now includes troubleshooting steps.',
+ 'v0.5: Wake-on-LAN + connectivity history. Each Spark\'s MAC is now auto-discovered during the normal hardware sweep and cached in /data/connectivity.json. Up/down transitions are logged with duration. Unreachable hardware cards get a "Wake (WoL)" button that sends a magic packet (preferring the other Spark as the sender so it originates on the right LAN segment). New "Connectivity log" button in the hardware section shows the recent transitions for each Spark — useful for spotting patterns (e.g. always-at-noon dropouts).',
},
migrations: {
up: async ({ effects }) => {},