From a02f4db850cf66254c6321fa164e4205e29ade40 Mon Sep 17 00:00:00 2001 From: Grant Date: Tue, 12 May 2026 12:51:49 -0500 Subject: [PATCH] v0.5.0 - Wake-on-LAN + connectivity history wol.py: - build_magic_packet(): standard 6x0xFF + 16x MAC layout - send_local_broadcast(): direct from container (ports 9 + 7 for safety) - send_via_peer(): preferred path; SSHes to the OTHER Spark and runs a Python one-liner there so the packet originates on the target's LAN segment (most reliable) - MAC validation + normalization connectivity.py: - /data/connectivity.json persistence (thread-safe, atomic rename) - Stores per-Spark current state + last_change timestamp + rolling 200-event log - Records up/down transitions; computes down_seconds / up_seconds durations - MAC cache populated lazily during hardware probes hardware.py: - Probe now reads MAC via /sys/class/net//address - After each probe, record_state() emits a transition event if state changed - record_mac() caches the address so WoL works when the Spark next goes down Endpoints: - GET /api/connectivity: macs, current state, last_change, events[] - POST /api/spark/{name}/wake: tries via-peer first, falls back to direct broadcast UI: - Unreachable hardware card shows the cached MAC + 'Wake (WoL)' button (only if MAC known) - New 'Connectivity log' button opens a modal with per-Spark transition history (last 25 each), including duration of each prior up/down period - pollHardware also pulls /api/connectivity so WoL buttons appear without an extra fetch Package: bump 0.5.0:0; main.ts sets CONNECTIVITY_LOG=/data/connectivity.json --- image/app/connectivity.py | 126 +++++++++++++++++++++++++++++ image/app/hardware.py | 16 +++- image/app/server.py | 46 +++++++++++ image/app/static/app.js | 77 +++++++++++++++++- image/app/static/index.html | 16 +++- image/app/static/style.css | 36 +++++++++ image/app/wol.py | 69 ++++++++++++++++ package/startos/main.ts | 1 + package/startos/versions/v0_1_0.ts | 4 +- 9 files changed, 383 insertions(+), 8 deletions(-) create mode 100644 image/app/connectivity.py create mode 100644 image/app/wol.py diff --git a/image/app/connectivity.py b/image/app/connectivity.py new file mode 100644 index 0000000..89e9be0 --- /dev/null +++ b/image/app/connectivity.py @@ -0,0 +1,126 @@ +"""Track Spark up/down transitions and cache discovered MAC addresses. + +Persisted to /data/connectivity.json so history survives package restarts: + + { + "macs": { "spark1": "aa:bb:..", "spark2": "11:22:.." }, + "current": { "spark1": "up", "spark2": "down" }, + "last_change": { "spark1": "2026-05-12T15:00:00Z", ... }, + "events": [ + { "spark": "spark2", "at": "2026-05-12T17:30:00Z", "transition": "down" }, + { "spark": "spark2", "at": "2026-05-12T18:45:00Z", "transition": "up", "down_seconds": 4500 }, + ... + ] + } +""" +from __future__ import annotations +import json +import os +import threading +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + + +MAX_EVENTS = 200 # rolling window — plenty for showing recent history + + +def _path() -> str: + return os.environ.get("CONNECTIVITY_LOG", "/data/connectivity.json") + + +_lock = threading.Lock() + + +def _read() -> dict: + try: + with open(_path()) as f: + return json.load(f) or {} + except (FileNotFoundError, json.JSONDecodeError): + return {} + + +def _write(data: dict) -> None: + p = _path() + Path(p).parent.mkdir(parents=True, exist_ok=True) + tmp = p + ".tmp" + with open(tmp, "w") as f: + json.dump(data, f, indent=2, sort_keys=False) + os.replace(tmp, p) + + +def load() -> dict: + with _lock: + d = _read() + d.setdefault("macs", {}) + d.setdefault("current", {}) + d.setdefault("last_change", {}) + d.setdefault("events", []) + return d + + +def record_mac(spark: str, mac: Optional[str]) -> None: + if not mac: + return + with _lock: + d = _read() + d.setdefault("macs", {}) + if d["macs"].get(spark) != mac: + d["macs"][spark] = mac + _write(d) + + +def record_state(spark: str, reachable: bool) -> Optional[dict]: + """Update current state. If it differs from the last seen state, append an event. + + Returns the event dict if a transition was recorded, else None. + """ + new_state = "up" if reachable else "down" + now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") + with _lock: + d = _read() + d.setdefault("macs", {}) + d.setdefault("current", {}) + d.setdefault("last_change", {}) + d.setdefault("events", []) + prev = d["current"].get(spark) + if prev == new_state: + return None + event: dict = {"spark": spark, "at": now, "transition": new_state} + # When we have a previous state and timestamp, compute duration + last_change = d["last_change"].get(spark) + if prev and last_change: + try: + prev_dt = datetime.fromisoformat(last_change.replace("Z", "+00:00")) + duration = (datetime.now(timezone.utc) - prev_dt).total_seconds() + if prev == "down" and new_state == "up": + event["down_seconds"] = round(duration) + if prev == "up" and new_state == "down": + event["up_seconds"] = round(duration) + except ValueError: + pass + d["current"][spark] = new_state + d["last_change"][spark] = now + d["events"].append(event) + # Keep rolling window + if len(d["events"]) > MAX_EVENTS: + d["events"] = d["events"][-MAX_EVENTS:] + _write(d) + return event + + +def get_mac(spark: str) -> Optional[str]: + d = load() + return d.get("macs", {}).get(spark) + + +def summary() -> dict: + """Compact summary for the UI: known MACs, current state, recent events.""" + d = load() + events = d.get("events", []) + return { + "macs": d.get("macs", {}), + "current": d.get("current", {}), + "last_change": d.get("last_change", {}), + "events": events[-50:], + } diff --git a/image/app/hardware.py b/image/app/hardware.py index 4527026..5561c38 100644 --- a/image/app/hardware.py +++ b/image/app/hardware.py @@ -10,6 +10,7 @@ import time from typing import Any from .config import Settings +from .connectivity import record_mac, record_state from .ssh import ssh_run @@ -23,6 +24,8 @@ echo MEMORY=$(free -b 2>/dev/null | awk '/^Mem:/ {print $2, $3}') echo DISK=$(df -B1 / 2>/dev/null | awk 'NR==2 {print $2, $3}') echo GPU=$(nvidia-smi --query-gpu=name,utilization.gpu,temperature.gpu,power.draw,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1) echo GPU_MEM_USED_MIB=$(nvidia-smi --query-compute-apps=used_gpu_memory --format=csv,noheader,nounits 2>/dev/null | awk '{s+=$1} END {print s+0}') +DEFIF=$(ip route show default 2>/dev/null | awk '{print $5; exit}') +echo MAC=$(cat /sys/class/net/$DEFIF/address 2>/dev/null) """.strip() @@ -78,6 +81,9 @@ def _parse(out: str) -> dict: # Sum per-process compute memory (works even on unified-memory systems) if info.get("gpu_mem_used_mib"): parsed["gpu_mem_used_mib"] = _parse_int(info["gpu_mem_used_mib"]) + # MAC address on the default-route interface (for Wake-on-LAN) + if info.get("mac"): + parsed["mac"] = info["mac"].lower() return parsed @@ -118,12 +124,14 @@ class HardwareProbe: # marked this host unreachable, return the cached failure immediately. rc, out, err = await ssh_run(host, user, _PROBE, self.settings, timeout=6) if rc != 0: - # Cache failures for a slightly longer TTL so the dashboard isn't - # blocked behind 6 s of SSH timeout on every poll. result = {"reachable": False, "configured": True, "host": host, "error": err.strip() or out.strip() or f"rc={rc}"} self._cache[key] = (now, result) - # Override the TTL effectively by inserting a sentinel into the cache age + record_state(key, False) return result - result = {"reachable": True, "configured": True, "host": host, **_parse(out)} + parsed = _parse(out) + result = {"reachable": True, "configured": True, "host": host, **parsed} self._cache[key] = (now, result) + record_state(key, True) + if parsed.get("mac"): + record_mac(key, parsed["mac"]) return result diff --git a/image/app/server.py b/image/app/server.py index 3417260..55cc15e 100644 --- a/image/app/server.py +++ b/image/app/server.py @@ -10,6 +10,7 @@ from pydantic import BaseModel from typing import Literal from .config import Settings +from .connectivity import get_mac, summary as connectivity_summary from .custom_services import add_custom_service, delete_custom_service from .download import DownloadManager from .hardware import HardwareProbe @@ -21,6 +22,7 @@ from .services import docker_state, run_action, services_from_settings from .ssh import ssh_run from .swap import SwapManager from .updates import UpdateManager, get_update_status +from .wol import send_local_broadcast, send_via_peer settings = Settings.from_env() @@ -128,6 +130,50 @@ async def get_hardware() -> dict: return await hardware_probe.fetch() +@app.get("/api/connectivity") +async def get_connectivity() -> dict: + """Up/down transition log per Spark + cached MACs.""" + return connectivity_summary() + + +@app.post("/api/spark/{name}/wake") +async def wake_spark(name: str) -> dict: + """Send a Wake-on-LAN magic packet for the named Spark. + + Tries the OTHER Spark (if reachable) first because the packet has to + originate on the target's LAN segment to be reliable. Falls back to a + direct UDP broadcast from this container. + """ + if name not in ("spark1", "spark2"): + raise HTTPException(404, f"unknown spark: {name}") + mac = get_mac(name) + if not mac: + raise HTTPException(400, f"MAC for {name} not yet known; bring it up once so we can probe it, then this will work next time it sleeps") + + # Find the peer's connectivity to decide the path. + other = "spark2" if name == "spark1" else "spark1" + other_host = settings.spark1_host if other == "spark1" else settings.spark2_host + other_user = settings.spark1_user if other == "spark1" else settings.spark2_user + + delivered_via = None + via_peer_ok = False + via_peer_err = "" + if other_host and other_user: + via_peer_ok, via_peer_err = await send_via_peer(other_host, other_user, mac, settings) + if via_peer_ok: + delivered_via = other + + if not via_peer_ok: + # Fall back to direct from this container + try: + send_local_broadcast(mac) + delivered_via = "container" + except Exception as e: + raise HTTPException(500, f"WoL failed: peer={via_peer_err!r} container={e!r}") + + return {"ok": True, "spark": name, "mac": mac, "delivered_via": delivered_via} + + @app.get("/api/services") async def get_services() -> dict: """Lifecycle state of always-on support services (Parakeet, Magpie, …). diff --git a/image/app/static/app.js b/image/app/static/app.js index 54fb1df..7f05150 100644 --- a/image/app/static/app.js +++ b/image/app/static/app.js @@ -121,10 +121,69 @@ function bar(usedPct, warn) { async function pollHardware() { try { state.hardware = await fetchJSON('/api/hardware'); + try { state.connectivity = await fetchJSON('/api/connectivity'); } catch {} renderHardware(); } catch (e) { console.warn('hardware poll failed', e); } } +function fmtDuration(sec) { + if (sec == null) return ''; + if (sec < 60) return `${Math.round(sec)}s`; + if (sec < 3600) return `${Math.round(sec / 60)}m`; + if (sec < 86400) { + const h = Math.floor(sec / 3600); + const m = Math.round((sec % 3600) / 60); + return m ? `${h}h ${m}m` : `${h}h`; + } + const d = Math.floor(sec / 86400); + const h = Math.round((sec % 86400) / 3600); + return h ? `${d}d ${h}h` : `${d}d`; +} + +function openConnectivityDialog() { + const dlg = el('#connectivity-dialog'); + const content = el('#connectivity-content'); + const c = state.connectivity || {}; + const events = c.events || []; + if (events.length === 0) { + content.innerHTML = '
No transitions recorded yet. Once a Spark goes down and comes back, you\'ll see entries here.
'; + dlg.showModal(); + return; + } + const bySpark = {}; + for (const e of events) { + (bySpark[e.spark] = bySpark[e.spark] || []).push(e); + } + const html = Object.entries(bySpark).map(([spark, evs]) => { + const downs = evs.filter(e => e.transition === 'down').length; + const mac = c.macs?.[spark]; + return ` +
+

${escapeHtml(spark)}${mac ? ` ${escapeHtml(mac)}` : ''}

+
${evs.length} transition${evs.length===1?'':'s'} · ${downs} down event${downs===1?'':'s'} in window
+ ${evs.slice(-25).reverse().map(e => ` +
+ ${escapeHtml(e.at.replace('T', ' ').replace('Z', ''))} + ${e.transition === 'up' ? '↑ came back online' : '↓ dropped offline'} + ${e.down_seconds != null ? `was down ${fmtDuration(e.down_seconds)}` : ''}${e.up_seconds != null ? `was up ${fmtDuration(e.up_seconds)}` : ''} +
+ `).join('')} +
+ `; + }).join(''); + content.innerHTML = html; + dlg.showModal(); +} + +async function wakeSpark(name) { + try { + const r = await fetchJSON(`/api/spark/${name}/wake`, { method: 'POST' }); + alert(`Wake-on-LAN sent to ${name} (MAC ${r.mac}, via ${r.delivered_via}). Give it ~30 seconds to wake; the card will go green when it comes back.`); + } catch (e) { + alert(`Wake failed: ${e.message}`); + } +} + function renderHardware() { const panel = el('#hardware-panel'); const grid = el('#hardware-grid'); @@ -138,14 +197,23 @@ function renderHardware() { const card = document.createElement('div'); if (!s.reachable) { card.className = 'hw-card unreachable'; + const mac = state.connectivity?.macs?.[key]; + const wolRow = mac + ? `
+ ${escapeHtml(mac)} + + +
` + : `
MAC not yet known — once it's been up once with this dashboard installed, "Wake" will appear here.
`; card.innerHTML = `
${escapeHtml(key)} unreachable
${escapeHtml(s.host || '')} — ${escapeHtml(s.error || 'no response')}
+ ${wolRow}
- Spark Control can't restart a Spark that won't answer SSH. Steps to try: + If Wake-on-LAN doesn't bring it back, manual steps:
  1. Verify it's powered on (check the front LED).
  2. Ping it from another LAN device.
  3. @@ -1307,6 +1375,13 @@ async function init() { el('#nim-cancel').addEventListener('click', () => el('#nim-dialog').close()); el('#nim-form').addEventListener('submit', submitNim); el('#nim-prog-close').addEventListener('click', () => el('#nim-progress-dialog').close()); + el('#open-connectivity').addEventListener('click', openConnectivityDialog); + el('#connectivity-close').addEventListener('click', () => el('#connectivity-dialog').close()); + // Wake-on-LAN buttons live on unreachable hardware cards; delegate. + el('#hardware-grid').addEventListener('click', (e) => { + const btn = e.target.closest('[data-wake]'); + if (btn) wakeSpark(btn.dataset.wake); + }); setupCatalogDialog(); setupAdvancedDialog(); // Open WebUI link from /api/config diff --git a/image/app/static/index.html b/image/app/static/index.html index 05a67c2..f9cf853 100644 --- a/image/app/static/index.html +++ b/image/app/static/index.html @@ -26,8 +26,22 @@