From 39f8410623160f9c9a55438670c846546dfc0a1d Mon Sep 17 00:00:00 2001 From: Keysat Date: Mon, 15 Jun 2026 22:57:40 -0500 Subject: [PATCH] v0.21.0:0 - matrix-bridge bot tile (status, update, restart, logs) --- AGENTS.md | 5 +- image/app/config.py | 18 ++ image/app/matrix_bridge.py | 186 ++++++++++++++++++ image/app/server.py | 97 ++++++++- image/app/services.py | 11 ++ image/app/static/app.js | 144 +++++++++++++- image/app/static/index.html | 31 +++ image/app/static/style.css | 6 +- image/tests/test_matrix_bridge.py | 47 +++++ package/startos/actions/configureSparks.ts | 9 + .../startos/fileModels/sparkConfig.yaml.ts | 2 + package/startos/main.ts | 2 + package/startos/versions/v0_1_0.ts | 4 +- 13 files changed, 549 insertions(+), 13 deletions(-) create mode 100644 image/app/matrix_bridge.py create mode 100644 image/tests/test_matrix_bridge.py diff --git a/AGENTS.md b/AGENTS.md index 0d37483..1325a20 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -56,10 +56,11 @@ Subsystem guidance lives in `docs/guides/` and loads when matching files are tou ## Current state - **Working (v0.20.0:0, installed and serving):** swap dashboard; chat / transcribe / diarize(+chunk) / TTS proxies; embeddings + rerank + hybrid search (Qdrant); `/scrub` + `/rehydrate`; label-merge incl. dual-channel; per-Spark SSH-key copy + WireGuard `VPN ` hardware-card badge. Spark 2 audio stack healthy. Security hardening (v0.19.0:0 — shellsafe SSH-injection guard, Qdrant path-injection, same-origin CSRF guard) shipped and stable; evidence in `EVALUATION.md`. -- **Tests:** offline pytest harness in `image/tests/` — `cd image && .venv/bin/python -m pytest` (65 passing). Covers `build_launch_command` (incl. the shell-injection round-trip), the transcript↔diarizer label-merge, and the `shellsafe` validators. Mock-heavy swap/proxy tests deliberately skipped (low ROI). Redaction + live-audio suites remain standalone scripts. +- **matrix-bridge tile (v0.21.0:0 — code complete, NOT yet built/installed):** new "matrix-bridge" service tile (kind `bot`) on the Always-on services panel. Status badge (docker-state only — no HTTP health port), Restart/Stop/Start (generic `/api/services` path), **Update** (streamed `git fetch && git reset --hard origin/ && docker compose up -d --build` via `app/matrix_bridge.py`, 25-min cap, fail-loud), **View logs** (`docker logs --tail 100`). Driven as a dedicated SSH user **directly** (no `sudo -iu` — spark2 has no passwordless sudo). The user is a **blank-default "Configure Sparks" field** (`matrix_bridge_user`); blank → service unconfigured → tile hidden (keeps the shared package portable, no hardcoded username). Host reuses `spark2_host`; container/dir/branch are env-overridable defaults (`matrix-bridge` / `~/matrix-bridge` / `master`). Tile also auto-hides when the container is absent. New endpoints: `POST /api/matrix-bridge/update` (+`/{id}`, `/{id}/stream`), `GET /api/matrix-bridge/logs`. **Owner prereq before Update works:** convert `~/matrix-bridge` to a Gitea clone, and authorize the package key for `modelo` unless `spark2_user == modelo`. Then bump-build-install (`cd package && make x86 && make install` — restarts live service, get go/no-go). +- **Tests:** offline pytest harness in `image/tests/` — `cd image && .venv/bin/python -m pytest` (70 passing). Covers `build_launch_command` (incl. the shell-injection round-trip), the transcript↔diarizer label-merge, the `shellsafe` validators, and `matrix_bridge.build_update_command` (+ phase detection). Mock-heavy swap/proxy tests deliberately skipped (low ROI). Redaction + live-audio suites remain standalone scripts. - **Signal Engine "flakiness":** diagnosed as *not* a server bug — transient 1–4s unresponsiveness while the single GPU is busy. Client-side remedy (in-flight cap 2 / ceiling 3 / retry-on-timeout+503) drafted and **forwarded to that dev (owner confirmed 2026-06-15)**. Awaiting whether they want the measured concurrency knee. - **Stance (decided, not built):** no public interface / no API-token auth — LAN + WireGuard/Tailscale split-tunnel only; the CSRF guard covers the browser-driven vector. - **Known limits:** `/health` blips while the GPU is busy (mitigated client-side); dual-channel can miss a quiet local word under loud remote bleed; connectivity log misses sub-5s outages between 5s polls; diarizer caps at 4 speakers. - **Infra gotcha (safety):** passwordless sudo is NOT configured on spark2 — design unprivileged probes for any Spark feature (the badge uses `ip`, not `sudo wg show`). spark2 sits on the `starttunnel` WireGuard subnet (`10.59.211.6/24`, survives reboot). Owner declined SSH-key rotation after the 2026-06-12 history scrub (only the key *name* leaked) — don't re-flag. - **Hosting:** self-hosted Gitea — remote `gitea`, branch `master`, over SSH; push after committing. (Wart: commit `8d839e3` is mislabeled `v0.13.0:4` but contains through v0.18.0:0.) -- **Next:** (1) audio concurrency sweep — only if the Signal Engine dev wants the measured knee; needs owner OK in a quiet window. (2) Otherwise pull from `ROADMAP.md`: local-path/fine-tuned model support (new) or P2 tech-debt. Parakeet long-audio guard is deferred (rationale in ROADMAP). +- **Next:** (1) matrix-bridge Phase 3 — owner does the one-time prereqs (Gitea clone of `~/matrix-bridge` + authorize the package key for `modelo`), then bump-build-install v0.21.0:0 and verify the tile flips across a manual `docker stop`/`start` and that Update/Restart/logs work. (2) audio concurrency sweep — only if the Signal Engine dev wants the measured knee; needs owner OK in a quiet window. (3) Otherwise pull from `ROADMAP.md`: local-path/fine-tuned model support (new) or P2 tech-debt. Parakeet long-audio guard is deferred (rationale in ROADMAP). diff --git a/image/app/config.py b/image/app/config.py index 45cf425..5aa0830 100644 --- a/image/app/config.py +++ b/image/app/config.py @@ -42,6 +42,11 @@ class Settings: qdrant_user: str qdrant_container: str qdrant_collection: str + matrix_bridge_host: str + matrix_bridge_user: str + matrix_bridge_container: str + matrix_bridge_dir: str + matrix_bridge_branch: str redaction_map_db: str redaction_map_ttl: int ssh_key_path: str @@ -81,6 +86,19 @@ class Settings: qdrant_user=_env("QDRANT_USER") or spark2_user, qdrant_container=_env("QDRANT_CONTAINER") or "qdrant", qdrant_collection=_env("QDRANT_COLLECTION", ""), + # matrix-bridge bot container, driven as its own SSH user (the owner + # of the ~/matrix-bridge git clone) so git/docker run unprivileged. + # The user is BLANK by default and set via the "Configure Sparks" + # action; leaving it blank reports the service as unconfigured, which + # hides the tile. That keeps the shared package portable — a + # deployment without the bot never shows a stray tile or a hardcoded + # username. Host defaults to Spark 2 (same box); container/dir/branch + # are sensible defaults. All are env-overridable. + matrix_bridge_host=_env("MATRIX_BRIDGE_HOST") or spark2_host, + matrix_bridge_user=_env("MATRIX_BRIDGE_USER"), + matrix_bridge_container=_env("MATRIX_BRIDGE_CONTAINER") or "matrix-bridge", + matrix_bridge_dir=_env("MATRIX_BRIDGE_DIR") or "~/matrix-bridge", + matrix_bridge_branch=_env("MATRIX_BRIDGE_BRANCH") or "master", # Redaction gateway pseudonym-map store (server-held de-anon key). redaction_map_db=_env("REDACTION_MAP_DB", "/data/redaction_maps.db"), redaction_map_ttl=int(_env("REDACTION_MAP_TTL", "7200")), diff --git a/image/app/matrix_bridge.py b/image/app/matrix_bridge.py new file mode 100644 index 0000000..7ceb031 --- /dev/null +++ b/image/app/matrix_bridge.py @@ -0,0 +1,186 @@ +"""Update + logs for the matrix-bridge bot container on the Spark. + +matrix-bridge is a single Docker container managed by docker compose out of a +git clone at `~matrix_bridge_user/matrix-bridge`. Status (the badge) and +start/stop/restart ride the generic service machinery in `services.py` +(`docker_state` / `run_action`). The two things that don't fit that mould live +here: + + - **Update** — `git fetch && git reset --hard origin/ && docker + compose up -d --build`. Long-running (docker build), so it streams like the + vLLM `UpdateManager`: fire-and-forget job, SSE stream, fail-loud rc. + - **Logs** — a one-shot `docker logs --tail N` for diagnosing a red badge. + +We connect **directly as the configured user** (`modelo` — the repo owner), so +git never trips its dubious-ownership guard and docker runs via the user's +docker-group membership. We deliberately do NOT `sudo -iu modelo`: this Spark +has no passwordless sudo, so a sudo wrap would hang in SSH BatchMode. +""" +from __future__ import annotations +import asyncio +import time +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Optional + +from .config import Settings +from .shellsafe import quote_arg +from .ssh import ssh_run, ssh_stream, StreamHandle + +# Hard ceiling on a single update. A first build after a base-image bump is +# slow (minutes); the cache makes later ones quick. 25 min is generous headroom +# without letting a genuinely wedged build spin forever. +_UPDATE_TIMEOUT_S = 1500 + + +def build_update_command(directory: str, branch: str) -> str: + """The update one-liner, run from the bot's git clone as its owner. + + `directory` and `branch` come from operator config (not request input), so + they're interpolated directly — same trust model as the Spark hostnames in + `health`/`updates`. `directory` may be `~/...`, which must stay unquoted so + the remote login shell expands it; quoting would defeat that. + """ + return ( + f"cd {directory} && " + f"git fetch origin && " + f"git reset --hard origin/{branch} && " + f"docker compose up -d --build" + ) + + +def _phase_for(line: str) -> Optional[str]: + """Map a streamed output line to a human-readable phase, or None to keep + the current phase. Kept loose — compose/buildkit output varies by version.""" + low = line.lower() + if "git reset" in low or "head is now at" in low: + return "Resetting to the latest release…" + if "docker compose" in low or "buildkit" in low or low.startswith("step ") or "=> " in line or "building " in low: + return "Building the bot image…" + if "recreate" in low or "starting" in low or "started" in low or "container matrix-bridge" in low: + return "Recreating the container…" + if "already up to date" in low: + return "No new code; rebuilding…" + return None + + +@dataclass +class UpdateJob: + id: str + started_at: str + state: str = "starting" + lines: list[str] = field(default_factory=list) + returncode: Optional[int] = None + finished_at: Optional[str] = None + phase: str = "Starting…" + + def append(self, line: str) -> None: + self.lines.append(line) + if len(self.lines) > 1000: + del self.lines[: len(self.lines) - 1000] + + +class MatrixBridgeManager: + def __init__(self, settings: Settings) -> None: + self.settings = settings + self.lock = asyncio.Lock() + self.jobs: dict[str, UpdateJob] = {} + self.current_job_id: Optional[str] = None + + def _configured(self) -> bool: + s = self.settings + return bool(s.matrix_bridge_host and s.matrix_bridge_user) + + def get(self, job_id: str) -> UpdateJob | None: + return self.jobs.get(job_id) + + async def fetch_logs(self, tail: int = 100) -> dict: + """One-shot `docker logs --tail N ` (stderr merged in).""" + s = self.settings + if not self._configured(): + return {"ok": False, "error": "matrix-bridge host not configured"} + tail = max(1, min(int(tail), 1000)) + # tail is already int-clamped, but quote at the sink anyway so the + # shellsafe convention (no raw interpolation into an SSH command) holds + # regardless of caller. + cmd = f"docker logs --tail {quote_arg(str(tail))} {quote_arg(s.matrix_bridge_container)} 2>&1" + rc, out, err = await ssh_run( + s.matrix_bridge_host, s.matrix_bridge_user, cmd, s, timeout=20 + ) + return { + "ok": rc == 0, + "rc": rc, + "container": s.matrix_bridge_container, + "output": (out or err).strip(), + } + + async def trigger_update(self) -> UpdateJob: + if not self._configured(): + raise RuntimeError("matrix-bridge host not configured") + if self.lock.locked(): + raise RuntimeError("An update is already in progress") + job = UpdateJob( + id=uuid.uuid4().hex[:8], + started_at=datetime.now(timezone.utc).isoformat(), + ) + self.jobs[job.id] = job + self.current_job_id = job.id + asyncio.create_task(self._run(job)) + return job + + async def _run(self, job: UpdateJob) -> None: + async with self.lock: + try: + await self._do(job) + if job.state != "failed": + job.state = "done" + job.returncode = 0 + job.phase = "Done" + except asyncio.TimeoutError: + job.append(f"[error] update timed out after {_UPDATE_TIMEOUT_S}s") + job.state = "failed" + job.returncode = 124 + job.phase = "Timed out" + except Exception as e: + job.append(f"[error] {type(e).__name__}: {e}") + job.state = "failed" + if job.returncode is None: + job.returncode = 1 + finally: + job.finished_at = datetime.now(timezone.utc).isoformat() + if self.current_job_id == job.id: + self.current_job_id = None + + async def _do(self, job: UpdateJob) -> None: + s = self.settings + cmd = build_update_command(s.matrix_bridge_dir, s.matrix_bridge_branch) + job.append(f"$ {cmd}") + job.state = "running" + job.phase = "Fetching latest code…" + + handle = StreamHandle() + gen = ssh_stream(s.matrix_bridge_host, s.matrix_bridge_user, cmd, s, handle=handle) + deadline = time.monotonic() + _UPDATE_TIMEOUT_S + try: + while True: + remaining = deadline - time.monotonic() + if remaining <= 0: + raise asyncio.TimeoutError + try: + line = await asyncio.wait_for(gen.__anext__(), timeout=remaining) + except StopAsyncIteration: + break + job.append(line) + phase = _phase_for(line) + if phase: + job.phase = phase + finally: + # Closing the generator terminates the underlying ssh process and + # populates handle.returncode via ssh_stream's finally block. + await gen.aclose() + + rc = handle.returncode or 0 + if rc != 0: + job.state = "failed" + job.returncode = rc diff --git a/image/app/server.py b/image/app/server.py index bea8125..047c59b 100644 --- a/image/app/server.py +++ b/image/app/server.py @@ -3,7 +3,7 @@ import asyncio import json from pathlib import Path -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, HTTPException, Query, Request from fastapi.responses import FileResponse, JSONResponse, StreamingResponse from fastapi.staticfiles import StaticFiles from pydantic import BaseModel @@ -21,6 +21,7 @@ from .embeddings_proxy import build_router as build_embeddings_router from .redaction_gateway import build_router as build_redaction_router, MapStore from .hardware import HardwareProbe from .health import check_kokoro, check_parakeet, check_vllm, check_embeddings, check_qdrant +from .matrix_bridge import MatrixBridgeManager from .models import load_catalog from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs @@ -43,6 +44,7 @@ hardware_probe = HardwareProbe(settings) nim_manager = NimManager(settings) deep_health = DeepHealth(settings) speech_models = SpeechModelsManager(settings) +matrix_bridge = MatrixBridgeManager(settings) app = FastAPI(title="spark-control", version="0.1.0") @@ -474,6 +476,11 @@ async def get_services() -> dict: http = await check_embeddings(settings) elif name == "qdrant": http = await check_qdrant(settings) + elif svc.kind == "bot": + # No HTTP health endpoint (host networking, no port) — judged purely + # by docker state. http_ready stays None so the badge isn't pinned + # to a "Starting…" verdict that can never clear. + http = {"ok": None, "base_url": None} else: # Custom services expose a /health endpoint by convention. http = await check_kokoro(settings) if svc.kind == "tts" else {"ok": None, "base_url": svc.host and f"http://{svc.host}:{svc.port}"} @@ -484,7 +491,9 @@ async def get_services() -> dict: "container": svc.container, "kind": svc.kind, "base_url": http.get("base_url"), - "http_ready": bool(http.get("ok")), + # None (not False) for services with no HTTP surface (the bot), so + # the UI judges them by docker state alone instead of "Starting…". + "http_ready": None if svc.kind == "bot" else bool(http.get("ok")), # Prefer the check fn's own top-level model key (embeddings reports # it there); fall back to a model field inside detail for services # whose /health embeds it (parakeet). @@ -500,8 +509,11 @@ async def get_services() -> dict: results = await asyncio.gather(*[one(n) for n in services.keys()]) for name, info in results: out[name] = info - # Feed http reachability into the connectivity log (transition-only) - record_state(name, bool(info.get("http_ready"))) + # Feed http reachability into the connectivity log (transition-only). + # Skip services with no HTTP surface (http_ready is None) — they'd + # otherwise register as perpetually "down". + if info.get("http_ready") is not None: + record_state(name, bool(info.get("http_ready"))) return out @@ -606,7 +618,7 @@ async def stream_nim_install(job_id: str): @app.delete("/api/services/{name}") async def del_service(name: str) -> dict: # Only allow deleting custom services (not the bundled built-in keys) - if name in ("parakeet", "kokoro", "embeddings", "qdrant"): + if name in ("parakeet", "kokoro", "embeddings", "qdrant", "matrix-bridge"): raise HTTPException(400, "built-in service; cannot delete (use Configure Sparks to point at a different host)") delete_custom_service(name) return {"ok": True, "name": name} @@ -625,6 +637,81 @@ async def service_action(name: str, action: str) -> dict: return {"name": name, "action": action, **result} +# ---- matrix-bridge bot: update (git pull + rebuild) + logs ---- +# Status badge + start/stop/restart ride the generic /api/services machinery +# above (the bot is a registered ServiceDef). Only the long-running Update and +# the logs view need bespoke endpoints. + +def _serialize_mb_update(job) -> dict: + return { + "id": job.id, + "state": job.state, + "phase": job.phase, + "started_at": job.started_at, + "finished_at": job.finished_at, + "returncode": job.returncode, + "lines": job.lines, + } + + +@app.post("/api/matrix-bridge/update") +async def post_matrix_bridge_update() -> dict: + """Pull latest code, rebuild, and recreate the bot container. Long-running + (docker build) — returns a job id to stream.""" + try: + job = await matrix_bridge.trigger_update() + except RuntimeError as e: + raise HTTPException(409 if "in progress" in str(e) else 503, str(e)) + return {"job_id": job.id, "state": job.state} + + +@app.get("/api/matrix-bridge/update/{job_id}") +async def get_matrix_bridge_update(job_id: str) -> dict: + job = matrix_bridge.get(job_id) + if job is None: + raise HTTPException(404, "no such job") + return _serialize_mb_update(job) + + +@app.get("/api/matrix-bridge/update/{job_id}/stream") +async def stream_matrix_bridge_update(job_id: str, request: Request): + job = matrix_bridge.get(job_id) + if job is None: + raise HTTPException(404, "no such job") + + async def gen(): + sent = 0 + last_phase = None + while True: + # An update can run for minutes; bail promptly if the client is gone + # rather than spinning the poll loop until the job's 25-min ceiling. + if await request.is_disconnected(): + return + n = len(job.lines) + if n > sent: + for line in job.lines[sent:n]: + yield f"data: {json.dumps({'line': line})}\n\n" + sent = n + if job.phase != last_phase: + yield f"event: phase\ndata: {json.dumps({'state': job.state, 'phase': job.phase})}\n\n" + last_phase = job.phase + if job.returncode is not None and sent >= len(job.lines): + yield f"event: done\ndata: {json.dumps({'state': job.state, 'returncode': job.returncode})}\n\n" + return + await asyncio.sleep(0.5) + + return StreamingResponse(gen(), media_type="text/event-stream") + + +@app.get("/api/matrix-bridge/logs") +async def get_matrix_bridge_logs(tail: int = Query(100, ge=1, le=1000)) -> dict: + """Last N lines of `docker logs` for the bot container (stderr merged).""" + result = await matrix_bridge.fetch_logs(tail=tail) + if not result.get("ok"): + raise HTTPException(502, result.get("output") or result.get("error") or "could not read logs") + return result + + # ---- Speech model patch management ---- @app.get("/api/speech-models") diff --git a/image/app/services.py b/image/app/services.py index b44308d..2c9b71b 100644 --- a/image/app/services.py +++ b/image/app/services.py @@ -89,6 +89,17 @@ def services_from_settings(s: Settings) -> dict[str, ServiceDef]: container=s.qdrant_container, port=s.qdrant_port, ), + # matrix-bridge Matrix bot. No HTTP port to probe (host networking, no + # health endpoint) — judged purely by docker state. Driven as its own + # SSH user (modelo, the repo owner) so git/docker run unprivileged. + "matrix-bridge": ServiceDef( + name="matrix-bridge", + kind="bot", + host=s.matrix_bridge_host, + user=s.matrix_bridge_user, + container=s.matrix_bridge_container, + port=0, + ), } for entry in load_custom_services(): key = entry.get("key") diff --git a/image/app/static/app.js b/image/app/static/app.js index 7ac4939..074d4b4 100644 --- a/image/app/static/app.js +++ b/image/app/static/app.js @@ -13,6 +13,7 @@ const state = { swap_progress: 0, // 0–1 services: {}, service_action_in_flight: null, // e.g. "parakeet:restart" + mb_update_in_flight: false, // matrix-bridge update job running hardware: {}, config: {}, configured: true, @@ -438,8 +439,13 @@ function classifyService(s) { if (s.docker_state === 'missing') return 'missing'; if (s.docker_state === 'restarting') return 'unhealthy'; if (s.docker_state === 'exited') return 'unhealthy'; - if (s.docker_state === 'running' && !s.http_ready) return 'starting'; - if (s.docker_state === 'running' && s.http_ready) return 'running'; + if (s.docker_state === 'running') { + // http_ready === false means an HTTP probe is expected but failing → still + // warming up. null means the service has no HTTP surface (e.g. the bot), so + // a running container is simply healthy. + if (s.http_ready === false) return 'starting'; + return 'running'; + } return s.docker_state || 'unknown'; } @@ -471,6 +477,11 @@ async function renderServices() { grid.innerHTML = ''; for (const [name, s] of entries) { const cls = classifyService(s); + const isBot = s.kind === 'bot'; + // The bot tile is opt-in: it only belongs to deployments that actually run + // matrix-bridge. When the container is absent (missing) or the host isn't + // configured, hide the tile entirely rather than show a stray red card. + if (isBot && (cls === 'missing' || cls === 'unconfigured')) continue; const card = document.createElement('div'); card.className = `service-card ${cls}`; const inFlight = state.service_action_in_flight && state.service_action_in_flight.startsWith(name + ':'); @@ -537,9 +548,11 @@ async function renderServices() { ${restartsRow} ${deepRow}
+ ${isBot ? `` : ''} + ${isBot ? `` : ''}
`; grid.appendChild(card); @@ -547,6 +560,10 @@ async function renderServices() { for (const btn of grid.querySelectorAll('.btn[data-svc-action]')) { btn.addEventListener('click', () => onServiceAction(btn.dataset.svcAction)); } + const mbUpdateBtn = grid.querySelector('[data-mb-update]'); + if (mbUpdateBtn) mbUpdateBtn.addEventListener('click', onMatrixBridgeUpdate); + const mbLogsBtn = grid.querySelector('[data-mb-logs]'); + if (mbLogsBtn) mbLogsBtn.addEventListener('click', openMatrixBridgeLogs); for (const btn of grid.querySelectorAll('[data-dh-run]')) { btn.addEventListener('click', () => onDeepHealthRun(btn.dataset.dhRun, btn)); } @@ -725,6 +742,118 @@ async function onServiceAction(key) { } } +// ===================== matrix-bridge bot (update + logs) ===================== + +const mbState = { job_id: null, eventsource: null, timer: null, started_at: null }; + +function mbTimerStart(at) { + mbState.started_at = at; + if (mbState.timer) clearInterval(mbState.timer); + const tick = () => { + if (!mbState.started_at) return; + const sec = Math.max(0, Math.floor((Date.now() - mbState.started_at) / 1000)); + el('#mb-update-elapsed').textContent = `${Math.floor(sec / 60)}:${(sec % 60).toString().padStart(2, '0')}`; + }; + tick(); + mbState.timer = setInterval(tick, 500); +} + +async function onMatrixBridgeUpdate() { + if (state.mb_update_in_flight) return; + if (!confirm('Update the matrix-bridge bot?\n\nThis pulls the latest code, rebuilds the container image, and recreates the container. The first build after a base-image change can take several minutes. The bot is briefly offline while it restarts.')) return; + state.mb_update_in_flight = true; + renderServices(); + try { + const r = await fetchJSON('/api/matrix-bridge/update', { method: 'POST' }); + attachMbUpdateProgress(r.job_id); + } catch (e) { + state.mb_update_in_flight = false; + renderServices(); + alert('Update failed to start: ' + e.message); + } +} + +async function attachMbUpdateProgress(jobId) { + mbState.job_id = jobId; + el('#mb-update-log').textContent = ''; + el('#mb-update-title').textContent = 'Updating matrix-bridge…'; + el('#mb-update-phase').textContent = 'Starting…'; + el('#mb-update-dialog').showModal(); + try { + const snap = await fetchJSON(`/api/matrix-bridge/update/${jobId}`); + mbTimerStart(Date.parse(snap.started_at)); + el('#mb-update-phase').textContent = snap.phase || 'Working…'; + el('#mb-update-log').textContent = (snap.lines || []).join('\n'); + if (snap.returncode !== null) { onMbUpdateDone(snap); return; } + } catch { mbTimerStart(Date.now()); } + const es = new EventSource(`/api/matrix-bridge/update/${jobId}/stream`); + mbState.eventsource = es; + es.onmessage = ev => { + try { + const d = JSON.parse(ev.data); + if (d.line !== undefined) { + const log = el('#mb-update-log'); + log.textContent += d.line + '\n'; + log.scrollTop = log.scrollHeight; + } + } catch {} + }; + es.addEventListener('phase', ev => { + try { el('#mb-update-phase').textContent = JSON.parse(ev.data).phase; } catch {} + }); + es.addEventListener('done', ev => { + let d = {}; try { d = JSON.parse(ev.data); } catch {} + onMbUpdateDone(d); + }); + es.onerror = () => { + // Don't leave the Update button wedged-disabled on a dropped stream. The + // job keeps running server-side; re-clicking Update returns a clean 409. + es.close(); + mbState.eventsource = null; + state.mb_update_in_flight = false; + el('#mb-update-phase').textContent = 'Lost connection to the update stream — reopen or check logs.'; + renderServices(); + }; +} + +function onMbUpdateDone(d) { + if (mbState.eventsource) { mbState.eventsource.close(); mbState.eventsource = null; } + if (mbState.timer) { clearInterval(mbState.timer); mbState.timer = null; } + state.mb_update_in_flight = false; + if (d.state === 'failed') { + el('#mb-update-title').textContent = `Update failed (rc=${d.returncode})`; + el('#mb-update-phase').textContent = 'Failed — see the log above.'; + } else { + el('#mb-update-title').textContent = 'Update complete'; + el('#mb-update-phase').textContent = 'Done ✓'; + } + // Refresh the tile's badge. + (async () => { try { state.services = await fetchJSON('/api/services'); } catch {} renderServices(); })(); +} + +async function openMatrixBridgeLogs() { + const pre = el('#mb-logs-pre'); + el('#mb-logs-title').textContent = 'matrix-bridge logs'; + pre.textContent = 'Loading…'; + el('#mb-logs-dialog').showModal(); + await loadMatrixBridgeLogs(); +} + +async function loadMatrixBridgeLogs() { + const pre = el('#mb-logs-pre'); + const btn = el('#mb-logs-refresh'); + if (btn) btn.disabled = true; + try { + const r = await fetchJSON('/api/matrix-bridge/logs?tail=100'); + pre.textContent = r.output || '(no output)'; + pre.scrollTop = pre.scrollHeight; + } catch (e) { + pre.textContent = 'Could not read logs: ' + e.message; + } finally { + if (btn) btn.disabled = false; + } +} + function renderEndpoint(status) { const v = status.vllm || {}; const panel = el('#endpoint-panel'); @@ -1883,6 +2012,17 @@ async function init() { el('#nim-cancel').addEventListener('click', () => el('#nim-dialog').close()); el('#nim-form').addEventListener('submit', submitNim); el('#nim-prog-close').addEventListener('click', () => el('#nim-progress-dialog').close()); + el('#mb-update-close').addEventListener('click', () => el('#mb-update-dialog').close()); + // Dismissing the modal (Close or Esc) stops streaming; the job runs on + // server-side and re-clicking Update returns a 409 if still in progress. + el('#mb-update-dialog').addEventListener('close', () => { + if (mbState.eventsource) { mbState.eventsource.close(); mbState.eventsource = null; } + if (mbState.timer) { clearInterval(mbState.timer); mbState.timer = null; } + state.mb_update_in_flight = false; + renderServices(); + }); + el('#mb-logs-close').addEventListener('click', () => el('#mb-logs-dialog').close()); + el('#mb-logs-refresh').addEventListener('click', loadMatrixBridgeLogs); el('#open-connectivity').addEventListener('click', openConnectivityDialog); el('#connectivity-close').addEventListener('click', () => el('#connectivity-dialog').close()); // Hardware-card buttons (Wake-on-LAN on unreachable cards; SSH-key copy on diff --git a/image/app/static/index.html b/image/app/static/index.html index 2057858..18626e3 100644 --- a/image/app/static/index.html +++ b/image/app/static/index.html @@ -164,6 +164,37 @@ + + + + + + + +