From 7ae6ab3ba83fb14a7e569fe6430cc17266b1aeb9 Mon Sep 17 00:00:00 2001 From: Keysat Date: Thu, 18 Jun 2026 07:07:08 -0500 Subject: [PATCH] v0.25.0:0 - cluster coordination layer (swap lock + webhook + schedule registry) GPU-arbiter safety layer for when automation, not just the dashboard, swaps models: - swap reservation lock (POST/GET/DELETE /api/swap/lock); 423-enforced in post_swap via a single-read gate, TTL-bounded, secret-token auth, human force-release override + dashboard banner - swap webhook (swap_complete/swap_failed) fired outside the swap lock, optional HMAC signature, configurable URL+secret - read-only schedule registry (GET/POST/DELETE /api/schedule) + dashboard panel New module image/app/coordination.py; docs/COORDINATION.md for consumers; 22 offline tests in test_coordination.py. --- AGENTS.md | 8 +- ROADMAP.md | 9 +- docs/COORDINATION.md | 157 ++++++++ image/app/config.py | 7 + image/app/coordination.py | 342 ++++++++++++++++++ image/app/server.py | 109 +++++- image/app/static/app.js | 102 +++++- image/app/static/index.html | 15 + image/app/static/style.css | 36 ++ image/app/swap.py | 24 +- image/tests/test_coordination.py | 201 ++++++++++ package/startos/actions/configureSparks.ts | 18 + .../startos/fileModels/sparkConfig.yaml.ts | 5 + package/startos/main.ts | 4 + package/startos/versions/v0_1_0.ts | 4 +- 15 files changed, 1026 insertions(+), 15 deletions(-) create mode 100644 docs/COORDINATION.md create mode 100644 image/app/coordination.py create mode 100644 image/tests/test_coordination.py diff --git a/AGENTS.md b/AGENTS.md index d647a4c..2f41c01 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,7 +33,7 @@ Subsystem guidance lives in `docs/guides/` and loads when matching files are tou - `image/app/` — FastAPI app (`server.py` entry, routers in sibling modules, `static/` dashboard UI). - `package/startos/` — StartOS manifest, interfaces, actions, version + release notes. -- `docs/` — `AUDIO_API.md`, `EMBEDDINGS.md`, `REDACTION_GATEWAY.md` (consumer-facing API refs; update with API changes). +- `docs/` — `AUDIO_API.md`, `EMBEDDINGS.md`, `REDACTION_GATEWAY.md`, `COORDINATION.md` (consumer-facing API refs; update with API changes). - `README.md` (overview), `HANDOFF.md` (fresh-user install guide), `runbook.md` (ops notes), `known-issues.md`, `ROADMAP.md` (longer-term backlog — items move into "Current state" below when picked up). ## Conventions @@ -55,12 +55,12 @@ Subsystem guidance lives in `docs/guides/` and loads when matching files are tou ## Current state -- **Live service runs v0.22.0:0** (installed and serving). **v0.24.0:0 is the latest in tree — built clean, committed (`26070eb`), tagged `v0.24.0`, and pushed to `gitea/master`. Two close-out steps remain: (a) `make release` to publish the s9pk asset to Gitea Releases (NOT run this session — needs `GITEA_URL` + write `GITEA_TOKEN`, neither in env; the s9pk is built and waiting in `package/`), and (b) the live install. v0.23.0:0 is in the same boat — also committed/tagged/Gitea-published but install PENDING.** Both installs blocked on the same mDNS issue (P3 line below). Working features: swap dashboard; chat / transcribe / diarize(+chunk) / TTS proxies; embeddings + rerank + hybrid search (Qdrant); `/scrub` + `/rehydrate`; label-merge incl. dual-channel; per-Spark SSH-key copy + WireGuard `VPN ` hardware-card badge; configurable vLLM port (blank ⇒ 8888) + **configurable topology** (vLLM container name, hide-services list, second-Spark vLLM monitor — v0.24.0:0). Local/fine-tuned models (v0.23.0:0) + the topology knobs land live once these installs go through. Spark 2 audio stack healthy. Security hardening (v0.19.0:0 — shellsafe SSH-injection guard, Qdrant path-injection, same-origin CSRF guard) shipped and stable; evidence in `EVALUATION.md`. +- **Live service runs v0.22.0:0** (installed and serving). **v0.25.0:0 is the latest in tree — coordination layer (swap lock + webhook + schedule registry); built/typechecked clean, NOT yet committed/tagged/installed (this session's work).** It stacks on three releases also staged-but-not-live: v0.24.0:0 (configurable topology — committed `26070eb`, tagged, pushed to `gitea/master`), v0.23.0:0 (local/fine-tuned models — committed/tagged/Gitea-published). **Close-out backlog for all of these: (a) commit/tag/push v0.25.0:0; (b) `make release` to publish s9pk assets to Gitea Releases (needs `GITEA_URL` + write `GITEA_TOKEN`, neither in env); (c) the live install.** Installs blocked on the same mDNS issue (P3 line below). Working features: swap dashboard; chat / transcribe / diarize(+chunk) / TTS proxies; embeddings + rerank + hybrid search (Qdrant); `/scrub` + `/rehydrate`; label-merge incl. dual-channel; per-Spark SSH-key copy + WireGuard `VPN ` hardware-card badge; configurable vLLM port (blank ⇒ 8888); **configurable topology** (vLLM container name, hide-services list, second-Spark vLLM monitor — v0.24.0:0); local/fine-tuned models (v0.23.0:0); **coordination layer** (v0.25.0:0 — GPU swap reservation lock with `423`-enforced manual-swap pause + human Release override, swap_complete/swap_failed webhook, read-only schedule registry; API in `docs/COORDINATION.md`). Everything from v0.23 onward lands live once the installs go through. Spark 2 audio stack healthy. Security hardening (v0.19.0:0 — shellsafe SSH-injection guard, Qdrant path-injection, same-origin CSRF guard) shipped and stable; evidence in `EVALUATION.md`. - **matrix-bridge bot tile (done, v0.21.0:1, verified live):** `bot`-kind service tile — status badge from docker-state only (no HTTP port), plus **Update** / Restart / Stop/Start / **View logs**. Code: `app/matrix_bridge.py` + `/api/matrix-bridge/{update,logs}` (update streams; 25-min cap; fail-loud). Driven directly as `modelo` on Spark 2 (**no `sudo -iu`** — spark2 has no passwordless sudo). User is a blank-default Configure-Sparks field (`matrix_bridge_user`); blank → tile hidden (portable). Host reuses `spark2_host` (`192.168.1.87` = the bot's box `spark-32d0`); container/dir/branch are env-overridable defaults. **Load-bearing ops dep:** Update's `git fetch` runs as `modelo`, which needs `modelo`'s `~/.ssh/config` pinning the Gitea deploy key with `IdentitiesOnly yes` — else the wrong key is offered and Gitea denies (publickey). Optional next, only if the bot dev asks: Docker `HEALTHCHECK` for running-but-disconnected detection (spec §Note). -- **Tests:** offline pytest harness in `image/tests/` — `cd image && .venv/bin/python -m pytest` (102 passing). Covers `build_launch_command` (incl. the shell-injection round-trip + local-model bind-mount), the transcript↔diarizer label-merge, the `shellsafe` validators, `matrix_bridge.build_update_command` (+ phase detection), and the configurable-topology layer (`test_topology.py`: `DISABLED_SERVICES` parsing, `vllm_container` override, disabled-service skip in `services_from_settings` + `check_*`, `probe_vllm_endpoint`). Mock-heavy swap/proxy tests deliberately skipped (low ROI). Redaction + live-audio suites remain standalone scripts. +- **Tests:** offline pytest harness in `image/tests/` — `cd image && .venv/bin/python -m pytest` (124 passing). Covers `build_launch_command` (incl. the shell-injection round-trip + local-model bind-mount), the transcript↔diarizer label-merge, the `shellsafe` validators, `matrix_bridge.build_update_command` (+ phase detection), the configurable-topology layer (`test_topology.py`), and the coordination layer (`test_coordination.py`: swap-lock lifecycle/expiry/token-auth, schedule-registry CRUD, webhook payload + HMAC signature — `now` is injected into the lock so expiry is tested without sleeping). Mock-heavy swap/proxy/endpoint tests deliberately skipped (low ROI). Redaction + live-audio suites remain standalone scripts. - **Signal Engine "flakiness":** diagnosed as *not* a server bug — transient 1–4s unresponsiveness while the single GPU is busy. Client-side remedy (in-flight cap 2 / ceiling 3 / retry-on-timeout+503) drafted and **forwarded to that dev (owner confirmed 2026-06-15)**. Awaiting whether they want the measured concurrency knee. - **Stance (decided, not built):** no public interface / no API-token auth — LAN + WireGuard/Tailscale split-tunnel only; the CSRF guard covers the browser-driven vector. - **Known limits:** `/health` blips while the GPU is busy (mitigated client-side); dual-channel can miss a quiet local word under loud remote bleed; connectivity log misses sub-5s outages between 5s polls; diarizer caps at 4 speakers; matrix-bridge badge won't visibly flip on a fast `docker restart` (status re-checked only after the command returns). - **Infra gotcha (safety):** passwordless sudo is NOT configured on spark2 — design unprivileged probes for any Spark feature (the badge uses `ip`, not `sudo wg show`). spark2 sits on the `starttunnel` WireGuard subnet (`10.59.211.6/24`, survives reboot). Owner declined SSH-key rotation after the 2026-06-12 history scrub (only the key *name* leaked) — don't re-flag. - **Hosting:** self-hosted Gitea — remote `gitea`, branch `master`, over SSH; push after committing. (Wart: commit `8d839e3` is mislabeled `v0.13.0:4` but contains through v0.18.0:0.) -- **Next — committed 2026-06-17: OpenClaw/Johnny-5 coexistence epic (full plan + design stance in `ROADMAP.md` → "Cluster coordination").** Stance: Spark Control = control plane / GPU arbiter, **not** a job runner; business cron jobs live in separate services that *call* its swap API (swaps are already API-driven via `POST /api/swap`). Sequence: (1) **configurable `VLLM_PORT`** — SHIPPED **v0.22.0:0** (Configure-Sparks field, blank ⇒ 8888; + `_env_int` hardening in `config.py` so a blank/bad port no longer crashes startup, killing a P3 tech-debt item). Committed `136a471`, pushed, tagged `v0.22.0`, rebuilt clean, installed, and **published to the self-hosted Gitea Releases** 2026-06-17 (`make release` → `scripts/gitea-release.sh`, takes `GITEA_URL` + a write token). **Distribution model (decided 2026-06-17):** Gitea Releases + a read-only token the adopter's agent uses to pull the latest s9pk (`GET /api/v1/repos/grant/spark-control/releases/latest` → download the `.s9pk` asset → sideload). Note: Gitea returns `browser_download_url` on its `.local` ROOT_URL, which won't resolve off-LAN — a remote adopter pulls via whatever address reaches the Gitea (the WireGuard IP). (2) **local-path/fine-tuned models** — DONE in tree, staged as **v0.23.0:0** (`ModelDef.local_path` + exactly-one-source validator; swap bind-mounts the dir at the same container path via the launch script's `VLLM_SPARK_EXTRA_DOCKER_ARGS` hook, **no `launch-cluster.sh` change**; "+ Add local model" UI form + `local` badge; `validate_local_path`; disk-delete refused for local; 94 tests pass. Reviewer-agent pass done, findings addressed (path validation + chat-template-location guard folded into the `ModelDef` validator so YAML/override entries are checked too; `_merge_overrides` skips a bad entry instead of failing the whole catalog; `VLLM_SPARK_EXTRA_DOCKER_ARGS` contract documented in `runbook.md`). **Committed `e783653`, tagged `v0.23.0`, built clean, published to Gitea Releases — but `make install` to the live Start9 FAILED: `immense-voyage.local` wasn't resolving via mDNS from the Mac (server up at `192.168.1.72`; `start-cli -H ` reaches it but returns UNAUTHORIZED, auth bound to the registered `.local` host). FINISH-HERE: flush mDNS (`sudo dscacheutil -flushcache && sudo killall -HUP mDNSResponder`) or add a hosts entry, then re-run `cd package && make install`** (details in runbook → "Sideload can't reach the server"). (3) **configurable topology** — DONE in tree, staged as **v0.24.0:0** (built clean, not yet committed/installed). Three optional Configure-Sparks knobs: vLLM container name (`VLLM_CONTAINER`, blank ⇒ `vllm_node`, threaded into the swap log-tail + validator exec via `quote_arg`); "services to hide" (`DISABLED_SERVICES` comma list → `Settings.disabled_services` frozenset, skipped by `services_from_settings`, the `check_*` probes, deep-health `run_all`, and connectivity logging — kills the Parakeet-on-8000 collision); second-Spark vLLM monitor via a `kind: vllm` custom service in `services-overrides.yaml` (`probe_vllm_endpoint` shared with `check_vllm`). `/api/endpoints` gained a `disabled` flag; the health-dot hides when disabled. 102 tests pass (+8 in `test_topology.py`). Swap mechanism deliberately NOT generalized to raw `docker run` (that's coordination, item 4). Install pending — same mDNS situation as v0.23.0. Next: (4) coordination layer (swap lock + swap webhook + schedule visibility) — only when our own automation lands. Still-open older threads: audio concurrency sweep (only if the Signal Engine dev wants the knee; needs a quiet window); optional matrix-bridge Docker `HEALTHCHECK` if the bot dev asks; Parakeet long-audio guard deferred (rationale in ROADMAP). +- **Next — committed 2026-06-17: OpenClaw/Johnny-5 coexistence epic (full plan + design stance in `ROADMAP.md` → "Cluster coordination").** Stance: Spark Control = control plane / GPU arbiter, **not** a job runner; business cron jobs live in separate services that *call* its swap API (swaps are already API-driven via `POST /api/swap`). Sequence: (1) **configurable `VLLM_PORT`** — SHIPPED **v0.22.0:0** (Configure-Sparks field, blank ⇒ 8888; + `_env_int` hardening in `config.py` so a blank/bad port no longer crashes startup, killing a P3 tech-debt item). Committed `136a471`, pushed, tagged `v0.22.0`, rebuilt clean, installed, and **published to the self-hosted Gitea Releases** 2026-06-17 (`make release` → `scripts/gitea-release.sh`, takes `GITEA_URL` + a write token). **Distribution model (decided 2026-06-17):** Gitea Releases + a read-only token the adopter's agent uses to pull the latest s9pk (`GET /api/v1/repos/grant/spark-control/releases/latest` → download the `.s9pk` asset → sideload). Note: Gitea returns `browser_download_url` on its `.local` ROOT_URL, which won't resolve off-LAN — a remote adopter pulls via whatever address reaches the Gitea (the WireGuard IP). (2) **local-path/fine-tuned models** — DONE in tree, staged as **v0.23.0:0** (`ModelDef.local_path` + exactly-one-source validator; swap bind-mounts the dir at the same container path via the launch script's `VLLM_SPARK_EXTRA_DOCKER_ARGS` hook, **no `launch-cluster.sh` change**; "+ Add local model" UI form + `local` badge; `validate_local_path`; disk-delete refused for local; 94 tests pass. Reviewer-agent pass done, findings addressed (path validation + chat-template-location guard folded into the `ModelDef` validator so YAML/override entries are checked too; `_merge_overrides` skips a bad entry instead of failing the whole catalog; `VLLM_SPARK_EXTRA_DOCKER_ARGS` contract documented in `runbook.md`). **Committed `e783653`, tagged `v0.23.0`, built clean, published to Gitea Releases — but `make install` to the live Start9 FAILED: `immense-voyage.local` wasn't resolving via mDNS from the Mac (server up at `192.168.1.72`; `start-cli -H ` reaches it but returns UNAUTHORIZED, auth bound to the registered `.local` host). FINISH-HERE: flush mDNS (`sudo dscacheutil -flushcache && sudo killall -HUP mDNSResponder`) or add a hosts entry, then re-run `cd package && make install`** (details in runbook → "Sideload can't reach the server"). (3) **configurable topology** — DONE in tree, staged as **v0.24.0:0** (built clean, not yet committed/installed). Three optional Configure-Sparks knobs: vLLM container name (`VLLM_CONTAINER`, blank ⇒ `vllm_node`, threaded into the swap log-tail + validator exec via `quote_arg`); "services to hide" (`DISABLED_SERVICES` comma list → `Settings.disabled_services` frozenset, skipped by `services_from_settings`, the `check_*` probes, deep-health `run_all`, and connectivity logging — kills the Parakeet-on-8000 collision); second-Spark vLLM monitor via a `kind: vllm` custom service in `services-overrides.yaml` (`probe_vllm_endpoint` shared with `check_vllm`). `/api/endpoints` gained a `disabled` flag; the health-dot hides when disabled. 102 tests pass (+8 in `test_topology.py`). Swap mechanism deliberately NOT generalized to raw `docker run` (that's coordination, item 4). Install pending — same mDNS situation as v0.23.0. (4) **coordination layer** — DONE in tree, staged as **v0.25.0:0** (brought forward 2026-06-17 on request rather than waiting for our own automation). `image/app/coordination.py` + `docs/COORDINATION.md`: swap reservation lock (`GET/POST/DELETE /api/swap/lock`, secret token, `423`-enforced in `post_swap`, TTL-bounded in-memory, `?force=true` human override, dashboard banner + swap-button pause), swap webhook (`swap_complete`/`swap_failed` fired outside the swap lock from `SwapManager._run`, optional HMAC `X-Spark-Signature`, Configure-Sparks URL+secret), schedule registry (`GET/POST/DELETE /api/schedule`, read-only "Scheduled jobs" panel). +20 tests (`test_coordination.py`). Built/typechecked clean; commit + install pending. Still-open older threads: audio concurrency sweep (only if the Signal Engine dev wants the knee; needs a quiet window); optional matrix-bridge Docker `HEALTHCHECK` if the bot dev asks; Parakeet long-audio guard deferred (rationale in ROADMAP). diff --git a/ROADMAP.md b/ROADMAP.md index 6a234f3..e138644 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -12,10 +12,11 @@ Sequenced: 1. **Configurable `VLLM_PORT`** — DONE, v0.22.0:0. Field in Configure Sparks (blank ⇒ 8888); numeric-setting parsing hardened so a blank/bad value falls back instead of crashing startup. Was the immediate "vLLM unreachable" bug for an adopter on port 8000. 2. **Local-path / fine-tuned model support** — DONE, v0.23.0:0. Catalog/`ModelDef` gained `local_path` (exactly one of `repo`/`local_path`); swap bind-mounts the dir into the vLLM container at the same path via the launch script's `VLLM_SPARK_EXTRA_DOCKER_ARGS` hook (no `launch-cluster.sh` change); "+ Add local model" form + `local` badge; disk-delete refused for local models; `validate_local_path` boundary check. His merged `ten31-v2` was the motivating case. 3. **Configurable topology** — DONE, v0.24.0:0. Three optional Configure-Sparks knobs: vLLM container name (`VLLM_CONTAINER`, blank ⇒ `vllm_node`; threaded through the swap log-tail + pre-flight validator via `quote_arg`); "services to hide" (`DISABLED_SERVICES`, comma list — hidden services show no tile and are skipped by status/deep-health/connectivity probes, killing the Parakeet-on-8000 collision); and a second-Spark vLLM monitor via a `kind: vllm` custom service in `services-overrides.yaml` (read-only tile probed through the shared `probe_vllm_endpoint`). `/api/endpoints` gained a `disabled` flag. Covers report P4/P5/#6. (Generalizing the *swap* mechanism to the adopter's raw `docker run` was deliberately left out — that's coordination, item 4; he swaps via his own crons and uses Spark Control to monitor.) -4. **Coordination layer** — build when our own automation actually lands (zero value until something other than the dashboard swaps models): - - **Swap lock** with holder + TTL (`POST` / `GET` / `DELETE /api/swap/lock`). An external scheduler acquires it before swapping; the dashboard then refuses manual swaps and shows who holds the GPU and until when. Enforced by the swap path, not advisory. - - **Swap-event webhook** (`swap_complete` / `swap_failed`) to a configurable URL, so downstream consumers update their provider config when the running model changes. - - **Schedule visibility** — read-only view the dashboard surfaces, *registered by* external schedulers (Spark Control does not own the schedule). +4. **Coordination layer** — DONE in tree, staged as **v0.25.0:0** (built/typechecked clean; install pending). All three primitives shipped; `image/app/coordination.py` + `docs/COORDINATION.md`. Brought forward 2026-06-17 on request rather than waiting for our own automation. + - **Swap lock** with holder + TTL (`POST` / `GET` / `DELETE /api/swap/lock`). Acquire returns a secret token; the swap endpoint refuses any real swap (`423`) that doesn't present it in `X-Swap-Lock-Token`, so the dashboard's manual swap is paused while a scheduler holds it (with a `?force=true` human override). In-memory + TTL-bounded → resets to unlocked on restart; re-acquire with the token extends. Enforced in `post_swap`, not advisory. + - **Swap-event webhook** (`swap_complete` / `swap_failed`) to a configurable URL (Configure-Sparks field), fired from `SwapManager._run` *outside* the swap lock; optional shared secret ⇒ `X-Spark-Signature` HMAC. Fire-and-forget (5 s, no retries); dry runs don't fire. + - **Schedule visibility** — `GET/POST/DELETE /api/schedule`; read-only "Scheduled jobs" dashboard panel, registered by external schedulers. Spark Control stores and displays, never executes. + - Still NOT generalized: the swap *mechanism* to raw `docker run` (that's the adopter's own crons' job). Tests: `image/tests/test_coordination.py` (22 cases — lock lifecycle/expiry/token, the single-read swap gate, schedule CRUD + id validation, webhook payload+signature). Known limit: lock + schedules are in-memory (a restart frees the lock and empties the registry until schedulers re-register) — persist to `/data` only if that bites. ## Near term - parakeet-asr long-audio memory guard — **deferred 2026-06-15, low priority.** A duration cap on `/v1/audio/diarize`: Sortformer runs the whole file in one pass (`diarizer.py:128-135`) over Spark 2's *shared* 128 GB unified memory (also feeding Kokoro/embeddings/Qdrant), so one giant single file can thrash into swap. **Precautionary — no observed incident**, and the production consumer (Recap Relay) already chunks via `/diarize-chunk` (~5-min, already bounded), so the only exposed path is a consumer POSTing one huge file to the full `/diarize`. When picked up: add a configurable `MAX_DIARIZE_SECONDS` guard in `diarizer.py` right after `duration` is computed (~line 130) → raise → HTTP 413 in `main.py` (mirrors the existing `MAX_UPLOAD_MB` 413); ship via the Reapply-patches action (restarts the live parakeet-asr container → needs go/no-go). Leave transcription out of v1 (upstream/un-patched file; parakeet-TDT handles long audio better). Revisit only if a consumer starts sending long single files. diff --git a/docs/COORDINATION.md b/docs/COORDINATION.md new file mode 100644 index 0000000..709d046 --- /dev/null +++ b/docs/COORDINATION.md @@ -0,0 +1,157 @@ +# Cluster coordination through Spark Control (v0.25.0) + +Spark Control is the **GPU arbiter, not a job runner.** Your recurring pipelines +(model-warming crons, "daily X" generators, batch jobs) live in your own +services and *drive Spark Control's swap API*. This page documents the safety +layer around that: a **swap reservation lock**, a **swap-event webhook**, and a +**read-only schedule registry**. + +If only the dashboard ever swaps models, you don't need any of this — it's for +when something automated also swaps. + +All endpoints are on the Spark Control host (same LAN/VPN URL as the LLM, audio, +and embeddings proxies). There is no API-token auth by design (LAN + split-tunnel +VPN only); a non-browser client passes the same-origin guard automatically. + +--- + +## 1. Swap reservation lock + +A short, TTL-bounded reservation of the swap path. While a lock is held, **any +real swap that doesn't present the holder's token is refused with `423 Locked`** +— including the dashboard's manual swap. The holder *name* is descriptive; the +returned **token** is the secret that authorises swaps and the release. + +The lock is in-memory: it resets to *unlocked* if Spark Control restarts (the +safe-for-availability default), and the swap engine's own in-progress guard +still prevents two swaps running at once. + +### `POST /api/swap/lock` — acquire (or extend) + +```json +// request +{ "holder": "openclaw-daily-vol", "ttl_seconds": 900, "note": "daily vol run" } + +// 200 response +{ + "held": true, + "holder": "openclaw-daily-vol", + "acquired_at": "2026-06-17T12:00:00+00:00", + "expires_at": "2026-06-17T12:15:00+00:00", + "seconds_remaining": 900, + "note": "daily vol run", + "token": "a1b2c3…" // SECRET — store it; needed to swap and to release +} +``` + +- `ttl_seconds` is optional (default 900) and clamped to `[1, 86400]`. +- **`409`** if a *different* holder already holds it (body includes the current + `lock` state). To **extend** your own lock, POST again with the same `holder` + **and** your `token` — the token is preserved and the window slides forward. + +### `GET /api/swap/lock` — status (no token) + +```json +{ "held": true, "holder": "openclaw-daily-vol", "expires_at": "…", "seconds_remaining": 612, "note": "…" } +// or +{ "held": false } +``` + +### `DELETE /api/swap/lock` — release + +Send your token in the `X-Swap-Lock-Token` header (or `?token=`): + +``` +DELETE /api/swap/lock +X-Swap-Lock-Token: a1b2c3… +``` + +- **`403`** if the token doesn't match. The dashboard's human override is + `DELETE /api/swap/lock?force=true` (no token). + +### Swapping while you hold the lock + +Pass the token on the swap call; the dashboard (no token) is then blocked: + +``` +POST /api/swap +X-Swap-Lock-Token: a1b2c3… +{ "model_key": "gemma-3-27b" } +``` + +Recommended scheduler flow: **acquire → swap (with token) → poll `/api/swap/{id}` +→ release**. Always release in a `finally`; if you crash, the TTL frees it. + +> `POST /api/swap/{key}/validate` (pre-flight) and dry-run swaps are **not** +> blocked by the lock — they don't touch the cluster. + +--- + +## 2. Swap-event webhook + +Configure a URL in **Configure Sparks → "Swap webhook URL"**. After every real +swap, Spark Control POSTs: + +```json +{ + "event": "swap_complete", // or "swap_failed" + "job_id": "1a2b3c4d", + "model_key": "gemma-3-27b", + "state": "ready", // or "failed" + "returncode": 0, + "started_at": "2026-06-17T12:00:00+00:00", + "finished_at": "2026-06-17T12:03:11+00:00", + "dry_run": false +} +``` + +Headers: `X-Spark-Event: swap_complete`. If you set a **webhook secret**, the +body is signed: `X-Spark-Signature: sha256=` (HMAC-SHA256 of the raw body +with the shared secret). Verify it like: + +```python +import hmac, hashlib +expected = "sha256=" + hmac.new(secret.encode(), raw_body, hashlib.sha256).hexdigest() +assert hmac.compare_digest(expected, request.headers["X-Spark-Signature"]) +``` + +Delivery is best-effort and fire-and-forget (5 s timeout, no retries) — a +webhook failure never affects the swap itself. Dry runs don't fire. + +--- + +## 3. Schedule registry (read-only display) + +So the dashboard can show *what's scheduled to touch the GPU and when*, your +schedulers register their jobs here. **Spark Control only displays these — it +never executes them.** + +### `POST /api/schedule` — register / update + +```json +// request (pass a stable `id` to update in place on re-register) +{ "id": "daily-vol", "name": "Daily Vol", "owner": "openclaw", + "cron": "0 6 * * *", "next_run": "2026-06-18T06:00:00Z", + "description": "Swaps to the big model, generates the vol report" } + +// response: the stored entry (generates an id if you omit one) +``` + +`name` is required; `id` (if given) must match `[A-Za-z0-9_.-]` (≤64 chars). + +### `GET /api/schedule` — list + +```json +{ "schedules": [ { "id": "daily-vol", "name": "Daily Vol", "owner": "openclaw", + "cron": "0 6 * * *", "next_run": "…", "description": "…", + "registered_at": "…", "updated_at": "…" } ] } +``` + +### `DELETE /api/schedule/{id}` — deregister + +```json +{ "deleted": true } +``` + +The registry is in-memory — re-register your schedules on your own startup so +they survive a Spark Control restart. diff --git a/image/app/config.py b/image/app/config.py index 75107c4..a13891f 100644 --- a/image/app/config.py +++ b/image/app/config.py @@ -103,6 +103,8 @@ class Settings: bind_port: int open_webui_url: str ngc_api_key: str + swap_webhook_url: str + swap_webhook_secret: str @classmethod def from_env(cls) -> "Settings": @@ -165,6 +167,11 @@ class Settings: bind_port=_env_int("BIND_PORT", 9999), open_webui_url=_env("OPEN_WEBUI_URL", ""), ngc_api_key=_env("NGC_API_KEY", ""), + # Coordination layer: fire a swap-lifecycle webhook to this URL so + # downstream consumers re-point their model config on a swap. Blank + # ⇒ disabled. The optional secret HMAC-signs the body (X-Spark-Signature). + swap_webhook_url=_env("SWAP_WEBHOOK_URL", ""), + swap_webhook_secret=_env("SWAP_WEBHOOK_SECRET", ""), ) @property diff --git a/image/app/coordination.py b/image/app/coordination.py new file mode 100644 index 0000000..1d88ad6 --- /dev/null +++ b/image/app/coordination.py @@ -0,0 +1,342 @@ +"""Cluster-coordination layer: the GPU swap lock, swap-event webhook, and the +read-only schedule registry. + +Spark Control is the **control plane / GPU arbiter, not a job runner.** Recurring +business pipelines live in separate services that *call* the swap API. These +three primitives add the *safety* layer around that: + +- **Swap lock** — a TTL-bounded reservation of the swap path. An external + scheduler acquires it before swapping; while held by someone else the + dashboard's manual swap is refused (enforced in the swap endpoint, not + advisory). Holder name is descriptive; the returned token is the secret that + authorises a swap or a release. +- **Webhook** — fires `swap_complete` / `swap_failed` to a configurable URL so + downstream consumers re-point their provider config when the running model + changes. Optionally HMAC-signed. +- **Schedule registry** — a read-only view the dashboard surfaces, *registered + by* external schedulers. Spark Control stores what it's told; it does not own + or execute any schedule. + +All state is in-memory (mirroring the swap/download/NIM job managers). On a +restart the lock resets to *unlocked* — the available-by-default failure mode; +the swap manager's own in-progress guard still prevents two swaps at once — +and schedulers re-register their schedules. +""" +from __future__ import annotations +import hashlib +import hmac +import json +import logging +import re +import uuid +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from typing import Optional + +import httpx + +log = logging.getLogger(__name__) + +# A lock reserves the GPU for a window; clamp the TTL so a buggy client can +# neither pin the cluster forever nor take a zero-length (useless) lock. +LOCK_TTL_MIN = 1 +LOCK_TTL_MAX = 86_400 # 24h +LOCK_TTL_DEFAULT = 900 # 15 min + +# Schedule ids are reflected to the dashboard and used as a URL path segment on +# delete, so a caller-supplied id is whitelist-checked. Generated ids are hex. +_SCHEDULE_ID_RE = re.compile(r"^[A-Za-z0-9_.-]{1,64}$") + + +def valid_schedule_id(value: str) -> bool: + """Whitelist check for a caller-supplied schedule id (register and delete).""" + return bool(_SCHEDULE_ID_RE.match(value or "")) + + +def _now() -> datetime: + return datetime.now(timezone.utc) + + +def _iso(dt: datetime) -> str: + return dt.isoformat() + + +# ---------------------------------------------------------------- swap lock ---- + +class LockHeld(Exception): + """The lock is held by a different holder. Carries the public lock state so + the endpoint can return holder + expiry in the 409 body.""" + + def __init__(self, state: dict) -> None: + self.state = state + super().__init__("swap lock is held by another holder") + + +@dataclass +class LockState: + holder: str + token: str + acquired_at: datetime + expires_at: datetime + note: str = "" + + def public(self, now: datetime) -> dict: + """Token-free view safe to expose on GET / in error bodies.""" + return { + "held": True, + "holder": self.holder, + "acquired_at": _iso(self.acquired_at), + "expires_at": _iso(self.expires_at), + "seconds_remaining": max(0, int((self.expires_at - now).total_seconds())), + "note": self.note, + } + + +class SwapLockManager: + """In-memory, TTL-bounded reservation of the GPU swap path. + + `now` is injectable on every method purely so the expiry logic is testable + without sleeping; production calls omit it and get wall-clock UTC. + """ + + def __init__(self) -> None: + self._lock: Optional[LockState] = None + + def _active(self, now: Optional[datetime] = None) -> Optional[LockState]: + """The current lock if one is held and unexpired; lazily clears an + expired lock so it never lingers.""" + now = now or _now() + if self._lock is not None and self._lock.expires_at <= now: + self._lock = None + return self._lock + + def status(self, now: Optional[datetime] = None) -> dict: + now = now or _now() + active = self._active(now) + return active.public(now) if active else {"held": False} + + def acquire( + self, + holder: str, + ttl_seconds: Optional[int] = None, + note: str = "", + token: Optional[str] = None, + *, + now: Optional[datetime] = None, + ) -> LockState: + """Acquire a free lock (new token), or extend one already held by + presenting its token. A request without the token is refused even if the + holder name matches — the name is descriptive, the token is the secret. + """ + now = now or _now() + holder = (holder or "").strip() + if not holder: + raise ValueError("holder is required") + ttl = ttl_seconds if ttl_seconds is not None else LOCK_TTL_DEFAULT + try: + ttl = int(ttl) + except (TypeError, ValueError): + ttl = LOCK_TTL_DEFAULT + ttl = max(LOCK_TTL_MIN, min(LOCK_TTL_MAX, ttl)) + + active = self._active(now) + if active is not None: + # Held — only the token-holder may extend/re-acquire. + if not (token and hmac.compare_digest(active.token, token)): + raise LockHeld(active.public(now)) + self._lock = LockState( + holder=holder or active.holder, + token=active.token, + acquired_at=active.acquired_at, + expires_at=now + timedelta(seconds=ttl), + note=note or active.note, + ) + return self._lock + + self._lock = LockState( + holder=holder, + token=uuid.uuid4().hex, + acquired_at=now, + expires_at=now + timedelta(seconds=ttl), + note=note, + ) + return self._lock + + def verify(self, token: Optional[str], now: Optional[datetime] = None) -> bool: + """True iff `token` matches the currently-active lock.""" + active = self._active(now) + return bool(active and token and hmac.compare_digest(active.token, token)) + + def is_blocked_by(self, token: Optional[str], now: Optional[datetime] = None) -> Optional[dict]: + """Single-read swap gate. Returns the public lock state if an active + lock blocks a swap carrying this token, else None. Does exactly one + `_active()` read so the decision can't straddle a TTL expiry the way a + separate status()+verify() pair could (which, at the expiry tick, would + spuriously refuse a swap that should now be allowed).""" + now = now or _now() + active = self._active(now) + if active is None: + return None + if token and hmac.compare_digest(active.token, token): + return None + return active.public(now) + + def release( + self, + token: Optional[str] = None, + *, + force: bool = False, + now: Optional[datetime] = None, + ) -> bool: + """Release the lock. Returns False if nothing was held. Requires the + matching token unless `force` (the human override from the dashboard).""" + active = self._active(now) + if active is None: + return False + if not force and not self.verify(token, now): + raise PermissionError("token does not hold the lock") + self._lock = None + return True + + +# ----------------------------------------------------------------- webhook ---- + +def build_webhook_payload( + *, + event: str, + job_id: str, + model_key: str, + state: str, + returncode: Optional[int], + started_at: Optional[str], + finished_at: Optional[str], + dry_run: bool, +) -> dict: + return { + "event": event, # swap_complete | swap_failed + "job_id": job_id, + "model_key": model_key, + "state": state, + "returncode": returncode, + "started_at": started_at, + "finished_at": finished_at, + "dry_run": dry_run, + } + + +def sign_payload(secret: str, body: bytes) -> str: + """`X-Spark-Signature` value: sha256 HMAC of the exact JSON body the + consumer receives, so they can recompute and trust it.""" + return "sha256=" + hmac.new(secret.encode(), body, hashlib.sha256).hexdigest() + + +class WebhookNotifier: + """Fire-and-forget POST of swap-lifecycle events. A webhook failure is + logged and swallowed — it must never affect the swap outcome.""" + + def __init__(self, url: str, secret: str = "", timeout: float = 5.0) -> None: + self.url = (url or "").strip() + self.secret = secret or "" + self.timeout = timeout + + @property + def enabled(self) -> bool: + return bool(self.url) + + async def fire(self, event: str, payload: dict) -> None: + if not self.enabled: + return + body = json.dumps(payload).encode() + headers = { + "content-type": "application/json", + "user-agent": "spark-control-webhook", + "x-spark-event": event, + } + if self.secret: + headers["x-spark-signature"] = sign_payload(self.secret, body) + try: + async with httpx.AsyncClient(timeout=self.timeout) as client: + await client.post(self.url, content=body, headers=headers) + except Exception as e: # noqa: BLE001 — best-effort, never propagate + log.warning("swap webhook to %s failed: %s", self.url, e) + + +# -------------------------------------------------------- schedule registry ---- + +@dataclass +class ScheduleEntry: + id: str + name: str + owner: str = "" + cron: str = "" + next_run: str = "" + description: str = "" + registered_at: str = "" + updated_at: str = "" + + def public(self) -> dict: + return { + "id": self.id, + "name": self.name, + "owner": self.owner, + "cron": self.cron, + "next_run": self.next_run, + "description": self.description, + "registered_at": self.registered_at, + "updated_at": self.updated_at, + } + + +class ScheduleRegistry: + """What external schedulers tell us about their cron jobs. Read-only from the + dashboard's side; Spark Control never executes any of it.""" + + def __init__(self) -> None: + self._items: dict[str, ScheduleEntry] = {} + + def list(self) -> list[dict]: + return [e.public() for e in self._items.values()] + + def register( + self, + *, + name: str, + id: Optional[str] = None, + owner: str = "", + cron: str = "", + next_run: str = "", + description: str = "", + ) -> ScheduleEntry: + name = (name or "").strip() + if not name: + raise ValueError("name is required") + if id is not None: + id = id.strip() + if id and not valid_schedule_id(id): + raise ValueError("id must match [A-Za-z0-9_.-] (max 64 chars)") + ts = _iso(_now()) + existing = self._items.get(id) if id else None + if existing is not None: + existing.name = name + existing.owner = owner.strip() + existing.cron = cron + existing.next_run = next_run + existing.description = description + existing.updated_at = ts + return existing + sid = id or uuid.uuid4().hex[:8] + entry = ScheduleEntry( + id=sid, + name=name, + owner=owner.strip(), + cron=cron, + next_run=next_run, + description=description, + registered_at=ts, + updated_at=ts, + ) + self._items[sid] = entry + return entry + + def delete(self, schedule_id: str) -> bool: + return self._items.pop(schedule_id, None) is not None diff --git a/image/app/server.py b/image/app/server.py index e8249ea..2824d66 100644 --- a/image/app/server.py +++ b/image/app/server.py @@ -11,6 +11,7 @@ from typing import Literal from .config import Settings from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary +from .coordination import LockHeld, ScheduleRegistry, SwapLockManager, WebhookNotifier, valid_schedule_id from .custom_services import add_custom_service, delete_custom_service from .audio_proxy import build_router as build_audio_router from .deep_health import DeepHealth @@ -37,7 +38,12 @@ from .wol import send_local_broadcast, send_via_peer settings = Settings.from_env() catalog = load_catalog(settings.models_yaml) -swap_manager = SwapManager(settings, catalog) +# Coordination layer (GPU arbiter): swap-lifecycle webhook, the swap reservation +# lock, and the read-only schedule registry. See coordination.py. +swap_webhook = WebhookNotifier(settings.swap_webhook_url, settings.swap_webhook_secret) +swap_lock = SwapLockManager() +schedule_registry = ScheduleRegistry() +swap_manager = SwapManager(settings, catalog, notifier=swap_webhook) download_manager = DownloadManager(settings) update_manager = UpdateManager(settings) hardware_probe = HardwareProbe(settings) @@ -67,6 +73,10 @@ _CSRF_EXEMPT_PREFIXES = ( "/api/audio/", # diarize-chunk / label-merge / transcribe-with-speakers "/api/health-event", # health reports posted by consumer apps ) +# Note: the coordination endpoints (/api/swap/lock, /api/schedule) are +# intentionally NOT exempt. External schedulers are non-browser clients (no +# Origin header) so they pass the guard already — same as /api/swap — while a +# malicious page can't drive them from the operator's browser. Don't add them. @app.middleware("http") @@ -892,9 +902,21 @@ async def validate_swap(key: str) -> dict: @app.post("/api/swap") -async def post_swap(req: SwapRequest) -> dict: +async def post_swap(req: SwapRequest, request: Request) -> dict: if not settings.configured and not req.dry_run: raise HTTPException(503, "spark1 not configured") + # Enforce the swap reservation lock (the GPU arbiter). A held lock blocks any + # real swap that doesn't present the holder's token in X-Swap-Lock-Token — so + # an external scheduler that holds the lock can swap, but the dashboard (no + # token) is refused while someone else holds it. Dry runs don't touch the + # cluster, so they're exempt. + if not req.dry_run: + blocked = swap_lock.is_blocked_by(request.headers.get("x-swap-lock-token")) + if blocked is not None: + raise HTTPException(status_code=423, detail={ + "error": "the GPU swap path is reserved by another holder", + "lock": blocked, + }) try: job = await swap_manager.trigger(req.model_key, dry_run=req.dry_run) except KeyError: @@ -949,6 +971,89 @@ async def stream_swap(job_id: str): return StreamingResponse(gen(), media_type="text/event-stream") +# ---- Coordination layer: swap lock + schedule registry ---- +# Endpoints are control-surface, not browser-exempt: an external scheduler is a +# non-browser client (no Origin header) so it passes the CSRF guard already, the +# same way it calls /api/swap today; the dashboard is same-origin. + +class LockAcquireRequest(BaseModel): + holder: str + ttl_seconds: int | None = None + note: str = "" + token: str | None = None # present only to extend an existing hold + + +@app.post("/api/swap/lock") +async def acquire_swap_lock(req: LockAcquireRequest) -> dict: + """Reserve the GPU swap path. Returns a secret token used to swap (header + X-Swap-Lock-Token) and to release. 409 if held by another holder.""" + try: + lock = swap_lock.acquire(req.holder, req.ttl_seconds, req.note, token=req.token) + except ValueError as e: + raise HTTPException(422, str(e)) + except LockHeld as e: + raise HTTPException(status_code=409, detail={ + "error": "swap lock is held by another holder", + "lock": e.state, + }) + return {**swap_lock.status(), "token": lock.token} + + +@app.get("/api/swap/lock") +async def get_swap_lock() -> dict: + """Public, token-free view of the reservation: held? who? until when?""" + return swap_lock.status() + + +@app.delete("/api/swap/lock") +async def release_swap_lock(request: Request, force: bool = Query(False)) -> dict: + """Release the reservation. Needs the matching X-Swap-Lock-Token unless + ?force=true (the human override from the dashboard).""" + token = request.headers.get("x-swap-lock-token") or request.query_params.get("token") + try: + released = swap_lock.release(token, force=force) + except PermissionError as e: + raise HTTPException(403, str(e)) + return {"released": released, **swap_lock.status()} + + +class ScheduleRequest(BaseModel): + name: str + id: str | None = None + owner: str = "" + cron: str = "" + next_run: str = "" + description: str = "" + + +@app.get("/api/schedule") +async def list_schedules() -> dict: + return {"schedules": schedule_registry.list()} + + +@app.post("/api/schedule") +async def register_schedule(req: ScheduleRequest) -> dict: + """Register (or update, by id) a schedule an external scheduler owns. Spark + Control only stores it for the dashboard — it never executes it.""" + try: + entry = schedule_registry.register( + name=req.name, id=req.id, owner=req.owner, + cron=req.cron, next_run=req.next_run, description=req.description, + ) + except ValueError as e: + raise HTTPException(422, str(e)) + return entry.public() + + +@app.delete("/api/schedule/{schedule_id}") +async def delete_schedule(schedule_id: str) -> dict: + # Whitelist the path segment at the boundary (repo convention), even though + # it's only ever a dict key — keeps it from being reflected or logged raw. + if not valid_schedule_id(schedule_id): + raise HTTPException(422, "invalid schedule id") + return {"deleted": schedule_registry.delete(schedule_id)} + + class DownloadRequest(BaseModel): repo: str mode: Literal["spark1", "spark2", "cluster"] = "spark1" diff --git a/image/app/static/app.js b/image/app/static/app.js index 7ea1778..8830e1b 100644 --- a/image/app/static/app.js +++ b/image/app/static/app.js @@ -21,11 +21,19 @@ const state = { deep_health: {}, disk_status: {}, // keyed by model key: { on_disk, total_bytes, per_host } disk_status_loaded: false, + lock: { held: false }, // GPU swap reservation (coordination layer) + schedules: [], // schedules external automation has registered }; const el = (sel) => document.querySelector(sel); const $$ = (sel) => document.querySelectorAll(sel); +// ISO timestamp -> local clock string (e.g. "2:45:10 PM"); '' if unparseable. +function fmtClock(iso) { + const t = Date.parse(iso); + return isNaN(t) ? '' : new Date(t).toLocaleTimeString(); +} + function escapeHtml(s) { if (s == null) return ''; return String(s) @@ -51,6 +59,12 @@ function renderCards() { const root = el('#cards'); root.innerHTML = ''; const isSwapping = !!state.swap_job_id; + // GPU reserved by external automation — manual swaps are refused server-side + // (423); reflect that in the buttons so the click never bounces. + const locked = !!(state.lock && state.lock.held); + const lockTip = locked + ? `Reserved by ${state.lock.holder || 'automation'}${state.lock.expires_at ? ' until ' + fmtClock(state.lock.expires_at) : ''}` + : ''; for (const key of Object.keys(state.models)) { const m = state.models[key]; const isActive = key === state.current_model_key; @@ -94,7 +108,9 @@ function renderCards() { if (isActive) { primaryBtn = ``; } else if (isOnDisk) { - primaryBtn = ``; + const swapBlocked = isSwapping || locked; + const tip = locked ? ` title="${escapeHtml(lockTip)}"` : ''; + primaryBtn = ``; } else if (m.local_path) { // A local model can't be "downloaded" — its directory has to exist on the Spark. primaryBtn = ``; @@ -1234,6 +1250,11 @@ function openDiskDeleteDialog(key) { async function triggerSwap(modelKey) { if (state.swap_job_id) return; + if (state.lock && state.lock.held) { + const until = state.lock.expires_at ? ' until ' + fmtClock(state.lock.expires_at) : ''; + alert(`The GPU swap path is reserved by ${state.lock.holder || 'automation'}${until}. Use "Release" on the reservation banner to override.`); + return; + } try { const r = await fetchJSON('/api/swap', { method: 'POST', @@ -1242,10 +1263,84 @@ async function triggerSwap(modelKey) { }); attachToSwap(r.job_id, /*needsBackfill=*/false); } catch (e) { - alert('Failed to start swap: ' + e.message); + // 423 Locked: a reservation was acquired between our last poll and this click. + if (e.message && e.message.startsWith('423')) { + alert('The GPU swap path was just reserved by automation. Refreshing…'); + pollCoordination(); + } else { + alert('Failed to start swap: ' + e.message); + } } } +// ---- coordination layer: swap lock + schedule registry ---- + +async function pollCoordination() { + try { + state.lock = await fetchJSON('/api/swap/lock'); + } catch { state.lock = { held: false }; } + try { + const r = await fetchJSON('/api/schedule'); + state.schedules = r.schedules || []; + } catch { state.schedules = []; } + renderLockBanner(); + renderSchedules(); + renderCards(); // reflect lock state on the swap buttons +} + +function renderLockBanner() { + const banner = el('#lock-banner'); + if (!banner) return; + const lock = state.lock; + if (lock && lock.held) { + const until = lock.expires_at ? ` until ${fmtClock(lock.expires_at)}` : ''; + const note = lock.note ? ` — ${escapeHtml(lock.note)}` : ''; + el('#lock-text').innerHTML = + `GPU swap path reserved by ${escapeHtml(lock.holder || 'automation')}${until}${note}. Manual swaps are paused.`; + banner.classList.remove('hidden'); + } else { + banner.classList.add('hidden'); + } +} + +function renderSchedules() { + const panel = el('#schedule-panel'); + const list = el('#schedule-list'); + if (!panel || !list) return; + const items = state.schedules || []; + if (!items.length) { + panel.classList.add('hidden'); + list.innerHTML = ''; + return; + } + list.innerHTML = items.map((s) => { + const meta = [ + s.cron ? `${escapeHtml(s.cron)}` : '', + s.next_run ? `next: ${escapeHtml(s.next_run)}` : '', + s.owner ? `by ${escapeHtml(s.owner)}` : '', + ].filter(Boolean).join(' · '); + const desc = s.description ? `
${escapeHtml(s.description)}
` : ''; + return `
+
${escapeHtml(s.name)}
+
${meta}
+ ${desc} +
`; + }).join(''); + panel.classList.remove('hidden'); +} + +async function releaseLock() { + const lock = state.lock || {}; + const who = lock.holder || 'automation'; + if (!confirm(`Force-release the GPU reservation held by ${who}? Any job relying on it may then collide with a manual swap.`)) return; + try { + await fetchJSON('/api/swap/lock?force=true', { method: 'DELETE' }); + } catch (e) { + alert('Failed to release: ' + e.message); + } + pollCoordination(); +} + async function triggerDownloadForKey(modelKey) { const m = state.models[modelKey]; if (!m) return; @@ -2102,6 +2197,7 @@ async function init() { }); el('#sshkey-close').addEventListener('click', () => el('#sshkey-dialog').close()); el('#open-local').addEventListener('click', openLocalModelDialog); + el('#lock-release').addEventListener('click', releaseLock); setupCatalogDialog(); setupAdvancedDialog(); setupLocalModelDialog(); @@ -2119,6 +2215,7 @@ async function init() { await loadModels(); await pollStatus(); await renderServices(); + pollCoordination(); pollHardware(); pollUpdates(); // Disk-status probe runs after first paint — slow over SSH and not blocking. @@ -2126,6 +2223,7 @@ async function init() { // Speech-model patches panel — slow over SSH, runs after first paint. renderSpeechModels(); setInterval(pollStatus, 5000); + setInterval(pollCoordination, 5000); // swap lock + schedule registry setInterval(pollHardware, 8000); // every 8s setInterval(pollUpdates, 300000); // every 5 min setInterval(loadDiskStatus, 60000); // every 60s — disk state changes rarely diff --git a/image/app/static/index.html b/image/app/static/index.html index 4942f78..f838ba0 100644 --- a/image/app/static/index.html +++ b/image/app/static/index.html @@ -96,6 +96,13 @@ + +