v0.27.0:0 - in-app settings gear + swap-lock route fix

Move the ~20 optional cluster knobs out of the StartOS "Configure Sparks" action (now just the 4 required fields) and into a dashboard ⚙ Settings gear, backed by a /data/app_settings.json overlay keyed by env-var names. One shared mutable Settings instance + Settings.reload() applies edits live without a restart; existing installs' values migrate automatically on first boot. Also: support-service ports (parakeet/kokoro/embed/qdrant + vllm) are now configurable, and GET /api/swap/lock no longer 404s (it was shadowed by the /api/swap/{job_id} catch-all). WebhookNotifier is re-pointed on save so its url/secret reload live too.
2026-06-18 13:41:28 -05:00
parent b67e001642
commit 7e0759846f
15 changed files with 797 additions and 268 deletions
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import asyncio
 import json
+import os
 from pathlib import Path

 from fastapi import FastAPI, HTTPException, Query, Request
@@ -9,6 +10,7 @@ from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel, ValidationError
 from typing import Literal

+from . import app_settings
 from .config import Settings
 from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
 from .coordination import LockHeld, ScheduleRegistry, SwapLockManager, WebhookNotifier, valid_schedule_id
@@ -37,6 +39,10 @@ from .validate import validate_launch
 from .wol import send_local_broadcast, send_via_peer


+# One-time migration: seed the in-app settings overlay from env (values set via
+# the StartOS action on a pre-gear install) before building Settings, so nothing
+# is lost on upgrade. No-op once the overlay exists. See app_settings.
+app_settings.seed_from_env(os.environ)
 settings = Settings.from_env()
 catalog = load_catalog(settings.models_yaml)
 # Coordination layer (GPU arbiter): swap-lifecycle webhook, the swap reservation
@@ -156,6 +162,35 @@ async def get_config() -> dict:
    }


+# ---- In-app settings ('gear') ----
+# The optional cluster knobs (ports, container names, support-service hosts,
+# integrations) live in an app-owned overlay on /data, edited here instead of in
+# the StartOS action — which keeps to just the four required setup fields. See
+# app_settings. Writes apply live: we rewrite the overlay then reload the shared
+# Settings instance in place, so every router/manager holding the reference picks
+# up the change with no container restart.
+@app.get("/api/settings")
+async def get_settings() -> dict:
+    return app_settings.public_view()
+
+
+class SettingsUpdate(BaseModel):
+    values: dict[str, str]
+
+
+@app.post("/api/settings")
+async def post_settings(req: SettingsUpdate) -> dict:
+    try:
+        app_settings.apply(req.values)
+    except app_settings.SettingsError as e:
+        raise HTTPException(422, str(e))
+    settings.reload()
+    # WebhookNotifier snapshots url/secret (not the Settings object), so reload()
+    # can't reach it — re-point it explicitly so a webhook edit applies live too.
+    swap_webhook.update(settings.swap_webhook_url, settings.swap_webhook_secret)
+    return app_settings.public_view()
+
+
 def _reload_catalog() -> None:
    global catalog
    catalog = load_catalog(settings.models_yaml)
@@ -947,6 +982,56 @@ async def post_swap(req: SwapRequest, request: Request) -> dict:
    return {"job_id": job.id, "model_key": job.model_key, "state": job.state}


+# ---- Swap reservation lock (the GPU arbiter) ----
+# ROUTE ORDER IS LOAD-BEARING: these static `/api/swap/lock` routes MUST be
+# registered before the parametric `/api/swap/{job_id}` below. FastAPI matches in
+# registration order, so if `{job_id}` came first, GET /api/swap/lock would bind
+# job_id="lock", look up a (non-existent) swap job, and 404 — which is exactly
+# the bug this ordering fixes. Keep these above the {job_id} routes.
+# CSRF: these are control-surface, not browser-exempt — an external scheduler is
+# a non-browser client (no Origin header) so it passes the guard already, the
+# same way it calls /api/swap; the dashboard is same-origin.
+class LockAcquireRequest(BaseModel):
+    holder: str
+    ttl_seconds: int | None = None
+    note: str = ""
+    token: str | None = None   # present only to extend an existing hold
+
+
+@app.post("/api/swap/lock")
+async def acquire_swap_lock(req: LockAcquireRequest) -> dict:
+    """Reserve the GPU swap path. Returns a secret token used to swap (header
+    X-Swap-Lock-Token) and to release. 409 if held by another holder."""
+    try:
+        lock = swap_lock.acquire(req.holder, req.ttl_seconds, req.note, token=req.token)
+    except ValueError as e:
+        raise HTTPException(422, str(e))
+    except LockHeld as e:
+        raise HTTPException(status_code=409, detail={
+            "error": "swap lock is held by another holder",
+            "lock": e.state,
+        })
+    return {**swap_lock.status(), "token": lock.token}
+
+
+@app.get("/api/swap/lock")
+async def get_swap_lock() -> dict:
+    """Public, token-free view of the reservation: held? who? until when?"""
+    return swap_lock.status()
+
+
+@app.delete("/api/swap/lock")
+async def release_swap_lock(request: Request, force: bool = Query(False)) -> dict:
+    """Release the reservation. Needs the matching X-Swap-Lock-Token unless
+    ?force=true (the human override from the dashboard)."""
+    token = request.headers.get("x-swap-lock-token") or request.query_params.get("token")
+    try:
+        released = swap_lock.release(token, force=force)
+    except PermissionError as e:
+        raise HTTPException(403, str(e))
+    return {"released": released, **swap_lock.status()}
+
+
@app.get("/api/swap/{job_id}")
 async def get_swap(job_id: str) -> dict:
    job = swap_manager.get(job_id)
@@ -992,52 +1077,10 @@ async def stream_swap(job_id: str):
    return StreamingResponse(gen(), media_type="text/event-stream")


-# ---- Coordination layer: swap lock + schedule registry ----
-# Endpoints are control-surface, not browser-exempt: an external scheduler is a
-# non-browser client (no Origin header) so it passes the CSRF guard already, the
-# same way it calls /api/swap today; the dashboard is same-origin.
-
-class LockAcquireRequest(BaseModel):
-    holder: str
-    ttl_seconds: int | None = None
-    note: str = ""
-    token: str | None = None   # present only to extend an existing hold
-
-
-@app.post("/api/swap/lock")
-async def acquire_swap_lock(req: LockAcquireRequest) -> dict:
-    """Reserve the GPU swap path. Returns a secret token used to swap (header
-    X-Swap-Lock-Token) and to release. 409 if held by another holder."""
-    try:
-        lock = swap_lock.acquire(req.holder, req.ttl_seconds, req.note, token=req.token)
-    except ValueError as e:
-        raise HTTPException(422, str(e))
-    except LockHeld as e:
-        raise HTTPException(status_code=409, detail={
-            "error": "swap lock is held by another holder",
-            "lock": e.state,
-        })
-    return {**swap_lock.status(), "token": lock.token}
-
-
-@app.get("/api/swap/lock")
-async def get_swap_lock() -> dict:
-    """Public, token-free view of the reservation: held? who? until when?"""
-    return swap_lock.status()
-
-
-@app.delete("/api/swap/lock")
-async def release_swap_lock(request: Request, force: bool = Query(False)) -> dict:
-    """Release the reservation. Needs the matching X-Swap-Lock-Token unless
-    ?force=true (the human override from the dashboard)."""
-    token = request.headers.get("x-swap-lock-token") or request.query_params.get("token")
-    try:
-        released = swap_lock.release(token, force=force)
-    except PermissionError as e:
-        raise HTTPException(403, str(e))
-    return {"released": released, **swap_lock.status()}
-
-
+# ---- Coordination layer: read-only schedule registry ----
+# (The swap reservation lock lives above, next to the swap routes.) Same CSRF
+# posture: control-surface, not browser-exempt — external schedulers send no
+# Origin header so they pass the guard; the dashboard is same-origin.
 class ScheduleRequest(BaseModel):
    name: str
    id: str | None = None