v0.27.0:0 - in-app settings gear + swap-lock route fix
Move the ~20 optional cluster knobs out of the StartOS "Configure Sparks"
action (now just the 4 required fields) and into a dashboard ⚙ Settings gear,
backed by a /data/app_settings.json overlay keyed by env-var names. One shared
mutable Settings instance + Settings.reload() applies edits live without a
restart; existing installs' values migrate automatically on first boot.
Also: support-service ports (parakeet/kokoro/embed/qdrant + vllm) are now
configurable, and GET /api/swap/lock no longer 404s (it was shadowed by the
/api/swap/{job_id} catch-all). WebhookNotifier is re-pointed on save so its
url/secret reload live too.
This commit is contained in:
+89
-46
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Query, Request
|
||||
@@ -9,6 +10,7 @@ from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel, ValidationError
|
||||
from typing import Literal
|
||||
|
||||
from . import app_settings
|
||||
from .config import Settings
|
||||
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||
from .coordination import LockHeld, ScheduleRegistry, SwapLockManager, WebhookNotifier, valid_schedule_id
|
||||
@@ -37,6 +39,10 @@ from .validate import validate_launch
|
||||
from .wol import send_local_broadcast, send_via_peer
|
||||
|
||||
|
||||
# One-time migration: seed the in-app settings overlay from env (values set via
|
||||
# the StartOS action on a pre-gear install) before building Settings, so nothing
|
||||
# is lost on upgrade. No-op once the overlay exists. See app_settings.
|
||||
app_settings.seed_from_env(os.environ)
|
||||
settings = Settings.from_env()
|
||||
catalog = load_catalog(settings.models_yaml)
|
||||
# Coordination layer (GPU arbiter): swap-lifecycle webhook, the swap reservation
|
||||
@@ -156,6 +162,35 @@ async def get_config() -> dict:
|
||||
}
|
||||
|
||||
|
||||
# ---- In-app settings ('gear') ----
|
||||
# The optional cluster knobs (ports, container names, support-service hosts,
|
||||
# integrations) live in an app-owned overlay on /data, edited here instead of in
|
||||
# the StartOS action — which keeps to just the four required setup fields. See
|
||||
# app_settings. Writes apply live: we rewrite the overlay then reload the shared
|
||||
# Settings instance in place, so every router/manager holding the reference picks
|
||||
# up the change with no container restart.
|
||||
@app.get("/api/settings")
|
||||
async def get_settings() -> dict:
|
||||
return app_settings.public_view()
|
||||
|
||||
|
||||
class SettingsUpdate(BaseModel):
|
||||
values: dict[str, str]
|
||||
|
||||
|
||||
@app.post("/api/settings")
|
||||
async def post_settings(req: SettingsUpdate) -> dict:
|
||||
try:
|
||||
app_settings.apply(req.values)
|
||||
except app_settings.SettingsError as e:
|
||||
raise HTTPException(422, str(e))
|
||||
settings.reload()
|
||||
# WebhookNotifier snapshots url/secret (not the Settings object), so reload()
|
||||
# can't reach it — re-point it explicitly so a webhook edit applies live too.
|
||||
swap_webhook.update(settings.swap_webhook_url, settings.swap_webhook_secret)
|
||||
return app_settings.public_view()
|
||||
|
||||
|
||||
def _reload_catalog() -> None:
|
||||
global catalog
|
||||
catalog = load_catalog(settings.models_yaml)
|
||||
@@ -947,6 +982,56 @@ async def post_swap(req: SwapRequest, request: Request) -> dict:
|
||||
return {"job_id": job.id, "model_key": job.model_key, "state": job.state}
|
||||
|
||||
|
||||
# ---- Swap reservation lock (the GPU arbiter) ----
|
||||
# ROUTE ORDER IS LOAD-BEARING: these static `/api/swap/lock` routes MUST be
|
||||
# registered before the parametric `/api/swap/{job_id}` below. FastAPI matches in
|
||||
# registration order, so if `{job_id}` came first, GET /api/swap/lock would bind
|
||||
# job_id="lock", look up a (non-existent) swap job, and 404 — which is exactly
|
||||
# the bug this ordering fixes. Keep these above the {job_id} routes.
|
||||
# CSRF: these are control-surface, not browser-exempt — an external scheduler is
|
||||
# a non-browser client (no Origin header) so it passes the guard already, the
|
||||
# same way it calls /api/swap; the dashboard is same-origin.
|
||||
class LockAcquireRequest(BaseModel):
|
||||
holder: str
|
||||
ttl_seconds: int | None = None
|
||||
note: str = ""
|
||||
token: str | None = None # present only to extend an existing hold
|
||||
|
||||
|
||||
@app.post("/api/swap/lock")
|
||||
async def acquire_swap_lock(req: LockAcquireRequest) -> dict:
|
||||
"""Reserve the GPU swap path. Returns a secret token used to swap (header
|
||||
X-Swap-Lock-Token) and to release. 409 if held by another holder."""
|
||||
try:
|
||||
lock = swap_lock.acquire(req.holder, req.ttl_seconds, req.note, token=req.token)
|
||||
except ValueError as e:
|
||||
raise HTTPException(422, str(e))
|
||||
except LockHeld as e:
|
||||
raise HTTPException(status_code=409, detail={
|
||||
"error": "swap lock is held by another holder",
|
||||
"lock": e.state,
|
||||
})
|
||||
return {**swap_lock.status(), "token": lock.token}
|
||||
|
||||
|
||||
@app.get("/api/swap/lock")
|
||||
async def get_swap_lock() -> dict:
|
||||
"""Public, token-free view of the reservation: held? who? until when?"""
|
||||
return swap_lock.status()
|
||||
|
||||
|
||||
@app.delete("/api/swap/lock")
|
||||
async def release_swap_lock(request: Request, force: bool = Query(False)) -> dict:
|
||||
"""Release the reservation. Needs the matching X-Swap-Lock-Token unless
|
||||
?force=true (the human override from the dashboard)."""
|
||||
token = request.headers.get("x-swap-lock-token") or request.query_params.get("token")
|
||||
try:
|
||||
released = swap_lock.release(token, force=force)
|
||||
except PermissionError as e:
|
||||
raise HTTPException(403, str(e))
|
||||
return {"released": released, **swap_lock.status()}
|
||||
|
||||
|
||||
@app.get("/api/swap/{job_id}")
|
||||
async def get_swap(job_id: str) -> dict:
|
||||
job = swap_manager.get(job_id)
|
||||
@@ -992,52 +1077,10 @@ async def stream_swap(job_id: str):
|
||||
return StreamingResponse(gen(), media_type="text/event-stream")
|
||||
|
||||
|
||||
# ---- Coordination layer: swap lock + schedule registry ----
|
||||
# Endpoints are control-surface, not browser-exempt: an external scheduler is a
|
||||
# non-browser client (no Origin header) so it passes the CSRF guard already, the
|
||||
# same way it calls /api/swap today; the dashboard is same-origin.
|
||||
|
||||
class LockAcquireRequest(BaseModel):
|
||||
holder: str
|
||||
ttl_seconds: int | None = None
|
||||
note: str = ""
|
||||
token: str | None = None # present only to extend an existing hold
|
||||
|
||||
|
||||
@app.post("/api/swap/lock")
|
||||
async def acquire_swap_lock(req: LockAcquireRequest) -> dict:
|
||||
"""Reserve the GPU swap path. Returns a secret token used to swap (header
|
||||
X-Swap-Lock-Token) and to release. 409 if held by another holder."""
|
||||
try:
|
||||
lock = swap_lock.acquire(req.holder, req.ttl_seconds, req.note, token=req.token)
|
||||
except ValueError as e:
|
||||
raise HTTPException(422, str(e))
|
||||
except LockHeld as e:
|
||||
raise HTTPException(status_code=409, detail={
|
||||
"error": "swap lock is held by another holder",
|
||||
"lock": e.state,
|
||||
})
|
||||
return {**swap_lock.status(), "token": lock.token}
|
||||
|
||||
|
||||
@app.get("/api/swap/lock")
|
||||
async def get_swap_lock() -> dict:
|
||||
"""Public, token-free view of the reservation: held? who? until when?"""
|
||||
return swap_lock.status()
|
||||
|
||||
|
||||
@app.delete("/api/swap/lock")
|
||||
async def release_swap_lock(request: Request, force: bool = Query(False)) -> dict:
|
||||
"""Release the reservation. Needs the matching X-Swap-Lock-Token unless
|
||||
?force=true (the human override from the dashboard)."""
|
||||
token = request.headers.get("x-swap-lock-token") or request.query_params.get("token")
|
||||
try:
|
||||
released = swap_lock.release(token, force=force)
|
||||
except PermissionError as e:
|
||||
raise HTTPException(403, str(e))
|
||||
return {"released": released, **swap_lock.status()}
|
||||
|
||||
|
||||
# ---- Coordination layer: read-only schedule registry ----
|
||||
# (The swap reservation lock lives above, next to the swap routes.) Same CSRF
|
||||
# posture: control-surface, not browser-exempt — external schedulers send no
|
||||
# Origin header so they pass the guard; the dashboard is same-origin.
|
||||
class ScheduleRequest(BaseModel):
|
||||
name: str
|
||||
id: str | None = None
|
||||
|
||||
Reference in New Issue
Block a user