v0.19.0:0 - harden cluster-control surface: ssh injection, qdrant path, csrf

Triaged from a full independent evaluation (EVALUATION.md). Addresses the three P0/P1 code findings; the proxy/data APIs that downstream apps consume are deliberately untouched. - ssh command injection (P0): new shellsafe.py validates + shlex.quotes every user-supplied value crossing into an SSH command on the Sparks (model repo, vllm args/knobs, NIM image/container/volume/port/env, service names). Boundary validation on POST /api/models and POST /api/nim/install; quoting at every sink in models/download/nim/services. NGC key now quoted too. - qdrant path injection (P1): /api/search validates the collection name against a metacharacter-free whitelist and URL-encodes the path segment. - csrf (P1): csrf_guard middleware enforces same-origin on state-changing control endpoints; /v1/*, /scrub, /rehydrate, /api/search, /api/audio/* and /api/health-event are exempt so external consumers are unaffected. Verified: injection survives only as a single quoted token, vLLM preflight shlex.split round-trip intact, CSRF behaviors covered via TestClient, both offline redaction suites still pass, tsc clean, s9pk rebuilt.
2026-06-12 16:36:33 -05:00
parent 98988057a2
commit 1c4e861783
10 changed files with 260 additions and 24 deletions
@@ -25,6 +25,7 @@ from .models import load_catalog
 from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager
 from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
 from .services import docker_state, run_action, services_from_settings
+from .shellsafe import validate_container, validate_image, validate_repo
 from .speech_models import SpeechModelsManager
 from .ssh import ssh_run
 from .swap import SwapManager
@@ -46,6 +47,44 @@ speech_models = SpeechModelsManager(settings)
 app = FastAPI(title="spark-control", version="0.1.0")


+# ---- Same-origin (CSRF) guard on state-mutating control endpoints ----
+# The app ships no API auth by design (LAN/VPN-only, no public interface). That
+# makes the realistic remote threat a *browser-driven CSRF*: a malicious page open
+# in the operator's browser silently POSTing to the control endpoints (swap, NIM
+# install, service stop, disk delete, …) while they're on the trusted network.
+# Browsers attach an Origin (and Referer) header to every cross-site state-changing
+# request, so we reject mutating requests whose Origin/Referer hostname doesn't
+# match the host the dashboard was served from. Programmatic consumers (Recap Relay,
+# CRM, Open WebUI, …) hit the proxy/data surface below and send no browser Origin,
+# so they're unaffected; the exempt prefixes are the cross-origin-by-design API.
+_CSRF_SAFE_METHODS = {"GET", "HEAD", "OPTIONS", "TRACE"}
+_CSRF_EXEMPT_PREFIXES = (
+    "/v1/",               # OpenAI-compatible chat/audio/embeddings/rerank proxies
+    "/scrub", "/rehydrate",  # redaction gateway (used by downstream apps)
+    "/api/search",        # retrieval proxy
+    "/api/audio/",        # diarize-chunk / label-merge / transcribe-with-speakers
+    "/api/health-event",  # health reports posted by consumer apps
+)
+
+
+@app.middleware("http")
+async def csrf_guard(request, call_next):
+    if request.method not in _CSRF_SAFE_METHODS and not request.url.path.startswith(_CSRF_EXEMPT_PREFIXES):
+        origin = request.headers.get("origin") or request.headers.get("referer")
+        if origin:
+            from urllib.parse import urlparse
+            origin_host = urlparse(origin).hostname
+            req_host = (request.headers.get("host") or "").rsplit(":", 1)[0]
+            # Only block when we can positively identify a mismatch; absence of a
+            # header (non-browser client) or an unparseable Host falls through.
+            if origin_host and req_host and origin_host != req_host:
+                return JSONResponse(
+                    status_code=403,
+                    content={"detail": "cross-origin request to a control endpoint was blocked"},
+                )
+    return await call_next(request)
+
+
@app.on_event("startup")
 async def _start_deep_health() -> None:
    # Fire-and-forget; the loop catches its own exceptions.
@@ -155,6 +194,10 @@ class CustomModelBody(BaseModel):
 async def post_model(body: CustomModelBody) -> dict:
    if not body.key or not body.key.replace("-", "").replace("_", "").isalnum():
        raise HTTPException(400, "key must be alphanumeric/-/_ only")
+    try:
+        validate_repo(body.repo)
+    except ValueError as e:
+        raise HTTPException(400, str(e))
    if body.key in catalog.models and not catalog.models[body.key].custom:
        raise HTTPException(409, f"'{body.key}' is a bundled model — pick a different key")
    add_custom(body.model_dump())
@@ -435,6 +478,11 @@ class NimInstallBody(BaseModel):

@app.post("/api/nim/install")
 async def post_nim_install(body: NimInstallBody) -> dict:
+    try:
+        validate_image(body.image)
+        validate_container(body.container)
+    except ValueError as e:
+        raise HTTPException(400, str(e))
    target_host = settings.spark1_host if body.host == "spark1" else settings.spark2_host
    target_user = settings.spark1_user if body.host == "spark1" else settings.spark2_user
    try: