v0.19.0:0 - harden cluster-control surface: ssh injection, qdrant path, csrf
Triaged from a full independent evaluation (EVALUATION.md). Addresses the three P0/P1 code findings; the proxy/data APIs that downstream apps consume are deliberately untouched. - ssh command injection (P0): new shellsafe.py validates + shlex.quotes every user-supplied value crossing into an SSH command on the Sparks (model repo, vllm args/knobs, NIM image/container/volume/port/env, service names). Boundary validation on POST /api/models and POST /api/nim/install; quoting at every sink in models/download/nim/services. NGC key now quoted too. - qdrant path injection (P1): /api/search validates the collection name against a metacharacter-free whitelist and URL-encodes the path segment. - csrf (P1): csrf_guard middleware enforces same-origin on state-changing control endpoints; /v1/*, /scrub, /rehydrate, /api/search, /api/audio/* and /api/health-event are exempt so external consumers are unaffected. Verified: injection survives only as a single quoted token, vLLM preflight shlex.split round-trip intact, CSRF behaviors covered via TestClient, both offline redaction suites still pass, tsc clean, s9pk rebuilt.
This commit is contained in:
@@ -25,6 +25,7 @@ from .models import load_catalog
|
||||
from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager
|
||||
from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
|
||||
from .services import docker_state, run_action, services_from_settings
|
||||
from .shellsafe import validate_container, validate_image, validate_repo
|
||||
from .speech_models import SpeechModelsManager
|
||||
from .ssh import ssh_run
|
||||
from .swap import SwapManager
|
||||
@@ -46,6 +47,44 @@ speech_models = SpeechModelsManager(settings)
|
||||
app = FastAPI(title="spark-control", version="0.1.0")
|
||||
|
||||
|
||||
# ---- Same-origin (CSRF) guard on state-mutating control endpoints ----
|
||||
# The app ships no API auth by design (LAN/VPN-only, no public interface). That
|
||||
# makes the realistic remote threat a *browser-driven CSRF*: a malicious page open
|
||||
# in the operator's browser silently POSTing to the control endpoints (swap, NIM
|
||||
# install, service stop, disk delete, …) while they're on the trusted network.
|
||||
# Browsers attach an Origin (and Referer) header to every cross-site state-changing
|
||||
# request, so we reject mutating requests whose Origin/Referer hostname doesn't
|
||||
# match the host the dashboard was served from. Programmatic consumers (Recap Relay,
|
||||
# CRM, Open WebUI, …) hit the proxy/data surface below and send no browser Origin,
|
||||
# so they're unaffected; the exempt prefixes are the cross-origin-by-design API.
|
||||
_CSRF_SAFE_METHODS = {"GET", "HEAD", "OPTIONS", "TRACE"}
|
||||
_CSRF_EXEMPT_PREFIXES = (
|
||||
"/v1/", # OpenAI-compatible chat/audio/embeddings/rerank proxies
|
||||
"/scrub", "/rehydrate", # redaction gateway (used by downstream apps)
|
||||
"/api/search", # retrieval proxy
|
||||
"/api/audio/", # diarize-chunk / label-merge / transcribe-with-speakers
|
||||
"/api/health-event", # health reports posted by consumer apps
|
||||
)
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
async def csrf_guard(request, call_next):
|
||||
if request.method not in _CSRF_SAFE_METHODS and not request.url.path.startswith(_CSRF_EXEMPT_PREFIXES):
|
||||
origin = request.headers.get("origin") or request.headers.get("referer")
|
||||
if origin:
|
||||
from urllib.parse import urlparse
|
||||
origin_host = urlparse(origin).hostname
|
||||
req_host = (request.headers.get("host") or "").rsplit(":", 1)[0]
|
||||
# Only block when we can positively identify a mismatch; absence of a
|
||||
# header (non-browser client) or an unparseable Host falls through.
|
||||
if origin_host and req_host and origin_host != req_host:
|
||||
return JSONResponse(
|
||||
status_code=403,
|
||||
content={"detail": "cross-origin request to a control endpoint was blocked"},
|
||||
)
|
||||
return await call_next(request)
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def _start_deep_health() -> None:
|
||||
# Fire-and-forget; the loop catches its own exceptions.
|
||||
@@ -155,6 +194,10 @@ class CustomModelBody(BaseModel):
|
||||
async def post_model(body: CustomModelBody) -> dict:
|
||||
if not body.key or not body.key.replace("-", "").replace("_", "").isalnum():
|
||||
raise HTTPException(400, "key must be alphanumeric/-/_ only")
|
||||
try:
|
||||
validate_repo(body.repo)
|
||||
except ValueError as e:
|
||||
raise HTTPException(400, str(e))
|
||||
if body.key in catalog.models and not catalog.models[body.key].custom:
|
||||
raise HTTPException(409, f"'{body.key}' is a bundled model — pick a different key")
|
||||
add_custom(body.model_dump())
|
||||
@@ -435,6 +478,11 @@ class NimInstallBody(BaseModel):
|
||||
|
||||
@app.post("/api/nim/install")
|
||||
async def post_nim_install(body: NimInstallBody) -> dict:
|
||||
try:
|
||||
validate_image(body.image)
|
||||
validate_container(body.container)
|
||||
except ValueError as e:
|
||||
raise HTTPException(400, str(e))
|
||||
target_host = settings.spark1_host if body.host == "spark1" else settings.spark2_host
|
||||
target_user = settings.spark1_user if body.host == "spark1" else settings.spark2_user
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user