Add regression tests for v74 fixes; close soft-delete leak in list-view aggregates

Lock in the three v0.1.0:74 security/privacy fixes with regression tests, and fix a same-class soft-delete leak surfaced while writing them. - backend/test_assets_traversal.py: boots the real server, proves /assets/ path-traversal vectors (incl. a real decoy file and the live crm.db, plain and URL-encoded) 404 and leak nothing, while a legit asset still serves 200. - backend/test_soft_delete_reads.py: get-by-id 404s soft-deleted rows and nested + list-view aggregates exclude soft-deleted children. - backend/mcp/test_outreach_redaction.py: an unknown free-prose name is tokenized away from the Claude payload but re-hydrated locally, and the path fails closed (no Claude call) when the local NER model is down. - backend/run_tests.py: aggregate runner (each backend/**/test_*.py in its own subprocess); replaces the manual for-loop. 16/16 green. A reviewer pass on the tests confirmed the soft-delete filter was missing from list-view aggregate sub-selects: org contact_count/total_funded and contacts comm_count/last_contact_date counted soft-deleted rows. Add `deleted_at IS NULL` to those four (server.py) and regression-cover them. The reports subsystem (dashboard/pipeline/LP-breakdown, ~16 aggregate queries) has the same leak and is logged as P2 for a dedicated pass. Not yet built or deployed — bump the package version before the next s9pk build.
2026-06-13 00:26:22 -05:00
parent a74a540295
commit 7285bb0e52
6 changed files with 488 additions and 11 deletions
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""Regression test for the /assets/ path-traversal containment fix (v0.1.0:74).
+
+Before the fix, get_path()/urlparse did NOT normalize '..', so an unauthenticated
+GET /assets/../../data/crm.db (raw client, no client-side normalization) escaped the
+frontend root and read any file the process could — the LP DB, the JWT secret, the
+Gmail key. The fix resolves the target with os.path.realpath and 404s anything that
+does not stay under FRONTEND_ROOT (server.py, the `/assets/` branch of do_GET).
+
+This boots the REAL server in-process against a throwaway frontend root, plants a
+decoy "secret" OUTSIDE that root, and proves: (1) traversal vectors that resolve to a
+real readable file outside the root still 404 and leak no bytes; (2) the live crm.db
+path is 404'd; (3) URL-encoded separators don't help; (4) a legit in-bounds asset
+still serves 200 (the fix isn't over-broad). Synthetic only (guardrail #9).
+
+Run: cd backend && python3 test_assets_traversal.py
+"""
+import http.client
+import os
+import sys
+import tempfile
+import threading
+from http.server import ThreadingHTTPServer
+
+# Lay out a throwaway tree BEFORE importing server (FRONTEND_DIR/ROOT resolve at import):
+#   base/frontend/{index.html,assets/app.css}   <- the served root
+#   base/secret.txt                             <- a real file a traversal would target
+#   base/data/crm.db                            <- the live DB, created by init_db()
+_BASE = tempfile.mkdtemp()
+_FRONTEND = os.path.join(_BASE, "frontend")
+os.makedirs(os.path.join(_FRONTEND, "assets"))
+_DATA = os.path.join(_BASE, "data")
+os.makedirs(_DATA)
+with open(os.path.join(_FRONTEND, "index.html"), "w") as f:
+    f.write("<!doctype html><title>crm</title>")
+_CSS_MARKER = "/* legit-asset-marker-7f3a */"
+with open(os.path.join(_FRONTEND, "assets", "app.css"), "w") as f:
+    f.write(_CSS_MARKER)
+_SECRET_MARKER = "TOPSECRET-JWT-zq19"
+with open(os.path.join(_BASE, "secret.txt"), "w") as f:
+    f.write(_SECRET_MARKER)
+
+os.environ["CRM_FRONTEND_DIR"] = _FRONTEND
+os.environ["CRM_DATA_DIR"] = _DATA
+os.environ["CRM_DB_PATH"] = os.path.join(_DATA, "crm.db")
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import server  # noqa: E402
+
+FAILS = []
+
+
+def check(cond, msg):
+    print(("  PASS " if cond else "  FAIL ") + msg)
+    if not cond:
+        FAILS.append(msg)
+
+
+class _Quiet(server.CRMHandler):
+    def log_message(self, *a):  # keep the test output clean
+        pass
+
+
+def _get(port, path):
+    """Raw GET with the path sent verbatim — http.client does NOT normalize '..',
+    which is exactly the unauthenticated raw-client threat the fix defends against."""
+    conn = http.client.HTTPConnection("127.0.0.1", port, timeout=10)
+    conn.request("GET", path)
+    resp = conn.getresponse()
+    body = resp.read().decode("utf-8", "replace")
+    conn.close()
+    return resp.status, body
+
+
+def main():
+    server.init_db()  # creates base/data/crm.db and the full schema
+    check(os.path.exists(os.environ["CRM_DB_PATH"]), "init_db created the live crm.db (a real traversal target)")
+
+    httpd = ThreadingHTTPServer(("127.0.0.1", 0), _Quiet)
+    port = httpd.server_address[1]
+    threading.Thread(target=httpd.serve_forever, daemon=True).start()
+    try:
+        # ── legit in-bounds asset still serves (containment is not over-broad) ──
+        print("\n[legit asset]")
+        st, body = _get(port, "/assets/app.css")
+        check(st == 200, f"in-bounds /assets/app.css serves 200 (got {st})")
+        check(_CSS_MARKER in body, "in-bounds asset body is served intact")
+
+        # ── traversal to a REAL file outside the root: 404, zero bytes leaked ──
+        print("\n[traversal -> decoy secret outside the root]")
+        for vec in ["/assets/../../secret.txt",
+                    "/assets/../../../secret.txt",
+                    "/assets/..%2f..%2fsecret.txt",         # urlparse won't decode %2f
+                    "/assets/..%2F..%2Fsecret.txt"]:        # …nor uppercase %2F (some clients send it)
+            st, body = _get(port, vec)
+            check(st == 404, f"{vec} -> 404 (got {st})")
+            check(_SECRET_MARKER not in body, f"{vec} leaks no secret bytes")
+
+        # ── traversal to the live crm.db (the headline vector from the eval) ──
+        print("\n[traversal -> live crm.db]")
+        for vec in ["/assets/../../data/crm.db",
+                    "/assets/../data/crm.db",
+                    "/assets/..%2f..%2fdata%2fcrm.db"]:
+            st, body = _get(port, vec)
+            check(st == 404, f"{vec} -> 404 (got {st})")
+            check("SQLite format 3" not in body, f"{vec} leaks no DB header")
+
+        # ── deep absolute-style escape ──
+        print("\n[deep escape]")
+        st, body = _get(port, "/assets/../../../../../../../../etc/passwd")
+        check(st == 404, f"/assets/../../etc/passwd -> 404 (got {st})")
+        check("root:" not in body, "/etc/passwd not leaked")
+    finally:
+        httpd.shutdown()
+
+    print()
+    if FAILS:
+        print(f"FAILED ({len(FAILS)}):")
+        for f in FAILS:
+            print(f"  - {f}")
+        sys.exit(1)
+    print("ALL PASS (assets path-traversal containment)")
+
+
+if __name__ == "__main__":
+    main()