v0.13.0:4 - redaction gateway, embeddings proxy, expanded audio API

- Add redaction gateway (redaction_gateway.py, redaction/ scrub + tests) - Add embeddings proxy and spark_embed service (Dockerfile + main.py) - Expand audio_proxy with speaker-aware handling; deep_health/health/server updates - Package: configureSparks action + sparkConfig model updates, manifest/main wiring - Docs: AUDIO_API, EMBEDDINGS, REDACTION_GATEWAY; HANDOFF and runbook/known-issues refresh
2026-06-11 17:45:21 -05:00
parent 4a75274db3
commit 8d839e3714
37 changed files with 3763 additions and 197 deletions
@@ -17,8 +17,10 @@ from .deep_health import DeepHealth
 from .disk import delete_from_disk, probe_disk
 from .download import DownloadManager
 from .llm_proxy import build_router as build_llm_router
+from .embeddings_proxy import build_router as build_embeddings_router
+from .redaction_gateway import build_router as build_redaction_router, MapStore
 from .hardware import HardwareProbe
-from .health import check_magpie, check_parakeet, check_vllm
+from .health import check_kokoro, check_parakeet, check_vllm, check_embeddings, check_qdrant
 from .models import load_catalog
 from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager
 from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
@@ -60,7 +62,7 @@ app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")

 # OpenAI-compatible audio proxy: /v1/audio/speech, /v1/audio/transcriptions, /v1/models.
 # Lets Open WebUI, Home Assistant, and any other OpenAI-shaped client talk to
-# Parakeet (STT) and Magpie (TTS) through a single spark-control URL.
+# Parakeet (STT) and Kokoro (TTS) through a single spark-control URL.
 # Passing deep_health lets the proxy fire an immediate wedge-detect + auto-restart
 # when Parakeet returns 500, instead of waiting up to 5 min for the periodic probe.
 app.include_router(build_audio_router(settings, deep_health=deep_health))
@@ -71,6 +73,20 @@ app.include_router(build_audio_router(settings, deep_health=deep_health))
 # as the audio proxy — clients only need one URL for everything.
 app.include_router(build_llm_router(settings))

+# OpenAI-compatible embeddings + rerank + hybrid search proxy:
+# /v1/embeddings -> spark-embed (bge-m3 dense), /v1/rerank -> spark-embed
+# (bge-reranker-v2-m3), /api/search -> orchestrated dense(+sparse) retrieval
+# from Qdrant with optional cross-encoder rerank. Same single-trusted-host
+# model as the LLM and audio proxies.
+app.include_router(build_embeddings_router(settings))
+
+# Redaction gateway: /scrub + /rehydrate. The privacy boundary between sovereign
+# LP data and the Claude API — de-identify context before it leaves the box,
+# re-identify Claude's response locally. The pseudonym map (the de-anon key) is
+# held server-side in a TTL-swept store on /data and never leaves this host.
+redaction_map_store = MapStore(settings.redaction_map_db, settings.redaction_map_ttl)
+app.include_router(build_redaction_router(settings, redaction_map_store))
+

@app.get("/", include_in_schema=False)
 async def index() -> FileResponse:
@@ -274,7 +290,7 @@ async def run_deep_health(service: str) -> dict:


 class HealthEventBody(BaseModel):
-    service: str                 # e.g. "parakeet", "magpie", "vllm"
+    service: str                 # e.g. "parakeet", "kokoro", "vllm"
    ok: bool                     # true on success, false on failure
    source: str | None = None    # what app reported (e.g. "open-webui")
    error: str | None = None     # optional detail
@@ -344,7 +360,7 @@ async def wake_spark(name: str) -> dict:

@app.get("/api/services")
 async def get_services() -> dict:
-    """Lifecycle state of always-on support services (Parakeet, Magpie, …).
+    """Lifecycle state of always-on support services (Parakeet, Kokoro, …).

    Each entry includes:
      - host/port/container/user (configured)
@@ -362,8 +378,15 @@ async def get_services() -> dict:
        docker = await docker_state(settings, svc)
        if name == "parakeet":
            http = await check_parakeet(settings)
+        elif name == "kokoro":
+            http = await check_kokoro(settings)
+        elif name == "embeddings":
+            http = await check_embeddings(settings)
+        elif name == "qdrant":
+            http = await check_qdrant(settings)
        else:
-            http = await check_magpie(settings)
+            # Custom services expose a /health endpoint by convention.
+            http = await check_kokoro(settings) if svc.kind == "tts" else {"ok": None, "base_url": svc.host and f"http://{svc.host}:{svc.port}"}
        return name, {
            "host": svc.host,
            "user": svc.user,
@@ -372,7 +395,10 @@ async def get_services() -> dict:
            "kind": svc.kind,
            "base_url": http.get("base_url"),
            "http_ready": bool(http.get("ok")),
-            "model": (http.get("detail") or {}).get("model") if isinstance(http.get("detail"), dict) else None,
+            # Prefer the check fn's own top-level model key (embeddings reports
+            # it there); fall back to a model field inside detail for services
+            # whose /health embeds it (parakeet).
+            "model": http.get("model") or ((http.get("detail") or {}).get("model") if isinstance(http.get("detail"), dict) else None),
            "docker_state": docker.get("state"),
            "restart_count": docker.get("restart_count"),
            "started_at": docker.get("started_at"),
@@ -484,8 +510,8 @@ async def stream_nim_install(job_id: str):

@app.delete("/api/services/{name}")
 async def del_service(name: str) -> dict:
-    # Only allow deleting custom services (not the bundled parakeet/magpie keys)
-    if name in ("parakeet", "magpie"):
+    # Only allow deleting custom services (not the bundled built-in keys)
+    if name in ("parakeet", "kokoro", "embeddings", "qdrant"):
        raise HTTPException(400, "built-in service; cannot delete (use Configure Sparks to point at a different host)")
    delete_custom_service(name)
    return {"ok": True, "name": name}
@@ -551,12 +577,15 @@ async def post_speech_models_restart() -> dict:
@app.get("/api/endpoints")
 async def get_endpoints() -> dict:
    """Service-discovery summary. Stable shape; other apps on the LAN can poll this
-    to learn the OpenAI-compatible vLLM endpoint, the Parakeet STT endpoint, and the
-    Magpie TTS endpoint without needing to know the individual Spark IPs."""
-    vllm, parakeet, magpie = await asyncio.gather(
+    to learn the OpenAI-compatible vLLM endpoint, the Parakeet STT endpoint, the
+    Kokoro TTS endpoint, and the embeddings + Qdrant retrieval endpoints without
+    needing to know the individual Spark IPs."""
+    vllm, parakeet, kokoro, embeddings, qdrant = await asyncio.gather(
        check_vllm(settings),
        check_parakeet(settings),
-        check_magpie(settings),
+        check_kokoro(settings),
+        check_embeddings(settings),
+        check_qdrant(settings),
    )
    return {
        "vllm": {
@@ -571,31 +600,51 @@ async def get_endpoints() -> dict:
            "kind": "stt",
            "model": (parakeet.get("detail") or {}).get("model") if isinstance(parakeet.get("detail"), dict) else None,
        },
-        "magpie": {
-            "ready": bool(magpie.get("ok")),
-            "base_url": magpie.get("base_url"),
+        "kokoro": {
+            "ready": bool(kokoro.get("ok")),
+            "base_url": kokoro.get("base_url"),
            "kind": "tts",
        },
+        "embeddings": {
+            "ready": bool(embeddings.get("ok")),
+            "base_url": embeddings.get("base_url"),
+            "kind": "embedding",
+            "model": embeddings.get("model"),
+            # The proxied OpenAI-compatible endpoints live on Spark Control itself.
+            "openai_endpoints": ["/v1/embeddings", "/v1/rerank", "/api/search"],
+        },
+        "qdrant": {
+            "ready": bool(qdrant.get("ok")),
+            "base_url": qdrant.get("base_url"),
+            "kind": "vectordb",
+            "collection": settings.qdrant_collection or None,
+        },
    }


@app.get("/api/status")
 async def get_status() -> dict:
-    vllm, parakeet, magpie = await asyncio.gather(
+    vllm, parakeet, kokoro, embeddings, qdrant = await asyncio.gather(
        check_vllm(settings),
        check_parakeet(settings),
-        check_magpie(settings),
+        check_kokoro(settings),
+        check_embeddings(settings),
+        check_qdrant(settings),
    )
    # Feed health into the connectivity log (deduped — only logs on transition)
    record_state("vllm", bool(vllm.get("ok")))
    record_state("parakeet", bool(parakeet.get("ok")))
-    record_state("magpie", bool(magpie.get("ok")))
+    record_state("kokoro", bool(kokoro.get("ok")))
+    record_state("embeddings", bool(embeddings.get("ok")))
+    record_state("qdrant", bool(qdrant.get("ok")))
    current_key = _identify_current_model(vllm.get("current_model"))
    return {
        "configured": settings.configured,
        "vllm": vllm,
        "parakeet": parakeet,
-        "magpie": magpie,
+        "kokoro": kokoro,
+        "embeddings": embeddings,
+        "qdrant": qdrant,
        "current_model_key": current_key,
        "current_swap_job": swap_manager.current_job_id,
    }