v0.13.0:4 - redaction gateway, embeddings proxy, expanded audio API

- Add redaction gateway (redaction_gateway.py, redaction/ scrub + tests)
- Add embeddings proxy and spark_embed service (Dockerfile + main.py)
- Expand audio_proxy with speaker-aware handling; deep_health/health/server updates
- Package: configureSparks action + sparkConfig model updates, manifest/main wiring
- Docs: AUDIO_API, EMBEDDINGS, REDACTION_GATEWAY; HANDOFF and runbook/known-issues refresh
This commit is contained in:
Keysat
2026-06-11 17:45:21 -05:00
parent 4a75274db3
commit 8d839e3714
37 changed files with 3763 additions and 197 deletions
+68 -19
View File
@@ -17,8 +17,10 @@ from .deep_health import DeepHealth
from .disk import delete_from_disk, probe_disk
from .download import DownloadManager
from .llm_proxy import build_router as build_llm_router
from .embeddings_proxy import build_router as build_embeddings_router
from .redaction_gateway import build_router as build_redaction_router, MapStore
from .hardware import HardwareProbe
from .health import check_magpie, check_parakeet, check_vllm
from .health import check_kokoro, check_parakeet, check_vllm, check_embeddings, check_qdrant
from .models import load_catalog
from .nim import SUGGESTED_NIMS, CATALOG_URL, NimManager
from .overrides import add_custom, delete_custom, extract_knobs_from_args, load_overrides, set_knobs
@@ -60,7 +62,7 @@ app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
# OpenAI-compatible audio proxy: /v1/audio/speech, /v1/audio/transcriptions, /v1/models.
# Lets Open WebUI, Home Assistant, and any other OpenAI-shaped client talk to
# Parakeet (STT) and Magpie (TTS) through a single spark-control URL.
# Parakeet (STT) and Kokoro (TTS) through a single spark-control URL.
# Passing deep_health lets the proxy fire an immediate wedge-detect + auto-restart
# when Parakeet returns 500, instead of waiting up to 5 min for the periodic probe.
app.include_router(build_audio_router(settings, deep_health=deep_health))
@@ -71,6 +73,20 @@ app.include_router(build_audio_router(settings, deep_health=deep_health))
# as the audio proxy — clients only need one URL for everything.
app.include_router(build_llm_router(settings))
# OpenAI-compatible embeddings + rerank + hybrid search proxy:
# /v1/embeddings -> spark-embed (bge-m3 dense), /v1/rerank -> spark-embed
# (bge-reranker-v2-m3), /api/search -> orchestrated dense(+sparse) retrieval
# from Qdrant with optional cross-encoder rerank. Same single-trusted-host
# model as the LLM and audio proxies.
app.include_router(build_embeddings_router(settings))
# Redaction gateway: /scrub + /rehydrate. The privacy boundary between sovereign
# LP data and the Claude API — de-identify context before it leaves the box,
# re-identify Claude's response locally. The pseudonym map (the de-anon key) is
# held server-side in a TTL-swept store on /data and never leaves this host.
redaction_map_store = MapStore(settings.redaction_map_db, settings.redaction_map_ttl)
app.include_router(build_redaction_router(settings, redaction_map_store))
@app.get("/", include_in_schema=False)
async def index() -> FileResponse:
@@ -274,7 +290,7 @@ async def run_deep_health(service: str) -> dict:
class HealthEventBody(BaseModel):
service: str # e.g. "parakeet", "magpie", "vllm"
service: str # e.g. "parakeet", "kokoro", "vllm"
ok: bool # true on success, false on failure
source: str | None = None # what app reported (e.g. "open-webui")
error: str | None = None # optional detail
@@ -344,7 +360,7 @@ async def wake_spark(name: str) -> dict:
@app.get("/api/services")
async def get_services() -> dict:
"""Lifecycle state of always-on support services (Parakeet, Magpie, …).
"""Lifecycle state of always-on support services (Parakeet, Kokoro, …).
Each entry includes:
- host/port/container/user (configured)
@@ -362,8 +378,15 @@ async def get_services() -> dict:
docker = await docker_state(settings, svc)
if name == "parakeet":
http = await check_parakeet(settings)
elif name == "kokoro":
http = await check_kokoro(settings)
elif name == "embeddings":
http = await check_embeddings(settings)
elif name == "qdrant":
http = await check_qdrant(settings)
else:
http = await check_magpie(settings)
# Custom services expose a /health endpoint by convention.
http = await check_kokoro(settings) if svc.kind == "tts" else {"ok": None, "base_url": svc.host and f"http://{svc.host}:{svc.port}"}
return name, {
"host": svc.host,
"user": svc.user,
@@ -372,7 +395,10 @@ async def get_services() -> dict:
"kind": svc.kind,
"base_url": http.get("base_url"),
"http_ready": bool(http.get("ok")),
"model": (http.get("detail") or {}).get("model") if isinstance(http.get("detail"), dict) else None,
# Prefer the check fn's own top-level model key (embeddings reports
# it there); fall back to a model field inside detail for services
# whose /health embeds it (parakeet).
"model": http.get("model") or ((http.get("detail") or {}).get("model") if isinstance(http.get("detail"), dict) else None),
"docker_state": docker.get("state"),
"restart_count": docker.get("restart_count"),
"started_at": docker.get("started_at"),
@@ -484,8 +510,8 @@ async def stream_nim_install(job_id: str):
@app.delete("/api/services/{name}")
async def del_service(name: str) -> dict:
# Only allow deleting custom services (not the bundled parakeet/magpie keys)
if name in ("parakeet", "magpie"):
# Only allow deleting custom services (not the bundled built-in keys)
if name in ("parakeet", "kokoro", "embeddings", "qdrant"):
raise HTTPException(400, "built-in service; cannot delete (use Configure Sparks to point at a different host)")
delete_custom_service(name)
return {"ok": True, "name": name}
@@ -551,12 +577,15 @@ async def post_speech_models_restart() -> dict:
@app.get("/api/endpoints")
async def get_endpoints() -> dict:
"""Service-discovery summary. Stable shape; other apps on the LAN can poll this
to learn the OpenAI-compatible vLLM endpoint, the Parakeet STT endpoint, and the
Magpie TTS endpoint without needing to know the individual Spark IPs."""
vllm, parakeet, magpie = await asyncio.gather(
to learn the OpenAI-compatible vLLM endpoint, the Parakeet STT endpoint, the
Kokoro TTS endpoint, and the embeddings + Qdrant retrieval endpoints without
needing to know the individual Spark IPs."""
vllm, parakeet, kokoro, embeddings, qdrant = await asyncio.gather(
check_vllm(settings),
check_parakeet(settings),
check_magpie(settings),
check_kokoro(settings),
check_embeddings(settings),
check_qdrant(settings),
)
return {
"vllm": {
@@ -571,31 +600,51 @@ async def get_endpoints() -> dict:
"kind": "stt",
"model": (parakeet.get("detail") or {}).get("model") if isinstance(parakeet.get("detail"), dict) else None,
},
"magpie": {
"ready": bool(magpie.get("ok")),
"base_url": magpie.get("base_url"),
"kokoro": {
"ready": bool(kokoro.get("ok")),
"base_url": kokoro.get("base_url"),
"kind": "tts",
},
"embeddings": {
"ready": bool(embeddings.get("ok")),
"base_url": embeddings.get("base_url"),
"kind": "embedding",
"model": embeddings.get("model"),
# The proxied OpenAI-compatible endpoints live on Spark Control itself.
"openai_endpoints": ["/v1/embeddings", "/v1/rerank", "/api/search"],
},
"qdrant": {
"ready": bool(qdrant.get("ok")),
"base_url": qdrant.get("base_url"),
"kind": "vectordb",
"collection": settings.qdrant_collection or None,
},
}
@app.get("/api/status")
async def get_status() -> dict:
vllm, parakeet, magpie = await asyncio.gather(
vllm, parakeet, kokoro, embeddings, qdrant = await asyncio.gather(
check_vllm(settings),
check_parakeet(settings),
check_magpie(settings),
check_kokoro(settings),
check_embeddings(settings),
check_qdrant(settings),
)
# Feed health into the connectivity log (deduped — only logs on transition)
record_state("vllm", bool(vllm.get("ok")))
record_state("parakeet", bool(parakeet.get("ok")))
record_state("magpie", bool(magpie.get("ok")))
record_state("kokoro", bool(kokoro.get("ok")))
record_state("embeddings", bool(embeddings.get("ok")))
record_state("qdrant", bool(qdrant.get("ok")))
current_key = _identify_current_model(vllm.get("current_model"))
return {
"configured": settings.configured,
"vllm": vllm,
"parakeet": parakeet,
"magpie": magpie,
"kokoro": kokoro,
"embeddings": embeddings,
"qdrant": qdrant,
"current_model_key": current_key,
"current_swap_job": swap_manager.current_job_id,
}