v0.13.0:4 - redaction gateway, embeddings proxy, expanded audio API
- Add redaction gateway (redaction_gateway.py, redaction/ scrub + tests) - Add embeddings proxy and spark_embed service (Dockerfile + main.py) - Expand audio_proxy with speaker-aware handling; deep_health/health/server updates - Package: configureSparks action + sparkConfig model updates, manifest/main wiring - Docs: AUDIO_API, EMBEDDINGS, REDACTION_GATEWAY; HANDOFF and runbook/known-issues refresh
This commit is contained in:
+75
-13
@@ -8,7 +8,7 @@ real transcription returns 500 cudaErrorUnknown.
|
||||
|
||||
So this module sends *real* but tiny synthetic inference requests:
|
||||
- Parakeet: 1 second of digital silence (16 kHz mono PCM, in-memory WAV)
|
||||
- Magpie: short text-to-speech, response audio discarded
|
||||
- Kokoro: short text-to-speech, response audio discarded
|
||||
- vLLM: 1-token chat completion against whatever model is loaded
|
||||
|
||||
All synthetic payloads are generated on demand into BytesIO, sent over HTTP,
|
||||
@@ -98,7 +98,9 @@ class DeepHealth:
|
||||
self.interval_sec = interval_sec
|
||||
self.state: dict[str, ServiceState] = {
|
||||
"parakeet": ServiceState(),
|
||||
"magpie": ServiceState(),
|
||||
"kokoro": ServiceState(),
|
||||
"embeddings": ServiceState(),
|
||||
"qdrant": ServiceState(),
|
||||
"vllm": ServiceState(),
|
||||
}
|
||||
self._stop = asyncio.Event()
|
||||
@@ -133,30 +135,30 @@ class DeepHealth:
|
||||
except Exception as e:
|
||||
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||
|
||||
async def probe_magpie(self) -> ProbeResult:
|
||||
async def probe_kokoro(self) -> ProbeResult:
|
||||
s = self.settings
|
||||
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
if not s.magpie_host:
|
||||
if not s.kokoro_host:
|
||||
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||
# Magpie /v1/audio/synthesize expects multipart form-data, not JSON.
|
||||
# The (None, value) tuple in httpx's `files=` produces a non-file form field.
|
||||
url = f"http://{s.magpie_host}:{s.magpie_port}/v1/audio/synthesize"
|
||||
form: dict = {"text": (None, "hi"), "language": (None, "en-US")}
|
||||
# Kokoro is OpenAI-shape: POST /v1/audio/speech with JSON body. We don't
|
||||
# care about the audio body; just confirm the model produces a 200.
|
||||
url = f"http://{s.kokoro_host}:{s.kokoro_port}/v1/audio/speech"
|
||||
body = {"model": "kokoro", "input": "hi", "voice": "bm_george",
|
||||
"response_format": "wav"}
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||
r = await c.post(url, files=form)
|
||||
r = await c.post(url, json=body)
|
||||
latency = round((time.monotonic() - t0) * 1000)
|
||||
if 200 <= r.status_code < 300:
|
||||
return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
|
||||
# 4xx that aren't 5xx mean server is alive but our payload is off —
|
||||
# don't classify as wedge.
|
||||
# 4xx (bad voice, bad params) means server is alive — don't wedge-classify.
|
||||
if 400 <= r.status_code < 500:
|
||||
return ProbeResult(
|
||||
ok=True,
|
||||
at=now_iso,
|
||||
latency_ms=latency,
|
||||
note=f"{r.status_code} — server alive (probe payload may need a voice name)",
|
||||
note=f"{r.status_code} — server alive (probe payload may need adjustment)",
|
||||
)
|
||||
return ProbeResult(
|
||||
ok=False,
|
||||
@@ -167,6 +169,52 @@ class DeepHealth:
|
||||
except Exception as e:
|
||||
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||
|
||||
async def probe_embeddings(self) -> ProbeResult:
|
||||
s = self.settings
|
||||
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
if not s.embed_host:
|
||||
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||
base = f"http://{s.embed_host}:{s.embed_port}"
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||
# First check readiness; the model takes a while to load on boot.
|
||||
h = await c.get(f"{base}/health")
|
||||
if h.status_code == 200 and isinstance(h.json(), dict) and h.json().get("status") != "ready":
|
||||
# Still loading models — not a wedge, just warming.
|
||||
return ProbeResult(ok=True, at=now_iso, note="loading models (warming)")
|
||||
r = await c.post(f"{base}/embed", json={"input": "health probe"})
|
||||
latency = round((time.monotonic() - t0) * 1000)
|
||||
if 200 <= r.status_code < 300:
|
||||
return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
|
||||
if r.status_code == 503:
|
||||
# spark-embed says model loading — warming, not wedged.
|
||||
return ProbeResult(ok=True, at=now_iso, latency_ms=latency, note="model loading (503)")
|
||||
return ProbeResult(ok=False, at=now_iso, latency_ms=latency,
|
||||
error=f"HTTP {r.status_code}: {r.text[:240]}")
|
||||
except Exception as e:
|
||||
# Connection refused during boot is warming, not a wedge — same
|
||||
# philosophy as the vllm idle case; don't trigger auto-restart.
|
||||
return ProbeResult(ok=True, at=now_iso, note=f"unreachable/warming: {type(e).__name__}")
|
||||
|
||||
async def probe_qdrant(self) -> ProbeResult:
|
||||
s = self.settings
|
||||
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
if not s.qdrant_host:
|
||||
return ProbeResult(ok=False, at=now_iso, error="not configured")
|
||||
base = f"http://{s.qdrant_host}:{s.qdrant_port}"
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT_SEC) as c:
|
||||
r = await c.get(f"{base}/readyz")
|
||||
latency = round((time.monotonic() - t0) * 1000)
|
||||
if 200 <= r.status_code < 300:
|
||||
return ProbeResult(ok=True, at=now_iso, latency_ms=latency)
|
||||
return ProbeResult(ok=False, at=now_iso, latency_ms=latency,
|
||||
error=f"HTTP {r.status_code}: {r.text[:240]}")
|
||||
except Exception as e:
|
||||
return ProbeResult(ok=False, at=now_iso, error=f"{type(e).__name__}: {e}")
|
||||
|
||||
async def probe_vllm(self) -> ProbeResult:
|
||||
s = self.settings
|
||||
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
@@ -233,7 +281,9 @@ class DeepHealth:
|
||||
|
||||
PROBES = {
|
||||
"parakeet": "probe_parakeet",
|
||||
"magpie": "probe_magpie",
|
||||
"kokoro": "probe_kokoro",
|
||||
"embeddings": "probe_embeddings",
|
||||
"qdrant": "probe_qdrant",
|
||||
"vllm": "probe_vllm",
|
||||
}
|
||||
|
||||
@@ -302,6 +352,18 @@ class DeepHealth:
|
||||
svc = services[service]
|
||||
if not svc.host or not svc.user:
|
||||
return
|
||||
# Only auto-restart GPU model servers (stt/tts/embedding). A vector DB
|
||||
# (qdrant, kind=vectordb) holds the only copy of the index — a restart
|
||||
# on a benign/transient probe error (e.g. a 404 on a not-yet-created
|
||||
# collection, or a 5xx during HNSW build) could corrupt or interrupt a
|
||||
# write. Never auto-restart it; surface the failure instead.
|
||||
from .services import RESTARTABLE_KINDS
|
||||
if svc.kind not in RESTARTABLE_KINDS:
|
||||
record_report(
|
||||
service, ok=False, source="deep-health",
|
||||
detail=f"probe failed but kind='{svc.kind}' is not auto-restartable; manual check needed",
|
||||
)
|
||||
return
|
||||
result = await run_action(self.settings, svc, "restart")
|
||||
st.restarts.append(now)
|
||||
ok = result.get("ok", False)
|
||||
|
||||
Reference in New Issue
Block a user