v0.9.0:2 - audio proxy: turn Parakeet wedge 500 into clean 503 + immediate auto-restart
Parakeet's recurring CUDA wedge (CUBLAS_STATUS_*_ERROR mid-attention)
fires reliably on Open WebUI's WebM/Opus->MP3 audio. Previously the
proxy relayed the upstream 500 verbatim, Open WebUI showed "Server
connection error" with no signal to retry, and recovery took up to
5 minutes (waiting for the next periodic deep-health probe).
Now the proxy:
1. Detects 500 from /v1/audio/transcriptions
2. Fires deep_health.run_one("parakeet") as a background asyncio task
(which contains the same wedge-detect + rate-limited auto-restart
logic, but runs immediately instead of waiting for the next tick)
3. Returns 503 with a clear detail message and Retry-After: 60
The client (Open WebUI, Home Assistant, etc.) gets a proper retry
signal; the auto-restart triggers inside seconds; the next attempt
~60s later succeeds. Rate-limiting (3 restarts per 30 min) is
inherited from the deep-health module so this can't cause restart
storms.
server.py: pass deep_health into build_audio_router().
audio_proxy.py: new 503-with-restart branch; signature now accepts
deep_health as an optional dependency.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -8,10 +8,16 @@ Endpoints exposed on spark-control's port (same as the dashboard):
|
||||
|
||||
Both downstream services already speak HTTP on the LAN; this module just adapts
|
||||
request/response shapes so OpenAI clients don't need a custom integration.
|
||||
|
||||
When Parakeet returns a 500 (commonly the recurring CUDA wedge), the proxy
|
||||
returns a clearer 503 with Retry-After=60, and fires the deep-health probe in
|
||||
the background — which detects the wedge and triggers a rate-limited container
|
||||
restart inside seconds. The client's next attempt ~60s later then succeeds.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File
|
||||
@@ -56,7 +62,13 @@ class SpeechRequest(BaseModel):
|
||||
encoding: Optional[str] = "LINEAR_PCM"
|
||||
|
||||
|
||||
def build_router(settings: Settings) -> APIRouter:
|
||||
def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
||||
"""Build the audio proxy router.
|
||||
|
||||
If `deep_health` is provided, 500s from Parakeet trigger an immediate
|
||||
background probe (which contains the same wedge-detect → auto-restart
|
||||
logic as the 5-minute periodic loop, but fires now instead of waiting).
|
||||
"""
|
||||
router = APIRouter()
|
||||
|
||||
def _parakeet_base() -> str:
|
||||
@@ -176,6 +188,23 @@ def build_router(settings: Settings) -> APIRouter:
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(502, f"parakeet unreachable: {e}")
|
||||
|
||||
if r.status_code == 500:
|
||||
# Parakeet 500s are almost always the CUDA wedge (CUBLAS_*_ERROR
|
||||
# mid-attention). Kick deep-health to detect+restart in the
|
||||
# background, and return a clean retry signal to the client.
|
||||
err_snippet = r.text[:400]
|
||||
logger.warning("parakeet 500 — firing deep-health probe in background. detail=%s", err_snippet)
|
||||
if deep_health is not None:
|
||||
try:
|
||||
asyncio.create_task(deep_health.run_one("parakeet"))
|
||||
except Exception as e:
|
||||
logger.error("failed to schedule deep-health probe: %s", e)
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Parakeet returned a transient error (likely CUDA wedge). Auto-restart triggered; retry in ~60s.",
|
||||
headers={"Retry-After": "60"},
|
||||
)
|
||||
|
||||
if r.status_code != 200:
|
||||
raise HTTPException(r.status_code, r.text[:500])
|
||||
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
|
||||
|
||||
+3
-1
@@ -58,7 +58,9 @@ app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
|
||||
# OpenAI-compatible audio proxy: /v1/audio/speech, /v1/audio/transcriptions, /v1/models.
|
||||
# Lets Open WebUI, Home Assistant, and any other OpenAI-shaped client talk to
|
||||
# Parakeet (STT) and Magpie (TTS) through a single spark-control URL.
|
||||
app.include_router(build_audio_router(settings))
|
||||
# Passing deep_health lets the proxy fire an immediate wedge-detect + auto-restart
|
||||
# when Parakeet returns 500, instead of waiting up to 5 min for the periodic probe.
|
||||
app.include_router(build_audio_router(settings, deep_health=deep_health))
|
||||
|
||||
|
||||
@app.get("/", include_in_schema=False)
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||
|
||||
export const v0_1_0 = VersionInfo.of({
|
||||
version: '0.9.0:1',
|
||||
version: '0.9.0:2',
|
||||
releaseNotes: {
|
||||
en_US:
|
||||
'v0.9.0:1 — fix: 0.9.0:0 added the OpenAI-compatible audio proxy (TTS to Magpie, STT to Parakeet) but the new /v1/audio/transcriptions endpoint uses FastAPI Form/File parameters, which need python-multipart at runtime. That package wasn\'t in the Docker image\'s dependency list (the dashboard never needed multipart before), so FastAPI crashed on import — taking down the whole dashboard. This patch adds python-multipart>=0.0.9 to image/pyproject.toml. After installing this version, the dashboard URL, the StartTunnel clearnet domain, and the new /v1/* audio endpoints all come back up. No code changes to the proxy or anything else.',
|
||||
'v0.9.0:2 — Open WebUI voice mode UX fix. Parakeet has a recurring CUDA wedge (CUBLAS_STATUS_*_ERROR mid-attention) that fires reliably on Open WebUI\'s WebM/Opus→MP3 audio. Previously the proxy just relayed the upstream 500, Open WebUI showed "Server connection error", and you had to wait up to 5 min for the periodic deep-health probe to detect+restart Parakeet. Now: when Parakeet returns 500, the proxy fires deep-health\'s probe immediately in the background (which contains the same wedge-detect + rate-limited auto-restart logic) and returns 503 with Retry-After: 60 instead. The client gets a clear retry signal and the auto-restart kicks in within seconds. Retrying ~60s later should succeed reliably.',
|
||||
},
|
||||
migrations: {
|
||||
up: async ({ effects }) => {},
|
||||
|
||||
Reference in New Issue
Block a user