v0.9.0:2 - audio proxy: turn Parakeet wedge 500 into clean 503 + immediate auto-restart
Parakeet's recurring CUDA wedge (CUBLAS_STATUS_*_ERROR mid-attention)
fires reliably on Open WebUI's WebM/Opus->MP3 audio. Previously the
proxy relayed the upstream 500 verbatim, Open WebUI showed "Server
connection error" with no signal to retry, and recovery took up to
5 minutes (waiting for the next periodic deep-health probe).
Now the proxy:
1. Detects 500 from /v1/audio/transcriptions
2. Fires deep_health.run_one("parakeet") as a background asyncio task
(which contains the same wedge-detect + rate-limited auto-restart
logic, but runs immediately instead of waiting for the next tick)
3. Returns 503 with a clear detail message and Retry-After: 60
The client (Open WebUI, Home Assistant, etc.) gets a proper retry
signal; the auto-restart triggers inside seconds; the next attempt
~60s later succeeds. Rate-limiting (3 restarts per 30 min) is
inherited from the deep-health module so this can't cause restart
storms.
server.py: pass deep_health into build_audio_router().
audio_proxy.py: new 503-with-restart branch; signature now accepts
deep_health as an optional dependency.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -8,10 +8,16 @@ Endpoints exposed on spark-control's port (same as the dashboard):
|
|||||||
|
|
||||||
Both downstream services already speak HTTP on the LAN; this module just adapts
|
Both downstream services already speak HTTP on the LAN; this module just adapts
|
||||||
request/response shapes so OpenAI clients don't need a custom integration.
|
request/response shapes so OpenAI clients don't need a custom integration.
|
||||||
|
|
||||||
|
When Parakeet returns a 500 (commonly the recurring CUDA wedge), the proxy
|
||||||
|
returns a clearer 503 with Retry-After=60, and fires the deep-health probe in
|
||||||
|
the background — which detects the wedge and triggers a rate-limited container
|
||||||
|
restart inside seconds. The client's next attempt ~60s later then succeeds.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
from typing import Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File
|
from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File
|
||||||
@@ -56,7 +62,13 @@ class SpeechRequest(BaseModel):
|
|||||||
encoding: Optional[str] = "LINEAR_PCM"
|
encoding: Optional[str] = "LINEAR_PCM"
|
||||||
|
|
||||||
|
|
||||||
def build_router(settings: Settings) -> APIRouter:
|
def build_router(settings: Settings, deep_health: Any = None) -> APIRouter:
|
||||||
|
"""Build the audio proxy router.
|
||||||
|
|
||||||
|
If `deep_health` is provided, 500s from Parakeet trigger an immediate
|
||||||
|
background probe (which contains the same wedge-detect → auto-restart
|
||||||
|
logic as the 5-minute periodic loop, but fires now instead of waiting).
|
||||||
|
"""
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
def _parakeet_base() -> str:
|
def _parakeet_base() -> str:
|
||||||
@@ -176,6 +188,23 @@ def build_router(settings: Settings) -> APIRouter:
|
|||||||
except httpx.HTTPError as e:
|
except httpx.HTTPError as e:
|
||||||
raise HTTPException(502, f"parakeet unreachable: {e}")
|
raise HTTPException(502, f"parakeet unreachable: {e}")
|
||||||
|
|
||||||
|
if r.status_code == 500:
|
||||||
|
# Parakeet 500s are almost always the CUDA wedge (CUBLAS_*_ERROR
|
||||||
|
# mid-attention). Kick deep-health to detect+restart in the
|
||||||
|
# background, and return a clean retry signal to the client.
|
||||||
|
err_snippet = r.text[:400]
|
||||||
|
logger.warning("parakeet 500 — firing deep-health probe in background. detail=%s", err_snippet)
|
||||||
|
if deep_health is not None:
|
||||||
|
try:
|
||||||
|
asyncio.create_task(deep_health.run_one("parakeet"))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("failed to schedule deep-health probe: %s", e)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=503,
|
||||||
|
detail="Parakeet returned a transient error (likely CUDA wedge). Auto-restart triggered; retry in ~60s.",
|
||||||
|
headers={"Retry-After": "60"},
|
||||||
|
)
|
||||||
|
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise HTTPException(r.status_code, r.text[:500])
|
raise HTTPException(r.status_code, r.text[:500])
|
||||||
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
|
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
|
||||||
|
|||||||
+3
-1
@@ -58,7 +58,9 @@ app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
|
|||||||
# OpenAI-compatible audio proxy: /v1/audio/speech, /v1/audio/transcriptions, /v1/models.
|
# OpenAI-compatible audio proxy: /v1/audio/speech, /v1/audio/transcriptions, /v1/models.
|
||||||
# Lets Open WebUI, Home Assistant, and any other OpenAI-shaped client talk to
|
# Lets Open WebUI, Home Assistant, and any other OpenAI-shaped client talk to
|
||||||
# Parakeet (STT) and Magpie (TTS) through a single spark-control URL.
|
# Parakeet (STT) and Magpie (TTS) through a single spark-control URL.
|
||||||
app.include_router(build_audio_router(settings))
|
# Passing deep_health lets the proxy fire an immediate wedge-detect + auto-restart
|
||||||
|
# when Parakeet returns 500, instead of waiting up to 5 min for the periodic probe.
|
||||||
|
app.include_router(build_audio_router(settings, deep_health=deep_health))
|
||||||
|
|
||||||
|
|
||||||
@app.get("/", include_in_schema=False)
|
@app.get("/", include_in_schema=False)
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
export const v0_1_0 = VersionInfo.of({
|
export const v0_1_0 = VersionInfo.of({
|
||||||
version: '0.9.0:1',
|
version: '0.9.0:2',
|
||||||
releaseNotes: {
|
releaseNotes: {
|
||||||
en_US:
|
en_US:
|
||||||
'v0.9.0:1 — fix: 0.9.0:0 added the OpenAI-compatible audio proxy (TTS to Magpie, STT to Parakeet) but the new /v1/audio/transcriptions endpoint uses FastAPI Form/File parameters, which need python-multipart at runtime. That package wasn\'t in the Docker image\'s dependency list (the dashboard never needed multipart before), so FastAPI crashed on import — taking down the whole dashboard. This patch adds python-multipart>=0.0.9 to image/pyproject.toml. After installing this version, the dashboard URL, the StartTunnel clearnet domain, and the new /v1/* audio endpoints all come back up. No code changes to the proxy or anything else.',
|
'v0.9.0:2 — Open WebUI voice mode UX fix. Parakeet has a recurring CUDA wedge (CUBLAS_STATUS_*_ERROR mid-attention) that fires reliably on Open WebUI\'s WebM/Opus→MP3 audio. Previously the proxy just relayed the upstream 500, Open WebUI showed "Server connection error", and you had to wait up to 5 min for the periodic deep-health probe to detect+restart Parakeet. Now: when Parakeet returns 500, the proxy fires deep-health\'s probe immediately in the background (which contains the same wedge-detect + rate-limited auto-restart logic) and returns 503 with Retry-After: 60 instead. The client gets a clear retry signal and the auto-restart kicks in within seconds. Retrying ~60s later should succeed reliably.',
|
||||||
},
|
},
|
||||||
migrations: {
|
migrations: {
|
||||||
up: async ({ effects }) => {},
|
up: async ({ effects }) => {},
|
||||||
|
|||||||
Reference in New Issue
Block a user