From 197655a62b5c7d170ef3d05a97a751aafce9233b Mon Sep 17 00:00:00 2001 From: Keysat Date: Sun, 17 May 2026 18:07:35 -0500 Subject: [PATCH] v0.9.0:2 - audio proxy: turn Parakeet wedge 500 into clean 503 + immediate auto-restart Parakeet's recurring CUDA wedge (CUBLAS_STATUS_*_ERROR mid-attention) fires reliably on Open WebUI's WebM/Opus->MP3 audio. Previously the proxy relayed the upstream 500 verbatim, Open WebUI showed "Server connection error" with no signal to retry, and recovery took up to 5 minutes (waiting for the next periodic deep-health probe). Now the proxy: 1. Detects 500 from /v1/audio/transcriptions 2. Fires deep_health.run_one("parakeet") as a background asyncio task (which contains the same wedge-detect + rate-limited auto-restart logic, but runs immediately instead of waiting for the next tick) 3. Returns 503 with a clear detail message and Retry-After: 60 The client (Open WebUI, Home Assistant, etc.) gets a proper retry signal; the auto-restart triggers inside seconds; the next attempt ~60s later succeeds. Rate-limiting (3 restarts per 30 min) is inherited from the deep-health module so this can't cause restart storms. server.py: pass deep_health into build_audio_router(). audio_proxy.py: new 503-with-restart branch; signature now accepts deep_health as an optional dependency. Co-Authored-By: Claude Opus 4.7 (1M context) --- image/app/audio_proxy.py | 33 ++++++++++++++++++++++++++++-- image/app/server.py | 4 +++- package/startos/versions/v0_1_0.ts | 4 ++-- 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/image/app/audio_proxy.py b/image/app/audio_proxy.py index c98244b..1ce0eba 100644 --- a/image/app/audio_proxy.py +++ b/image/app/audio_proxy.py @@ -8,10 +8,16 @@ Endpoints exposed on spark-control's port (same as the dashboard): Both downstream services already speak HTTP on the LAN; this module just adapts request/response shapes so OpenAI clients don't need a custom integration. + +When Parakeet returns a 500 (commonly the recurring CUDA wedge), the proxy +returns a clearer 503 with Retry-After=60, and fires the deep-health probe in +the background — which detects the wedge and triggers a rate-limited container +restart inside seconds. The client's next attempt ~60s later then succeeds. """ from __future__ import annotations +import asyncio import logging -from typing import Optional +from typing import Any, Optional import httpx from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File @@ -56,7 +62,13 @@ class SpeechRequest(BaseModel): encoding: Optional[str] = "LINEAR_PCM" -def build_router(settings: Settings) -> APIRouter: +def build_router(settings: Settings, deep_health: Any = None) -> APIRouter: + """Build the audio proxy router. + + If `deep_health` is provided, 500s from Parakeet trigger an immediate + background probe (which contains the same wedge-detect → auto-restart + logic as the 5-minute periodic loop, but fires now instead of waiting). + """ router = APIRouter() def _parakeet_base() -> str: @@ -176,6 +188,23 @@ def build_router(settings: Settings) -> APIRouter: except httpx.HTTPError as e: raise HTTPException(502, f"parakeet unreachable: {e}") + if r.status_code == 500: + # Parakeet 500s are almost always the CUDA wedge (CUBLAS_*_ERROR + # mid-attention). Kick deep-health to detect+restart in the + # background, and return a clean retry signal to the client. + err_snippet = r.text[:400] + logger.warning("parakeet 500 — firing deep-health probe in background. detail=%s", err_snippet) + if deep_health is not None: + try: + asyncio.create_task(deep_health.run_one("parakeet")) + except Exception as e: + logger.error("failed to schedule deep-health probe: %s", e) + raise HTTPException( + status_code=503, + detail="Parakeet returned a transient error (likely CUDA wedge). Auto-restart triggered; retry in ~60s.", + headers={"Retry-After": "60"}, + ) + if r.status_code != 200: raise HTTPException(r.status_code, r.text[:500]) return Response(content=r.content, media_type=r.headers.get("content-type", "application/json")) diff --git a/image/app/server.py b/image/app/server.py index 652a9f7..d032915 100644 --- a/image/app/server.py +++ b/image/app/server.py @@ -58,7 +58,9 @@ app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static") # OpenAI-compatible audio proxy: /v1/audio/speech, /v1/audio/transcriptions, /v1/models. # Lets Open WebUI, Home Assistant, and any other OpenAI-shaped client talk to # Parakeet (STT) and Magpie (TTS) through a single spark-control URL. -app.include_router(build_audio_router(settings)) +# Passing deep_health lets the proxy fire an immediate wedge-detect + auto-restart +# when Parakeet returns 500, instead of waiting up to 5 min for the periodic probe. +app.include_router(build_audio_router(settings, deep_health=deep_health)) @app.get("/", include_in_schema=False) diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts index c6adb9b..122e86a 100644 --- a/package/startos/versions/v0_1_0.ts +++ b/package/startos/versions/v0_1_0.ts @@ -1,10 +1,10 @@ import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' export const v0_1_0 = VersionInfo.of({ - version: '0.9.0:1', + version: '0.9.0:2', releaseNotes: { en_US: - 'v0.9.0:1 — fix: 0.9.0:0 added the OpenAI-compatible audio proxy (TTS to Magpie, STT to Parakeet) but the new /v1/audio/transcriptions endpoint uses FastAPI Form/File parameters, which need python-multipart at runtime. That package wasn\'t in the Docker image\'s dependency list (the dashboard never needed multipart before), so FastAPI crashed on import — taking down the whole dashboard. This patch adds python-multipart>=0.0.9 to image/pyproject.toml. After installing this version, the dashboard URL, the StartTunnel clearnet domain, and the new /v1/* audio endpoints all come back up. No code changes to the proxy or anything else.', + 'v0.9.0:2 — Open WebUI voice mode UX fix. Parakeet has a recurring CUDA wedge (CUBLAS_STATUS_*_ERROR mid-attention) that fires reliably on Open WebUI\'s WebM/Opus→MP3 audio. Previously the proxy just relayed the upstream 500, Open WebUI showed "Server connection error", and you had to wait up to 5 min for the periodic deep-health probe to detect+restart Parakeet. Now: when Parakeet returns 500, the proxy fires deep-health\'s probe immediately in the background (which contains the same wedge-detect + rate-limited auto-restart logic) and returns 503 with Retry-After: 60 instead. The client gets a clear retry signal and the auto-restart kicks in within seconds. Retrying ~60s later should succeed reliably.', }, migrations: { up: async ({ effects }) => {},