"""OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI, Home Assistant, etc.) talk to Parakeet (STT) and Magpie (TTS) through one URL. Endpoints exposed on spark-control's port (same as the dashboard): GET /v1/models — lists STT model + Magpie voices in OpenAI shape POST /v1/audio/speech — OpenAI TTS → Magpie /v1/audio/synthesize POST /v1/audio/transcriptions — forward to Parakeet (already OpenAI-compatible) Both downstream services already speak HTTP on the LAN; this module just adapts request/response shapes so OpenAI clients don't need a custom integration. When Parakeet returns a 500 (commonly the recurring CUDA wedge), the proxy returns a clearer 503 with Retry-After=60, and fires the deep-health probe in the background — which detects the wedge and triggers a rate-limited container restart inside seconds. The client's next attempt ~60s later then succeeds. """ from __future__ import annotations import asyncio import logging from typing import Any, Optional import httpx from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File from fastapi.responses import Response, StreamingResponse from pydantic import BaseModel from .config import Settings logger = logging.getLogger("spark-control.audio") # Magpie voice name encodes its language. Example: # Magpie-Multilingual.EN-US.Mia -> en-US # Magpie-Multilingual.ES-US.Diego -> es-US # Magpie-Multilingual.FR-FR.Pascal -> fr-FR def _lang_from_voice(voice: str) -> str: try: parts = voice.split(".") # parts = ["Magpie-Multilingual", "EN-US", "Mia"] (or with emotion suffix) if len(parts) >= 2 and "-" in parts[1]: lang_part = parts[1] # "EN-US" primary, region = lang_part.split("-", 1) return f"{primary.lower()}-{region.upper()}" except Exception: pass return "en-US" # Default voice: configurable, falls back to a sensible English voice if unset. DEFAULT_VOICE = "Magpie-Multilingual.EN-US.Mia" class SpeechRequest(BaseModel): """OpenAI /v1/audio/speech request body.""" model: Optional[str] = None # ignored — Magpie has one model input: str # the text to speak voice: Optional[str] = None # e.g. "Magpie-Multilingual.EN-US.Mia" response_format: Optional[str] = "wav" # only "wav" supported today speed: Optional[float] = 1.0 # ignored by Magpie # Magpie-specific extensions (clients may pass these through) language: Optional[str] = None sample_rate_hz: Optional[int] = 22050 encoding: Optional[str] = "LINEAR_PCM" def build_router(settings: Settings, deep_health: Any = None) -> APIRouter: """Build the audio proxy router. If `deep_health` is provided, 500s from Parakeet trigger an immediate background probe (which contains the same wedge-detect → auto-restart logic as the 5-minute periodic loop, but fires now instead of waiting). """ router = APIRouter() def _parakeet_base() -> str: return f"http://{settings.parakeet_host}:{settings.parakeet_port}" def _magpie_base() -> str: return f"http://{settings.magpie_host}:{settings.magpie_port}" # ---- /v1/models ---- @router.get("/v1/models") async def list_models() -> dict: """Advertise the STT model + a small voice menu so clients can populate their voice-picker UIs. Falls back gracefully if Magpie is offline (returns just the STT entry).""" data: list[dict] = [ { "id": "parakeet-tdt-0.6b-v3", "object": "model", "owned_by": "nvidia", "kind": "stt", }, ] # Try to enumerate voices from Magpie; if unreachable, just skip. try: async with httpx.AsyncClient(timeout=5.0) as client: r = await client.get(f"{_magpie_base()}/v1/audio/list_voices") if r.status_code == 200: voices_by_locales = r.json() seen = set() for _locales, payload in voices_by_locales.items(): for v in payload.get("voices", []): # Collapse emotion variants — expose only the base voice name. # "Magpie-Multilingual.EN-US.Mia.Angry" -> "Magpie-Multilingual.EN-US.Mia" parts = v.split(".") base = ".".join(parts[:3]) if len(parts) >= 3 else v if base not in seen: seen.add(base) data.append({ "id": base, "object": "model", "owned_by": "nvidia", "kind": "tts", }) except Exception as e: logger.warning("magpie voice list unavailable: %s", e) return {"object": "list", "data": data} # ---- /v1/audio/speech (TTS) ---- @router.post("/v1/audio/speech") async def speech(body: SpeechRequest) -> Response: """OpenAI-style TTS. Translates to Magpie's multipart synth call. Returns raw WAV bytes (Content-Type: audio/wav) — browsers and most clients play these directly. """ text = (body.input or "").strip() if not text: raise HTTPException(400, "input text is required") voice = body.voice or DEFAULT_VOICE language = body.language or _lang_from_voice(voice) sample_rate = int(body.sample_rate_hz or 22050) encoding = body.encoding or "LINEAR_PCM" form = { "text": text, "language": language, "voice": voice, "sample_rate_hz": str(sample_rate), "encoding": encoding, } try: async with httpx.AsyncClient(timeout=120.0) as client: r = await client.post(f"{_magpie_base()}/v1/audio/synthesize", data=form) except httpx.HTTPError as e: raise HTTPException(502, f"magpie unreachable: {e}") if r.status_code != 200: # Surface Magpie's error message verbatim so clients can debug voice/lang typos. raise HTTPException(r.status_code, r.text[:500]) # Magpie returns WAV bytes already (Content-Type: audio/wav). Pass through. media_type = r.headers.get("content-type", "audio/wav") return Response(content=r.content, media_type=media_type) # ---- /v1/audio/transcriptions (STT) ---- @router.post("/v1/audio/transcriptions") async def transcriptions( file: UploadFile = File(...), model: Optional[str] = Form(default=None), language: Optional[str] = Form(default=None), prompt: Optional[str] = Form(default=None), response_format: Optional[str] = Form(default="json"), temperature: Optional[float] = Form(default=None), ) -> Response: """Forward to Parakeet's already-OpenAI-compatible endpoint. We relay rather than redirect so clients only need to know one URL (spark-control's) — and so any future client-side rewrites of the request shape (e.g. translating Whisper-format params) happen here. """ body = await file.read() files = {"file": (file.filename or "audio.wav", body, file.content_type or "application/octet-stream")} data: dict[str, str] = {} if model: data["model"] = model if language: data["language"] = language if prompt: data["prompt"] = prompt if response_format: data["response_format"] = response_format if temperature is not None: data["temperature"] = str(temperature) try: async with httpx.AsyncClient(timeout=300.0) as client: r = await client.post( f"{_parakeet_base()}/v1/audio/transcriptions", files=files, data=data, ) except httpx.HTTPError as e: raise HTTPException(502, f"parakeet unreachable: {e}") if r.status_code == 500: # Parakeet 500s are almost always the CUDA wedge (CUBLAS_*_ERROR # mid-attention). Kick deep-health to detect+restart in the # background, and return a clean retry signal to the client. err_snippet = r.text[:400] logger.warning("parakeet 500 — firing deep-health probe in background. detail=%s", err_snippet) if deep_health is not None: try: asyncio.create_task(deep_health.run_one("parakeet")) except Exception as e: logger.error("failed to schedule deep-health probe: %s", e) raise HTTPException( status_code=503, detail="Parakeet returned a transient error (likely CUDA wedge). Auto-restart triggered; retry in ~60s.", headers={"Retry-After": "60"}, ) if r.status_code != 200: raise HTTPException(r.status_code, r.text[:500]) return Response(content=r.content, media_type=r.headers.get("content-type", "application/json")) return router