v0.9.0:0 - OpenAI-compatible audio proxy for Open WebUI / Home Assistant

Adds three new endpoints to spark-control that translate OpenAI's audio API shapes to the Parakeet (STT) and Magpie (TTS, NVIDIA Riva) services on the Sparks: GET /v1/models — STT model + Magpie's 60+ voices POST /v1/audio/speech — OpenAI body -> Magpie multipart synthesize (returns audio/wav passthrough) POST /v1/audio/transcriptions — relay to Parakeet (already compatible) Verified shapes against the live services: - Parakeet returns OpenAI-style {"text": "..."} or verbose_json with segments+words. Already a perfect drop-in for OpenAI clients. - Magpie returns raw WAV bytes with Content-Type: audio/wav. NOT base64-wrapped JSON as one might assume. The proxy is literally a body-translation on the request side; response is passthrough. Voice language is auto-derived from the voice name (e.g. Magpie-Multilingual.EN-US.Mia -> language=en-US) so clients don't need to set it explicitly. Open WebUI / Home Assistant / Recap Relay can now all point at one URL — https://<spark-control>.local/v1 — and get LLM, STT, TTS behind a single identity. No shim service to deploy. Pure addition: no existing routes touched; the dashboard, /api/*, download flow, deep-health, hardware probes are all unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 16:41:48 -05:00
parent befedf0852
commit f44e7f8b03
3 changed files with 191 additions and 2 deletions
@@ -0,0 +1,183 @@
+"""OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI,
+Home Assistant, etc.) talk to Parakeet (STT) and Magpie (TTS) through one URL.
+
+Endpoints exposed on spark-control's port (same as the dashboard):
+  GET  /v1/models                 — lists STT model + Magpie voices in OpenAI shape
+  POST /v1/audio/speech           — OpenAI TTS → Magpie /v1/audio/synthesize
+  POST /v1/audio/transcriptions   — forward to Parakeet (already OpenAI-compatible)
+
+Both downstream services already speak HTTP on the LAN; this module just adapts
+request/response shapes so OpenAI clients don't need a custom integration.
+"""
+from __future__ import annotations
+import logging
+from typing import Optional
+
+import httpx
+from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File
+from fastapi.responses import Response, StreamingResponse
+from pydantic import BaseModel
+
+from .config import Settings
+
+logger = logging.getLogger("spark-control.audio")
+
+# Magpie voice name encodes its language. Example:
+#   Magpie-Multilingual.EN-US.Mia        -> en-US
+#   Magpie-Multilingual.ES-US.Diego      -> es-US
+#   Magpie-Multilingual.FR-FR.Pascal     -> fr-FR
+def _lang_from_voice(voice: str) -> str:
+    try:
+        parts = voice.split(".")
+        # parts = ["Magpie-Multilingual", "EN-US", "Mia"] (or with emotion suffix)
+        if len(parts) >= 2 and "-" in parts[1]:
+            lang_part = parts[1]  # "EN-US"
+            primary, region = lang_part.split("-", 1)
+            return f"{primary.lower()}-{region.upper()}"
+    except Exception:
+        pass
+    return "en-US"
+
+
+# Default voice: configurable, falls back to a sensible English voice if unset.
+DEFAULT_VOICE = "Magpie-Multilingual.EN-US.Mia"
+
+
+class SpeechRequest(BaseModel):
+    """OpenAI /v1/audio/speech request body."""
+    model: Optional[str] = None              # ignored — Magpie has one model
+    input: str                                # the text to speak
+    voice: Optional[str] = None              # e.g. "Magpie-Multilingual.EN-US.Mia"
+    response_format: Optional[str] = "wav"   # only "wav" supported today
+    speed: Optional[float] = 1.0             # ignored by Magpie
+    # Magpie-specific extensions (clients may pass these through)
+    language: Optional[str] = None
+    sample_rate_hz: Optional[int] = 22050
+    encoding: Optional[str] = "LINEAR_PCM"
+
+
+def build_router(settings: Settings) -> APIRouter:
+    router = APIRouter()
+
+    def _parakeet_base() -> str:
+        return f"http://{settings.parakeet_host}:{settings.parakeet_port}"
+
+    def _magpie_base() -> str:
+        return f"http://{settings.magpie_host}:{settings.magpie_port}"
+
+    # ---- /v1/models ----
+    @router.get("/v1/models")
+    async def list_models() -> dict:
+        """Advertise the STT model + a small voice menu so clients can
+        populate their voice-picker UIs. Falls back gracefully if Magpie
+        is offline (returns just the STT entry)."""
+        data: list[dict] = [
+            {
+                "id": "parakeet-tdt-0.6b-v3",
+                "object": "model",
+                "owned_by": "nvidia",
+                "kind": "stt",
+            },
+        ]
+        # Try to enumerate voices from Magpie; if unreachable, just skip.
+        try:
+            async with httpx.AsyncClient(timeout=5.0) as client:
+                r = await client.get(f"{_magpie_base()}/v1/audio/list_voices")
+            if r.status_code == 200:
+                voices_by_locales = r.json()
+                seen = set()
+                for _locales, payload in voices_by_locales.items():
+                    for v in payload.get("voices", []):
+                        # Collapse emotion variants — expose only the base voice name.
+                        # "Magpie-Multilingual.EN-US.Mia.Angry" -> "Magpie-Multilingual.EN-US.Mia"
+                        parts = v.split(".")
+                        base = ".".join(parts[:3]) if len(parts) >= 3 else v
+                        if base not in seen:
+                            seen.add(base)
+                            data.append({
+                                "id": base,
+                                "object": "model",
+                                "owned_by": "nvidia",
+                                "kind": "tts",
+                            })
+        except Exception as e:
+            logger.warning("magpie voice list unavailable: %s", e)
+        return {"object": "list", "data": data}
+
+    # ---- /v1/audio/speech (TTS) ----
+    @router.post("/v1/audio/speech")
+    async def speech(body: SpeechRequest) -> Response:
+        """OpenAI-style TTS. Translates to Magpie's multipart synth call.
+
+        Returns raw WAV bytes (Content-Type: audio/wav) — browsers and most
+        clients play these directly.
+        """
+        text = (body.input or "").strip()
+        if not text:
+            raise HTTPException(400, "input text is required")
+
+        voice = body.voice or DEFAULT_VOICE
+        language = body.language or _lang_from_voice(voice)
+        sample_rate = int(body.sample_rate_hz or 22050)
+        encoding = body.encoding or "LINEAR_PCM"
+
+        form = {
+            "text": text,
+            "language": language,
+            "voice": voice,
+            "sample_rate_hz": str(sample_rate),
+            "encoding": encoding,
+        }
+        try:
+            async with httpx.AsyncClient(timeout=120.0) as client:
+                r = await client.post(f"{_magpie_base()}/v1/audio/synthesize", data=form)
+        except httpx.HTTPError as e:
+            raise HTTPException(502, f"magpie unreachable: {e}")
+
+        if r.status_code != 200:
+            # Surface Magpie's error message verbatim so clients can debug voice/lang typos.
+            raise HTTPException(r.status_code, r.text[:500])
+
+        # Magpie returns WAV bytes already (Content-Type: audio/wav). Pass through.
+        media_type = r.headers.get("content-type", "audio/wav")
+        return Response(content=r.content, media_type=media_type)
+
+    # ---- /v1/audio/transcriptions (STT) ----
+    @router.post("/v1/audio/transcriptions")
+    async def transcriptions(
+        file: UploadFile = File(...),
+        model: Optional[str] = Form(default=None),
+        language: Optional[str] = Form(default=None),
+        prompt: Optional[str] = Form(default=None),
+        response_format: Optional[str] = Form(default="json"),
+        temperature: Optional[float] = Form(default=None),
+    ) -> Response:
+        """Forward to Parakeet's already-OpenAI-compatible endpoint.
+
+        We relay rather than redirect so clients only need to know one URL
+        (spark-control's) — and so any future client-side rewrites of the
+        request shape (e.g. translating Whisper-format params) happen here.
+        """
+        body = await file.read()
+        files = {"file": (file.filename or "audio.wav", body, file.content_type or "application/octet-stream")}
+        data: dict[str, str] = {}
+        if model: data["model"] = model
+        if language: data["language"] = language
+        if prompt: data["prompt"] = prompt
+        if response_format: data["response_format"] = response_format
+        if temperature is not None: data["temperature"] = str(temperature)
+
+        try:
+            async with httpx.AsyncClient(timeout=300.0) as client:
+                r = await client.post(
+                    f"{_parakeet_base()}/v1/audio/transcriptions",
+                    files=files, data=data,
+                )
+        except httpx.HTTPError as e:
+            raise HTTPException(502, f"parakeet unreachable: {e}")
+
+        if r.status_code != 200:
+            raise HTTPException(r.status_code, r.text[:500])
+        return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
+
+    return router