v0.9.0:0 - OpenAI-compatible audio proxy for Open WebUI / Home Assistant

Adds three new endpoints to spark-control that translate OpenAI's audio API shapes to the Parakeet (STT) and Magpie (TTS, NVIDIA Riva) services on the Sparks: GET /v1/models — STT model + Magpie's 60+ voices POST /v1/audio/speech — OpenAI body -> Magpie multipart synthesize (returns audio/wav passthrough) POST /v1/audio/transcriptions — relay to Parakeet (already compatible) Verified shapes against the live services: - Parakeet returns OpenAI-style {"text": "..."} or verbose_json with segments+words. Already a perfect drop-in for OpenAI clients. - Magpie returns raw WAV bytes with Content-Type: audio/wav. NOT base64-wrapped JSON as one might assume. The proxy is literally a body-translation on the request side; response is passthrough. Voice language is auto-derived from the voice name (e.g. Magpie-Multilingual.EN-US.Mia -> language=en-US) so clients don't need to set it explicitly. Open WebUI / Home Assistant / Recap Relay can now all point at one URL — https://<spark-control>.local/v1 — and get LLM, STT, TTS behind a single identity. No shim service to deploy. Pure addition: no existing routes touched; the dashboard, /api/*, download flow, deep-health, hardware probes are all unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 16:41:48 -05:00
parent befedf0852
commit f44e7f8b03
3 changed files with 191 additions and 2 deletions
@@ -0,0 +1,183 @@
 """OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI,
 Home Assistant, etc.) talk to Parakeet (STT) and Magpie (TTS) through one URL.
 Endpoints exposed on spark-control's port (same as the dashboard):
  GET  /v1/models                 — lists STT model + Magpie voices in OpenAI shape
  POST /v1/audio/speech           — OpenAI TTS → Magpie /v1/audio/synthesize
  POST /v1/audio/transcriptions   — forward to Parakeet (already OpenAI-compatible)
 Both downstream services already speak HTTP on the LAN; this module just adapts
 request/response shapes so OpenAI clients don't need a custom integration.
 """
 from __future__ import annotations
 import logging
 from typing import Optional
 import httpx
 from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File
 from fastapi.responses import Response, StreamingResponse
 from pydantic import BaseModel
 from .config import Settings
 logger = logging.getLogger("spark-control.audio")
 # Magpie voice name encodes its language. Example:
 #   Magpie-Multilingual.EN-US.Mia        -> en-US
 #   Magpie-Multilingual.ES-US.Diego      -> es-US
 #   Magpie-Multilingual.FR-FR.Pascal     -> fr-FR
 def _lang_from_voice(voice: str) -> str:
    try:
        parts = voice.split(".")
        # parts = ["Magpie-Multilingual", "EN-US", "Mia"] (or with emotion suffix)
        if len(parts) >= 2 and "-" in parts[1]:
            lang_part = parts[1]  # "EN-US"
            primary, region = lang_part.split("-", 1)
            return f"{primary.lower()}-{region.upper()}"
    except Exception:
        pass
    return "en-US"
 # Default voice: configurable, falls back to a sensible English voice if unset.
 DEFAULT_VOICE = "Magpie-Multilingual.EN-US.Mia"
 class SpeechRequest(BaseModel):
    """OpenAI /v1/audio/speech request body."""
    model: Optional[str] = None              # ignored — Magpie has one model
    input: str                                # the text to speak
    voice: Optional[str] = None              # e.g. "Magpie-Multilingual.EN-US.Mia"
    response_format: Optional[str] = "wav"   # only "wav" supported today
    speed: Optional[float] = 1.0             # ignored by Magpie
    # Magpie-specific extensions (clients may pass these through)
    language: Optional[str] = None
    sample_rate_hz: Optional[int] = 22050
    encoding: Optional[str] = "LINEAR_PCM"
 def build_router(settings: Settings) -> APIRouter:
    router = APIRouter()
    def _parakeet_base() -> str:
        return f"http://{settings.parakeet_host}:{settings.parakeet_port}"
    def _magpie_base() -> str:
        return f"http://{settings.magpie_host}:{settings.magpie_port}"
    # ---- /v1/models ----
    @router.get("/v1/models")
    async def list_models() -> dict:
        """Advertise the STT model + a small voice menu so clients can
        populate their voice-picker UIs. Falls back gracefully if Magpie
        is offline (returns just the STT entry)."""
        data: list[dict] = [
            {
                "id": "parakeet-tdt-0.6b-v3",
                "object": "model",
                "owned_by": "nvidia",
                "kind": "stt",
            },
        ]
        # Try to enumerate voices from Magpie; if unreachable, just skip.
        try:
            async with httpx.AsyncClient(timeout=5.0) as client:
                r = await client.get(f"{_magpie_base()}/v1/audio/list_voices")
            if r.status_code == 200:
                voices_by_locales = r.json()
                seen = set()
                for _locales, payload in voices_by_locales.items():
                    for v in payload.get("voices", []):
                        # Collapse emotion variants — expose only the base voice name.
                        # "Magpie-Multilingual.EN-US.Mia.Angry" -> "Magpie-Multilingual.EN-US.Mia"
                        parts = v.split(".")
                        base = ".".join(parts[:3]) if len(parts) >= 3 else v
                        if base not in seen:
                            seen.add(base)
                            data.append({
                                "id": base,
                                "object": "model",
                                "owned_by": "nvidia",
                                "kind": "tts",
                            })
        except Exception as e:
            logger.warning("magpie voice list unavailable: %s", e)
        return {"object": "list", "data": data}
    # ---- /v1/audio/speech (TTS) ----
    @router.post("/v1/audio/speech")
    async def speech(body: SpeechRequest) -> Response:
        """OpenAI-style TTS. Translates to Magpie's multipart synth call.
        Returns raw WAV bytes (Content-Type: audio/wav) — browsers and most
        clients play these directly.
        """
        text = (body.input or "").strip()
        if not text:
            raise HTTPException(400, "input text is required")
        voice = body.voice or DEFAULT_VOICE
        language = body.language or _lang_from_voice(voice)
        sample_rate = int(body.sample_rate_hz or 22050)
        encoding = body.encoding or "LINEAR_PCM"
        form = {
            "text": text,
            "language": language,
            "voice": voice,
            "sample_rate_hz": str(sample_rate),
            "encoding": encoding,
        }
        try:
            async with httpx.AsyncClient(timeout=120.0) as client:
                r = await client.post(f"{_magpie_base()}/v1/audio/synthesize", data=form)
        except httpx.HTTPError as e:
            raise HTTPException(502, f"magpie unreachable: {e}")
        if r.status_code != 200:
            # Surface Magpie's error message verbatim so clients can debug voice/lang typos.
            raise HTTPException(r.status_code, r.text[:500])
        # Magpie returns WAV bytes already (Content-Type: audio/wav). Pass through.
        media_type = r.headers.get("content-type", "audio/wav")
        return Response(content=r.content, media_type=media_type)
    # ---- /v1/audio/transcriptions (STT) ----
    @router.post("/v1/audio/transcriptions")
    async def transcriptions(
        file: UploadFile = File(...),
        model: Optional[str] = Form(default=None),
        language: Optional[str] = Form(default=None),
        prompt: Optional[str] = Form(default=None),
        response_format: Optional[str] = Form(default="json"),
        temperature: Optional[float] = Form(default=None),
    ) -> Response:
        """Forward to Parakeet's already-OpenAI-compatible endpoint.
        We relay rather than redirect so clients only need to know one URL
        (spark-control's) — and so any future client-side rewrites of the
        request shape (e.g. translating Whisper-format params) happen here.
        """
        body = await file.read()
        files = {"file": (file.filename or "audio.wav", body, file.content_type or "application/octet-stream")}
        data: dict[str, str] = {}
        if model: data["model"] = model
        if language: data["language"] = language
        if prompt: data["prompt"] = prompt
        if response_format: data["response_format"] = response_format
        if temperature is not None: data["temperature"] = str(temperature)
        try:
            async with httpx.AsyncClient(timeout=300.0) as client:
                r = await client.post(
                    f"{_parakeet_base()}/v1/audio/transcriptions",
                    files=files, data=data,
                )
        except httpx.HTTPError as e:
            raise HTTPException(502, f"parakeet unreachable: {e}")
        if r.status_code != 200:
            raise HTTPException(r.status_code, r.text[:500])
        return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
    return router
@@ -12,6 +12,7 @@ from typing import Literal
 from .config import Settings
 from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
 from .custom_services import add_custom_service, delete_custom_service
 from .audio_proxy import build_router as build_audio_router
 from .deep_health import DeepHealth
 from .disk import delete_from_disk, probe_disk
 from .download import DownloadManager
@@ -54,6 +55,11 @@ async def _stop_deep_health() -> None:
 _STATIC_DIR = Path(__file__).resolve().parent / "static"
 app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
 # OpenAI-compatible audio proxy: /v1/audio/speech, /v1/audio/transcriptions, /v1/models.
 # Lets Open WebUI, Home Assistant, and any other OpenAI-shaped client talk to
 # Parakeet (STT) and Magpie (TTS) through a single spark-control URL.
 app.include_router(build_audio_router(settings))
@app.get("/", include_in_schema=False)
 async def index() -> FileResponse:
@@ -1,10 +1,10 @@
 import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
 export const v0_1_0 = VersionInfo.of({
-  version: '0.8.1:2',
+  version: '0.9.0:0',
  releaseNotes: {
    en_US:
-      'v0.8.1:2 — the primary card button now adapts to whether the model is on disk. If weights are present: green "Switch to this" (unchanged). If weights are NOT on disk: blue "Download" instead, which calls /api/download directly with the model\'s repo and the right mode (solo→Spark 1, cluster→both Sparks) — no more pasting the repo into the manual download form to re-fetch a deleted model. Re-installing a previously-deleted model is now one click + a confirmation. Builds on the disk-status pills + trash icons from 0.8.1.',
+      'v0.9.0 — OpenAI-compatible audio proxy. Spark Control now exposes /v1/audio/speech (TTS), /v1/audio/transcriptions (STT), and /v1/models on its own URL, translating OpenAI-shaped requests to Magpie (NVIDIA Riva multipart) and forwarding to Parakeet (already OpenAI-compatible). Open WebUI, Home Assistant, and any other OpenAI-compatible client can now point at https://<your-spark-control>.local/v1 and get TTS + STT + LLM all behind one identity — no shim service to deploy, no separate URLs to remember. /v1/models lists Magpie\'s 60+ voices across en-US, es-US, fr-FR, zh-CN, it-IT, hi-IN, vi-VN, ja-JP, de-DE so client UIs auto-populate their voice pickers. Falls back gracefully if Magpie is offline (still serves STT). Pure addition — no existing routes or endpoints changed.',
  },
  migrations: {
    up: async ({ effects }) => {},