v0.9.0:0 - OpenAI-compatible audio proxy for Open WebUI / Home Assistant
Adds three new endpoints to spark-control that translate OpenAI's
audio API shapes to the Parakeet (STT) and Magpie (TTS, NVIDIA Riva)
services on the Sparks:
GET /v1/models — STT model + Magpie's 60+ voices
POST /v1/audio/speech — OpenAI body -> Magpie multipart synthesize
(returns audio/wav passthrough)
POST /v1/audio/transcriptions — relay to Parakeet (already compatible)
Verified shapes against the live services:
- Parakeet returns OpenAI-style {"text": "..."} or verbose_json with
segments+words. Already a perfect drop-in for OpenAI clients.
- Magpie returns raw WAV bytes with Content-Type: audio/wav. NOT
base64-wrapped JSON as one might assume. The proxy is literally a
body-translation on the request side; response is passthrough.
Voice language is auto-derived from the voice name (e.g.
Magpie-Multilingual.EN-US.Mia -> language=en-US) so clients don't
need to set it explicitly.
Open WebUI / Home Assistant / Recap Relay can now all point at one
URL — https://<spark-control>.local/v1 — and get LLM, STT, TTS
behind a single identity. No shim service to deploy.
Pure addition: no existing routes touched; the dashboard, /api/*,
download flow, deep-health, hardware probes are all unchanged.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,183 @@
|
||||
"""OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI,
|
||||
Home Assistant, etc.) talk to Parakeet (STT) and Magpie (TTS) through one URL.
|
||||
|
||||
Endpoints exposed on spark-control's port (same as the dashboard):
|
||||
GET /v1/models — lists STT model + Magpie voices in OpenAI shape
|
||||
POST /v1/audio/speech — OpenAI TTS → Magpie /v1/audio/synthesize
|
||||
POST /v1/audio/transcriptions — forward to Parakeet (already OpenAI-compatible)
|
||||
|
||||
Both downstream services already speak HTTP on the LAN; this module just adapts
|
||||
request/response shapes so OpenAI clients don't need a custom integration.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File
|
||||
from fastapi.responses import Response, StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .config import Settings
|
||||
|
||||
logger = logging.getLogger("spark-control.audio")
|
||||
|
||||
# Magpie voice name encodes its language. Example:
|
||||
# Magpie-Multilingual.EN-US.Mia -> en-US
|
||||
# Magpie-Multilingual.ES-US.Diego -> es-US
|
||||
# Magpie-Multilingual.FR-FR.Pascal -> fr-FR
|
||||
def _lang_from_voice(voice: str) -> str:
|
||||
try:
|
||||
parts = voice.split(".")
|
||||
# parts = ["Magpie-Multilingual", "EN-US", "Mia"] (or with emotion suffix)
|
||||
if len(parts) >= 2 and "-" in parts[1]:
|
||||
lang_part = parts[1] # "EN-US"
|
||||
primary, region = lang_part.split("-", 1)
|
||||
return f"{primary.lower()}-{region.upper()}"
|
||||
except Exception:
|
||||
pass
|
||||
return "en-US"
|
||||
|
||||
|
||||
# Default voice: configurable, falls back to a sensible English voice if unset.
|
||||
DEFAULT_VOICE = "Magpie-Multilingual.EN-US.Mia"
|
||||
|
||||
|
||||
class SpeechRequest(BaseModel):
|
||||
"""OpenAI /v1/audio/speech request body."""
|
||||
model: Optional[str] = None # ignored — Magpie has one model
|
||||
input: str # the text to speak
|
||||
voice: Optional[str] = None # e.g. "Magpie-Multilingual.EN-US.Mia"
|
||||
response_format: Optional[str] = "wav" # only "wav" supported today
|
||||
speed: Optional[float] = 1.0 # ignored by Magpie
|
||||
# Magpie-specific extensions (clients may pass these through)
|
||||
language: Optional[str] = None
|
||||
sample_rate_hz: Optional[int] = 22050
|
||||
encoding: Optional[str] = "LINEAR_PCM"
|
||||
|
||||
|
||||
def build_router(settings: Settings) -> APIRouter:
|
||||
router = APIRouter()
|
||||
|
||||
def _parakeet_base() -> str:
|
||||
return f"http://{settings.parakeet_host}:{settings.parakeet_port}"
|
||||
|
||||
def _magpie_base() -> str:
|
||||
return f"http://{settings.magpie_host}:{settings.magpie_port}"
|
||||
|
||||
# ---- /v1/models ----
|
||||
@router.get("/v1/models")
|
||||
async def list_models() -> dict:
|
||||
"""Advertise the STT model + a small voice menu so clients can
|
||||
populate their voice-picker UIs. Falls back gracefully if Magpie
|
||||
is offline (returns just the STT entry)."""
|
||||
data: list[dict] = [
|
||||
{
|
||||
"id": "parakeet-tdt-0.6b-v3",
|
||||
"object": "model",
|
||||
"owned_by": "nvidia",
|
||||
"kind": "stt",
|
||||
},
|
||||
]
|
||||
# Try to enumerate voices from Magpie; if unreachable, just skip.
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
r = await client.get(f"{_magpie_base()}/v1/audio/list_voices")
|
||||
if r.status_code == 200:
|
||||
voices_by_locales = r.json()
|
||||
seen = set()
|
||||
for _locales, payload in voices_by_locales.items():
|
||||
for v in payload.get("voices", []):
|
||||
# Collapse emotion variants — expose only the base voice name.
|
||||
# "Magpie-Multilingual.EN-US.Mia.Angry" -> "Magpie-Multilingual.EN-US.Mia"
|
||||
parts = v.split(".")
|
||||
base = ".".join(parts[:3]) if len(parts) >= 3 else v
|
||||
if base not in seen:
|
||||
seen.add(base)
|
||||
data.append({
|
||||
"id": base,
|
||||
"object": "model",
|
||||
"owned_by": "nvidia",
|
||||
"kind": "tts",
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning("magpie voice list unavailable: %s", e)
|
||||
return {"object": "list", "data": data}
|
||||
|
||||
# ---- /v1/audio/speech (TTS) ----
|
||||
@router.post("/v1/audio/speech")
|
||||
async def speech(body: SpeechRequest) -> Response:
|
||||
"""OpenAI-style TTS. Translates to Magpie's multipart synth call.
|
||||
|
||||
Returns raw WAV bytes (Content-Type: audio/wav) — browsers and most
|
||||
clients play these directly.
|
||||
"""
|
||||
text = (body.input or "").strip()
|
||||
if not text:
|
||||
raise HTTPException(400, "input text is required")
|
||||
|
||||
voice = body.voice or DEFAULT_VOICE
|
||||
language = body.language or _lang_from_voice(voice)
|
||||
sample_rate = int(body.sample_rate_hz or 22050)
|
||||
encoding = body.encoding or "LINEAR_PCM"
|
||||
|
||||
form = {
|
||||
"text": text,
|
||||
"language": language,
|
||||
"voice": voice,
|
||||
"sample_rate_hz": str(sample_rate),
|
||||
"encoding": encoding,
|
||||
}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
r = await client.post(f"{_magpie_base()}/v1/audio/synthesize", data=form)
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(502, f"magpie unreachable: {e}")
|
||||
|
||||
if r.status_code != 200:
|
||||
# Surface Magpie's error message verbatim so clients can debug voice/lang typos.
|
||||
raise HTTPException(r.status_code, r.text[:500])
|
||||
|
||||
# Magpie returns WAV bytes already (Content-Type: audio/wav). Pass through.
|
||||
media_type = r.headers.get("content-type", "audio/wav")
|
||||
return Response(content=r.content, media_type=media_type)
|
||||
|
||||
# ---- /v1/audio/transcriptions (STT) ----
|
||||
@router.post("/v1/audio/transcriptions")
|
||||
async def transcriptions(
|
||||
file: UploadFile = File(...),
|
||||
model: Optional[str] = Form(default=None),
|
||||
language: Optional[str] = Form(default=None),
|
||||
prompt: Optional[str] = Form(default=None),
|
||||
response_format: Optional[str] = Form(default="json"),
|
||||
temperature: Optional[float] = Form(default=None),
|
||||
) -> Response:
|
||||
"""Forward to Parakeet's already-OpenAI-compatible endpoint.
|
||||
|
||||
We relay rather than redirect so clients only need to know one URL
|
||||
(spark-control's) — and so any future client-side rewrites of the
|
||||
request shape (e.g. translating Whisper-format params) happen here.
|
||||
"""
|
||||
body = await file.read()
|
||||
files = {"file": (file.filename or "audio.wav", body, file.content_type or "application/octet-stream")}
|
||||
data: dict[str, str] = {}
|
||||
if model: data["model"] = model
|
||||
if language: data["language"] = language
|
||||
if prompt: data["prompt"] = prompt
|
||||
if response_format: data["response_format"] = response_format
|
||||
if temperature is not None: data["temperature"] = str(temperature)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||
r = await client.post(
|
||||
f"{_parakeet_base()}/v1/audio/transcriptions",
|
||||
files=files, data=data,
|
||||
)
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(502, f"parakeet unreachable: {e}")
|
||||
|
||||
if r.status_code != 200:
|
||||
raise HTTPException(r.status_code, r.text[:500])
|
||||
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
|
||||
|
||||
return router
|
||||
Reference in New Issue
Block a user