v0.9.0:0 - OpenAI-compatible audio proxy for Open WebUI / Home Assistant

Adds three new endpoints to spark-control that translate OpenAI's
audio API shapes to the Parakeet (STT) and Magpie (TTS, NVIDIA Riva)
services on the Sparks:

  GET  /v1/models                — STT model + Magpie's 60+ voices
  POST /v1/audio/speech          — OpenAI body -> Magpie multipart synthesize
                                    (returns audio/wav passthrough)
  POST /v1/audio/transcriptions  — relay to Parakeet (already compatible)

Verified shapes against the live services:
  - Parakeet returns OpenAI-style {"text": "..."} or verbose_json with
    segments+words. Already a perfect drop-in for OpenAI clients.
  - Magpie returns raw WAV bytes with Content-Type: audio/wav. NOT
    base64-wrapped JSON as one might assume. The proxy is literally a
    body-translation on the request side; response is passthrough.

Voice language is auto-derived from the voice name (e.g.
Magpie-Multilingual.EN-US.Mia -> language=en-US) so clients don't
need to set it explicitly.

Open WebUI / Home Assistant / Recap Relay can now all point at one
URL — https://<spark-control>.local/v1 — and get LLM, STT, TTS
behind a single identity. No shim service to deploy.

Pure addition: no existing routes touched; the dashboard, /api/*,
download flow, deep-health, hardware probes are all unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Keysat
2026-05-17 16:41:48 -05:00
parent befedf0852
commit f44e7f8b03
3 changed files with 191 additions and 2 deletions
+183
View File
@@ -0,0 +1,183 @@
"""OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI,
Home Assistant, etc.) talk to Parakeet (STT) and Magpie (TTS) through one URL.
Endpoints exposed on spark-control's port (same as the dashboard):
GET /v1/models — lists STT model + Magpie voices in OpenAI shape
POST /v1/audio/speech — OpenAI TTS → Magpie /v1/audio/synthesize
POST /v1/audio/transcriptions — forward to Parakeet (already OpenAI-compatible)
Both downstream services already speak HTTP on the LAN; this module just adapts
request/response shapes so OpenAI clients don't need a custom integration.
"""
from __future__ import annotations
import logging
from typing import Optional
import httpx
from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File
from fastapi.responses import Response, StreamingResponse
from pydantic import BaseModel
from .config import Settings
logger = logging.getLogger("spark-control.audio")
# Magpie voice name encodes its language. Example:
# Magpie-Multilingual.EN-US.Mia -> en-US
# Magpie-Multilingual.ES-US.Diego -> es-US
# Magpie-Multilingual.FR-FR.Pascal -> fr-FR
def _lang_from_voice(voice: str) -> str:
try:
parts = voice.split(".")
# parts = ["Magpie-Multilingual", "EN-US", "Mia"] (or with emotion suffix)
if len(parts) >= 2 and "-" in parts[1]:
lang_part = parts[1] # "EN-US"
primary, region = lang_part.split("-", 1)
return f"{primary.lower()}-{region.upper()}"
except Exception:
pass
return "en-US"
# Default voice: configurable, falls back to a sensible English voice if unset.
DEFAULT_VOICE = "Magpie-Multilingual.EN-US.Mia"
class SpeechRequest(BaseModel):
"""OpenAI /v1/audio/speech request body."""
model: Optional[str] = None # ignored — Magpie has one model
input: str # the text to speak
voice: Optional[str] = None # e.g. "Magpie-Multilingual.EN-US.Mia"
response_format: Optional[str] = "wav" # only "wav" supported today
speed: Optional[float] = 1.0 # ignored by Magpie
# Magpie-specific extensions (clients may pass these through)
language: Optional[str] = None
sample_rate_hz: Optional[int] = 22050
encoding: Optional[str] = "LINEAR_PCM"
def build_router(settings: Settings) -> APIRouter:
router = APIRouter()
def _parakeet_base() -> str:
return f"http://{settings.parakeet_host}:{settings.parakeet_port}"
def _magpie_base() -> str:
return f"http://{settings.magpie_host}:{settings.magpie_port}"
# ---- /v1/models ----
@router.get("/v1/models")
async def list_models() -> dict:
"""Advertise the STT model + a small voice menu so clients can
populate their voice-picker UIs. Falls back gracefully if Magpie
is offline (returns just the STT entry)."""
data: list[dict] = [
{
"id": "parakeet-tdt-0.6b-v3",
"object": "model",
"owned_by": "nvidia",
"kind": "stt",
},
]
# Try to enumerate voices from Magpie; if unreachable, just skip.
try:
async with httpx.AsyncClient(timeout=5.0) as client:
r = await client.get(f"{_magpie_base()}/v1/audio/list_voices")
if r.status_code == 200:
voices_by_locales = r.json()
seen = set()
for _locales, payload in voices_by_locales.items():
for v in payload.get("voices", []):
# Collapse emotion variants — expose only the base voice name.
# "Magpie-Multilingual.EN-US.Mia.Angry" -> "Magpie-Multilingual.EN-US.Mia"
parts = v.split(".")
base = ".".join(parts[:3]) if len(parts) >= 3 else v
if base not in seen:
seen.add(base)
data.append({
"id": base,
"object": "model",
"owned_by": "nvidia",
"kind": "tts",
})
except Exception as e:
logger.warning("magpie voice list unavailable: %s", e)
return {"object": "list", "data": data}
# ---- /v1/audio/speech (TTS) ----
@router.post("/v1/audio/speech")
async def speech(body: SpeechRequest) -> Response:
"""OpenAI-style TTS. Translates to Magpie's multipart synth call.
Returns raw WAV bytes (Content-Type: audio/wav) — browsers and most
clients play these directly.
"""
text = (body.input or "").strip()
if not text:
raise HTTPException(400, "input text is required")
voice = body.voice or DEFAULT_VOICE
language = body.language or _lang_from_voice(voice)
sample_rate = int(body.sample_rate_hz or 22050)
encoding = body.encoding or "LINEAR_PCM"
form = {
"text": text,
"language": language,
"voice": voice,
"sample_rate_hz": str(sample_rate),
"encoding": encoding,
}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
r = await client.post(f"{_magpie_base()}/v1/audio/synthesize", data=form)
except httpx.HTTPError as e:
raise HTTPException(502, f"magpie unreachable: {e}")
if r.status_code != 200:
# Surface Magpie's error message verbatim so clients can debug voice/lang typos.
raise HTTPException(r.status_code, r.text[:500])
# Magpie returns WAV bytes already (Content-Type: audio/wav). Pass through.
media_type = r.headers.get("content-type", "audio/wav")
return Response(content=r.content, media_type=media_type)
# ---- /v1/audio/transcriptions (STT) ----
@router.post("/v1/audio/transcriptions")
async def transcriptions(
file: UploadFile = File(...),
model: Optional[str] = Form(default=None),
language: Optional[str] = Form(default=None),
prompt: Optional[str] = Form(default=None),
response_format: Optional[str] = Form(default="json"),
temperature: Optional[float] = Form(default=None),
) -> Response:
"""Forward to Parakeet's already-OpenAI-compatible endpoint.
We relay rather than redirect so clients only need to know one URL
(spark-control's) — and so any future client-side rewrites of the
request shape (e.g. translating Whisper-format params) happen here.
"""
body = await file.read()
files = {"file": (file.filename or "audio.wav", body, file.content_type or "application/octet-stream")}
data: dict[str, str] = {}
if model: data["model"] = model
if language: data["language"] = language
if prompt: data["prompt"] = prompt
if response_format: data["response_format"] = response_format
if temperature is not None: data["temperature"] = str(temperature)
try:
async with httpx.AsyncClient(timeout=300.0) as client:
r = await client.post(
f"{_parakeet_base()}/v1/audio/transcriptions",
files=files, data=data,
)
except httpx.HTTPError as e:
raise HTTPException(502, f"parakeet unreachable: {e}")
if r.status_code != 200:
raise HTTPException(r.status_code, r.text[:500])
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
return router
+6
View File
@@ -12,6 +12,7 @@ from typing import Literal
from .config import Settings from .config import Settings
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
from .custom_services import add_custom_service, delete_custom_service from .custom_services import add_custom_service, delete_custom_service
from .audio_proxy import build_router as build_audio_router
from .deep_health import DeepHealth from .deep_health import DeepHealth
from .disk import delete_from_disk, probe_disk from .disk import delete_from_disk, probe_disk
from .download import DownloadManager from .download import DownloadManager
@@ -54,6 +55,11 @@ async def _stop_deep_health() -> None:
_STATIC_DIR = Path(__file__).resolve().parent / "static" _STATIC_DIR = Path(__file__).resolve().parent / "static"
app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static") app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
# OpenAI-compatible audio proxy: /v1/audio/speech, /v1/audio/transcriptions, /v1/models.
# Lets Open WebUI, Home Assistant, and any other OpenAI-shaped client talk to
# Parakeet (STT) and Magpie (TTS) through a single spark-control URL.
app.include_router(build_audio_router(settings))
@app.get("/", include_in_schema=False) @app.get("/", include_in_schema=False)
async def index() -> FileResponse: async def index() -> FileResponse:
+2 -2
View File
@@ -1,10 +1,10 @@
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
export const v0_1_0 = VersionInfo.of({ export const v0_1_0 = VersionInfo.of({
version: '0.8.1:2', version: '0.9.0:0',
releaseNotes: { releaseNotes: {
en_US: en_US:
'v0.8.1:2 — the primary card button now adapts to whether the model is on disk. If weights are present: green "Switch to this" (unchanged). If weights are NOT on disk: blue "Download" instead, which calls /api/download directly with the model\'s repo and the right mode (solo→Spark 1, cluster→both Sparks) — no more pasting the repo into the manual download form to re-fetch a deleted model. Re-installing a previously-deleted model is now one click + a confirmation. Builds on the disk-status pills + trash icons from 0.8.1.', 'v0.9.0 — OpenAI-compatible audio proxy. Spark Control now exposes /v1/audio/speech (TTS), /v1/audio/transcriptions (STT), and /v1/models on its own URL, translating OpenAI-shaped requests to Magpie (NVIDIA Riva multipart) and forwarding to Parakeet (already OpenAI-compatible). Open WebUI, Home Assistant, and any other OpenAI-compatible client can now point at https://<your-spark-control>.local/v1 and get TTS + STT + LLM all behind one identity — no shim service to deploy, no separate URLs to remember. /v1/models lists Magpie\'s 60+ voices across en-US, es-US, fr-FR, zh-CN, it-IT, hi-IN, vi-VN, ja-JP, de-DE so client UIs auto-populate their voice pickers. Falls back gracefully if Magpie is offline (still serves STT). Pure addition — no existing routes or endpoints changed.',
}, },
migrations: { migrations: {
up: async ({ effects }) => {}, up: async ({ effects }) => {},