v0.9.0:0 - OpenAI-compatible audio proxy for Open WebUI / Home Assistant
Adds three new endpoints to spark-control that translate OpenAI's
audio API shapes to the Parakeet (STT) and Magpie (TTS, NVIDIA Riva)
services on the Sparks:
GET /v1/models — STT model + Magpie's 60+ voices
POST /v1/audio/speech — OpenAI body -> Magpie multipart synthesize
(returns audio/wav passthrough)
POST /v1/audio/transcriptions — relay to Parakeet (already compatible)
Verified shapes against the live services:
- Parakeet returns OpenAI-style {"text": "..."} or verbose_json with
segments+words. Already a perfect drop-in for OpenAI clients.
- Magpie returns raw WAV bytes with Content-Type: audio/wav. NOT
base64-wrapped JSON as one might assume. The proxy is literally a
body-translation on the request side; response is passthrough.
Voice language is auto-derived from the voice name (e.g.
Magpie-Multilingual.EN-US.Mia -> language=en-US) so clients don't
need to set it explicitly.
Open WebUI / Home Assistant / Recap Relay can now all point at one
URL — https://<spark-control>.local/v1 — and get LLM, STT, TTS
behind a single identity. No shim service to deploy.
Pure addition: no existing routes touched; the dashboard, /api/*,
download flow, deep-health, hardware probes are all unchanged.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,183 @@
|
|||||||
|
"""OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI,
|
||||||
|
Home Assistant, etc.) talk to Parakeet (STT) and Magpie (TTS) through one URL.
|
||||||
|
|
||||||
|
Endpoints exposed on spark-control's port (same as the dashboard):
|
||||||
|
GET /v1/models — lists STT model + Magpie voices in OpenAI shape
|
||||||
|
POST /v1/audio/speech — OpenAI TTS → Magpie /v1/audio/synthesize
|
||||||
|
POST /v1/audio/transcriptions — forward to Parakeet (already OpenAI-compatible)
|
||||||
|
|
||||||
|
Both downstream services already speak HTTP on the LAN; this module just adapts
|
||||||
|
request/response shapes so OpenAI clients don't need a custom integration.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File
|
||||||
|
from fastapi.responses import Response, StreamingResponse
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from .config import Settings
|
||||||
|
|
||||||
|
logger = logging.getLogger("spark-control.audio")
|
||||||
|
|
||||||
|
# Magpie voice name encodes its language. Example:
|
||||||
|
# Magpie-Multilingual.EN-US.Mia -> en-US
|
||||||
|
# Magpie-Multilingual.ES-US.Diego -> es-US
|
||||||
|
# Magpie-Multilingual.FR-FR.Pascal -> fr-FR
|
||||||
|
def _lang_from_voice(voice: str) -> str:
|
||||||
|
try:
|
||||||
|
parts = voice.split(".")
|
||||||
|
# parts = ["Magpie-Multilingual", "EN-US", "Mia"] (or with emotion suffix)
|
||||||
|
if len(parts) >= 2 and "-" in parts[1]:
|
||||||
|
lang_part = parts[1] # "EN-US"
|
||||||
|
primary, region = lang_part.split("-", 1)
|
||||||
|
return f"{primary.lower()}-{region.upper()}"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return "en-US"
|
||||||
|
|
||||||
|
|
||||||
|
# Default voice: configurable, falls back to a sensible English voice if unset.
|
||||||
|
DEFAULT_VOICE = "Magpie-Multilingual.EN-US.Mia"
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechRequest(BaseModel):
|
||||||
|
"""OpenAI /v1/audio/speech request body."""
|
||||||
|
model: Optional[str] = None # ignored — Magpie has one model
|
||||||
|
input: str # the text to speak
|
||||||
|
voice: Optional[str] = None # e.g. "Magpie-Multilingual.EN-US.Mia"
|
||||||
|
response_format: Optional[str] = "wav" # only "wav" supported today
|
||||||
|
speed: Optional[float] = 1.0 # ignored by Magpie
|
||||||
|
# Magpie-specific extensions (clients may pass these through)
|
||||||
|
language: Optional[str] = None
|
||||||
|
sample_rate_hz: Optional[int] = 22050
|
||||||
|
encoding: Optional[str] = "LINEAR_PCM"
|
||||||
|
|
||||||
|
|
||||||
|
def build_router(settings: Settings) -> APIRouter:
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
def _parakeet_base() -> str:
|
||||||
|
return f"http://{settings.parakeet_host}:{settings.parakeet_port}"
|
||||||
|
|
||||||
|
def _magpie_base() -> str:
|
||||||
|
return f"http://{settings.magpie_host}:{settings.magpie_port}"
|
||||||
|
|
||||||
|
# ---- /v1/models ----
|
||||||
|
@router.get("/v1/models")
|
||||||
|
async def list_models() -> dict:
|
||||||
|
"""Advertise the STT model + a small voice menu so clients can
|
||||||
|
populate their voice-picker UIs. Falls back gracefully if Magpie
|
||||||
|
is offline (returns just the STT entry)."""
|
||||||
|
data: list[dict] = [
|
||||||
|
{
|
||||||
|
"id": "parakeet-tdt-0.6b-v3",
|
||||||
|
"object": "model",
|
||||||
|
"owned_by": "nvidia",
|
||||||
|
"kind": "stt",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
# Try to enumerate voices from Magpie; if unreachable, just skip.
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||||
|
r = await client.get(f"{_magpie_base()}/v1/audio/list_voices")
|
||||||
|
if r.status_code == 200:
|
||||||
|
voices_by_locales = r.json()
|
||||||
|
seen = set()
|
||||||
|
for _locales, payload in voices_by_locales.items():
|
||||||
|
for v in payload.get("voices", []):
|
||||||
|
# Collapse emotion variants — expose only the base voice name.
|
||||||
|
# "Magpie-Multilingual.EN-US.Mia.Angry" -> "Magpie-Multilingual.EN-US.Mia"
|
||||||
|
parts = v.split(".")
|
||||||
|
base = ".".join(parts[:3]) if len(parts) >= 3 else v
|
||||||
|
if base not in seen:
|
||||||
|
seen.add(base)
|
||||||
|
data.append({
|
||||||
|
"id": base,
|
||||||
|
"object": "model",
|
||||||
|
"owned_by": "nvidia",
|
||||||
|
"kind": "tts",
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("magpie voice list unavailable: %s", e)
|
||||||
|
return {"object": "list", "data": data}
|
||||||
|
|
||||||
|
# ---- /v1/audio/speech (TTS) ----
|
||||||
|
@router.post("/v1/audio/speech")
|
||||||
|
async def speech(body: SpeechRequest) -> Response:
|
||||||
|
"""OpenAI-style TTS. Translates to Magpie's multipart synth call.
|
||||||
|
|
||||||
|
Returns raw WAV bytes (Content-Type: audio/wav) — browsers and most
|
||||||
|
clients play these directly.
|
||||||
|
"""
|
||||||
|
text = (body.input or "").strip()
|
||||||
|
if not text:
|
||||||
|
raise HTTPException(400, "input text is required")
|
||||||
|
|
||||||
|
voice = body.voice or DEFAULT_VOICE
|
||||||
|
language = body.language or _lang_from_voice(voice)
|
||||||
|
sample_rate = int(body.sample_rate_hz or 22050)
|
||||||
|
encoding = body.encoding or "LINEAR_PCM"
|
||||||
|
|
||||||
|
form = {
|
||||||
|
"text": text,
|
||||||
|
"language": language,
|
||||||
|
"voice": voice,
|
||||||
|
"sample_rate_hz": str(sample_rate),
|
||||||
|
"encoding": encoding,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||||
|
r = await client.post(f"{_magpie_base()}/v1/audio/synthesize", data=form)
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
raise HTTPException(502, f"magpie unreachable: {e}")
|
||||||
|
|
||||||
|
if r.status_code != 200:
|
||||||
|
# Surface Magpie's error message verbatim so clients can debug voice/lang typos.
|
||||||
|
raise HTTPException(r.status_code, r.text[:500])
|
||||||
|
|
||||||
|
# Magpie returns WAV bytes already (Content-Type: audio/wav). Pass through.
|
||||||
|
media_type = r.headers.get("content-type", "audio/wav")
|
||||||
|
return Response(content=r.content, media_type=media_type)
|
||||||
|
|
||||||
|
# ---- /v1/audio/transcriptions (STT) ----
|
||||||
|
@router.post("/v1/audio/transcriptions")
|
||||||
|
async def transcriptions(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
model: Optional[str] = Form(default=None),
|
||||||
|
language: Optional[str] = Form(default=None),
|
||||||
|
prompt: Optional[str] = Form(default=None),
|
||||||
|
response_format: Optional[str] = Form(default="json"),
|
||||||
|
temperature: Optional[float] = Form(default=None),
|
||||||
|
) -> Response:
|
||||||
|
"""Forward to Parakeet's already-OpenAI-compatible endpoint.
|
||||||
|
|
||||||
|
We relay rather than redirect so clients only need to know one URL
|
||||||
|
(spark-control's) — and so any future client-side rewrites of the
|
||||||
|
request shape (e.g. translating Whisper-format params) happen here.
|
||||||
|
"""
|
||||||
|
body = await file.read()
|
||||||
|
files = {"file": (file.filename or "audio.wav", body, file.content_type or "application/octet-stream")}
|
||||||
|
data: dict[str, str] = {}
|
||||||
|
if model: data["model"] = model
|
||||||
|
if language: data["language"] = language
|
||||||
|
if prompt: data["prompt"] = prompt
|
||||||
|
if response_format: data["response_format"] = response_format
|
||||||
|
if temperature is not None: data["temperature"] = str(temperature)
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||||
|
r = await client.post(
|
||||||
|
f"{_parakeet_base()}/v1/audio/transcriptions",
|
||||||
|
files=files, data=data,
|
||||||
|
)
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
raise HTTPException(502, f"parakeet unreachable: {e}")
|
||||||
|
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise HTTPException(r.status_code, r.text[:500])
|
||||||
|
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
|
||||||
|
|
||||||
|
return router
|
||||||
@@ -12,6 +12,7 @@ from typing import Literal
|
|||||||
from .config import Settings
|
from .config import Settings
|
||||||
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary
|
||||||
from .custom_services import add_custom_service, delete_custom_service
|
from .custom_services import add_custom_service, delete_custom_service
|
||||||
|
from .audio_proxy import build_router as build_audio_router
|
||||||
from .deep_health import DeepHealth
|
from .deep_health import DeepHealth
|
||||||
from .disk import delete_from_disk, probe_disk
|
from .disk import delete_from_disk, probe_disk
|
||||||
from .download import DownloadManager
|
from .download import DownloadManager
|
||||||
@@ -54,6 +55,11 @@ async def _stop_deep_health() -> None:
|
|||||||
_STATIC_DIR = Path(__file__).resolve().parent / "static"
|
_STATIC_DIR = Path(__file__).resolve().parent / "static"
|
||||||
app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
|
app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
|
||||||
|
|
||||||
|
# OpenAI-compatible audio proxy: /v1/audio/speech, /v1/audio/transcriptions, /v1/models.
|
||||||
|
# Lets Open WebUI, Home Assistant, and any other OpenAI-shaped client talk to
|
||||||
|
# Parakeet (STT) and Magpie (TTS) through a single spark-control URL.
|
||||||
|
app.include_router(build_audio_router(settings))
|
||||||
|
|
||||||
|
|
||||||
@app.get("/", include_in_schema=False)
|
@app.get("/", include_in_schema=False)
|
||||||
async def index() -> FileResponse:
|
async def index() -> FileResponse:
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
export const v0_1_0 = VersionInfo.of({
|
export const v0_1_0 = VersionInfo.of({
|
||||||
version: '0.8.1:2',
|
version: '0.9.0:0',
|
||||||
releaseNotes: {
|
releaseNotes: {
|
||||||
en_US:
|
en_US:
|
||||||
'v0.8.1:2 — the primary card button now adapts to whether the model is on disk. If weights are present: green "Switch to this" (unchanged). If weights are NOT on disk: blue "Download" instead, which calls /api/download directly with the model\'s repo and the right mode (solo→Spark 1, cluster→both Sparks) — no more pasting the repo into the manual download form to re-fetch a deleted model. Re-installing a previously-deleted model is now one click + a confirmation. Builds on the disk-status pills + trash icons from 0.8.1.',
|
'v0.9.0 — OpenAI-compatible audio proxy. Spark Control now exposes /v1/audio/speech (TTS), /v1/audio/transcriptions (STT), and /v1/models on its own URL, translating OpenAI-shaped requests to Magpie (NVIDIA Riva multipart) and forwarding to Parakeet (already OpenAI-compatible). Open WebUI, Home Assistant, and any other OpenAI-compatible client can now point at https://<your-spark-control>.local/v1 and get TTS + STT + LLM all behind one identity — no shim service to deploy, no separate URLs to remember. /v1/models lists Magpie\'s 60+ voices across en-US, es-US, fr-FR, zh-CN, it-IT, hi-IN, vi-VN, ja-JP, de-DE so client UIs auto-populate their voice pickers. Falls back gracefully if Magpie is offline (still serves STT). Pure addition — no existing routes or endpoints changed.',
|
||||||
},
|
},
|
||||||
migrations: {
|
migrations: {
|
||||||
up: async ({ effects }) => {},
|
up: async ({ effects }) => {},
|
||||||
|
|||||||
Reference in New Issue
Block a user