f44e7f8b03
Adds three new endpoints to spark-control that translate OpenAI's
audio API shapes to the Parakeet (STT) and Magpie (TTS, NVIDIA Riva)
services on the Sparks:
GET /v1/models — STT model + Magpie's 60+ voices
POST /v1/audio/speech — OpenAI body -> Magpie multipart synthesize
(returns audio/wav passthrough)
POST /v1/audio/transcriptions — relay to Parakeet (already compatible)
Verified shapes against the live services:
- Parakeet returns OpenAI-style {"text": "..."} or verbose_json with
segments+words. Already a perfect drop-in for OpenAI clients.
- Magpie returns raw WAV bytes with Content-Type: audio/wav. NOT
base64-wrapped JSON as one might assume. The proxy is literally a
body-translation on the request side; response is passthrough.
Voice language is auto-derived from the voice name (e.g.
Magpie-Multilingual.EN-US.Mia -> language=en-US) so clients don't
need to set it explicitly.
Open WebUI / Home Assistant / Recap Relay can now all point at one
URL — https://<spark-control>.local/v1 — and get LLM, STT, TTS
behind a single identity. No shim service to deploy.
Pure addition: no existing routes touched; the dashboard, /api/*,
download flow, deep-health, hardware probes are all unchanged.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
184 lines
7.6 KiB
Python
184 lines
7.6 KiB
Python
"""OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI,
|
|
Home Assistant, etc.) talk to Parakeet (STT) and Magpie (TTS) through one URL.
|
|
|
|
Endpoints exposed on spark-control's port (same as the dashboard):
|
|
GET /v1/models — lists STT model + Magpie voices in OpenAI shape
|
|
POST /v1/audio/speech — OpenAI TTS → Magpie /v1/audio/synthesize
|
|
POST /v1/audio/transcriptions — forward to Parakeet (already OpenAI-compatible)
|
|
|
|
Both downstream services already speak HTTP on the LAN; this module just adapts
|
|
request/response shapes so OpenAI clients don't need a custom integration.
|
|
"""
|
|
from __future__ import annotations
|
|
import logging
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File
|
|
from fastapi.responses import Response, StreamingResponse
|
|
from pydantic import BaseModel
|
|
|
|
from .config import Settings
|
|
|
|
logger = logging.getLogger("spark-control.audio")
|
|
|
|
# Magpie voice name encodes its language. Example:
|
|
# Magpie-Multilingual.EN-US.Mia -> en-US
|
|
# Magpie-Multilingual.ES-US.Diego -> es-US
|
|
# Magpie-Multilingual.FR-FR.Pascal -> fr-FR
|
|
def _lang_from_voice(voice: str) -> str:
|
|
try:
|
|
parts = voice.split(".")
|
|
# parts = ["Magpie-Multilingual", "EN-US", "Mia"] (or with emotion suffix)
|
|
if len(parts) >= 2 and "-" in parts[1]:
|
|
lang_part = parts[1] # "EN-US"
|
|
primary, region = lang_part.split("-", 1)
|
|
return f"{primary.lower()}-{region.upper()}"
|
|
except Exception:
|
|
pass
|
|
return "en-US"
|
|
|
|
|
|
# Default voice: configurable, falls back to a sensible English voice if unset.
|
|
DEFAULT_VOICE = "Magpie-Multilingual.EN-US.Mia"
|
|
|
|
|
|
class SpeechRequest(BaseModel):
|
|
"""OpenAI /v1/audio/speech request body."""
|
|
model: Optional[str] = None # ignored — Magpie has one model
|
|
input: str # the text to speak
|
|
voice: Optional[str] = None # e.g. "Magpie-Multilingual.EN-US.Mia"
|
|
response_format: Optional[str] = "wav" # only "wav" supported today
|
|
speed: Optional[float] = 1.0 # ignored by Magpie
|
|
# Magpie-specific extensions (clients may pass these through)
|
|
language: Optional[str] = None
|
|
sample_rate_hz: Optional[int] = 22050
|
|
encoding: Optional[str] = "LINEAR_PCM"
|
|
|
|
|
|
def build_router(settings: Settings) -> APIRouter:
|
|
router = APIRouter()
|
|
|
|
def _parakeet_base() -> str:
|
|
return f"http://{settings.parakeet_host}:{settings.parakeet_port}"
|
|
|
|
def _magpie_base() -> str:
|
|
return f"http://{settings.magpie_host}:{settings.magpie_port}"
|
|
|
|
# ---- /v1/models ----
|
|
@router.get("/v1/models")
|
|
async def list_models() -> dict:
|
|
"""Advertise the STT model + a small voice menu so clients can
|
|
populate their voice-picker UIs. Falls back gracefully if Magpie
|
|
is offline (returns just the STT entry)."""
|
|
data: list[dict] = [
|
|
{
|
|
"id": "parakeet-tdt-0.6b-v3",
|
|
"object": "model",
|
|
"owned_by": "nvidia",
|
|
"kind": "stt",
|
|
},
|
|
]
|
|
# Try to enumerate voices from Magpie; if unreachable, just skip.
|
|
try:
|
|
async with httpx.AsyncClient(timeout=5.0) as client:
|
|
r = await client.get(f"{_magpie_base()}/v1/audio/list_voices")
|
|
if r.status_code == 200:
|
|
voices_by_locales = r.json()
|
|
seen = set()
|
|
for _locales, payload in voices_by_locales.items():
|
|
for v in payload.get("voices", []):
|
|
# Collapse emotion variants — expose only the base voice name.
|
|
# "Magpie-Multilingual.EN-US.Mia.Angry" -> "Magpie-Multilingual.EN-US.Mia"
|
|
parts = v.split(".")
|
|
base = ".".join(parts[:3]) if len(parts) >= 3 else v
|
|
if base not in seen:
|
|
seen.add(base)
|
|
data.append({
|
|
"id": base,
|
|
"object": "model",
|
|
"owned_by": "nvidia",
|
|
"kind": "tts",
|
|
})
|
|
except Exception as e:
|
|
logger.warning("magpie voice list unavailable: %s", e)
|
|
return {"object": "list", "data": data}
|
|
|
|
# ---- /v1/audio/speech (TTS) ----
|
|
@router.post("/v1/audio/speech")
|
|
async def speech(body: SpeechRequest) -> Response:
|
|
"""OpenAI-style TTS. Translates to Magpie's multipart synth call.
|
|
|
|
Returns raw WAV bytes (Content-Type: audio/wav) — browsers and most
|
|
clients play these directly.
|
|
"""
|
|
text = (body.input or "").strip()
|
|
if not text:
|
|
raise HTTPException(400, "input text is required")
|
|
|
|
voice = body.voice or DEFAULT_VOICE
|
|
language = body.language or _lang_from_voice(voice)
|
|
sample_rate = int(body.sample_rate_hz or 22050)
|
|
encoding = body.encoding or "LINEAR_PCM"
|
|
|
|
form = {
|
|
"text": text,
|
|
"language": language,
|
|
"voice": voice,
|
|
"sample_rate_hz": str(sample_rate),
|
|
"encoding": encoding,
|
|
}
|
|
try:
|
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
r = await client.post(f"{_magpie_base()}/v1/audio/synthesize", data=form)
|
|
except httpx.HTTPError as e:
|
|
raise HTTPException(502, f"magpie unreachable: {e}")
|
|
|
|
if r.status_code != 200:
|
|
# Surface Magpie's error message verbatim so clients can debug voice/lang typos.
|
|
raise HTTPException(r.status_code, r.text[:500])
|
|
|
|
# Magpie returns WAV bytes already (Content-Type: audio/wav). Pass through.
|
|
media_type = r.headers.get("content-type", "audio/wav")
|
|
return Response(content=r.content, media_type=media_type)
|
|
|
|
# ---- /v1/audio/transcriptions (STT) ----
|
|
@router.post("/v1/audio/transcriptions")
|
|
async def transcriptions(
|
|
file: UploadFile = File(...),
|
|
model: Optional[str] = Form(default=None),
|
|
language: Optional[str] = Form(default=None),
|
|
prompt: Optional[str] = Form(default=None),
|
|
response_format: Optional[str] = Form(default="json"),
|
|
temperature: Optional[float] = Form(default=None),
|
|
) -> Response:
|
|
"""Forward to Parakeet's already-OpenAI-compatible endpoint.
|
|
|
|
We relay rather than redirect so clients only need to know one URL
|
|
(spark-control's) — and so any future client-side rewrites of the
|
|
request shape (e.g. translating Whisper-format params) happen here.
|
|
"""
|
|
body = await file.read()
|
|
files = {"file": (file.filename or "audio.wav", body, file.content_type or "application/octet-stream")}
|
|
data: dict[str, str] = {}
|
|
if model: data["model"] = model
|
|
if language: data["language"] = language
|
|
if prompt: data["prompt"] = prompt
|
|
if response_format: data["response_format"] = response_format
|
|
if temperature is not None: data["temperature"] = str(temperature)
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
r = await client.post(
|
|
f"{_parakeet_base()}/v1/audio/transcriptions",
|
|
files=files, data=data,
|
|
)
|
|
except httpx.HTTPError as e:
|
|
raise HTTPException(502, f"parakeet unreachable: {e}")
|
|
|
|
if r.status_code != 200:
|
|
raise HTTPException(r.status_code, r.text[:500])
|
|
return Response(content=r.content, media_type=r.headers.get("content-type", "application/json"))
|
|
|
|
return router
|