diff --git a/image/app/audio_proxy.py b/image/app/audio_proxy.py new file mode 100644 index 0000000..c98244b --- /dev/null +++ b/image/app/audio_proxy.py @@ -0,0 +1,183 @@ +"""OpenAI-compatible audio proxy: lets any OpenAI-shaped client (Open WebUI, +Home Assistant, etc.) talk to Parakeet (STT) and Magpie (TTS) through one URL. + +Endpoints exposed on spark-control's port (same as the dashboard): + GET /v1/models — lists STT model + Magpie voices in OpenAI shape + POST /v1/audio/speech — OpenAI TTS → Magpie /v1/audio/synthesize + POST /v1/audio/transcriptions — forward to Parakeet (already OpenAI-compatible) + +Both downstream services already speak HTTP on the LAN; this module just adapts +request/response shapes so OpenAI clients don't need a custom integration. +""" +from __future__ import annotations +import logging +from typing import Optional + +import httpx +from fastapi import APIRouter, Form, HTTPException, Request, UploadFile, File +from fastapi.responses import Response, StreamingResponse +from pydantic import BaseModel + +from .config import Settings + +logger = logging.getLogger("spark-control.audio") + +# Magpie voice name encodes its language. Example: +# Magpie-Multilingual.EN-US.Mia -> en-US +# Magpie-Multilingual.ES-US.Diego -> es-US +# Magpie-Multilingual.FR-FR.Pascal -> fr-FR +def _lang_from_voice(voice: str) -> str: + try: + parts = voice.split(".") + # parts = ["Magpie-Multilingual", "EN-US", "Mia"] (or with emotion suffix) + if len(parts) >= 2 and "-" in parts[1]: + lang_part = parts[1] # "EN-US" + primary, region = lang_part.split("-", 1) + return f"{primary.lower()}-{region.upper()}" + except Exception: + pass + return "en-US" + + +# Default voice: configurable, falls back to a sensible English voice if unset. +DEFAULT_VOICE = "Magpie-Multilingual.EN-US.Mia" + + +class SpeechRequest(BaseModel): + """OpenAI /v1/audio/speech request body.""" + model: Optional[str] = None # ignored — Magpie has one model + input: str # the text to speak + voice: Optional[str] = None # e.g. "Magpie-Multilingual.EN-US.Mia" + response_format: Optional[str] = "wav" # only "wav" supported today + speed: Optional[float] = 1.0 # ignored by Magpie + # Magpie-specific extensions (clients may pass these through) + language: Optional[str] = None + sample_rate_hz: Optional[int] = 22050 + encoding: Optional[str] = "LINEAR_PCM" + + +def build_router(settings: Settings) -> APIRouter: + router = APIRouter() + + def _parakeet_base() -> str: + return f"http://{settings.parakeet_host}:{settings.parakeet_port}" + + def _magpie_base() -> str: + return f"http://{settings.magpie_host}:{settings.magpie_port}" + + # ---- /v1/models ---- + @router.get("/v1/models") + async def list_models() -> dict: + """Advertise the STT model + a small voice menu so clients can + populate their voice-picker UIs. Falls back gracefully if Magpie + is offline (returns just the STT entry).""" + data: list[dict] = [ + { + "id": "parakeet-tdt-0.6b-v3", + "object": "model", + "owned_by": "nvidia", + "kind": "stt", + }, + ] + # Try to enumerate voices from Magpie; if unreachable, just skip. + try: + async with httpx.AsyncClient(timeout=5.0) as client: + r = await client.get(f"{_magpie_base()}/v1/audio/list_voices") + if r.status_code == 200: + voices_by_locales = r.json() + seen = set() + for _locales, payload in voices_by_locales.items(): + for v in payload.get("voices", []): + # Collapse emotion variants — expose only the base voice name. + # "Magpie-Multilingual.EN-US.Mia.Angry" -> "Magpie-Multilingual.EN-US.Mia" + parts = v.split(".") + base = ".".join(parts[:3]) if len(parts) >= 3 else v + if base not in seen: + seen.add(base) + data.append({ + "id": base, + "object": "model", + "owned_by": "nvidia", + "kind": "tts", + }) + except Exception as e: + logger.warning("magpie voice list unavailable: %s", e) + return {"object": "list", "data": data} + + # ---- /v1/audio/speech (TTS) ---- + @router.post("/v1/audio/speech") + async def speech(body: SpeechRequest) -> Response: + """OpenAI-style TTS. Translates to Magpie's multipart synth call. + + Returns raw WAV bytes (Content-Type: audio/wav) — browsers and most + clients play these directly. + """ + text = (body.input or "").strip() + if not text: + raise HTTPException(400, "input text is required") + + voice = body.voice or DEFAULT_VOICE + language = body.language or _lang_from_voice(voice) + sample_rate = int(body.sample_rate_hz or 22050) + encoding = body.encoding or "LINEAR_PCM" + + form = { + "text": text, + "language": language, + "voice": voice, + "sample_rate_hz": str(sample_rate), + "encoding": encoding, + } + try: + async with httpx.AsyncClient(timeout=120.0) as client: + r = await client.post(f"{_magpie_base()}/v1/audio/synthesize", data=form) + except httpx.HTTPError as e: + raise HTTPException(502, f"magpie unreachable: {e}") + + if r.status_code != 200: + # Surface Magpie's error message verbatim so clients can debug voice/lang typos. + raise HTTPException(r.status_code, r.text[:500]) + + # Magpie returns WAV bytes already (Content-Type: audio/wav). Pass through. + media_type = r.headers.get("content-type", "audio/wav") + return Response(content=r.content, media_type=media_type) + + # ---- /v1/audio/transcriptions (STT) ---- + @router.post("/v1/audio/transcriptions") + async def transcriptions( + file: UploadFile = File(...), + model: Optional[str] = Form(default=None), + language: Optional[str] = Form(default=None), + prompt: Optional[str] = Form(default=None), + response_format: Optional[str] = Form(default="json"), + temperature: Optional[float] = Form(default=None), + ) -> Response: + """Forward to Parakeet's already-OpenAI-compatible endpoint. + + We relay rather than redirect so clients only need to know one URL + (spark-control's) — and so any future client-side rewrites of the + request shape (e.g. translating Whisper-format params) happen here. + """ + body = await file.read() + files = {"file": (file.filename or "audio.wav", body, file.content_type or "application/octet-stream")} + data: dict[str, str] = {} + if model: data["model"] = model + if language: data["language"] = language + if prompt: data["prompt"] = prompt + if response_format: data["response_format"] = response_format + if temperature is not None: data["temperature"] = str(temperature) + + try: + async with httpx.AsyncClient(timeout=300.0) as client: + r = await client.post( + f"{_parakeet_base()}/v1/audio/transcriptions", + files=files, data=data, + ) + except httpx.HTTPError as e: + raise HTTPException(502, f"parakeet unreachable: {e}") + + if r.status_code != 200: + raise HTTPException(r.status_code, r.text[:500]) + return Response(content=r.content, media_type=r.headers.get("content-type", "application/json")) + + return router diff --git a/image/app/server.py b/image/app/server.py index 8ad6e73..652a9f7 100644 --- a/image/app/server.py +++ b/image/app/server.py @@ -12,6 +12,7 @@ from typing import Literal from .config import Settings from .connectivity import get_mac, record_report, record_state, summary as connectivity_summary from .custom_services import add_custom_service, delete_custom_service +from .audio_proxy import build_router as build_audio_router from .deep_health import DeepHealth from .disk import delete_from_disk, probe_disk from .download import DownloadManager @@ -54,6 +55,11 @@ async def _stop_deep_health() -> None: _STATIC_DIR = Path(__file__).resolve().parent / "static" app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static") +# OpenAI-compatible audio proxy: /v1/audio/speech, /v1/audio/transcriptions, /v1/models. +# Lets Open WebUI, Home Assistant, and any other OpenAI-shaped client talk to +# Parakeet (STT) and Magpie (TTS) through a single spark-control URL. +app.include_router(build_audio_router(settings)) + @app.get("/", include_in_schema=False) async def index() -> FileResponse: diff --git a/package/startos/versions/v0_1_0.ts b/package/startos/versions/v0_1_0.ts index 707b872..be2c958 100644 --- a/package/startos/versions/v0_1_0.ts +++ b/package/startos/versions/v0_1_0.ts @@ -1,10 +1,10 @@ import { VersionInfo, IMPOSSIBLE } from '@start9labs/start-sdk' export const v0_1_0 = VersionInfo.of({ - version: '0.8.1:2', + version: '0.9.0:0', releaseNotes: { en_US: - 'v0.8.1:2 — the primary card button now adapts to whether the model is on disk. If weights are present: green "Switch to this" (unchanged). If weights are NOT on disk: blue "Download" instead, which calls /api/download directly with the model\'s repo and the right mode (solo→Spark 1, cluster→both Sparks) — no more pasting the repo into the manual download form to re-fetch a deleted model. Re-installing a previously-deleted model is now one click + a confirmation. Builds on the disk-status pills + trash icons from 0.8.1.', + 'v0.9.0 — OpenAI-compatible audio proxy. Spark Control now exposes /v1/audio/speech (TTS), /v1/audio/transcriptions (STT), and /v1/models on its own URL, translating OpenAI-shaped requests to Magpie (NVIDIA Riva multipart) and forwarding to Parakeet (already OpenAI-compatible). Open WebUI, Home Assistant, and any other OpenAI-compatible client can now point at https://.local/v1 and get TTS + STT + LLM all behind one identity — no shim service to deploy, no separate URLs to remember. /v1/models lists Magpie\'s 60+ voices across en-US, es-US, fr-FR, zh-CN, it-IT, hi-IN, vi-VN, ja-JP, de-DE so client UIs auto-populate their voice pickers. Falls back gracefully if Magpie is offline (still serves STT). Pure addition — no existing routes or endpoints changed.', }, migrations: { up: async ({ effects }) => {},