v0.13.0:3 - proxy /v1/chat/completions through Spark Control to vLLM

Recap Relay dev caught that all audio endpoints route through Spark Control but chat-completions didn't — clients had to know about both SC AND the direct vLLM URL on Spark 1. Closes that last gap. New endpoints: POST /v1/chat/completions — OpenAI-shape, forwards to vLLM on Spark 1 POST /v1/completions — legacy OpenAI completions, same path Implementation (image/app/llm_proxy.py): - Dumb forwarder: request body passed through verbatim, response body streamed back chunk-by-chunk. No transformation. vLLM already speaks the same shape; adding any logic here would just create skew. - Streaming: parses body for `stream: true` and uses httpx.AsyncClient .stream() + FastAPI StreamingResponse if so. Non-streaming path is a simple post-and-return. - 30-minute timeout to accommodate large-context completions (default httpx 5s would kill anything substantial). - On upstream non-200 in streaming mode: emits one SSE `error` event so the client's parser doesn't hang on an empty stream forever. - On upstream connection error: HTTP 502 with "vllm unreachable" detail. Now clients can use ONE host for everything: POST https://spark-control/api/audio/diarize-chunk POST https://spark-control/v1/audio/transcriptions POST https://spark-control/v1/chat/completions GET https://spark-control/api/endpoints (still works for clients that prefer the direct URLs) No parakeet container changes. No Reapply patches needed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 19:58:19 -05:00
parent c7f94381e7
commit 4a75274db3
3 changed files with 139 additions and 2 deletions
@@ -0,0 +1,130 @@
+"""OpenAI-compatible chat-completions proxy that forwards to the vLLM
+process currently running on Spark 1.
+
+Lets clients (recap-relay, Open WebUI, etc.) use a single Spark Control
+host for everything — same TLS cert, same allowlist, same place to add
+rate limiting/observability later — instead of having to also reach
+into <spark-1-ip>:8888 directly.
+
+Endpoints:
+  POST /v1/chat/completions   — OpenAI chat completions (streams when stream=true)
+  POST /v1/completions        — OpenAI legacy completions (also stream-capable)
+
+The proxy is intentionally dumb: forward the request body, stream the
+response back. We don't parse or transform the OpenAI payload — vLLM
+already speaks the same shape, and adding any transformation here would
+create skew with the official OpenAI clients.
+"""
+from __future__ import annotations
+import json
+import logging
+from typing import AsyncIterator
+
+import httpx
+from fastapi import APIRouter, HTTPException, Request
+from fastapi.responses import Response, StreamingResponse
+
+from .config import Settings
+
+logger = logging.getLogger("spark-control.llm")
+
+
+# vLLM gets long for big-context completions; cap at 30 min to be safe.
+DEFAULT_TIMEOUT = 1800.0
+
+
+def build_router(settings: Settings) -> APIRouter:
+    router = APIRouter()
+
+    def _vllm_url(suffix: str) -> str:
+        return f"http://{settings.spark1_host}:{settings.vllm_port}{suffix}"
+
+    async def _proxy(request: Request, upstream_suffix: str) -> Response:
+        if not settings.spark1_host:
+            raise HTTPException(503, "Spark 1 host not configured")
+        body = await request.body()
+        # Determine whether the client requested streaming. vLLM returns SSE if
+        # stream=true; otherwise a single JSON object. We must stream when the
+        # client asked, otherwise FastAPI would buffer the entire response and
+        # block until vLLM finishes generating (defeats the point of streaming).
+        is_stream = False
+        try:
+            parsed = json.loads(body) if body else {}
+            is_stream = bool(parsed.get("stream"))
+        except Exception:
+            pass
+
+        # Forward content-type + accept headers; strip hop-by-hop headers.
+        fwd_headers = {
+            "Content-Type": request.headers.get("content-type", "application/json"),
+        }
+        if (accept := request.headers.get("accept")):
+            fwd_headers["Accept"] = accept
+
+        url = _vllm_url(upstream_suffix)
+
+        if is_stream:
+            # Stream the upstream response back chunk-by-chunk. We hold the
+            # httpx connection open for the lifetime of the stream.
+            async def passthrough() -> AsyncIterator[bytes]:
+                async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
+                    try:
+                        async with client.stream(
+                            "POST", url, content=body, headers=fwd_headers
+                        ) as r:
+                            if r.status_code != 200:
+                                err_body = await r.aread()
+                                logger.warning(
+                                    "vllm %s returned %s: %s",
+                                    upstream_suffix, r.status_code, err_body[:300]
+                                )
+                                # Emit a single SSE error event so the client's
+                                # parser doesn't just hang on an empty stream.
+                                yield (
+                                    f"event: error\ndata: "
+                                    f"{json.dumps({'status': r.status_code, 'detail': err_body[:500].decode(errors='replace')})}\n\n"
+                                ).encode()
+                                return
+                            async for chunk in r.aiter_raw():
+                                yield chunk
+                    except httpx.HTTPError as e:
+                        logger.exception("vllm stream failed: %s", e)
+                        yield (
+                            f"event: error\ndata: "
+                            f"{json.dumps({'detail': f'vllm unreachable: {e}'})}\n\n"
+                        ).encode()
+
+            return StreamingResponse(passthrough(), media_type="text/event-stream")
+
+        # Non-streaming: one POST, return the body verbatim.
+        try:
+            async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
+                r = await client.post(url, content=body, headers=fwd_headers)
+        except httpx.HTTPError as e:
+            raise HTTPException(502, f"vllm unreachable: {e}")
+        return Response(
+            content=r.content,
+            status_code=r.status_code,
+            media_type=r.headers.get("content-type", "application/json"),
+        )
+
+    @router.post("/v1/chat/completions")
+    async def chat_completions(request: Request) -> Response:
+        """OpenAI chat-completions, forwarded to the vLLM on Spark 1.
+
+        Request body is passed through unchanged — anything vLLM understands
+        works here (model, messages, max_tokens, temperature, response_format,
+        chat_template_kwargs, tools, tool_choice, ...).
+
+        Streaming: set `stream: true` in the request body and we'll stream the
+        SSE response from vLLM back through this proxy. Default 30-min timeout
+        per request to accommodate large-context completions.
+        """
+        return await _proxy(request, "/v1/chat/completions")
+
+    @router.post("/v1/completions")
+    async def completions(request: Request) -> Response:
+        """OpenAI legacy completions, forwarded to the vLLM on Spark 1."""
+        return await _proxy(request, "/v1/completions")
+
+    return router
@@ -16,6 +16,7 @@ from .audio_proxy import build_router as build_audio_router
 from .deep_health import DeepHealth
 from .disk import delete_from_disk, probe_disk
 from .download import DownloadManager
+from .llm_proxy import build_router as build_llm_router
 from .hardware import HardwareProbe
 from .health import check_magpie, check_parakeet, check_vllm
 from .models import load_catalog
@@ -64,6 +65,12 @@ app.mount("/static", StaticFiles(directory=_STATIC_DIR), name="static")
 # when Parakeet returns 500, instead of waiting up to 5 min for the periodic probe.
 app.include_router(build_audio_router(settings, deep_health=deep_health))

+# OpenAI-compatible LLM proxy: /v1/chat/completions, /v1/completions.
+# Forwards to whatever vLLM is currently running on Spark 1 (per the LLM swap
+# state). Supports SSE streaming when stream=true. Same trusted-host model
+# as the audio proxy — clients only need one URL for everything.
+app.include_router(build_llm_router(settings))
+

@app.get("/", include_in_schema=False)
 async def index() -> FileResponse: