spark-control/image/app/llm_proxy.py

"""OpenAI-compatible chat-completions proxy that forwards to the vLLM
process currently running on Spark 1.

Lets clients (Open WebUI, custom apps, etc.) use a single Spark Control
host for everything — same TLS cert, same allowlist, same place to add
rate limiting/observability later — instead of having to also reach
into <spark1-host>:8888 directly.

Endpoints:
  POST /v1/chat/completions   — OpenAI chat completions (streams when stream=true)
  POST /v1/completions        — OpenAI legacy completions (also stream-capable)

The proxy is intentionally dumb: forward the request body, stream the
response back. We don't parse or transform the OpenAI payload — vLLM
already speaks the same shape, and adding any transformation here would
create skew with the official OpenAI clients.
"""
from __future__ import annotations
import json
import logging
from typing import AsyncIterator

import httpx
from fastapi import APIRouter, HTTPException, Request
from fastapi.responses import Response, StreamingResponse

from .config import Settings

logger = logging.getLogger("spark-control.llm")


# vLLM gets long for big-context completions; cap at 30 min to be safe.
DEFAULT_TIMEOUT = 1800.0


def build_router(settings: Settings) -> APIRouter:
    router = APIRouter()

    def _vllm_url(suffix: str) -> str:
        return f"http://{settings.spark1_host}:{settings.vllm_port}{suffix}"

    async def _proxy(request: Request, upstream_suffix: str) -> Response:
        if not settings.spark1_host:
            raise HTTPException(503, "Spark 1 host not configured")
        body = await request.body()
        # Determine whether the client requested streaming. vLLM returns SSE if
        # stream=true; otherwise a single JSON object. We must stream when the
        # client asked, otherwise FastAPI would buffer the entire response and
        # block until vLLM finishes generating (defeats the point of streaming).
        is_stream = False
        try:
            parsed = json.loads(body) if body else {}
            is_stream = bool(parsed.get("stream"))
        except Exception:
            pass

        # Forward content-type + accept headers; strip hop-by-hop headers.
        fwd_headers = {
            "Content-Type": request.headers.get("content-type", "application/json"),
        }
        if (accept := request.headers.get("accept")):
            fwd_headers["Accept"] = accept

        url = _vllm_url(upstream_suffix)

        if is_stream:
            # Stream the upstream response back chunk-by-chunk. We hold the
            # httpx connection open for the lifetime of the stream.
            async def passthrough() -> AsyncIterator[bytes]:
                async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
                    try:
                        async with client.stream(
                            "POST", url, content=body, headers=fwd_headers
                        ) as r:
                            if r.status_code != 200:
                                err_body = await r.aread()
                                logger.warning(
                                    "vllm %s returned %s: %s",
                                    upstream_suffix, r.status_code, err_body[:300]
                                )
                                # Emit a single SSE error event so the client's
                                # parser doesn't just hang on an empty stream.
                                yield (
                                    f"event: error\ndata: "
                                    f"{json.dumps({'status': r.status_code, 'detail': err_body[:500].decode(errors='replace')})}\n\n"
                                ).encode()
                                return
                            async for chunk in r.aiter_raw():
                                yield chunk
                    except httpx.HTTPError as e:
                        logger.exception("vllm stream failed: %s", e)
                        yield (
                            f"event: error\ndata: "
                            f"{json.dumps({'detail': f'vllm unreachable: {e}'})}\n\n"
                        ).encode()

            return StreamingResponse(passthrough(), media_type="text/event-stream")

        # Non-streaming: one POST, return the body verbatim.
        try:
            async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
                r = await client.post(url, content=body, headers=fwd_headers)
        except httpx.HTTPError as e:
            raise HTTPException(502, f"vllm unreachable: {e}")
        return Response(
            content=r.content,
            status_code=r.status_code,
            media_type=r.headers.get("content-type", "application/json"),
        )

    @router.post("/v1/chat/completions")
    async def chat_completions(request: Request) -> Response:
        """OpenAI chat-completions, forwarded to the vLLM on Spark 1.

        Request body is passed through unchanged — anything vLLM understands
        works here (model, messages, max_tokens, temperature, response_format,
        chat_template_kwargs, tools, tool_choice, ...).

        Streaming: set `stream: true` in the request body and we'll stream the
        SSE response from vLLM back through this proxy. Default 30-min timeout
        per request to accommodate large-context completions.
        """
        return await _proxy(request, "/v1/chat/completions")

    @router.post("/v1/completions")
    async def completions(request: Request) -> Response:
        """OpenAI legacy completions, forwarded to the vLLM on Spark 1."""
        return await _proxy(request, "/v1/completions")

    return router