ten31-database/backend/ingest/llm.py

"""Local Qwen chat client via Spark Control /v1/chat/completions.

Used for the privacy-sensitive, high-volume reasoning that must stay on Ten31
infra (entity-resolution adjudication, triage). Frontier reasoning still goes to
Claude; this is the local leg. Thinking is disabled for fast structured output.
"""
import json
import re

import config
import http_util


def chat(prompt, system=None, max_tokens=200, temperature=0.0):
    messages = []
    if system:
        messages.append({"role": "system", "content": system})
    messages.append({"role": "user", "content": prompt})
    body = {"model": config.CHAT_MODEL, "messages": messages,
            "temperature": temperature, "max_tokens": max_tokens,
            "chat_template_kwargs": {"enable_thinking": False}}
    status, data = http_util.request("POST", f"{config.SPARK_CONTROL_URL}/v1/chat/completions",
                                     body, verify=config.SPARK_VERIFY_TLS)
    if status != 200:
        raise RuntimeError(f"/v1/chat/completions -> {status}: {data}")
    return (data["choices"][0]["message"].get("content") or "").strip()


def chat_vision(prompt, image_b64, mime="image/jpeg", system=None, max_tokens=600, temperature=0.0):
    """Multimodal chat: a text prompt + one base64 image to the local VL model via Spark Control.

    Same endpoint and model as chat() — the daily-driver Qwen is vision-capable (capabilities
    [vision, reasoning]); the only difference is the user message's `content` is the OpenAI
    multimodal array (a text part + an image_url data-URI), which Spark Control forwards to vLLM
    unchanged (it's a dumb passthrough). The server downscales to its max_pixels cap, so a
    full-res phone photo is fine. Thinking stays off for fast, literal output."""
    messages = []
    if system:
        messages.append({"role": "system", "content": system})
    messages.append({"role": "user", "content": [
        {"type": "text", "text": prompt},
        {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{image_b64}"}},
    ]})
    body = {"model": config.CHAT_MODEL, "messages": messages,
            "temperature": temperature, "max_tokens": max_tokens,
            "chat_template_kwargs": {"enable_thinking": False}}
    status, data = http_util.request("POST", f"{config.SPARK_CONTROL_URL}/v1/chat/completions",
                                     body, verify=config.SPARK_VERIFY_TLS)
    if status != 200:
        raise RuntimeError(f"/v1/chat/completions (vision) -> {status}: {data}")
    return (data["choices"][0]["message"].get("content") or "").strip()


def chat_json(prompt, system=None, max_tokens=200):
    """Chat and parse the first JSON object from the reply (tolerant of fences)."""
    raw = chat(prompt, system=system, max_tokens=max_tokens)
    raw = re.sub(r"^```(json)?|```$", "", raw.strip(), flags=re.MULTILINE).strip()
    m = re.search(r"\{.*\}", raw, re.DOTALL)
    if not m:
        return None
    try:
        return json.loads(m.group(0))
    except json.JSONDecodeError:
        return None