ten31-database/backend/ingest/llm.py

"""Local Qwen chat client via Spark Control /v1/chat/completions.

Used for the privacy-sensitive, high-volume reasoning that must stay on Ten31
infra (entity-resolution adjudication, triage). Frontier reasoning still goes to
Claude; this is the local leg. Thinking is disabled for fast structured output.
"""
import json
import re

import config
import http_util


def chat(prompt, system=None, max_tokens=200, temperature=0.0):
    messages = []
    if system:
        messages.append({"role": "system", "content": system})
    messages.append({"role": "user", "content": prompt})
    body = {"model": config.CHAT_MODEL, "messages": messages,
            "temperature": temperature, "max_tokens": max_tokens,
            "chat_template_kwargs": {"enable_thinking": False}}
    status, data = http_util.request("POST", f"{config.SPARK_CONTROL_URL}/v1/chat/completions",
                                     body, verify=config.SPARK_VERIFY_TLS)
    if status != 200:
        raise RuntimeError(f"/v1/chat/completions -> {status}: {data}")
    return (data["choices"][0]["message"].get("content") or "").strip()


def chat_json(prompt, system=None, max_tokens=200):
    """Chat and parse the first JSON object from the reply (tolerant of fences)."""
    raw = chat(prompt, system=system, max_tokens=max_tokens)
    raw = re.sub(r"^```(json)?|```$", "", raw.strip(), flags=re.MULTILINE).strip()
    m = re.search(r"\{.*\}", raw, re.DOTALL)
    if not m:
        return None
    try:
        return json.loads(m.group(0))
    except json.JSONDecodeError:
        return None