"""Local Qwen chat client via Spark Control /v1/chat/completions. Used for the privacy-sensitive, high-volume reasoning that must stay on Ten31 infra (entity-resolution adjudication, triage). Frontier reasoning still goes to Claude; this is the local leg. Thinking is disabled for fast structured output. """ import json import re import config import http_util def chat(prompt, system=None, max_tokens=200, temperature=0.0): messages = [] if system: messages.append({"role": "system", "content": system}) messages.append({"role": "user", "content": prompt}) body = {"model": config.CHAT_MODEL, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, "chat_template_kwargs": {"enable_thinking": False}} status, data = http_util.request("POST", f"{config.SPARK_CONTROL_URL}/v1/chat/completions", body, verify=config.SPARK_VERIFY_TLS) if status != 200: raise RuntimeError(f"/v1/chat/completions -> {status}: {data}") return (data["choices"][0]["message"].get("content") or "").strip() def chat_json(prompt, system=None, max_tokens=200): """Chat and parse the first JSON object from the reply (tolerant of fences).""" raw = chat(prompt, system=system, max_tokens=max_tokens) raw = re.sub(r"^```(json)?|```$", "", raw.strip(), flags=re.MULTILINE).strip() m = re.search(r"\{.*\}", raw, re.DOTALL) if not m: return None try: return json.loads(m.group(0)) except json.JSONDecodeError: return None