ten31-signal-engine/signal_engine/extract/backends.py

"""Pluggable extraction backends (§scaling).

The §4.2 extractor calls a backend that turns chat messages into a JSON string. Default is the
LOCAL Qwen via Spark Control (the ~95%-local design). The Gemini backend is the documented
overflow/fallback for bulk back-cataloging at scale, or if the Sparks are unavailable — used for
the PUBLIC corpus only, never conviction/exposure data (sovereignty boundary, §4.6).

A backend exposes: complete_json(messages, max_tokens) -> str  (a JSON object string).
"""
from __future__ import annotations

import logging

log = logging.getLogger(__name__)


class LocalQwenBackend:
    name = "local"

    def __init__(self, sc) -> None:
        self.sc = sc

    def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
        resp = self.sc.chat(messages, json_object=True, temperature=0,
                            enable_thinking=False, max_tokens=max_tokens)
        return resp["choices"][0]["message"]["content"]


class GeminiBackend:
    """Gemini fallback/overflow. Implemented against the `google-genai` SDK. NOTE: untested until a
    key is provided — validate end-to-end before relying on it for a real backfill. The async BATCH
    API is the eventual scale path; this synchronous form is the drop-in fallback."""
    name = "gemini"

    def __init__(self, api_key: str, model: str = "gemini-2.5-flash") -> None:
        from google import genai  # guarded import; pip install google-genai
        self._genai = genai
        self.client = genai.Client(api_key=api_key)
        self.model = model

    def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
        from google.genai import types
        system = "\n\n".join(m["content"] for m in messages if m["role"] == "system")
        user = "\n\n".join(m["content"] for m in messages if m["role"] != "system")
        resp = self.client.models.generate_content(
            model=self.model,
            contents=user,
            config=types.GenerateContentConfig(
                system_instruction=system or None,
                temperature=0,
                max_output_tokens=max_tokens,
                response_mime_type="application/json",
                # Gemini 2.5 thinks by default and spends the output budget on reasoning tokens —
                # it hit MAX_TOKENS with ~3.8k thoughts and a truncated JSON body (0 claims parsed).
                # Extraction is deterministic, no-CoT (mirrors the local path's enable_thinking=False).
                thinking_config=types.ThinkingConfig(thinking_budget=0),
            ),
        )
        return resp.text or "{}"


def from_config(cfg, sc) -> "LocalQwenBackend | GeminiBackend":
    if cfg.extraction_backend == "gemini":
        if not cfg.gemini_api_key:
            log.warning("EXTRACTION_BACKEND=gemini but GEMINI_API_KEY missing — falling back to local")
        else:
            return GeminiBackend(cfg.gemini_api_key, cfg.gemini_model)
    return LocalQwenBackend(sc)