ten31-signal-engine/signal_engine/extract/claims.py

"""Claim extraction: text → 0..N claim units → SQLite (§4.2)."""
from __future__ import annotations

import json
import logging
import sqlite3
from typing import Any

from .prompt import SEED_TOPICS, build_messages

log = logging.getLogger(__name__)

_ENUMS = {
    "claim_type": {"interpretive", "predictive", "descriptive", "reactive"},
    "time_horizon": {"near", "medium", "long", "unspecified"},
    "confidence": {"low", "med", "high"},
    "thesis_seam": {"energy_compute", "debasement_bitcoin", "ai_data_ownership", "none"},
    "salience": {"central", "secondary", "aside"},
}


def register_seed_topics(conn: sqlite3.Connection) -> None:
    """Pre-load the controlled half of the hybrid topic vocabulary (§4.2)."""
    for t in SEED_TOPICS:
        conn.execute(
            "INSERT INTO topics (topic_canonical, status) VALUES (?, 'controlled') "
            "ON CONFLICT(topic_canonical) DO UPDATE SET status='controlled'",
            (t,),
        )
    conn.commit()


def chunk_text(text: str, max_chars: int) -> list[str]:
    """Split on paragraph boundaries into windows that fit the model context alongside the prompt."""
    text = text.strip()
    if not text:
        return []
    if len(text) <= max_chars:
        return [text]
    chunks: list[str] = []
    cur: list[str] = []
    size = 0
    for para in text.split("\n\n"):
        if size + len(para) > max_chars and cur:
            chunks.append("\n\n".join(cur))
            cur, size = [], 0
        cur.append(para)
        size += len(para) + 2
    if cur:
        chunks.append("\n\n".join(cur))
    return chunks


def _parse_claims(content: str) -> list[dict]:
    try:
        obj = json.loads(content)
    except Exception:
        i, j = content.find("{"), content.rfind("}")
        if i < 0 or j < 0:
            return []
        try:
            obj = json.loads(content[i:j + 1])
        except Exception:
            return []
    claims = obj.get("claims", []) if isinstance(obj, dict) else []
    return [c for c in claims if isinstance(c, dict) and c.get("proposition")]


def extract_claims_from_text(backend, text: str, *, source_name: str, source_cluster: str | None,
                             date: str | None, kind: str) -> list[dict]:
    """`backend` is any object with .complete_json(messages, max_tokens) -> str
    (see extract.backends: LocalQwenBackend | GeminiBackend)."""
    messages = build_messages(text, source_name=source_name, source_cluster=source_cluster,
                              date=date, kind=kind)
    content = backend.complete_json(messages, max_tokens=4000)
    return _parse_claims(content)


def _enum(c: dict, field: str, default: str) -> str:
    v = c.get(field)
    return v if v in _ENUMS[field] else default


def persist_claims(conn: sqlite3.Connection, *, doc: sqlite3.Row, source: sqlite3.Row | None,
                   claims: list[dict], chunk_idx: int) -> int:
    n = 0
    cluster = source["source_cluster"] if source else None
    for i, c in enumerate(claims):
        seam = _enum(c, "thesis_seam", "none")
        topic = c.get("topic_canonical") or None
        if topic:
            # register emergent topics BEFORE the claim (claims.topic_canonical is a FK → topics)
            conn.execute(
                "INSERT OR IGNORE INTO topics (topic_canonical, status, seam) VALUES (?, 'emergent', ?)",
                (topic, seam),
            )
        claim_id = f"{doc['doc_id']}:{chunk_idx}:{i}"
        conn.execute(
            """INSERT OR IGNORE INTO claims
                 (claim_id, doc_id, source_id, proposition, topic_canonical, topic_raw, claimant,
                  source_cluster, date, claim_type, time_horizon, confidence, rel_polarity,
                  engages_consensus, counters_position, thesis_seam, salience)
               VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
            (
                claim_id, doc["doc_id"], doc["source_id"], str(c["proposition"])[:1000],
                topic, c.get("topic_raw"),
                c.get("claimant") or (source["name"] if source else None),
                cluster, doc["date"],
                _enum(c, "claim_type", "descriptive"), _enum(c, "time_horizon", "unspecified"),
                _enum(c, "confidence", "med"), "none",
                1 if c.get("engages_consensus") else 0, c.get("counters_position"),
                seam, _enum(c, "salience", "secondary"),
            ),
        )
        n += 1
    conn.commit()
    return n