"""Claim extraction: text → 0..N claim units → SQLite (§4.2).""" from __future__ import annotations import json import logging import sqlite3 from typing import Any from .prompt import SEED_TOPICS, build_messages log = logging.getLogger(__name__) _ENUMS = { "claim_type": {"interpretive", "predictive", "descriptive", "reactive"}, "time_horizon": {"near", "medium", "long", "unspecified"}, "confidence": {"low", "med", "high"}, "thesis_seam": {"energy_compute", "debasement_bitcoin", "ai_data_ownership", "none"}, "salience": {"central", "secondary", "aside"}, } def register_seed_topics(conn: sqlite3.Connection) -> None: """Pre-load the controlled half of the hybrid topic vocabulary (§4.2).""" for t in SEED_TOPICS: conn.execute( "INSERT INTO topics (topic_canonical, status) VALUES (?, 'controlled') " "ON CONFLICT(topic_canonical) DO UPDATE SET status='controlled'", (t,), ) conn.commit() # Coarse→fine split boundaries. Transcripts arrive as `Speaker: turn` lines joined by a SINGLE # newline (ASR output has no blank-line paragraphs), filings as paragraph text — so splitting on # "\n\n" alone never fires on a transcript and the whole episode would go in one call. "" is the # per-character hard cap that guarantees termination regardless of punctuation. _SEPARATORS = ["\n\n", "\n", ". ", " ", ""] def chunk_text(text: str, max_chars: int) -> list[str]: """Pack text into windows that each fit the model context alongside the prompt. Falls through paragraph → line → sentence → word → hard char-slice, so NO chunk ever exceeds max_chars however the source is punctuated, while keeping speaker turns intact when they fit. """ if max_chars < 1: # else _pack recurses past the last separator → IndexError raise ValueError(f"max_chars must be >= 1, got {max_chars}") text = text.strip() if not text: return [] return _pack(text, max_chars, _SEPARATORS) def _pack(text: str, max_chars: int, seps: list[str]) -> list[str]: """Recursively pack `text` on the coarsest separator in `seps` that keeps chunks within max_chars, descending to a finer one only for a part that is itself still too big.""" if len(text) <= max_chars: return [text] sep, rest = seps[0], seps[1:] parts = list(text) if sep == "" else text.split(sep) out: list[str] = [] cur = "" for p in parts: candidate = p if not cur else cur + sep + p if len(candidate) <= max_chars: cur = candidate continue if cur: out.append(cur) if len(p) <= max_chars: cur = p else: # a single part still too big → split it on the next-finer boundary out.extend(_pack(p, max_chars, rest)) cur = "" if cur: out.append(cur) return out def _parse_claims(content: str) -> list[dict]: try: obj = json.loads(content) except Exception: i, j = content.find("{"), content.rfind("}") if i < 0 or j < 0: return [] try: obj = json.loads(content[i:j + 1]) except Exception: return [] claims = obj.get("claims", []) if isinstance(obj, dict) else [] return [c for c in claims if isinstance(c, dict) and c.get("proposition")] def extract_claims_from_text(backend, text: str, *, source_name: str, source_cluster: str | None, date: str | None, kind: str) -> list[dict]: """`backend` is any object with .complete_json(messages, max_tokens) -> str (see extract.backends: LocalQwenBackend | GeminiBackend).""" messages = build_messages(text, source_name=source_name, source_cluster=source_cluster, date=date, kind=kind) content = backend.complete_json(messages, max_tokens=4000) return _parse_claims(content) def _enum(c: dict, field: str, default: str) -> str: v = c.get(field) return v if v in _ENUMS[field] else default def persist_claims(conn: sqlite3.Connection, *, doc: sqlite3.Row, source: sqlite3.Row | None, claims: list[dict], chunk_idx: int) -> int: n = 0 cluster = source["source_cluster"] if source else None for i, c in enumerate(claims): seam = _enum(c, "thesis_seam", "none") topic = c.get("topic_canonical") or None if topic: # register emergent topics BEFORE the claim (claims.topic_canonical is a FK → topics) conn.execute( "INSERT OR IGNORE INTO topics (topic_canonical, status, seam) VALUES (?, 'emergent', ?)", (topic, seam), ) claim_id = f"{doc['doc_id']}:{chunk_idx}:{i}" conn.execute( """INSERT OR IGNORE INTO claims (claim_id, doc_id, source_id, proposition, topic_canonical, topic_raw, claimant, source_cluster, date, claim_type, time_horizon, confidence, rel_polarity, engages_consensus, counters_position, thesis_seam, salience) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", ( claim_id, doc["doc_id"], doc["source_id"], str(c["proposition"])[:1000], topic, c.get("topic_raw"), c.get("claimant") or (source["name"] if source else None), cluster, doc["date"], _enum(c, "claim_type", "descriptive"), _enum(c, "time_horizon", "unspecified"), _enum(c, "confidence", "med"), "none", 1 if c.get("engages_consensus") else 0, c.get("counters_position"), seam, _enum(c, "salience", "secondary"), ), ) n += 1 conn.commit() return n