Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
@@ -0,0 +1,6 @@
+"""Extraction (§4.2) — local LLM → structured claim units. The cost & quality center.
+
+Emits at the level of the PROPOSITION: a passage may yield 0..N claims, and MOST passages yield
+zero. An extractor that dutifully emits a claim per chunk reintroduces exactly the noise the rest
+of the system is designed to remove.
+"""
@@ -0,0 +1,64 @@
+"""Pluggable extraction backends (§scaling).
+
+The §4.2 extractor calls a backend that turns chat messages into a JSON string. Default is the
+LOCAL Qwen via Spark Control (the ~95%-local design). The Gemini backend is the documented
+overflow/fallback for bulk back-cataloging at scale, or if the Sparks are unavailable — used for
+the PUBLIC corpus only, never conviction/exposure data (sovereignty boundary, §4.6).
+
+A backend exposes: complete_json(messages, max_tokens) -> str  (a JSON object string).
+"""
+from __future__ import annotations
+
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class LocalQwenBackend:
+    name = "local"
+
+    def __init__(self, sc) -> None:
+        self.sc = sc
+
+    def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
+        resp = self.sc.chat(messages, json_object=True, temperature=0,
+                            enable_thinking=False, max_tokens=max_tokens)
+        return resp["choices"][0]["message"]["content"]
+
+
+class GeminiBackend:
+    """Gemini fallback/overflow. Implemented against the `google-genai` SDK. NOTE: untested until a
+    key is provided — validate end-to-end before relying on it for a real backfill. The async BATCH
+    API is the eventual scale path; this synchronous form is the drop-in fallback."""
+    name = "gemini"
+
+    def __init__(self, api_key: str, model: str = "gemini-2.5-flash") -> None:
+        from google import genai  # guarded import; pip install google-genai
+        self._genai = genai
+        self.client = genai.Client(api_key=api_key)
+        self.model = model
+
+    def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
+        from google.genai import types
+        system = "\n\n".join(m["content"] for m in messages if m["role"] == "system")
+        user = "\n\n".join(m["content"] for m in messages if m["role"] != "system")
+        resp = self.client.models.generate_content(
+            model=self.model,
+            contents=user,
+            config=types.GenerateContentConfig(
+                system_instruction=system or None,
+                temperature=0,
+                max_output_tokens=max_tokens,
+                response_mime_type="application/json",
+            ),
+        )
+        return resp.text or "{}"
+
+
+def from_config(cfg, sc) -> "LocalQwenBackend | GeminiBackend":
+    if cfg.extraction_backend == "gemini":
+        if not cfg.gemini_api_key:
+            log.warning("EXTRACTION_BACKEND=gemini but GEMINI_API_KEY missing — falling back to local")
+        else:
+            return GeminiBackend(cfg.gemini_api_key, cfg.gemini_model)
+    return LocalQwenBackend(sc)
@@ -0,0 +1,117 @@
+"""Claim extraction: text → 0..N claim units → SQLite (§4.2)."""
+from __future__ import annotations
+
+import json
+import logging
+import sqlite3
+from typing import Any
+
+from .prompt import SEED_TOPICS, build_messages
+
+log = logging.getLogger(__name__)
+
+_ENUMS = {
+    "claim_type": {"interpretive", "predictive", "descriptive", "reactive"},
+    "time_horizon": {"near", "medium", "long", "unspecified"},
+    "confidence": {"low", "med", "high"},
+    "thesis_seam": {"energy_compute", "debasement_bitcoin", "ai_data_ownership", "none"},
+    "salience": {"central", "secondary", "aside"},
+}
+
+
+def register_seed_topics(conn: sqlite3.Connection) -> None:
+    """Pre-load the controlled half of the hybrid topic vocabulary (§4.2)."""
+    for t in SEED_TOPICS:
+        conn.execute(
+            "INSERT INTO topics (topic_canonical, status) VALUES (?, 'controlled') "
+            "ON CONFLICT(topic_canonical) DO UPDATE SET status='controlled'",
+            (t,),
+        )
+    conn.commit()
+
+
+def chunk_text(text: str, max_chars: int) -> list[str]:
+    """Split on paragraph boundaries into windows that fit the model context alongside the prompt."""
+    text = text.strip()
+    if not text:
+        return []
+    if len(text) <= max_chars:
+        return [text]
+    chunks: list[str] = []
+    cur: list[str] = []
+    size = 0
+    for para in text.split("\n\n"):
+        if size + len(para) > max_chars and cur:
+            chunks.append("\n\n".join(cur))
+            cur, size = [], 0
+        cur.append(para)
+        size += len(para) + 2
+    if cur:
+        chunks.append("\n\n".join(cur))
+    return chunks
+
+
+def _parse_claims(content: str) -> list[dict]:
+    try:
+        obj = json.loads(content)
+    except Exception:
+        i, j = content.find("{"), content.rfind("}")
+        if i < 0 or j < 0:
+            return []
+        try:
+            obj = json.loads(content[i:j + 1])
+        except Exception:
+            return []
+    claims = obj.get("claims", []) if isinstance(obj, dict) else []
+    return [c for c in claims if isinstance(c, dict) and c.get("proposition")]
+
+
+def extract_claims_from_text(backend, text: str, *, source_name: str, source_cluster: str | None,
+                             date: str | None, kind: str) -> list[dict]:
+    """`backend` is any object with .complete_json(messages, max_tokens) -> str
+    (see extract.backends: LocalQwenBackend | GeminiBackend)."""
+    messages = build_messages(text, source_name=source_name, source_cluster=source_cluster,
+                              date=date, kind=kind)
+    content = backend.complete_json(messages, max_tokens=4000)
+    return _parse_claims(content)
+
+
+def _enum(c: dict, field: str, default: str) -> str:
+    v = c.get(field)
+    return v if v in _ENUMS[field] else default
+
+
+def persist_claims(conn: sqlite3.Connection, *, doc: sqlite3.Row, source: sqlite3.Row | None,
+                   claims: list[dict], chunk_idx: int) -> int:
+    n = 0
+    cluster = source["source_cluster"] if source else None
+    for i, c in enumerate(claims):
+        seam = _enum(c, "thesis_seam", "none")
+        topic = c.get("topic_canonical") or None
+        if topic:
+            # register emergent topics BEFORE the claim (claims.topic_canonical is a FK → topics)
+            conn.execute(
+                "INSERT OR IGNORE INTO topics (topic_canonical, status, seam) VALUES (?, 'emergent', ?)",
+                (topic, seam),
+            )
+        claim_id = f"{doc['doc_id']}:{chunk_idx}:{i}"
+        conn.execute(
+            """INSERT OR IGNORE INTO claims
+                 (claim_id, doc_id, source_id, proposition, topic_canonical, topic_raw, claimant,
+                  source_cluster, date, claim_type, time_horizon, confidence, rel_polarity,
+                  engages_consensus, counters_position, thesis_seam, salience)
+               VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+            (
+                claim_id, doc["doc_id"], doc["source_id"], str(c["proposition"])[:1000],
+                topic, c.get("topic_raw"),
+                c.get("claimant") or (source["name"] if source else None),
+                cluster, doc["date"],
+                _enum(c, "claim_type", "descriptive"), _enum(c, "time_horizon", "unspecified"),
+                _enum(c, "confidence", "med"), "none",
+                1 if c.get("engages_consensus") else 0, c.get("counters_position"),
+                seam, _enum(c, "salience", "secondary"),
+            ),
+        )
+        n += 1
+    conn.commit()
+    return n
@@ -0,0 +1,47 @@
+"""SEC filing HTML → plain text. Stdlib only (boring, inspectable).
+
+Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge <ix:hidden> section of
+numeric facts that would otherwise swamp the extractor), and collapses whitespace.
+"""
+from __future__ import annotations
+
+import re
+from html.parser import HTMLParser
+
+_SKIP_TAGS = {"script", "style", "head"}
+_SKIP_PREFIXES = ("ix:hidden",)          # inline-XBRL hidden fact dump
+_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"}
+
+
+class _Stripper(HTMLParser):
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self._skip_depth = 0
+        self._parts: list[str] = []
+
+    def handle_starttag(self, tag: str, attrs) -> None:
+        if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
+            self._skip_depth += 1
+        elif tag in _BLOCK_TAGS:
+            self._parts.append("\n")
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
+            self._skip_depth = max(0, self._skip_depth - 1)
+        elif tag in _BLOCK_TAGS:
+            self._parts.append("\n")
+
+    def handle_data(self, data: str) -> None:
+        if self._skip_depth == 0 and data.strip():
+            self._parts.append(data)
+
+
+def html_to_text(html: str, *, max_chars: int = 300_000) -> str:
+    p = _Stripper()
+    p.feed(html)
+    text = "".join(p._parts)
+    text = re.sub(r"[ \t ]+", " ", text)
+    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
+    text = "\n".join(line.strip() for line in text.splitlines())
+    text = text.strip()
+    return text[:max_chars]
@@ -0,0 +1,72 @@
+"""The §4.2 claim-extraction prompt. Prompt engineering is ours (§13.3); the schema is finalized.
+
+Discipline encoded here (the whole point of the system, §2/§4.2):
+  - Extract at the level of the PROPOSITION; emit ZERO when there is no substantive claim.
+  - Separate topic from stance: capture stance-vs-consensus explicitly, never as a bull/bear label.
+  - thesis_seam is a TAG, not a filter — off-thesis and anti-thesis claims are still extracted.
+"""
+from __future__ import annotations
+
+# Hybrid topic vocabulary (§4.2): a small SEEDED controlled list. The model reuses one when it
+# fits and proposes a concise snake_case topic otherwise; emergent topics are merged on a schedule.
+SEED_TOPICS = [
+    # energy <-> compute
+    "ai_compute_demand", "ai_power_constraint", "datacenter_buildout", "grid_interconnect",
+    "transformers_equipment", "nuclear_power", "natural_gas_power", "uranium_supply",
+    "cooling_infrastructure", "miner_flexible_load", "mining_ai_pivot",
+    # debasement <-> bitcoin
+    "bitcoin_reserve_asset", "bitcoin_collateral_credit", "bitcoin_treasury_strategy",
+    "btc_custody_regulation", "sovereign_bitcoin_adoption",
+    # ai <-> data ownership
+    "ai_data_ownership", "confidential_inference", "ai_commoditization",
+    # macro
+    "fed_policy", "fiscal_debasement", "stablecoins_cbdc",
+]
+
+_SYSTEM = """You are the claim-extraction component of an investment signal engine. You read a passage \
+(an SEC filing excerpt or a podcast/earnings-call transcript) and extract structured CLAIM UNITS.
+
+A CLAIM UNIT is a single normalized proposition that someone asserts — a forward-looking prediction, \
+an interpretive or causal judgment, or a stance taken against a prevailing view. It must be specific \
+enough to later be checked against the world.
+
+CRITICAL DISCIPLINE — be willing to extract NOTHING:
+- Most passages contain ZERO claim units. Boilerplate, legal disclaimers, ad reads, pleasantries, \
+generic descriptions, routine financial line-items, and recitations of well-known news are NOT claims.
+- Do NOT invent claims. Do NOT emit one claim per paragraph to seem thorough. If the passage has no \
+substantive proposition, return {"claims": []}. A precise empty answer is the correct, valued output.
+- Extract at the level of the PROPOSITION: one normalized subject-assertion-object sentence each. A \
+single rich passage may yield several; a long dull one yields none.
+
+For EACH claim unit, output these fields:
+- "proposition": one normalized sentence (subject-assertion-object), self-contained.
+- "topic_canonical": a concise snake_case topic for clustering. REUSE one of the provided seed topics \
+when it fits; otherwise propose a new concise snake_case label. Normalize synonyms (Fed/FOMC/rates → fed_policy).
+- "topic_raw": the topic as actually phrased in the passage.
+- "claimant": who asserts it (speaker name or the filing company). Use "unknown" if unclear.
+- "claim_type": one of interpretive | predictive | descriptive | reactive. (interpretive/predictive = \
+insight; descriptive/reactive = news echo — extract those only if clearly salient.)
+- "time_horizon": one of near | medium | long | unspecified (for predictive claims especially).
+- "confidence": the claimant's apparent conviction — one of low | med | high.
+- "engages_consensus": true ONLY if the claim explicitly argues against a stated mainstream view.
+- "counters_position": the mainstream position it argues against, or null.
+- "thesis_seam": one of energy_compute | debasement_bitcoin | ai_data_ownership | none. This is a TAG \
+for relevance only — tag off-thesis claims "none" and STILL extract them.
+- "salience": central | secondary | aside (how central the claim is to the passage).
+
+Return ONLY a JSON object: {"claims": [ {...}, ... ]}. No prose, no markdown."""
+
+
+def build_messages(text: str, *, source_name: str, source_cluster: str | None,
+                   date: str | None, kind: str) -> list[dict[str, str]]:
+    seed = ", ".join(SEED_TOPICS)
+    context = (
+        f"Source: {source_name or 'unknown'} (cluster: {source_cluster or 'n/a'}, type: {kind}, "
+        f"date: {date or 'n/a'}).\n"
+        f"Seed topics to reuse when they fit: {seed}.\n\n"
+        f"PASSAGE:\n{text}"
+    )
+    return [
+        {"role": "system", "content": _SYSTEM},
+        {"role": "user", "content": context},
+    ]
@@ -0,0 +1,69 @@
+"""Extraction worker — drains 'extract' jobs from the backfill queue (§4.2, §13.4).
+
+Single sequential worker by design: extraction is the heavier serial load on the one LLM GPU.
+For each job: load the document, get its text (fetch+strip filing HTML, or read a stored transcript),
+chunk it, run the §4.2 extractor per chunk, persist 0..N claims, complete the job.
+"""
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+import requests
+
+from ..backfill import queue
+from . import claims as claims_mod
+from .html_text import html_to_text
+
+log = logging.getLogger(__name__)
+
+
+def _document_text(doc, *, user_agent: str) -> str:
+    if doc["transcript_path"]:
+        return Path(doc["transcript_path"]).read_text()
+    if doc["kind"] == "filing" and doc["url"]:
+        r = requests.get(doc["url"], headers={"User-Agent": user_agent}, timeout=90)
+        r.raise_for_status()
+        return html_to_text(r.text)
+    raise ValueError(f"no text source for {doc['doc_id']} (kind={doc['kind']}, url={doc['url']})")
+
+
+def run_extract(conn, sc, cfg, *, limit: int = 10, max_chunks_per_doc: int = 4,
+                chunk_chars: int = 18_000, lease_seconds: int = 900,
+                worker_id: str = "extract-1") -> dict:
+    from .backends import from_config as backend_from_config
+    backend = backend_from_config(cfg, sc)
+    log.info("extraction backend: %s", backend.name)
+    claims_mod.register_seed_topics(conn)
+    processed = total_claims = 0
+    while processed < limit:
+        job = queue.lease_next(conn, worker_id=worker_id, job_types=["extract"], lease_seconds=lease_seconds)
+        if job is None:
+            break
+        processed += 1
+        doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
+        if doc is None:
+            queue.skip(conn, job["job_id"], "document missing")
+            continue
+        src = conn.execute("SELECT * FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
+        try:
+            text = _document_text(doc, user_agent=cfg.edgar_user_agent)
+            chunks = claims_mod.chunk_text(text, chunk_chars)[:max_chunks_per_doc]
+            doc_claims = 0
+            for idx, chunk in enumerate(chunks):
+                cl = claims_mod.extract_claims_from_text(
+                    backend, chunk,
+                    source_name=src["name"] if src else "",
+                    source_cluster=src["source_cluster"] if src else None,
+                    date=doc["date"], kind=doc["kind"],
+                )
+                doc_claims += claims_mod.persist_claims(conn, doc=doc, source=src, claims=cl, chunk_idx=idx)
+            conn.execute("UPDATE documents SET processed_at=datetime('now') WHERE doc_id=?", (doc["doc_id"],))
+            conn.commit()
+            queue.complete(conn, job["job_id"], output_ref=f"{doc_claims} claims / {len(chunks)} chunks")
+            total_claims += doc_claims
+            log.info("extracted %d claims from %s (%d chunks)", doc_claims, doc["doc_id"], len(chunks))
+        except Exception as e:  # noqa: BLE001
+            state = queue.fail(conn, job["job_id"], e)
+            log.warning("extract failed for %s: %s (→ %s)", job["target_id"], e, state)
+    return {"jobs_processed": processed, "claims_written": total_claims}