Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

This commit is contained in:
Keysat
2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
+6
View File
@@ -0,0 +1,6 @@
"""Extraction (§4.2) — local LLM → structured claim units. The cost & quality center.
Emits at the level of the PROPOSITION: a passage may yield 0..N claims, and MOST passages yield
zero. An extractor that dutifully emits a claim per chunk reintroduces exactly the noise the rest
of the system is designed to remove.
"""
+64
View File
@@ -0,0 +1,64 @@
"""Pluggable extraction backends (§scaling).
The §4.2 extractor calls a backend that turns chat messages into a JSON string. Default is the
LOCAL Qwen via Spark Control (the ~95%-local design). The Gemini backend is the documented
overflow/fallback for bulk back-cataloging at scale, or if the Sparks are unavailable — used for
the PUBLIC corpus only, never conviction/exposure data (sovereignty boundary, §4.6).
A backend exposes: complete_json(messages, max_tokens) -> str (a JSON object string).
"""
from __future__ import annotations
import logging
log = logging.getLogger(__name__)
class LocalQwenBackend:
name = "local"
def __init__(self, sc) -> None:
self.sc = sc
def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
resp = self.sc.chat(messages, json_object=True, temperature=0,
enable_thinking=False, max_tokens=max_tokens)
return resp["choices"][0]["message"]["content"]
class GeminiBackend:
"""Gemini fallback/overflow. Implemented against the `google-genai` SDK. NOTE: untested until a
key is provided — validate end-to-end before relying on it for a real backfill. The async BATCH
API is the eventual scale path; this synchronous form is the drop-in fallback."""
name = "gemini"
def __init__(self, api_key: str, model: str = "gemini-2.5-flash") -> None:
from google import genai # guarded import; pip install google-genai
self._genai = genai
self.client = genai.Client(api_key=api_key)
self.model = model
def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
from google.genai import types
system = "\n\n".join(m["content"] for m in messages if m["role"] == "system")
user = "\n\n".join(m["content"] for m in messages if m["role"] != "system")
resp = self.client.models.generate_content(
model=self.model,
contents=user,
config=types.GenerateContentConfig(
system_instruction=system or None,
temperature=0,
max_output_tokens=max_tokens,
response_mime_type="application/json",
),
)
return resp.text or "{}"
def from_config(cfg, sc) -> "LocalQwenBackend | GeminiBackend":
if cfg.extraction_backend == "gemini":
if not cfg.gemini_api_key:
log.warning("EXTRACTION_BACKEND=gemini but GEMINI_API_KEY missing — falling back to local")
else:
return GeminiBackend(cfg.gemini_api_key, cfg.gemini_model)
return LocalQwenBackend(sc)
+117
View File
@@ -0,0 +1,117 @@
"""Claim extraction: text → 0..N claim units → SQLite (§4.2)."""
from __future__ import annotations
import json
import logging
import sqlite3
from typing import Any
from .prompt import SEED_TOPICS, build_messages
log = logging.getLogger(__name__)
_ENUMS = {
"claim_type": {"interpretive", "predictive", "descriptive", "reactive"},
"time_horizon": {"near", "medium", "long", "unspecified"},
"confidence": {"low", "med", "high"},
"thesis_seam": {"energy_compute", "debasement_bitcoin", "ai_data_ownership", "none"},
"salience": {"central", "secondary", "aside"},
}
def register_seed_topics(conn: sqlite3.Connection) -> None:
"""Pre-load the controlled half of the hybrid topic vocabulary (§4.2)."""
for t in SEED_TOPICS:
conn.execute(
"INSERT INTO topics (topic_canonical, status) VALUES (?, 'controlled') "
"ON CONFLICT(topic_canonical) DO UPDATE SET status='controlled'",
(t,),
)
conn.commit()
def chunk_text(text: str, max_chars: int) -> list[str]:
"""Split on paragraph boundaries into windows that fit the model context alongside the prompt."""
text = text.strip()
if not text:
return []
if len(text) <= max_chars:
return [text]
chunks: list[str] = []
cur: list[str] = []
size = 0
for para in text.split("\n\n"):
if size + len(para) > max_chars and cur:
chunks.append("\n\n".join(cur))
cur, size = [], 0
cur.append(para)
size += len(para) + 2
if cur:
chunks.append("\n\n".join(cur))
return chunks
def _parse_claims(content: str) -> list[dict]:
try:
obj = json.loads(content)
except Exception:
i, j = content.find("{"), content.rfind("}")
if i < 0 or j < 0:
return []
try:
obj = json.loads(content[i:j + 1])
except Exception:
return []
claims = obj.get("claims", []) if isinstance(obj, dict) else []
return [c for c in claims if isinstance(c, dict) and c.get("proposition")]
def extract_claims_from_text(backend, text: str, *, source_name: str, source_cluster: str | None,
date: str | None, kind: str) -> list[dict]:
"""`backend` is any object with .complete_json(messages, max_tokens) -> str
(see extract.backends: LocalQwenBackend | GeminiBackend)."""
messages = build_messages(text, source_name=source_name, source_cluster=source_cluster,
date=date, kind=kind)
content = backend.complete_json(messages, max_tokens=4000)
return _parse_claims(content)
def _enum(c: dict, field: str, default: str) -> str:
v = c.get(field)
return v if v in _ENUMS[field] else default
def persist_claims(conn: sqlite3.Connection, *, doc: sqlite3.Row, source: sqlite3.Row | None,
claims: list[dict], chunk_idx: int) -> int:
n = 0
cluster = source["source_cluster"] if source else None
for i, c in enumerate(claims):
seam = _enum(c, "thesis_seam", "none")
topic = c.get("topic_canonical") or None
if topic:
# register emergent topics BEFORE the claim (claims.topic_canonical is a FK → topics)
conn.execute(
"INSERT OR IGNORE INTO topics (topic_canonical, status, seam) VALUES (?, 'emergent', ?)",
(topic, seam),
)
claim_id = f"{doc['doc_id']}:{chunk_idx}:{i}"
conn.execute(
"""INSERT OR IGNORE INTO claims
(claim_id, doc_id, source_id, proposition, topic_canonical, topic_raw, claimant,
source_cluster, date, claim_type, time_horizon, confidence, rel_polarity,
engages_consensus, counters_position, thesis_seam, salience)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
(
claim_id, doc["doc_id"], doc["source_id"], str(c["proposition"])[:1000],
topic, c.get("topic_raw"),
c.get("claimant") or (source["name"] if source else None),
cluster, doc["date"],
_enum(c, "claim_type", "descriptive"), _enum(c, "time_horizon", "unspecified"),
_enum(c, "confidence", "med"), "none",
1 if c.get("engages_consensus") else 0, c.get("counters_position"),
seam, _enum(c, "salience", "secondary"),
),
)
n += 1
conn.commit()
return n
+47
View File
@@ -0,0 +1,47 @@
"""SEC filing HTML → plain text. Stdlib only (boring, inspectable).
Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge <ix:hidden> section of
numeric facts that would otherwise swamp the extractor), and collapses whitespace.
"""
from __future__ import annotations
import re
from html.parser import HTMLParser
_SKIP_TAGS = {"script", "style", "head"}
_SKIP_PREFIXES = ("ix:hidden",) # inline-XBRL hidden fact dump
_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"}
class _Stripper(HTMLParser):
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self._skip_depth = 0
self._parts: list[str] = []
def handle_starttag(self, tag: str, attrs) -> None:
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
self._skip_depth += 1
elif tag in _BLOCK_TAGS:
self._parts.append("\n")
def handle_endtag(self, tag: str) -> None:
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
self._skip_depth = max(0, self._skip_depth - 1)
elif tag in _BLOCK_TAGS:
self._parts.append("\n")
def handle_data(self, data: str) -> None:
if self._skip_depth == 0 and data.strip():
self._parts.append(data)
def html_to_text(html: str, *, max_chars: int = 300_000) -> str:
p = _Stripper()
p.feed(html)
text = "".join(p._parts)
text = re.sub(r"[ \t ]+", " ", text)
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
text = "\n".join(line.strip() for line in text.splitlines())
text = text.strip()
return text[:max_chars]
+72
View File
@@ -0,0 +1,72 @@
"""The §4.2 claim-extraction prompt. Prompt engineering is ours (§13.3); the schema is finalized.
Discipline encoded here (the whole point of the system, §2/§4.2):
- Extract at the level of the PROPOSITION; emit ZERO when there is no substantive claim.
- Separate topic from stance: capture stance-vs-consensus explicitly, never as a bull/bear label.
- thesis_seam is a TAG, not a filter — off-thesis and anti-thesis claims are still extracted.
"""
from __future__ import annotations
# Hybrid topic vocabulary (§4.2): a small SEEDED controlled list. The model reuses one when it
# fits and proposes a concise snake_case topic otherwise; emergent topics are merged on a schedule.
SEED_TOPICS = [
# energy <-> compute
"ai_compute_demand", "ai_power_constraint", "datacenter_buildout", "grid_interconnect",
"transformers_equipment", "nuclear_power", "natural_gas_power", "uranium_supply",
"cooling_infrastructure", "miner_flexible_load", "mining_ai_pivot",
# debasement <-> bitcoin
"bitcoin_reserve_asset", "bitcoin_collateral_credit", "bitcoin_treasury_strategy",
"btc_custody_regulation", "sovereign_bitcoin_adoption",
# ai <-> data ownership
"ai_data_ownership", "confidential_inference", "ai_commoditization",
# macro
"fed_policy", "fiscal_debasement", "stablecoins_cbdc",
]
_SYSTEM = """You are the claim-extraction component of an investment signal engine. You read a passage \
(an SEC filing excerpt or a podcast/earnings-call transcript) and extract structured CLAIM UNITS.
A CLAIM UNIT is a single normalized proposition that someone asserts — a forward-looking prediction, \
an interpretive or causal judgment, or a stance taken against a prevailing view. It must be specific \
enough to later be checked against the world.
CRITICAL DISCIPLINE — be willing to extract NOTHING:
- Most passages contain ZERO claim units. Boilerplate, legal disclaimers, ad reads, pleasantries, \
generic descriptions, routine financial line-items, and recitations of well-known news are NOT claims.
- Do NOT invent claims. Do NOT emit one claim per paragraph to seem thorough. If the passage has no \
substantive proposition, return {"claims": []}. A precise empty answer is the correct, valued output.
- Extract at the level of the PROPOSITION: one normalized subject-assertion-object sentence each. A \
single rich passage may yield several; a long dull one yields none.
For EACH claim unit, output these fields:
- "proposition": one normalized sentence (subject-assertion-object), self-contained.
- "topic_canonical": a concise snake_case topic for clustering. REUSE one of the provided seed topics \
when it fits; otherwise propose a new concise snake_case label. Normalize synonyms (Fed/FOMC/rates → fed_policy).
- "topic_raw": the topic as actually phrased in the passage.
- "claimant": who asserts it (speaker name or the filing company). Use "unknown" if unclear.
- "claim_type": one of interpretive | predictive | descriptive | reactive. (interpretive/predictive = \
insight; descriptive/reactive = news echo — extract those only if clearly salient.)
- "time_horizon": one of near | medium | long | unspecified (for predictive claims especially).
- "confidence": the claimant's apparent conviction — one of low | med | high.
- "engages_consensus": true ONLY if the claim explicitly argues against a stated mainstream view.
- "counters_position": the mainstream position it argues against, or null.
- "thesis_seam": one of energy_compute | debasement_bitcoin | ai_data_ownership | none. This is a TAG \
for relevance only — tag off-thesis claims "none" and STILL extract them.
- "salience": central | secondary | aside (how central the claim is to the passage).
Return ONLY a JSON object: {"claims": [ {...}, ... ]}. No prose, no markdown."""
def build_messages(text: str, *, source_name: str, source_cluster: str | None,
date: str | None, kind: str) -> list[dict[str, str]]:
seed = ", ".join(SEED_TOPICS)
context = (
f"Source: {source_name or 'unknown'} (cluster: {source_cluster or 'n/a'}, type: {kind}, "
f"date: {date or 'n/a'}).\n"
f"Seed topics to reuse when they fit: {seed}.\n\n"
f"PASSAGE:\n{text}"
)
return [
{"role": "system", "content": _SYSTEM},
{"role": "user", "content": context},
]
+69
View File
@@ -0,0 +1,69 @@
"""Extraction worker — drains 'extract' jobs from the backfill queue (§4.2, §13.4).
Single sequential worker by design: extraction is the heavier serial load on the one LLM GPU.
For each job: load the document, get its text (fetch+strip filing HTML, or read a stored transcript),
chunk it, run the §4.2 extractor per chunk, persist 0..N claims, complete the job.
"""
from __future__ import annotations
import logging
from pathlib import Path
import requests
from ..backfill import queue
from . import claims as claims_mod
from .html_text import html_to_text
log = logging.getLogger(__name__)
def _document_text(doc, *, user_agent: str) -> str:
if doc["transcript_path"]:
return Path(doc["transcript_path"]).read_text()
if doc["kind"] == "filing" and doc["url"]:
r = requests.get(doc["url"], headers={"User-Agent": user_agent}, timeout=90)
r.raise_for_status()
return html_to_text(r.text)
raise ValueError(f"no text source for {doc['doc_id']} (kind={doc['kind']}, url={doc['url']})")
def run_extract(conn, sc, cfg, *, limit: int = 10, max_chunks_per_doc: int = 4,
chunk_chars: int = 18_000, lease_seconds: int = 900,
worker_id: str = "extract-1") -> dict:
from .backends import from_config as backend_from_config
backend = backend_from_config(cfg, sc)
log.info("extraction backend: %s", backend.name)
claims_mod.register_seed_topics(conn)
processed = total_claims = 0
while processed < limit:
job = queue.lease_next(conn, worker_id=worker_id, job_types=["extract"], lease_seconds=lease_seconds)
if job is None:
break
processed += 1
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
if doc is None:
queue.skip(conn, job["job_id"], "document missing")
continue
src = conn.execute("SELECT * FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
try:
text = _document_text(doc, user_agent=cfg.edgar_user_agent)
chunks = claims_mod.chunk_text(text, chunk_chars)[:max_chunks_per_doc]
doc_claims = 0
for idx, chunk in enumerate(chunks):
cl = claims_mod.extract_claims_from_text(
backend, chunk,
source_name=src["name"] if src else "",
source_cluster=src["source_cluster"] if src else None,
date=doc["date"], kind=doc["kind"],
)
doc_claims += claims_mod.persist_claims(conn, doc=doc, source=src, claims=cl, chunk_idx=idx)
conn.execute("UPDATE documents SET processed_at=datetime('now') WHERE doc_id=?", (doc["doc_id"],))
conn.commit()
queue.complete(conn, job["job_id"], output_ref=f"{doc_claims} claims / {len(chunks)} chunks")
total_claims += doc_claims
log.info("extracted %d claims from %s (%d chunks)", doc_claims, doc["doc_id"], len(chunks))
except Exception as e: # noqa: BLE001
state = queue.fail(conn, job["job_id"], e)
log.warning("extract failed for %s: %s (→ %s)", job["target_id"], e, state)
return {"jobs_processed": processed, "claims_written": total_claims}