Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,6 @@
|
||||
"""Extraction (§4.2) — local LLM → structured claim units. The cost & quality center.
|
||||
|
||||
Emits at the level of the PROPOSITION: a passage may yield 0..N claims, and MOST passages yield
|
||||
zero. An extractor that dutifully emits a claim per chunk reintroduces exactly the noise the rest
|
||||
of the system is designed to remove.
|
||||
"""
|
||||
@@ -0,0 +1,64 @@
|
||||
"""Pluggable extraction backends (§scaling).
|
||||
|
||||
The §4.2 extractor calls a backend that turns chat messages into a JSON string. Default is the
|
||||
LOCAL Qwen via Spark Control (the ~95%-local design). The Gemini backend is the documented
|
||||
overflow/fallback for bulk back-cataloging at scale, or if the Sparks are unavailable — used for
|
||||
the PUBLIC corpus only, never conviction/exposure data (sovereignty boundary, §4.6).
|
||||
|
||||
A backend exposes: complete_json(messages, max_tokens) -> str (a JSON object string).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LocalQwenBackend:
|
||||
name = "local"
|
||||
|
||||
def __init__(self, sc) -> None:
|
||||
self.sc = sc
|
||||
|
||||
def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
|
||||
resp = self.sc.chat(messages, json_object=True, temperature=0,
|
||||
enable_thinking=False, max_tokens=max_tokens)
|
||||
return resp["choices"][0]["message"]["content"]
|
||||
|
||||
|
||||
class GeminiBackend:
|
||||
"""Gemini fallback/overflow. Implemented against the `google-genai` SDK. NOTE: untested until a
|
||||
key is provided — validate end-to-end before relying on it for a real backfill. The async BATCH
|
||||
API is the eventual scale path; this synchronous form is the drop-in fallback."""
|
||||
name = "gemini"
|
||||
|
||||
def __init__(self, api_key: str, model: str = "gemini-2.5-flash") -> None:
|
||||
from google import genai # guarded import; pip install google-genai
|
||||
self._genai = genai
|
||||
self.client = genai.Client(api_key=api_key)
|
||||
self.model = model
|
||||
|
||||
def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
|
||||
from google.genai import types
|
||||
system = "\n\n".join(m["content"] for m in messages if m["role"] == "system")
|
||||
user = "\n\n".join(m["content"] for m in messages if m["role"] != "system")
|
||||
resp = self.client.models.generate_content(
|
||||
model=self.model,
|
||||
contents=user,
|
||||
config=types.GenerateContentConfig(
|
||||
system_instruction=system or None,
|
||||
temperature=0,
|
||||
max_output_tokens=max_tokens,
|
||||
response_mime_type="application/json",
|
||||
),
|
||||
)
|
||||
return resp.text or "{}"
|
||||
|
||||
|
||||
def from_config(cfg, sc) -> "LocalQwenBackend | GeminiBackend":
|
||||
if cfg.extraction_backend == "gemini":
|
||||
if not cfg.gemini_api_key:
|
||||
log.warning("EXTRACTION_BACKEND=gemini but GEMINI_API_KEY missing — falling back to local")
|
||||
else:
|
||||
return GeminiBackend(cfg.gemini_api_key, cfg.gemini_model)
|
||||
return LocalQwenBackend(sc)
|
||||
@@ -0,0 +1,117 @@
|
||||
"""Claim extraction: text → 0..N claim units → SQLite (§4.2)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
from typing import Any
|
||||
|
||||
from .prompt import SEED_TOPICS, build_messages
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_ENUMS = {
|
||||
"claim_type": {"interpretive", "predictive", "descriptive", "reactive"},
|
||||
"time_horizon": {"near", "medium", "long", "unspecified"},
|
||||
"confidence": {"low", "med", "high"},
|
||||
"thesis_seam": {"energy_compute", "debasement_bitcoin", "ai_data_ownership", "none"},
|
||||
"salience": {"central", "secondary", "aside"},
|
||||
}
|
||||
|
||||
|
||||
def register_seed_topics(conn: sqlite3.Connection) -> None:
|
||||
"""Pre-load the controlled half of the hybrid topic vocabulary (§4.2)."""
|
||||
for t in SEED_TOPICS:
|
||||
conn.execute(
|
||||
"INSERT INTO topics (topic_canonical, status) VALUES (?, 'controlled') "
|
||||
"ON CONFLICT(topic_canonical) DO UPDATE SET status='controlled'",
|
||||
(t,),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def chunk_text(text: str, max_chars: int) -> list[str]:
|
||||
"""Split on paragraph boundaries into windows that fit the model context alongside the prompt."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
if len(text) <= max_chars:
|
||||
return [text]
|
||||
chunks: list[str] = []
|
||||
cur: list[str] = []
|
||||
size = 0
|
||||
for para in text.split("\n\n"):
|
||||
if size + len(para) > max_chars and cur:
|
||||
chunks.append("\n\n".join(cur))
|
||||
cur, size = [], 0
|
||||
cur.append(para)
|
||||
size += len(para) + 2
|
||||
if cur:
|
||||
chunks.append("\n\n".join(cur))
|
||||
return chunks
|
||||
|
||||
|
||||
def _parse_claims(content: str) -> list[dict]:
|
||||
try:
|
||||
obj = json.loads(content)
|
||||
except Exception:
|
||||
i, j = content.find("{"), content.rfind("}")
|
||||
if i < 0 or j < 0:
|
||||
return []
|
||||
try:
|
||||
obj = json.loads(content[i:j + 1])
|
||||
except Exception:
|
||||
return []
|
||||
claims = obj.get("claims", []) if isinstance(obj, dict) else []
|
||||
return [c for c in claims if isinstance(c, dict) and c.get("proposition")]
|
||||
|
||||
|
||||
def extract_claims_from_text(backend, text: str, *, source_name: str, source_cluster: str | None,
|
||||
date: str | None, kind: str) -> list[dict]:
|
||||
"""`backend` is any object with .complete_json(messages, max_tokens) -> str
|
||||
(see extract.backends: LocalQwenBackend | GeminiBackend)."""
|
||||
messages = build_messages(text, source_name=source_name, source_cluster=source_cluster,
|
||||
date=date, kind=kind)
|
||||
content = backend.complete_json(messages, max_tokens=4000)
|
||||
return _parse_claims(content)
|
||||
|
||||
|
||||
def _enum(c: dict, field: str, default: str) -> str:
|
||||
v = c.get(field)
|
||||
return v if v in _ENUMS[field] else default
|
||||
|
||||
|
||||
def persist_claims(conn: sqlite3.Connection, *, doc: sqlite3.Row, source: sqlite3.Row | None,
|
||||
claims: list[dict], chunk_idx: int) -> int:
|
||||
n = 0
|
||||
cluster = source["source_cluster"] if source else None
|
||||
for i, c in enumerate(claims):
|
||||
seam = _enum(c, "thesis_seam", "none")
|
||||
topic = c.get("topic_canonical") or None
|
||||
if topic:
|
||||
# register emergent topics BEFORE the claim (claims.topic_canonical is a FK → topics)
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO topics (topic_canonical, status, seam) VALUES (?, 'emergent', ?)",
|
||||
(topic, seam),
|
||||
)
|
||||
claim_id = f"{doc['doc_id']}:{chunk_idx}:{i}"
|
||||
conn.execute(
|
||||
"""INSERT OR IGNORE INTO claims
|
||||
(claim_id, doc_id, source_id, proposition, topic_canonical, topic_raw, claimant,
|
||||
source_cluster, date, claim_type, time_horizon, confidence, rel_polarity,
|
||||
engages_consensus, counters_position, thesis_seam, salience)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
|
||||
(
|
||||
claim_id, doc["doc_id"], doc["source_id"], str(c["proposition"])[:1000],
|
||||
topic, c.get("topic_raw"),
|
||||
c.get("claimant") or (source["name"] if source else None),
|
||||
cluster, doc["date"],
|
||||
_enum(c, "claim_type", "descriptive"), _enum(c, "time_horizon", "unspecified"),
|
||||
_enum(c, "confidence", "med"), "none",
|
||||
1 if c.get("engages_consensus") else 0, c.get("counters_position"),
|
||||
seam, _enum(c, "salience", "secondary"),
|
||||
),
|
||||
)
|
||||
n += 1
|
||||
conn.commit()
|
||||
return n
|
||||
@@ -0,0 +1,47 @@
|
||||
"""SEC filing HTML → plain text. Stdlib only (boring, inspectable).
|
||||
|
||||
Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge <ix:hidden> section of
|
||||
numeric facts that would otherwise swamp the extractor), and collapses whitespace.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
|
||||
_SKIP_TAGS = {"script", "style", "head"}
|
||||
_SKIP_PREFIXES = ("ix:hidden",) # inline-XBRL hidden fact dump
|
||||
_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"}
|
||||
|
||||
|
||||
class _Stripper(HTMLParser):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(convert_charrefs=True)
|
||||
self._skip_depth = 0
|
||||
self._parts: list[str] = []
|
||||
|
||||
def handle_starttag(self, tag: str, attrs) -> None:
|
||||
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
|
||||
self._skip_depth += 1
|
||||
elif tag in _BLOCK_TAGS:
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
|
||||
self._skip_depth = max(0, self._skip_depth - 1)
|
||||
elif tag in _BLOCK_TAGS:
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if self._skip_depth == 0 and data.strip():
|
||||
self._parts.append(data)
|
||||
|
||||
|
||||
def html_to_text(html: str, *, max_chars: int = 300_000) -> str:
|
||||
p = _Stripper()
|
||||
p.feed(html)
|
||||
text = "".join(p._parts)
|
||||
text = re.sub(r"[ \t ]+", " ", text)
|
||||
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
|
||||
text = "\n".join(line.strip() for line in text.splitlines())
|
||||
text = text.strip()
|
||||
return text[:max_chars]
|
||||
@@ -0,0 +1,72 @@
|
||||
"""The §4.2 claim-extraction prompt. Prompt engineering is ours (§13.3); the schema is finalized.
|
||||
|
||||
Discipline encoded here (the whole point of the system, §2/§4.2):
|
||||
- Extract at the level of the PROPOSITION; emit ZERO when there is no substantive claim.
|
||||
- Separate topic from stance: capture stance-vs-consensus explicitly, never as a bull/bear label.
|
||||
- thesis_seam is a TAG, not a filter — off-thesis and anti-thesis claims are still extracted.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
# Hybrid topic vocabulary (§4.2): a small SEEDED controlled list. The model reuses one when it
|
||||
# fits and proposes a concise snake_case topic otherwise; emergent topics are merged on a schedule.
|
||||
SEED_TOPICS = [
|
||||
# energy <-> compute
|
||||
"ai_compute_demand", "ai_power_constraint", "datacenter_buildout", "grid_interconnect",
|
||||
"transformers_equipment", "nuclear_power", "natural_gas_power", "uranium_supply",
|
||||
"cooling_infrastructure", "miner_flexible_load", "mining_ai_pivot",
|
||||
# debasement <-> bitcoin
|
||||
"bitcoin_reserve_asset", "bitcoin_collateral_credit", "bitcoin_treasury_strategy",
|
||||
"btc_custody_regulation", "sovereign_bitcoin_adoption",
|
||||
# ai <-> data ownership
|
||||
"ai_data_ownership", "confidential_inference", "ai_commoditization",
|
||||
# macro
|
||||
"fed_policy", "fiscal_debasement", "stablecoins_cbdc",
|
||||
]
|
||||
|
||||
_SYSTEM = """You are the claim-extraction component of an investment signal engine. You read a passage \
|
||||
(an SEC filing excerpt or a podcast/earnings-call transcript) and extract structured CLAIM UNITS.
|
||||
|
||||
A CLAIM UNIT is a single normalized proposition that someone asserts — a forward-looking prediction, \
|
||||
an interpretive or causal judgment, or a stance taken against a prevailing view. It must be specific \
|
||||
enough to later be checked against the world.
|
||||
|
||||
CRITICAL DISCIPLINE — be willing to extract NOTHING:
|
||||
- Most passages contain ZERO claim units. Boilerplate, legal disclaimers, ad reads, pleasantries, \
|
||||
generic descriptions, routine financial line-items, and recitations of well-known news are NOT claims.
|
||||
- Do NOT invent claims. Do NOT emit one claim per paragraph to seem thorough. If the passage has no \
|
||||
substantive proposition, return {"claims": []}. A precise empty answer is the correct, valued output.
|
||||
- Extract at the level of the PROPOSITION: one normalized subject-assertion-object sentence each. A \
|
||||
single rich passage may yield several; a long dull one yields none.
|
||||
|
||||
For EACH claim unit, output these fields:
|
||||
- "proposition": one normalized sentence (subject-assertion-object), self-contained.
|
||||
- "topic_canonical": a concise snake_case topic for clustering. REUSE one of the provided seed topics \
|
||||
when it fits; otherwise propose a new concise snake_case label. Normalize synonyms (Fed/FOMC/rates → fed_policy).
|
||||
- "topic_raw": the topic as actually phrased in the passage.
|
||||
- "claimant": who asserts it (speaker name or the filing company). Use "unknown" if unclear.
|
||||
- "claim_type": one of interpretive | predictive | descriptive | reactive. (interpretive/predictive = \
|
||||
insight; descriptive/reactive = news echo — extract those only if clearly salient.)
|
||||
- "time_horizon": one of near | medium | long | unspecified (for predictive claims especially).
|
||||
- "confidence": the claimant's apparent conviction — one of low | med | high.
|
||||
- "engages_consensus": true ONLY if the claim explicitly argues against a stated mainstream view.
|
||||
- "counters_position": the mainstream position it argues against, or null.
|
||||
- "thesis_seam": one of energy_compute | debasement_bitcoin | ai_data_ownership | none. This is a TAG \
|
||||
for relevance only — tag off-thesis claims "none" and STILL extract them.
|
||||
- "salience": central | secondary | aside (how central the claim is to the passage).
|
||||
|
||||
Return ONLY a JSON object: {"claims": [ {...}, ... ]}. No prose, no markdown."""
|
||||
|
||||
|
||||
def build_messages(text: str, *, source_name: str, source_cluster: str | None,
|
||||
date: str | None, kind: str) -> list[dict[str, str]]:
|
||||
seed = ", ".join(SEED_TOPICS)
|
||||
context = (
|
||||
f"Source: {source_name or 'unknown'} (cluster: {source_cluster or 'n/a'}, type: {kind}, "
|
||||
f"date: {date or 'n/a'}).\n"
|
||||
f"Seed topics to reuse when they fit: {seed}.\n\n"
|
||||
f"PASSAGE:\n{text}"
|
||||
)
|
||||
return [
|
||||
{"role": "system", "content": _SYSTEM},
|
||||
{"role": "user", "content": context},
|
||||
]
|
||||
@@ -0,0 +1,69 @@
|
||||
"""Extraction worker — drains 'extract' jobs from the backfill queue (§4.2, §13.4).
|
||||
|
||||
Single sequential worker by design: extraction is the heavier serial load on the one LLM GPU.
|
||||
For each job: load the document, get its text (fetch+strip filing HTML, or read a stored transcript),
|
||||
chunk it, run the §4.2 extractor per chunk, persist 0..N claims, complete the job.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from ..backfill import queue
|
||||
from . import claims as claims_mod
|
||||
from .html_text import html_to_text
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _document_text(doc, *, user_agent: str) -> str:
|
||||
if doc["transcript_path"]:
|
||||
return Path(doc["transcript_path"]).read_text()
|
||||
if doc["kind"] == "filing" and doc["url"]:
|
||||
r = requests.get(doc["url"], headers={"User-Agent": user_agent}, timeout=90)
|
||||
r.raise_for_status()
|
||||
return html_to_text(r.text)
|
||||
raise ValueError(f"no text source for {doc['doc_id']} (kind={doc['kind']}, url={doc['url']})")
|
||||
|
||||
|
||||
def run_extract(conn, sc, cfg, *, limit: int = 10, max_chunks_per_doc: int = 4,
|
||||
chunk_chars: int = 18_000, lease_seconds: int = 900,
|
||||
worker_id: str = "extract-1") -> dict:
|
||||
from .backends import from_config as backend_from_config
|
||||
backend = backend_from_config(cfg, sc)
|
||||
log.info("extraction backend: %s", backend.name)
|
||||
claims_mod.register_seed_topics(conn)
|
||||
processed = total_claims = 0
|
||||
while processed < limit:
|
||||
job = queue.lease_next(conn, worker_id=worker_id, job_types=["extract"], lease_seconds=lease_seconds)
|
||||
if job is None:
|
||||
break
|
||||
processed += 1
|
||||
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
|
||||
if doc is None:
|
||||
queue.skip(conn, job["job_id"], "document missing")
|
||||
continue
|
||||
src = conn.execute("SELECT * FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
|
||||
try:
|
||||
text = _document_text(doc, user_agent=cfg.edgar_user_agent)
|
||||
chunks = claims_mod.chunk_text(text, chunk_chars)[:max_chunks_per_doc]
|
||||
doc_claims = 0
|
||||
for idx, chunk in enumerate(chunks):
|
||||
cl = claims_mod.extract_claims_from_text(
|
||||
backend, chunk,
|
||||
source_name=src["name"] if src else "",
|
||||
source_cluster=src["source_cluster"] if src else None,
|
||||
date=doc["date"], kind=doc["kind"],
|
||||
)
|
||||
doc_claims += claims_mod.persist_claims(conn, doc=doc, source=src, claims=cl, chunk_idx=idx)
|
||||
conn.execute("UPDATE documents SET processed_at=datetime('now') WHERE doc_id=?", (doc["doc_id"],))
|
||||
conn.commit()
|
||||
queue.complete(conn, job["job_id"], output_ref=f"{doc_claims} claims / {len(chunks)} chunks")
|
||||
total_claims += doc_claims
|
||||
log.info("extracted %d claims from %s (%d chunks)", doc_claims, doc["doc_id"], len(chunks))
|
||||
except Exception as e: # noqa: BLE001
|
||||
state = queue.fail(conn, job["job_id"], e)
|
||||
log.warning("extract failed for %s: %s (→ %s)", job["target_id"], e, state)
|
||||
return {"jobs_processed": processed, "claims_written": total_claims}
|
||||
Reference in New Issue
Block a user