Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,6 @@
|
||||
"""The scoring brain (build blueprint).
|
||||
|
||||
Stats/geometry NOMINATE candidates; the frontier model only judges/expands a pre-filtered shortlist
|
||||
(§5.1). Every count that feeds a score routes through the independence primitive (EISC), never a raw
|
||||
source count (§4.5). Every scorer reads `visible_claims` (as-of filtered), never `claims` directly.
|
||||
"""
|
||||
@@ -0,0 +1,43 @@
|
||||
"""As-of harness (§6.6 look-ahead guard).
|
||||
|
||||
Every scorer reads the `visible_claims` TEMP VIEW, never `claims` directly: at nomination time only
|
||||
claims dated <= as_of are visible, so the backtest can't reward noticing what already happened. The
|
||||
view also resolves merged canonical topics (topics.status='merged') to a stable `topic_id`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
|
||||
|
||||
class Scorer:
|
||||
"""Context manager that binds a run to an as_of date and exposes `visible_claims`.
|
||||
|
||||
mode='backtest' enforces strict as-of discipline; 'forward' is the live pilot. as_of is a
|
||||
controlled ISO date (YYYY-MM-DD) — safe to inline into the view DDL (views can't take params)."""
|
||||
|
||||
def __init__(self, conn: sqlite3.Connection, as_of: str, *, mode: str = "backtest") -> None:
|
||||
self.conn = conn
|
||||
self.as_of = as_of
|
||||
self.mode = mode
|
||||
|
||||
def __enter__(self) -> "Scorer":
|
||||
self.conn.executescript(
|
||||
f"""
|
||||
DROP VIEW IF EXISTS visible_claims;
|
||||
CREATE TEMP VIEW visible_claims AS
|
||||
SELECT c.*,
|
||||
COALESCE((SELECT t.merged_into FROM topics t
|
||||
WHERE t.topic_canonical = c.topic_canonical AND t.status='merged'),
|
||||
c.topic_canonical) AS topic_id
|
||||
FROM claims c
|
||||
JOIN documents d ON d.doc_id = c.doc_id
|
||||
WHERE c.date IS NOT NULL AND c.date <= '{self.as_of}';
|
||||
"""
|
||||
)
|
||||
return self
|
||||
|
||||
def __exit__(self, *exc) -> None:
|
||||
self.conn.execute("DROP VIEW IF EXISTS visible_claims")
|
||||
|
||||
def count_visible(self) -> int:
|
||||
return self.conn.execute("SELECT COUNT(*) FROM visible_claims").fetchone()[0]
|
||||
@@ -0,0 +1,49 @@
|
||||
"""The quantitative bar (§5.1, §6.6) — the single gate between nomination and the frontier judge.
|
||||
|
||||
Two tiers:
|
||||
- evidence bar → clears hard gates → WRITE A LEDGER ROW (the denominator, §6.6), even if never judged.
|
||||
- promotion bar → also clears the score threshold → goes to the frontier judge.
|
||||
|
||||
THE GLOBAL META-RULE (applied to every scorer): no candidate clears on a single source or single
|
||||
cluster — EISC_adj >= 2.0 AND K_eff >= 2. This is the §2.1 anti-lonely-outlier law, enforced once.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
EISC_FLOOR = 2.0
|
||||
KEFF_FLOOR = 2
|
||||
|
||||
# Defaults; overridable via the score_thresholds table (so the backtest can sweep without code edits).
|
||||
DEFAULT_MIN_SCORE = {"under_acted": 0.3, "emergence": 2.0, "contrarian": 1.5,
|
||||
"convergence": 2.5, "intersection": 2.0}
|
||||
|
||||
|
||||
def _min_score(conn, scorer: str) -> float:
|
||||
if conn is not None:
|
||||
row = conn.execute("SELECT min_score FROM score_thresholds WHERE scorer=?", (scorer,)).fetchone()
|
||||
if row and row[0] is not None:
|
||||
return float(row[0])
|
||||
return DEFAULT_MIN_SCORE.get(scorer, 0.0)
|
||||
|
||||
|
||||
def evaluate(scorer: str, result: dict, *, conn=None) -> tuple[bool, bool]:
|
||||
"""Returns (cleared_evidence_bar, cleared_promotion_bar)."""
|
||||
if scorer == "under_acted":
|
||||
return _under_acted(result, _min_score(conn, scorer))
|
||||
return (False, False) # Job A scorers wired with the forward pilot
|
||||
|
||||
|
||||
def _under_acted(result: dict, min_score: float) -> tuple[bool, bool]:
|
||||
i = result["inputs"]
|
||||
breaker = bool(i.get("is_breaker"))
|
||||
# §4.4 Job B = "rising INDEPENDENT corroboration". EISC>=2.0 enforces independence (shared-guest +
|
||||
# same-cluster discounting), so this is NOT an isolated point or one-guest echo (§2.1). Cross-cluster
|
||||
# (k_eff>=2) is the §4.5 GOLD for Job A DISCOVERY — NOT a hard gate for Job B corroboration: N
|
||||
# independent energy companies confirming a power thesis is real corroboration. Cross-cluster still
|
||||
# BOOSTS the score (eisc_corrob = eisc_adj includes the xcluster_mult) so cross-cluster ranks first.
|
||||
corroborated = (i.get("n_confirmed", 0) >= 4 and i.get("n_src", 0) >= 2
|
||||
and i.get("eisc_corrob", 0.0) >= EISC_FLOOR and i.get("a_corrob", 0.0) > 0)
|
||||
conv_ok = breaker or i.get("conviction_weight", 0.0) >= 0.7 # med-high / high
|
||||
expo_ok = breaker or i.get("exposure") in ("none", "lt2") # genuine exposure gap
|
||||
evidence = corroborated and conv_ok and expo_ok
|
||||
promotion = evidence and result["score"] >= min_score
|
||||
return evidence, promotion
|
||||
@@ -0,0 +1,86 @@
|
||||
"""Pre-registered confusion matrix on the §7.1 derivatives (DESIGN_v2 §1.3).
|
||||
|
||||
Measures PRECISION and RECALL, not recall alone. Uses the engine's already-stored candidate_scores
|
||||
(cleared_date + whisper_date) × the pre-registered external repricing (resolution.K2023.yaml). Reports
|
||||
the matrix at BOTH the cleared level (what the engine fired) and the whisper level (what it saw before
|
||||
the independence floor) — the delta is the empirical answer to the gate debate.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
import yaml
|
||||
|
||||
from .external import basket_index, fetch_eod, resolve_reprice, runway_at_signal
|
||||
|
||||
|
||||
def _engine_dates(conn) -> dict[str, dict]:
|
||||
"""For each under_acted node: earliest cleared as_of and earliest whisper as_of (n_conf>=4, a>0)."""
|
||||
rows = conn.execute(
|
||||
"SELECT node_id, conviction_id, as_of, cleared_evidence_bar ev, inputs_json "
|
||||
"FROM candidate_scores WHERE scorer='under_acted'"
|
||||
).fetchall()
|
||||
out: dict[str, dict] = {}
|
||||
for r in rows:
|
||||
k = r["node_id"] or r["conviction_id"]
|
||||
i = json.loads(r["inputs_json"])
|
||||
d = out.setdefault(k, {"cleared": None, "whisper": None})
|
||||
if r["ev"] and (d["cleared"] is None or r["as_of"] < d["cleared"]):
|
||||
d["cleared"] = r["as_of"]
|
||||
if i.get("n_confirmed", 0) >= 4 and i.get("a_corrob", 0) > 0:
|
||||
if d["whisper"] is None or r["as_of"] < d["whisper"]:
|
||||
d["whisper"] = r["as_of"]
|
||||
return out
|
||||
|
||||
|
||||
def _lead_days(repricing_date: str, signal_date: str | None) -> int | None:
|
||||
if not signal_date or not repricing_date:
|
||||
return None
|
||||
return (datetime.strptime(repricing_date, "%Y-%m-%d") - datetime.strptime(signal_date, "%Y-%m-%d")).days
|
||||
|
||||
|
||||
def run_confusion(conn, cfg, spec_path: str) -> dict:
|
||||
spec = yaml.safe_load(open(spec_path))
|
||||
w, rule = spec["window"], spec["rule"]
|
||||
engine = _engine_dates(conn)
|
||||
price_cache: dict[str, list] = {}
|
||||
|
||||
rows = []
|
||||
for node, basket in spec["baskets"].items():
|
||||
prices = {}
|
||||
for sym in basket:
|
||||
if sym not in price_cache:
|
||||
price_cache[sym] = fetch_eod(cfg.fmp_api_key, sym, w["start"], w["end"])
|
||||
prices[sym] = price_cache[sym]
|
||||
missing = [s for s in basket if not prices[s]]
|
||||
idx = basket_index(prices)
|
||||
res = resolve_reprice(idx, threshold_pct=rule["threshold_pct"], hold_pct=rule["hold_pct"],
|
||||
hold_days=rule["hold_days"])
|
||||
ed = engine.get(node, {"cleared": None, "whisper": None})
|
||||
rows.append({
|
||||
"node": node, "basket": basket, "missing": missing,
|
||||
"confirmed": res["confirmed"], "repricing_date": res["repricing_date"], "peak_pct": res["peak_pct"],
|
||||
"cleared_date": ed["cleared"], "whisper_date": ed["whisper"],
|
||||
"lead_cleared": _lead_days(res["repricing_date"], ed["cleared"]) if res["confirmed"] else None,
|
||||
"lead_whisper": _lead_days(res["repricing_date"], ed["whisper"]) if res["confirmed"] else None,
|
||||
# DESIGN_v2.1 Correction A: runway = fraction of the durable move still ahead at signal
|
||||
"runway_cleared": runway_at_signal(idx, ed["cleared"]) if res["confirmed"] else None,
|
||||
"runway_whisper": runway_at_signal(idx, ed["whisper"]) if res["confirmed"] else None,
|
||||
})
|
||||
|
||||
def classify(r, level):
|
||||
fired = bool(r[f"{level}_date"])
|
||||
real = r["confirmed"]
|
||||
return "TP" if (fired and real) else "FP" if (fired and not real) else "FN" if real else "TN"
|
||||
|
||||
def matrix(level):
|
||||
c = {"TP": 0, "FP": 0, "FN": 0, "TN": 0}
|
||||
for r in rows:
|
||||
c[classify(r, level)] += 1
|
||||
p = c["TP"] / (c["TP"] + c["FP"]) if (c["TP"] + c["FP"]) else None
|
||||
rec = c["TP"] / (c["TP"] + c["FN"]) if (c["TP"] + c["FN"]) else None
|
||||
return c, p, rec
|
||||
|
||||
return {"rows": rows, "cleared": matrix("cleared"), "whisper": matrix("whisper"),
|
||||
"classify": classify}
|
||||
@@ -0,0 +1,96 @@
|
||||
"""External-confirmation data for the resolver (DESIGN_v2 §1). Price series via FMP (already paid for).
|
||||
|
||||
This is the *resolving* leg (§6.2): real-world repricing, not discourse. Kept deliberately simple and
|
||||
transparent — the resolution rule is pre-registered, so the code here only fetches + applies it.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import requests
|
||||
|
||||
_FMP = "https://financialmodelingprep.com"
|
||||
|
||||
|
||||
def fetch_eod(api_key: str, symbol: str, start: str, end: str) -> list[tuple[str, float]]:
|
||||
"""Daily (date, close) for a symbol. Tries the FMP 'stable' then legacy 'v3' price endpoints."""
|
||||
s = requests.Session()
|
||||
attempts = [
|
||||
(f"{_FMP}/stable/historical-price-eod/full", {"symbol": symbol, "from": start, "to": end}),
|
||||
(f"{_FMP}/api/v3/historical-price-full/{symbol}", {"from": start, "to": end}),
|
||||
]
|
||||
for url, params in attempts:
|
||||
try:
|
||||
r = s.get(url, params={**params, "apikey": api_key}, timeout=40)
|
||||
if r.status_code != 200:
|
||||
continue
|
||||
j = r.json()
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
rows = j.get("historical") if isinstance(j, dict) else j
|
||||
if not rows:
|
||||
continue
|
||||
out = [(x["date"][:10], x.get("close") or x.get("adjClose")) for x in rows
|
||||
if x.get("date") and (x.get("close") or x.get("adjClose"))]
|
||||
if out:
|
||||
return sorted(out)
|
||||
return []
|
||||
|
||||
|
||||
def basket_index(prices_by_symbol: dict[str, list[tuple[str, float]]]) -> list[tuple[str, float]]:
|
||||
"""Equal-weight, each-symbol-normalized-to-its-own-first-close index, averaged over dates where
|
||||
data exists. (Symbols that IPO'd mid-window enter at 1.0 when they start — flagged by the caller.)"""
|
||||
norm = {}
|
||||
for sym, series in prices_by_symbol.items():
|
||||
if series:
|
||||
base = series[0][1]
|
||||
norm[sym] = {d: c / base for d, c in series if base}
|
||||
dates = sorted({d for n in norm.values() for d in n})
|
||||
idx = []
|
||||
for d in dates:
|
||||
vals = [n[d] for n in norm.values() if d in n]
|
||||
if vals:
|
||||
idx.append((d, sum(vals) / len(vals)))
|
||||
return idx
|
||||
|
||||
|
||||
def index_value_at(index: list[tuple[str, float]], date: str | None) -> float | None:
|
||||
"""Latest index value on or before `date` (baseline if the signal predates the data)."""
|
||||
if not index or not date:
|
||||
return None
|
||||
vals = [v for d, v in index if d <= date]
|
||||
return vals[-1] if vals else index[0][1]
|
||||
|
||||
|
||||
def runway_at_signal(index: list[tuple[str, float]], signal_date: str | None) -> float | None:
|
||||
"""Fraction of the durable move STILL AHEAD at the signal date (DESIGN_v2.1 Correction A).
|
||||
1.0 = whole move ahead (signal before it); 0.0 = signal at the peak. The right metric for a
|
||||
long-duration holder — a modestly-late signal with most of the move ahead is still actionable."""
|
||||
if not index or not signal_date:
|
||||
return None
|
||||
base = index[0][1]
|
||||
peak = max(v for _, v in index)
|
||||
val = index_value_at(index, signal_date)
|
||||
if peak <= base or val is None:
|
||||
return None
|
||||
return round(max(0.0, (peak - val) / (peak - base)), 2)
|
||||
|
||||
|
||||
def resolve_reprice(index: list[tuple[str, float]], *, threshold_pct: float, hold_pct: float,
|
||||
hold_days: int) -> dict:
|
||||
"""Apply the pre-registered rule: first date the index is ≥ +threshold% vs baseline AND still
|
||||
≥ +hold% `hold_days` later. Returns {confirmed, repricing_date, peak_pct}."""
|
||||
from datetime import datetime, timedelta
|
||||
if not index:
|
||||
return {"confirmed": False, "repricing_date": None, "peak_pct": None}
|
||||
base = index[0][1]
|
||||
thr = 1.0 + threshold_pct / 100.0
|
||||
hold = 1.0 + hold_pct / 100.0
|
||||
by_date = dict(index)
|
||||
dates = [d for d, _ in index]
|
||||
peak = max(v for _, v in index)
|
||||
for d, v in index:
|
||||
if v / base >= thr:
|
||||
target = (datetime.strptime(d, "%Y-%m-%d") + timedelta(days=hold_days)).strftime("%Y-%m-%d")
|
||||
later = [vv for dd, vv in index if dd >= target]
|
||||
if later and (later[0] / base) >= hold:
|
||||
return {"confirmed": True, "repricing_date": d, "peak_pct": round((peak / base - 1) * 100, 1)}
|
||||
return {"confirmed": False, "repricing_date": None, "peak_pct": round((peak / base - 1) * 100, 1)}
|
||||
@@ -0,0 +1,113 @@
|
||||
"""Effective Independent Source Count (EISC) — the system's differentiator (§4.5).
|
||||
|
||||
Discount convergence by source connectedness. Five shows that "independently converge" but share one
|
||||
guest must count as ~one voice; three shows across macro/energy/ai with no shared guests are gold.
|
||||
|
||||
Method (resolved in the design panel): noisy-OR connectedness matrix + inverse-row-sum EISC.
|
||||
- symmetric & order-independent (unlike a sequential pairwise-penalty walk)
|
||||
- each source's contribution is individually explainable ("counts 0.31 because connected to 3 others")
|
||||
- collapses correctly: 5 clones -> ~1.0 ; 5 cross-cluster independents -> ~5.0 (raw)
|
||||
- no eigensolve (unstable at n=2..4, our common case)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Coupling per edge type: a voiceprint-confirmed shared guest is near-total redundancy on a topic.
|
||||
KAPPA = {"shared_guest": 0.85, "citation": 0.45, "community": 0.60}
|
||||
# Same-cluster baseline correlation (sources in the same world are partly redundant even w/o an edge).
|
||||
CLUSTER_COUPLING = {"bitcoin": 0.55, "vc_consensus": 0.35}
|
||||
SAME_CLUSTER_DEFAULT = 0.25
|
||||
EDGE_CLAMP = 0.95 # cap kappa*weight so a heavily-weighted edge can't exceed near-total
|
||||
CAP_VALUE = 0.25 # §4.5: bitcoin / capped sources contribute at most 0.25 of a voice
|
||||
CLUSTER_MIN_CONTRIB = 0.5 # a cluster must add >= half an independent voice to count toward K_eff
|
||||
|
||||
|
||||
def effective_independent_N(srcs: list[tuple], edges: list[tuple], *, mode: str = "live") -> dict:
|
||||
"""srcs: [(source_id, source_cluster, cluster_capped_low[, own_network])]; edges: [(a,b,type,weight)].
|
||||
mode='live' (default) DROPS own_network sources (Ten31's own orbit — listening to ourselves, §v2.1);
|
||||
mode='test' keeps them (the reflexivity test fixture). Returns {eisc_adj, eisc_raw, k_eff, ...}."""
|
||||
if mode == "live":
|
||||
srcs = [s for s in srcs if not (len(s) > 3 and s[3])]
|
||||
ids = [s[0] for s in srcs]
|
||||
n = len(ids)
|
||||
if n == 0:
|
||||
return {"eisc_adj": 0.0, "eisc_raw": 0.0, "k_eff": 0, "xcluster_mult": 1.0, "per_source_contrib": {}}
|
||||
idx = {sid: i for i, sid in enumerate(ids)}
|
||||
cluster = {s[0]: s[1] for s in srcs}
|
||||
capped = {s[0]: (bool(s[2]) or s[1] == "bitcoin") for s in srcs}
|
||||
|
||||
# edge channel: combine all edges between a pair by noisy-OR product of (1 - kappa*weight)
|
||||
pair_factor: dict = defaultdict(lambda: 1.0)
|
||||
for a, b, etype, w in edges:
|
||||
if a in idx and b in idx and a != b:
|
||||
term = min(EDGE_CLAMP, KAPPA.get(etype, 0.0) * (w if w is not None else 1.0))
|
||||
pair_factor[frozenset((a, b))] *= (1.0 - term)
|
||||
|
||||
C = np.eye(n)
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
a, b = ids[i], ids[j]
|
||||
e = 1.0 - pair_factor[frozenset((a, b))] # 0 if no edge
|
||||
ci, cj = cluster[a], cluster[b]
|
||||
clust = (CLUSTER_COUPLING.get(ci, SAME_CLUSTER_DEFAULT)
|
||||
if (ci is not None and ci == cj) else 0.0)
|
||||
c = 1.0 - (1.0 - e) * (1.0 - clust)
|
||||
C[i, j] = C[j, i] = c
|
||||
|
||||
rowsum = C.sum(axis=1) # includes the diagonal 1.0
|
||||
contrib, eisc_raw = {}, 0.0
|
||||
cluster_mass: dict = defaultdict(float)
|
||||
for i, sid in enumerate(ids):
|
||||
cap = CAP_VALUE if capped[sid] else 1.0
|
||||
contrib[sid] = cap * (1.0 / rowsum[i])
|
||||
eisc_raw += contrib[sid]
|
||||
if not capped[sid] and cluster[sid]:
|
||||
cluster_mass[cluster[sid]] += contrib[sid]
|
||||
|
||||
# cross-cluster bonus: count NON-capped clusters that genuinely contribute an independent voice
|
||||
# (summed contribution >= half a voice). This stops "one guest across many clusters" from earning
|
||||
# the gold multiplier — the raw EISC already collapses that guest to ~1, and k_eff must agree.
|
||||
k_eff = sum(1 for m in cluster_mass.values() if m >= CLUSTER_MIN_CONTRIB)
|
||||
xmult = max(1.0, 1.0 + 0.5 * (k_eff - 1)) # 1clu->1.0, 2->1.5, 3->2.0 (gold)
|
||||
return {
|
||||
"eisc_adj": xmult * eisc_raw,
|
||||
"eisc_raw": eisc_raw,
|
||||
"k_eff": k_eff,
|
||||
"xcluster_mult": xmult,
|
||||
"per_source_contrib": {k: round(v, 4) for k, v in contrib.items()},
|
||||
}
|
||||
|
||||
|
||||
# --- DB helpers (the brain only READS the graph; edges are produced upstream by the voiceprint lib) ---
|
||||
def load_source_meta(conn, ids: list[str]) -> list[tuple]:
|
||||
ids = list(dict.fromkeys(ids))
|
||||
if not ids:
|
||||
return []
|
||||
ph = ",".join("?" * len(ids))
|
||||
rows = conn.execute(
|
||||
f"SELECT source_id, source_cluster, cluster_capped_low, COALESCE(own_network,0) "
|
||||
f"FROM sources WHERE source_id IN ({ph})", ids
|
||||
).fetchall()
|
||||
return [(r[0], r[1], r[2], r[3]) for r in rows]
|
||||
|
||||
|
||||
def load_edges(conn, ids: list[str]) -> list[tuple]:
|
||||
ids = list(dict.fromkeys(ids))
|
||||
if not ids:
|
||||
return []
|
||||
ph = ",".join("?" * len(ids))
|
||||
rows = conn.execute(
|
||||
f"SELECT src_a, src_b, edge_type, weight FROM source_edges WHERE src_a IN ({ph}) AND src_b IN ({ph})",
|
||||
ids + ids,
|
||||
).fetchall()
|
||||
return [(r[0], r[1], r[2], r[3]) for r in rows]
|
||||
|
||||
|
||||
def eisc_for(conn, source_ids: list[str], *, mode: str = "live") -> dict:
|
||||
"""Convenience: EISC for a set of source_ids, loading cluster/cap/own_network + edges from SQLite.
|
||||
mode='live' drops own_network sources; mode='test' keeps them (§v2.1 condition 1)."""
|
||||
ids = list(dict.fromkeys(source_ids))
|
||||
return effective_independent_N(load_source_meta(conn, ids), load_edges(conn, ids), mode=mode)
|
||||
@@ -0,0 +1,49 @@
|
||||
"""Ledger + candidate_scores writers. Log EVERY bar-clearer from day one (§6.6 denominator).
|
||||
|
||||
date_logged = as_of (backtest rows carry historical dates so lead-time math is correct). The
|
||||
discourse_metric JSON is FROZEN here at log time — the resolver (separate forward pass) never edits it.
|
||||
Grant's rating lives in human_evaluations; the model never reads it pre-log (§6.7).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
|
||||
|
||||
def _sig_id(scorer: str, key: str, as_of: str) -> str:
|
||||
return "sig_" + hashlib.sha1(f"{scorer}|{key}|{as_of}".encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
def _score_id(scorer: str, key: str, as_of: str) -> str:
|
||||
return hashlib.sha1(f"cs|{scorer}|{key}|{as_of}".encode()).hexdigest()
|
||||
|
||||
|
||||
def record_candidate_score(conn, result: dict, as_of: str, evidence: bool, promotion: bool) -> None:
|
||||
key = result.get("node_id") or result.get("conviction_id") or result.get("topic_canonical") or ""
|
||||
conn.execute(
|
||||
"""INSERT OR REPLACE INTO candidate_scores
|
||||
(score_id, scorer, as_of, topic_canonical, node_id, conviction_id, score,
|
||||
cleared_evidence_bar, cleared_promotion_bar, inputs_json)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?)""",
|
||||
(_score_id(result["scorer"], key, as_of), result["scorer"], as_of,
|
||||
result.get("topic_canonical"), result.get("node_id"), result.get("conviction_id"),
|
||||
result["score"], int(evidence), int(promotion), json.dumps(result["inputs"])[:8000]),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def log_candidate(conn, *, scorer: str, as_of: str, ledger_type: str, proposition: str,
|
||||
discourse_metric: dict, origin_conviction_id=None, origin_node_id=None) -> str:
|
||||
key = origin_node_id or origin_conviction_id or proposition
|
||||
signal_id = _sig_id(scorer, key, as_of)
|
||||
dm = {**discourse_metric, "scorer": scorer}
|
||||
conn.execute(
|
||||
"""INSERT OR IGNORE INTO ledger
|
||||
(signal_id, type, proposition, date_logged, discourse_metric, model_confidence,
|
||||
origin_conviction_id, origin_node_id)
|
||||
VALUES (?,?,?,?,?,?,?,?)""",
|
||||
(signal_id, ledger_type, proposition[:1000], as_of, json.dumps(dm)[:8000], None,
|
||||
origin_conviction_id, origin_node_id),
|
||||
)
|
||||
conn.commit()
|
||||
return signal_id
|
||||
@@ -0,0 +1,80 @@
|
||||
"""Local-LLM scoring helpers (§4.4). Bounded labeling passes over PRE-FILTERED candidates only —
|
||||
never nomination from the raw corpus (§5.1). JSON mode, temp 0, no thinking → deterministic.
|
||||
|
||||
Helper #2 (derivative-relevance) is built first — it's the one the §7.1 backtest needs. Helper #1
|
||||
(stance-folding for Job A contrarian) comes with the forward pilot.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_REL_SYS = (
|
||||
"You assess whether claims corroborate a specific investment hypothesis (a 2nd/3rd-order "
|
||||
"derivative of a thesis). For EACH claim decide: does it provide real-world evidence that the "
|
||||
"hypothesis is PLAYING OUT (corroborates), and the direction. 'affirms' = supports the hypothesis; "
|
||||
"'contradicts' = is evidence against it; 'tangential' = same topic words but not actually about the "
|
||||
"hypothesis (e.g. 'transformers' the ML architecture vs the electrical-grid kind). Be strict: a "
|
||||
"passing mention is tangential, not corroboration. "
|
||||
"TWO HARD RULES (these are the difference between catching a real signal and being fooled):\n"
|
||||
"1) REALIZED-ONLY. The hypothesis must be PLAYING OUT in fact. Announcements, plans, intentions, "
|
||||
"forecasts, targets, and 'may/will/expects/poised-to/aims-to/up-to' language are NOT corroboration — "
|
||||
"they are 'tangential' unless the claim states the thing has ACTUALLY HAPPENED / been DEPLOYED / "
|
||||
"closed. A $2B program 'announced' or capital 'made available' is NOT capital deployed. A company "
|
||||
"that 'may consider' or 'expects' something has not done it.\n"
|
||||
"2) ROLE-MATCH. The actor in the claim must occupy the role the hypothesis is about. If the "
|
||||
"hypothesis is that capital PROVIDERS are funding/supplying something, then a BORROWER or USER on the "
|
||||
"demand side (e.g. a firm posting an asset AS collateral to RECEIVE a loan) is the wrong side of the "
|
||||
"transaction → 'tangential' to that hypothesis, not 'affirms'. "
|
||||
'Return ONLY JSON: {"results":[{"claim_id":"...","corroborates":true|false,'
|
||||
'"direction":"affirms"|"contradicts"|"tangential"}]}.'
|
||||
)
|
||||
|
||||
|
||||
def _parse(raw: str) -> list[dict]:
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
except Exception:
|
||||
i, j = raw.find("{"), raw.rfind("}")
|
||||
if i < 0 or j < 0:
|
||||
return []
|
||||
try:
|
||||
obj = json.loads(raw[i:j + 1])
|
||||
except Exception:
|
||||
return []
|
||||
res = obj.get("results", []) if isinstance(obj, dict) else []
|
||||
return [r for r in res if isinstance(r, dict) and r.get("claim_id")]
|
||||
|
||||
|
||||
def derivative_relevance(backend, derivative: str, claims: list[dict]) -> dict[str, dict]:
|
||||
"""claims: [{claim_id, proposition}]. Returns {claim_id: {corroborates, direction}}.
|
||||
Filters retrieval near-misses; it cannot ADD claims search didn't return (not a nominator)."""
|
||||
if not claims:
|
||||
return {}
|
||||
listing = "\n".join(f"- [{c['claim_id']}] {c['proposition']}" for c in claims)
|
||||
user = (f"HYPOTHESIS (derivative): {derivative}\n\nCLAIMS:\n{listing}\n\n"
|
||||
f"Judge each claim id.")
|
||||
messages = [{"role": "system", "content": _REL_SYS}, {"role": "user", "content": user}]
|
||||
# Output is ~one JSON record per claim (claim_id + corroborates + direction ≈ 70-100 tokens). At
|
||||
# top_k=60 that's ~5k tokens — a fixed 3000 budget truncated mid-array → empty parse → a node
|
||||
# silently zeroed (the source of the unstable 5-affirm/0-affirm flip). Size the budget to the batch.
|
||||
budget = max(3000, 120 * len(claims) + 500)
|
||||
parsed = []
|
||||
for attempt in range(2): # one retry — a gateway-under-load truncation shouldn't zero out a node
|
||||
raw = backend.complete_json(messages, max_tokens=budget)
|
||||
parsed = _parse(raw)
|
||||
if parsed:
|
||||
break
|
||||
log.warning("derivative_relevance empty parse (attempt %d) for %r; raw[:160]=%r",
|
||||
attempt + 1, derivative[:50], raw[:160])
|
||||
# The listing presents ids as `- [{claim_id}] ...`; the model INCONSISTENTLY echoes the id back with
|
||||
# the surrounding brackets ("[edgar:...]") — which then misses the bracket-less lookup key and the
|
||||
# whole node reads as 0/(missing). Normalize the brackets+whitespace so matching is robust either way.
|
||||
out = {}
|
||||
for r in parsed:
|
||||
cid = str(r["claim_id"]).strip().strip("[]").strip()
|
||||
out[cid] = {"corroborates": bool(r.get("corroborates")),
|
||||
"direction": r.get("direction", "tangential")}
|
||||
return out
|
||||
@@ -0,0 +1,27 @@
|
||||
"""Resolver — the SEPARATE forward pass that closes the loop (§6.2, §6.3).
|
||||
|
||||
ARCHITECTURALLY ISOLATED from the scorers: it has no shared write path with them. Scorers write
|
||||
candidate_scores + ledger rows with outcome columns NULL and a FROZEN discourse_metric. The resolver
|
||||
runs later (larger as_of), reads ledger rows whose date_logged < as_of_now, and writes ONLY
|
||||
resolution_date / discourse_outcome / external_outcome / lead_time_days. It is FORBIDDEN from touching
|
||||
discourse_metric — that is the structural reason the ledger can't reward noticing what already happened.
|
||||
|
||||
Implementation note: real resolutions need forward time (the clock can't be backfilled). For the
|
||||
backtest, the discourse leg can be resolved by re-running the discourse metric forward from date_logged;
|
||||
the external leg (price/filings/human check, §6.5) is filled as that evidence arrives. Stubbed now to
|
||||
lock the architecture; filled out for the forward pilot.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def resolve_discourse_leg(conn, sc, cfg, *, as_of_now: str) -> int:
|
||||
"""For each ledger row logged before as_of_now without a resolution, re-measure discourse forward
|
||||
and set discourse_outcome + lead_time. (Forward-only; never reads/edits discourse_metric.)
|
||||
Returns count resolved. STUB — implemented for the forward pilot."""
|
||||
rows = conn.execute(
|
||||
"SELECT signal_id, date_logged FROM ledger WHERE resolution_date IS NULL AND date_logged < ?",
|
||||
(as_of_now,),
|
||||
).fetchall()
|
||||
# TODO(forward-pilot): re-run windowed independence from date_logged→as_of_now for each row's
|
||||
# origin derivative; set discourse_outcome in {up_cross_cluster,up_single_cluster,flat,down}.
|
||||
return 0
|
||||
@@ -0,0 +1,81 @@
|
||||
"""Scoring orchestrator. For Job B / the §7.1 backtest: march as_of dates, score every conviction +
|
||||
fan-out derivative, gate, log the denominator, promote nodes.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from ..extract.backends import from_config as backend_from_config
|
||||
from . import bar, under_acted
|
||||
from .asof import Scorer
|
||||
from .ledger_writer import log_candidate, record_candidate_score
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _nodes_for(conn, as_of, mode, conviction_ids):
|
||||
nodes = []
|
||||
where, params = "", []
|
||||
if conviction_ids:
|
||||
ph = ",".join("?" * len(conviction_ids))
|
||||
where = f" WHERE conviction_id IN ({ph})"
|
||||
params = list(conviction_ids)
|
||||
for c in conn.execute(
|
||||
f"SELECT conviction_id, thematic_proposition, conviction_level, current_exposure, is_thesis_breaker "
|
||||
f"FROM conviction_log{where}", params,
|
||||
):
|
||||
nodes.append({"conviction_id": c[0], "node_id": None, "derivative": c[1],
|
||||
"level": c[2], "exposure": c[3], "breaker": bool(c[4])})
|
||||
fq = ("SELECT f.node_id, f.parent_conviction_id, f.derivative_proposition, c.conviction_level, "
|
||||
"c.current_exposure, c.is_thesis_breaker FROM fanout_nodes f "
|
||||
"JOIN conviction_log c ON c.conviction_id = f.parent_conviction_id")
|
||||
conds, fparams = [], []
|
||||
if conviction_ids:
|
||||
conds.append(f"f.parent_conviction_id IN ({','.join('?' * len(conviction_ids))})")
|
||||
fparams += list(conviction_ids)
|
||||
if mode == "forward": # backtest uses the seeded tree as the as-of-2023 hypothesis (no created_at leak)
|
||||
conds.append("f.created_at <= ?")
|
||||
fparams.append(as_of)
|
||||
if conds:
|
||||
fq += " WHERE " + " AND ".join(conds)
|
||||
for f in conn.execute(fq, fparams):
|
||||
nodes.append({"conviction_id": f[1], "node_id": f[0], "derivative": f[2],
|
||||
"level": f[3], "exposure": f[4], "breaker": bool(f[5])})
|
||||
return nodes
|
||||
|
||||
|
||||
def run_under_acted(conn, sc, cfg, *, as_of, mode="backtest", conviction_ids=None, window_days=28) -> list[dict]:
|
||||
backend = backend_from_config(cfg, sc)
|
||||
out = []
|
||||
with Scorer(conn, as_of, mode=mode):
|
||||
for nd in _nodes_for(conn, as_of, mode, conviction_ids):
|
||||
r = under_acted.score_node(
|
||||
conn, sc, backend, as_of=as_of, derivative=nd["derivative"],
|
||||
conviction_id=nd["conviction_id"], node_id=nd["node_id"],
|
||||
conviction_level=nd["level"], exposure=nd["exposure"], is_breaker=nd["breaker"],
|
||||
window_days=window_days,
|
||||
)
|
||||
ev, pr = bar.evaluate("under_acted", r, conn=conn)
|
||||
record_candidate_score(conn, r, as_of, ev, pr)
|
||||
if ev:
|
||||
log_candidate(conn, scorer="under_acted", as_of=as_of,
|
||||
ledger_type="under_acted_conviction", proposition=nd["derivative"],
|
||||
discourse_metric=r["inputs"], origin_conviction_id=nd["conviction_id"],
|
||||
origin_node_id=nd["node_id"])
|
||||
if nd["node_id"]:
|
||||
conn.execute("UPDATE fanout_nodes SET status=? WHERE node_id=?",
|
||||
("signal" if pr else "corroborated", nd["node_id"]))
|
||||
conn.commit()
|
||||
out.append({"node": nd, "result": r, "evidence": ev, "promotion": pr})
|
||||
return out
|
||||
|
||||
|
||||
def run_backtest(conn, sc, cfg, *, conviction_id, dates, window_days=90) -> list[tuple]:
|
||||
timeline = []
|
||||
for as_of in dates:
|
||||
res = run_under_acted(conn, sc, cfg, as_of=as_of, mode="backtest",
|
||||
conviction_ids=[conviction_id], window_days=window_days)
|
||||
timeline.append((as_of, res))
|
||||
fired = [r for r in res if r["evidence"]]
|
||||
log.info("as_of %s: %d/%d nodes cleared evidence bar", as_of, len(fired), len(res))
|
||||
return timeline
|
||||
@@ -0,0 +1,105 @@
|
||||
"""Two-sided net-corroboration (DESIGN_v2.1 H5 + condition 3) — the instrument for the adversarial cases.
|
||||
|
||||
For a derivative, track the INDEPENDENCE-WEIGHTED affirms MINUS denies over time. This is the right
|
||||
output for Strike/Battery (where the question is "did the engine distinguish real adoption from
|
||||
narrative, and catch the contradiction?"), not runway:
|
||||
- STRIKE (reflexivity): a PASS = net stays low/quiet in LIVE mode (own_network dropped) while it
|
||||
would have fired in TEST mode (own_network kept) → the engine refuses the intra-cluster echo.
|
||||
- BATTERY (timing): the DEMAND derivative's net rises while the SUPPLY derivative's net stays flat →
|
||||
"half-confirmed, the load-bearing half isn't moving" = the eroding-conviction signal.
|
||||
Reuses the §4.6 relevance helper, which already returns direction affirms|contradicts|tangential.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from .independence import eisc_for
|
||||
from .llm_helpers import derivative_relevance
|
||||
from .windows import window_bounds
|
||||
|
||||
|
||||
def classify_corpus(sc, backend, derivative: str, as_of: str, *, top_k: int = 60) -> list[dict]:
|
||||
"""Retrieve (as-of filtered) + LLM-classify each claim's direction toward the derivative.
|
||||
Returns affirms/contradicts claims with source_id + date (tangential dropped)."""
|
||||
res = sc.search(derivative, collection="propositions", top_k=top_k, rerank=True)
|
||||
hits = res.get("data", []) if isinstance(res, dict) else []
|
||||
cand = []
|
||||
for h in hits:
|
||||
pl = (h.get("payload") or {})
|
||||
d = pl.get("date")
|
||||
if not pl.get("claim_id") or not d or d[:10] > as_of:
|
||||
continue
|
||||
cand.append({"claim_id": pl["claim_id"], "proposition": pl.get("proposition", ""),
|
||||
"date": d[:10], "source_id": pl.get("source_id")})
|
||||
if not cand:
|
||||
return []
|
||||
rel = derivative_relevance(backend, derivative,
|
||||
[{"claim_id": c["claim_id"], "proposition": c["proposition"]} for c in cand])
|
||||
out = []
|
||||
for c in cand:
|
||||
direction = rel.get(c["claim_id"], {}).get("direction", "tangential")
|
||||
if direction in ("affirms", "contradicts"):
|
||||
out.append({**c, "direction": direction})
|
||||
return out
|
||||
|
||||
|
||||
# DESIGN_v2 ADOPT #1 (claim-type weighting): a node "resolves" on REALIZED, descriptive disclosure —
|
||||
# not on forecasts/intent. A source counts toward the net only if it carries a HARD (realized-fact)
|
||||
# claim on this side; predictive/interpretive claims (forecasts, opinion, 'may consider', 'expects')
|
||||
# are the exact material that fooled the supply axis on Battery, so they don't qualify a source alone.
|
||||
_HARD_CLAIM_TYPES = ("descriptive", "reactive")
|
||||
|
||||
|
||||
def _hard_sources(conn, claim_ids: list[str]) -> set:
|
||||
"""Sources that contributed at least one realized-fact (descriptive/reactive) claim among claim_ids."""
|
||||
if not claim_ids:
|
||||
return set()
|
||||
ph = ",".join("?" * len(claim_ids))
|
||||
qph = ",".join("?" * len(_HARD_CLAIM_TYPES))
|
||||
rows = conn.execute(
|
||||
f"SELECT DISTINCT source_id FROM claims WHERE claim_id IN ({ph}) AND claim_type IN ({qph})",
|
||||
list(claim_ids) + list(_HARD_CLAIM_TYPES),
|
||||
).fetchall()
|
||||
return {r[0] for r in rows}
|
||||
|
||||
|
||||
def net_at(conn, classified: list[dict], as_of: str, *, window_days: int = 90, mode: str = "live",
|
||||
require_hard_evidence: bool = True) -> dict:
|
||||
"""Net independence-weighted corroboration in the trailing window ending at as_of. With
|
||||
require_hard_evidence (default), a source only counts on a side if it carries a realized-fact claim
|
||||
there — forecasts/intent alone don't qualify it (the announced-vs-deployed / opinion-vs-fact guard)."""
|
||||
_, start, end = window_bounds(as_of, n=1, days=window_days)[0]
|
||||
win = [c for c in classified if start < c["date"] <= end]
|
||||
aff = [c for c in win if c["direction"] == "affirms"]
|
||||
den = [c for c in win if c["direction"] == "contradicts"]
|
||||
aff_src_all = {c["source_id"] for c in aff}
|
||||
den_src_all = {c["source_id"] for c in den}
|
||||
if require_hard_evidence:
|
||||
hard_aff = _hard_sources(conn, [c["claim_id"] for c in aff])
|
||||
hard_den = _hard_sources(conn, [c["claim_id"] for c in den])
|
||||
aff_src = list(aff_src_all & hard_aff)
|
||||
den_src = list(den_src_all & hard_den)
|
||||
else:
|
||||
aff_src, den_src = list(aff_src_all), list(den_src_all)
|
||||
aff_e = eisc_for(conn, aff_src, mode=mode)["eisc_adj"] if aff_src else 0.0
|
||||
den_e = eisc_for(conn, den_src, mode=mode)["eisc_adj"] if den_src else 0.0
|
||||
own = 0
|
||||
if aff_src:
|
||||
ph = ",".join("?" * len(aff_src))
|
||||
own = conn.execute(
|
||||
f"SELECT COUNT(*) FROM sources WHERE source_id IN ({ph}) AND COALESCE(own_network,0)=1", aff_src
|
||||
).fetchone()[0]
|
||||
return {"as_of": as_of, "affirms_eisc": round(aff_e, 2), "denies_eisc": round(den_e, 2),
|
||||
"net": round(aff_e - den_e, 2),
|
||||
"n_affirm": len(aff), "n_deny": len(den),
|
||||
"hard_affirm_src": len(aff_src), "soft_affirm_src_dropped": len(aff_src_all) - len(aff_src),
|
||||
"own_network_affirm_src": own}
|
||||
|
||||
|
||||
def trajectory(conn, sc, backend, derivative: str, as_of_dates: list[str], *,
|
||||
window_days: int = 90, mode: str = "live", top_k: int = 60) -> list[dict]:
|
||||
"""The net-corroboration curve over as_of_dates. Run twice (mode='live' vs 'test') to see what the
|
||||
own_network quarantine removes — the reflexivity measurement."""
|
||||
out = []
|
||||
for as_of in as_of_dates:
|
||||
classified = classify_corpus(sc, backend, derivative, as_of, top_k=top_k)
|
||||
out.append(net_at(conn, classified, as_of, window_days=window_days, mode=mode))
|
||||
return out
|
||||
@@ -0,0 +1,75 @@
|
||||
"""Under-acted-conviction scorer — Job B, the §7.1 backtest target.
|
||||
|
||||
score = conviction_weight x exposure_gap x rising_independent_corroboration
|
||||
|
||||
Fires when Ten31 believes something (high conviction), has little/no position (exposure gap), and the
|
||||
world is beginning to corroborate it or a derivative of it — independently and with acceleration. This
|
||||
is the signal that should have flagged "size up power-infra picks-and-shovels" in 2023.
|
||||
|
||||
Exposure is joined LOCALLY (never crosses the frontier boundary, §4.6). Corroboration is RETRIEVED
|
||||
(stats nominate), then an LLM helper only FILTERS retrieval near-misses (§5.1) — it cannot add claims.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from .llm_helpers import derivative_relevance
|
||||
from .windows import windowed_independence
|
||||
|
||||
CONVICTION_WEIGHT = {"low": 0.15, "med": 0.4, "med-high": 0.7, "high": 1.0}
|
||||
EXPOSURE_GAP = {"none": 1.0, "lt2": 0.8, "2to10": 0.4, "gt10": 0.1, "unset": 0.6}
|
||||
|
||||
|
||||
def score_node(conn, sc, backend, *, as_of: str, derivative: str, conviction_id: str,
|
||||
node_id: str | None, conviction_level: str, exposure: str,
|
||||
is_breaker: bool = False, top_k: int = 40, window_days: int = 28) -> dict:
|
||||
cw = CONVICTION_WEIGHT.get(conviction_level, 0.4)
|
||||
eg = EXPOSURE_GAP.get(exposure, 0.6)
|
||||
|
||||
# 1. RETRIEVE (stats nominate): hybrid search over embedded propositions; as-of post-filter.
|
||||
try:
|
||||
res = sc.search(derivative, collection="propositions", top_k=top_k, rerank=True)
|
||||
except Exception as e: # noqa: BLE001
|
||||
return _result(conviction_id, node_id, 0.0, {"reason": f"search_failed:{str(e)[:60]}"},
|
||||
cw, eg, exposure, is_breaker)
|
||||
hits = res.get("data", []) if isinstance(res, dict) else []
|
||||
cand = []
|
||||
for h in hits:
|
||||
pl = (h.get("payload") or {}) if isinstance(h, dict) else {}
|
||||
d = pl.get("date")
|
||||
if not pl.get("claim_id") or not d or d[:10] > as_of: # Qdrant can't date-filter; do it here
|
||||
continue
|
||||
cand.append({"claim_id": pl["claim_id"], "proposition": pl.get("proposition", ""),
|
||||
"date": d, "source_id": pl.get("source_id")})
|
||||
if not cand:
|
||||
return _result(conviction_id, node_id, 0.0, {"reason": "no_retrieval", "n_retrieved": 0},
|
||||
cw, eg, exposure, is_breaker)
|
||||
|
||||
# 2. FILTER near-misses with the LLM (affirms-only). Not a nominator — can't add claims.
|
||||
rel = derivative_relevance(backend, derivative,
|
||||
[{"claim_id": c["claim_id"], "proposition": c["proposition"]} for c in cand])
|
||||
confirmed = [c for c in cand
|
||||
if rel.get(c["claim_id"], {}).get("corroborates")
|
||||
and rel[c["claim_id"]].get("direction") == "affirms"]
|
||||
n_src = len({c["source_id"] for c in confirmed})
|
||||
|
||||
# 3. CORROBORATION = independence-weighted acceleration over the confirmed set (treat as a topic).
|
||||
# window_days matches corpus cadence: ~90d for quarterly filings/earnings, ~28d for weekly podcasts.
|
||||
wi = windowed_independence(conn, [(c["date"], c["source_id"]) for c in confirmed], as_of, days=window_days)
|
||||
a_corrob = wi["acceleration"]
|
||||
eisc_corrob = wi["eisc0"]
|
||||
corroboration = max(0.0, a_corrob) * eisc_corrob
|
||||
|
||||
score = corroboration if is_breaker else cw * eg * corroboration
|
||||
inputs = {
|
||||
"as_of": as_of, "derivative": derivative, "n_retrieved": len(cand), "n_confirmed": len(confirmed),
|
||||
"n_src": n_src, "a_corrob": a_corrob, "eisc_corrob": eisc_corrob, "k_eff0": wi["k_eff0"],
|
||||
"window_counts": wi["counts"], "window_eisc": wi["eisc"], "corroboration": round(corroboration, 3),
|
||||
"confirmed_claim_ids": [c["claim_id"] for c in confirmed][:50],
|
||||
}
|
||||
return _result(conviction_id, node_id, score, inputs, cw, eg, exposure, is_breaker)
|
||||
|
||||
|
||||
def _result(conviction_id, node_id, score, inputs, cw, eg, exposure, is_breaker) -> dict:
|
||||
inputs = {**inputs, "conviction_weight": cw, "exposure_gap": eg, "exposure": exposure,
|
||||
"is_breaker": is_breaker}
|
||||
return {"scorer": "under_acted", "conviction_id": conviction_id, "node_id": node_id,
|
||||
"score": round(float(score), 4), "inputs": inputs}
|
||||
@@ -0,0 +1,53 @@
|
||||
"""Temporal windows + windowed independence (the single temporal layer, §4.4).
|
||||
|
||||
28-day non-overlapping windows anchored at as_of (W0 ends at as_of, then back). Non-overlapping
|
||||
avoids autocorrelation faking significance. The signal is the discrete 2nd derivative of the
|
||||
INDEPENDENCE-WEIGHTED flow (EISC per window), never the raw count — so a topic that "accelerates"
|
||||
only because one show booked the same guest three times has flat N(W).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from .independence import eisc_for
|
||||
|
||||
WINDOW_DAYS = 28
|
||||
N_WINDOWS = 3
|
||||
|
||||
|
||||
def _d(s: str) -> datetime:
|
||||
return datetime.strptime(s[:10], "%Y-%m-%d")
|
||||
|
||||
|
||||
def window_bounds(as_of: str, *, n: int = N_WINDOWS, days: int = WINDOW_DAYS) -> list[tuple[int, str, str]]:
|
||||
"""Returns [(idx, start_iso, end_iso)] with W0 ending at as_of, extending backward only."""
|
||||
end = _d(as_of)
|
||||
out = []
|
||||
for idx in range(n):
|
||||
w_end = end - timedelta(days=idx * days)
|
||||
w_start = end - timedelta(days=(idx + 1) * days)
|
||||
out.append((idx, w_start.strftime("%Y-%m-%d"), w_end.strftime("%Y-%m-%d")))
|
||||
return out
|
||||
|
||||
|
||||
def windowed_independence(conn, rows: list[tuple], as_of: str, *, n: int = N_WINDOWS,
|
||||
days: int = WINDOW_DAYS) -> dict:
|
||||
"""rows: [(date_iso, source_id)]. For each window compute raw count + EISC_adj of its sources.
|
||||
Returns {counts:[c0..], eisc:[N0..], k_eff:[...], acceleration, eisc0, sources0}.
|
||||
acceleration = N0 - 2*N1 + N2 (independence-weighted 2nd derivative)."""
|
||||
bounds = window_bounds(as_of, n=n, days=days)
|
||||
counts, eiscs, keffs, src_sets = [], [], [], []
|
||||
for _idx, start, end in bounds:
|
||||
win = [r for r in rows if r[0] and start < r[0][:10] <= end]
|
||||
srcs = list({r[1] for r in win})
|
||||
e = eisc_for(conn, srcs) if srcs else {"eisc_adj": 0.0, "k_eff": 0}
|
||||
counts.append(len(win))
|
||||
eiscs.append(e["eisc_adj"])
|
||||
keffs.append(e["k_eff"])
|
||||
src_sets.append(srcs)
|
||||
accel = eiscs[0] - 2 * eiscs[1] + eiscs[2] if n >= 3 else 0.0
|
||||
return {
|
||||
"counts": counts, "eisc": [round(x, 3) for x in eiscs], "k_eff": keffs,
|
||||
"acceleration": round(accel, 3), "eisc0": round(eiscs[0], 3), "k_eff0": keffs[0],
|
||||
"sources0": src_sets[0], "n_total": sum(counts),
|
||||
}
|
||||
Reference in New Issue
Block a user