Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,80 @@
|
||||
"""Local-LLM scoring helpers (§4.4). Bounded labeling passes over PRE-FILTERED candidates only —
|
||||
never nomination from the raw corpus (§5.1). JSON mode, temp 0, no thinking → deterministic.
|
||||
|
||||
Helper #2 (derivative-relevance) is built first — it's the one the §7.1 backtest needs. Helper #1
|
||||
(stance-folding for Job A contrarian) comes with the forward pilot.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_REL_SYS = (
|
||||
"You assess whether claims corroborate a specific investment hypothesis (a 2nd/3rd-order "
|
||||
"derivative of a thesis). For EACH claim decide: does it provide real-world evidence that the "
|
||||
"hypothesis is PLAYING OUT (corroborates), and the direction. 'affirms' = supports the hypothesis; "
|
||||
"'contradicts' = is evidence against it; 'tangential' = same topic words but not actually about the "
|
||||
"hypothesis (e.g. 'transformers' the ML architecture vs the electrical-grid kind). Be strict: a "
|
||||
"passing mention is tangential, not corroboration. "
|
||||
"TWO HARD RULES (these are the difference between catching a real signal and being fooled):\n"
|
||||
"1) REALIZED-ONLY. The hypothesis must be PLAYING OUT in fact. Announcements, plans, intentions, "
|
||||
"forecasts, targets, and 'may/will/expects/poised-to/aims-to/up-to' language are NOT corroboration — "
|
||||
"they are 'tangential' unless the claim states the thing has ACTUALLY HAPPENED / been DEPLOYED / "
|
||||
"closed. A $2B program 'announced' or capital 'made available' is NOT capital deployed. A company "
|
||||
"that 'may consider' or 'expects' something has not done it.\n"
|
||||
"2) ROLE-MATCH. The actor in the claim must occupy the role the hypothesis is about. If the "
|
||||
"hypothesis is that capital PROVIDERS are funding/supplying something, then a BORROWER or USER on the "
|
||||
"demand side (e.g. a firm posting an asset AS collateral to RECEIVE a loan) is the wrong side of the "
|
||||
"transaction → 'tangential' to that hypothesis, not 'affirms'. "
|
||||
'Return ONLY JSON: {"results":[{"claim_id":"...","corroborates":true|false,'
|
||||
'"direction":"affirms"|"contradicts"|"tangential"}]}.'
|
||||
)
|
||||
|
||||
|
||||
def _parse(raw: str) -> list[dict]:
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
except Exception:
|
||||
i, j = raw.find("{"), raw.rfind("}")
|
||||
if i < 0 or j < 0:
|
||||
return []
|
||||
try:
|
||||
obj = json.loads(raw[i:j + 1])
|
||||
except Exception:
|
||||
return []
|
||||
res = obj.get("results", []) if isinstance(obj, dict) else []
|
||||
return [r for r in res if isinstance(r, dict) and r.get("claim_id")]
|
||||
|
||||
|
||||
def derivative_relevance(backend, derivative: str, claims: list[dict]) -> dict[str, dict]:
|
||||
"""claims: [{claim_id, proposition}]. Returns {claim_id: {corroborates, direction}}.
|
||||
Filters retrieval near-misses; it cannot ADD claims search didn't return (not a nominator)."""
|
||||
if not claims:
|
||||
return {}
|
||||
listing = "\n".join(f"- [{c['claim_id']}] {c['proposition']}" for c in claims)
|
||||
user = (f"HYPOTHESIS (derivative): {derivative}\n\nCLAIMS:\n{listing}\n\n"
|
||||
f"Judge each claim id.")
|
||||
messages = [{"role": "system", "content": _REL_SYS}, {"role": "user", "content": user}]
|
||||
# Output is ~one JSON record per claim (claim_id + corroborates + direction ≈ 70-100 tokens). At
|
||||
# top_k=60 that's ~5k tokens — a fixed 3000 budget truncated mid-array → empty parse → a node
|
||||
# silently zeroed (the source of the unstable 5-affirm/0-affirm flip). Size the budget to the batch.
|
||||
budget = max(3000, 120 * len(claims) + 500)
|
||||
parsed = []
|
||||
for attempt in range(2): # one retry — a gateway-under-load truncation shouldn't zero out a node
|
||||
raw = backend.complete_json(messages, max_tokens=budget)
|
||||
parsed = _parse(raw)
|
||||
if parsed:
|
||||
break
|
||||
log.warning("derivative_relevance empty parse (attempt %d) for %r; raw[:160]=%r",
|
||||
attempt + 1, derivative[:50], raw[:160])
|
||||
# The listing presents ids as `- [{claim_id}] ...`; the model INCONSISTENTLY echoes the id back with
|
||||
# the surrounding brackets ("[edgar:...]") — which then misses the bracket-less lookup key and the
|
||||
# whole node reads as 0/(missing). Normalize the brackets+whitespace so matching is robust either way.
|
||||
out = {}
|
||||
for r in parsed:
|
||||
cid = str(r["claim_id"]).strip().strip("[]").strip()
|
||||
out[cid] = {"corroborates": bool(r.get("corroborates")),
|
||||
"direction": r.get("direction", "tangential")}
|
||||
return out
|
||||
Reference in New Issue
Block a user