Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

This commit is contained in:
Keysat
2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
+11
View File
@@ -0,0 +1,11 @@
"""Ten31 Signal Engine — pilot.
A recurring pipeline that ingests audio + text, extracts structured propositions
locally, and surfaces signal over time. The discipline that separates signal from
plausible-sounding noise (handoff §5): statistics & graph structure NOMINATE
candidates; the frontier model only JUDGES and FANS OUT a pre-filtered shortlist.
See README.md for the architecture and ten31-signal-engine-handoff.md for the spec.
"""
__version__ = "0.1.0"
+4
View File
@@ -0,0 +1,4 @@
from .cli import main
if __name__ == "__main__":
raise SystemExit(main())
+1
View File
@@ -0,0 +1 @@
"""Client-side backfill queue (§13.4). Producers enqueue; ONE worker drains sequentially."""
+123
View File
@@ -0,0 +1,123 @@
"""Backfill job queue over the `backfill_jobs` table (§13.4).
Model the corpus backfill as a managed GPU-hours queue, not a real-time fan-out. Producers
(ingestion) enqueue lightweight job descriptors; a SINGLE worker leases and drains them one at a
time so audio never goes parallel (→ 503). Jobs are:
- idempotent: UNIQUE(job_type, input_hash); re-enqueue of seen content is a no-op.
- crash-safe: leases expire, so a dead worker's job returns to the pool automatically.
- prioritized: lower `priority` runs first (backtest corpus + filings jump ahead).
This is plain SQLite so the whole queue is `SELECT * FROM backfill_jobs`.
"""
from __future__ import annotations
import sqlite3
from typing import Any, Optional, Sequence
LEASE_SECONDS_DEFAULT = 600
def enqueue(
conn: sqlite3.Connection,
*,
job_type: str,
target_id: str,
input_hash: str,
parent_doc_id: str | None = None,
priority: int = 100,
max_attempts: int = 5,
) -> Optional[int]:
"""Insert a job. Returns job_id, or None if this (job_type, input_hash) is already queued/done
(idempotent skip — §13.4)."""
cur = conn.execute(
"""INSERT OR IGNORE INTO backfill_jobs
(job_type, target_id, parent_doc_id, priority, max_attempts, input_hash, state)
VALUES (?,?,?,?,?,?, 'pending')""",
(job_type, target_id, parent_doc_id, priority, max_attempts, input_hash),
)
conn.commit()
return cur.lastrowid if cur.rowcount else None
def lease_next(
conn: sqlite3.Connection,
*,
worker_id: str,
job_types: Sequence[str] | None = None,
lease_seconds: int = LEASE_SECONDS_DEFAULT,
) -> Optional[sqlite3.Row]:
"""Atomically claim the highest-priority eligible job. Eligible = pending, OR a running/leased
job whose lease has expired (crash recovery). Increments `attempts`."""
params: list[Any] = []
type_filter = ""
if job_types:
type_filter = f" AND job_type IN ({','.join('?' * len(job_types))})"
params.extend(job_types)
row = conn.execute(
f"""SELECT job_id FROM backfill_jobs
WHERE (state = 'pending'
OR (state IN ('leased','running')
AND lease_expires_at IS NOT NULL
AND lease_expires_at < datetime('now')))
{type_filter}
ORDER BY priority ASC, job_id ASC
LIMIT 1""",
params,
).fetchone()
if row is None:
return None
conn.execute(
"""UPDATE backfill_jobs
SET state='running', lease_owner=?, lease_expires_at=datetime('now', ?),
attempts=attempts+1, updated_at=datetime('now')
WHERE job_id=?""",
(worker_id, f"+{int(lease_seconds)} seconds", row["job_id"]),
)
conn.commit()
return conn.execute("SELECT * FROM backfill_jobs WHERE job_id=?", (row["job_id"],)).fetchone()
def complete(conn: sqlite3.Connection, job_id: int, *, output_ref: str | None = None,
gpu_seconds: float | None = None) -> None:
conn.execute(
"""UPDATE backfill_jobs SET state='done', output_ref=?, gpu_seconds=?, error=NULL,
updated_at=datetime('now') WHERE job_id=?""",
(output_ref, gpu_seconds, job_id),
)
conn.commit()
def fail(conn: sqlite3.Connection, job_id: int, error: Any) -> str:
"""Retry (→ pending) if attempts remain, else dead-letter (→ failed). Returns the new state."""
row = conn.execute(
"SELECT attempts, max_attempts FROM backfill_jobs WHERE job_id=?", (job_id,)
).fetchone()
exhausted = bool(row) and row["attempts"] >= row["max_attempts"]
new_state = "failed" if exhausted else "pending"
conn.execute(
"""UPDATE backfill_jobs SET state=?, error=?, lease_owner=NULL, lease_expires_at=NULL,
updated_at=datetime('now') WHERE job_id=?""",
(new_state, str(error)[:2000], job_id),
)
conn.commit()
return new_state
def skip(conn: sqlite3.Connection, job_id: int, reason: str | None = None) -> None:
"""Terminal non-error skip (e.g. a chunk that produced zero claims is still 'done', but an
intentionally dropped job is 'skipped')."""
conn.execute(
"UPDATE backfill_jobs SET state='skipped', error=?, updated_at=datetime('now') WHERE job_id=?",
(reason, job_id),
)
conn.commit()
def stats(conn: sqlite3.Connection) -> dict[str, dict[str, int]]:
rows = conn.execute(
"SELECT job_type, state, COUNT(*) AS n FROM backfill_jobs GROUP BY job_type, state"
).fetchall()
out: dict[str, dict[str, int]] = {}
for r in rows:
out.setdefault(r["job_type"], {})[r["state"]] = r["n"]
return out
+619
View File
@@ -0,0 +1,619 @@
"""Pilot CLI. Subcommands map to the build order in handoff §11.
Currently implemented (foundation): init-db, seed-convictions, spark-status, db-tables.
Later stages (ingest, extract, score, judge, eval-ui) are added as they're built.
"""
from __future__ import annotations
import argparse
import logging
import sys
from pathlib import Path
from .config import load_config
from .store import db
from .store.seed import load_convictions, load_fanout
from .store.sources import load_source_edges, load_sources, update_feeds
DEFAULT_CONVICTION_SEED = Path("seeds/conviction_log.seed.yaml")
DEFAULT_SOURCES_SEED = Path("seeds/sources.seed.yaml")
DEFAULT_FEEDS_SEED = Path("seeds/podcast_feeds.resolved.yaml")
def _setup_logging(level: str) -> None:
logging.basicConfig(level=getattr(logging, level.upper(), logging.INFO),
format="%(asctime)s %(levelname)s %(name)s: %(message)s")
def cmd_init_db(args: argparse.Namespace) -> int:
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
print(f"Initialized DB at {cfg.db_path}")
print("Tables/views:", ", ".join(db.table_names(conn)))
return 0
def cmd_seed_convictions(args: argparse.Namespace) -> int:
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn) # ensure schema exists
path = Path(args.file)
n = load_convictions(conn, path)
print(f"Upserted {n} convictions from {path}")
breakers = conn.execute(
"SELECT conviction_id, thematic_proposition FROM conviction_log WHERE is_thesis_breaker = 1"
).fetchall()
if breakers:
print("Thesis-breakers loaded (engine must surface these AGAINST the thesis, §5.7):")
for b in breakers:
print(f" {b['conviction_id']}: {b['thematic_proposition'][:80]}...")
return 0
def cmd_seed_sources(args: argparse.Namespace) -> int:
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
n = load_sources(conn, Path(args.file))
by_kind = conn.execute(
"SELECT kind, COUNT(*) n FROM sources GROUP BY kind ORDER BY kind"
).fetchall()
print(f"Upserted {n} sources from {args.file}")
for r in by_kind:
print(f" {r['kind']}: {r['n']}")
return 0
def cmd_seed_edges(args: argparse.Namespace) -> int:
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
n = load_source_edges(conn, Path(args.file))
total = conn.execute("SELECT COUNT(*) FROM source_edges").fetchone()[0]
print(f"Inserted {n} new edges from {args.file} ({total} edges total)")
return 0
def cmd_load_feeds(args: argparse.Namespace) -> int:
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
n = update_feeds(conn, Path(args.file))
print(f"updated {n} podcast feeds")
rows = conn.execute(
"SELECT backtest_2022_2023, COUNT(*) c FROM sources WHERE kind='podcast' "
"GROUP BY backtest_2022_2023 ORDER BY c DESC"
).fetchall()
print("backtest 2022-2023 reach:")
for r in rows:
print(f" {r['backtest_2022_2023'] or 'unset'}: {r['c']}")
return 0
def cmd_ingest_edgar(args: argparse.Namespace) -> int:
from .ingest.edgar import EdgarClient, ingest_filings
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
client = EdgarClient(cfg.edgar_user_agent)
forms = tuple(f.strip() for f in args.forms.split(",")) if args.forms else ("10-K", "10-Q", "8-K")
# resolve source_id from ticker (create a lightweight source row if not seeded)
row = conn.execute("SELECT source_id FROM sources WHERE upper(ticker)=upper(?)", (args.ticker,)).fetchone()
if row:
source_id = row["source_id"]
else:
source_id = f"co-{args.ticker.lower()}"
conn.execute(
"INSERT OR IGNORE INTO sources (source_id, name, kind, ticker) VALUES (?,?,?,?)",
(source_id, args.ticker, "filing", args.ticker.upper()),
)
conn.commit()
n_docs, n_jobs = ingest_filings(conn, client, source_id=source_id, ticker=args.ticker,
since=args.since, until=args.until, forms=forms)
print(f"{args.ticker}: +{n_docs} filing documents, +{n_jobs} extract jobs queued "
f"(forms={','.join(forms)}, since={args.since}, until={args.until})")
return 0
def _resolve_source_id(conn, ticker: str, kind: str = "filing") -> str:
row = conn.execute("SELECT source_id FROM sources WHERE upper(ticker)=upper(?)", (ticker,)).fetchone()
if row:
return row["source_id"]
source_id = f"co-{ticker.lower()}"
conn.execute("INSERT OR IGNORE INTO sources (source_id, name, kind, ticker) VALUES (?,?,?,?)",
(source_id, ticker.upper(), kind, ticker.upper()))
conn.commit()
return source_id
def cmd_ingest_doc(args: argparse.Namespace) -> int:
from .ingest.docs import ingest_one
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
doc_id = ingest_one(conn, cfg, source_id=args.source, url=args.url,
title=args.title or args.url, date=args.date, method=args.method)
print(f"ingested: {doc_id}" if doc_id else "no new doc (duplicate / too short / fetch failed)")
return 0
def cmd_ingest_feed_text(args: argparse.Namespace) -> int:
from .ingest.docs import ingest_feed_text
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
n = ingest_feed_text(conn, cfg, source_id=args.source, rss_url=args.url,
since=args.since, until=args.until, limit=args.limit)
print(f"ingested {n} article docs from feed for {args.source}")
return 0
def cmd_ingest_doc_manifest(args: argparse.Namespace) -> int:
from .ingest.docs import ingest_manifest
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
r = ingest_manifest(conn, cfg, Path(args.file))
print(f"manifest: ingested={r['ingested']} skipped={r['skipped']} missing_source={r['missing_source']}")
return 0
def cmd_ingest_earnings(args: argparse.Namespace) -> int:
from .ingest.earnings import FMPClient, ingest_for_ticker
cfg = load_config()
if not cfg.fmp_api_key:
print("FMP_API_KEY not set", file=sys.stderr)
return 1
conn = db.connect(cfg.db_path)
db.init_db(conn)
fmp = FMPClient(cfg.fmp_api_key)
source_id = _resolve_source_id(conn, args.ticker)
n_docs, n_jobs = ingest_for_ticker(conn, fmp, source_id=source_id, symbol=args.ticker.upper(),
data_dir=cfg.data_dir, since=args.since, until=args.until, limit=args.limit)
print(f"{args.ticker}: +{n_docs} earnings transcripts, +{n_jobs} extract jobs (since={args.since}, until={args.until})")
return 0
def cmd_embed_claims(args: argparse.Namespace) -> int:
from .spark import from_config
from .embedstore.qdrant_store import get_client, ensure_collection, upsert_pending
from .embedstore.embedder import SparseEmbedder
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
sc = from_config(cfg)
client = get_client(args.qdrant_url)
created = ensure_collection(client)
print(f"collection {'created' if created else 'exists'}")
sparse = SparseEmbedder() if not args.no_sparse else None
n = upsert_pending(conn, sc, client, sparse)
print(f"embedded + upserted {n} propositions (sparse={'on' if sparse and sparse.available else 'off'})")
return 0
def cmd_search(args: argparse.Namespace) -> int:
from .spark import from_config
cfg = load_config()
sc = from_config(cfg)
res = sc.search(args.query, collection="propositions", top_k=args.top_k, rerank=not args.no_rerank)
hits = res.get("results") or res.get("hits") or res
print(json.dumps(hits, indent=2)[:2500])
return 0
def cmd_ingest_podcast(args: argparse.Namespace) -> int:
from .ingest.podcasts import ingest_rss, ingest_youtube
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
src = conn.execute("SELECT * FROM sources WHERE source_id=?", (args.source,)).fetchone()
if not src:
print(f"unknown source {args.source}", file=sys.stderr)
return 1
via = args.via
if via == "auto":
via = "youtube" if (src["backtest_2022_2023"] == "youtube_only" and args.since) else "rss"
fn = ingest_youtube if via == "youtube" else ingest_rss
n_docs, n_jobs = fn(conn, src, since=args.since, until=args.until, limit=args.limit)
print(f"{src['name']} via {via}: +{n_docs} episodes, +{n_jobs} transcribe jobs")
return 0
def cmd_run_transcribe(args: argparse.Namespace) -> int:
from .spark import from_config
from .ingest.transcribe_worker import run_transcribe
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
sc = from_config(cfg)
result = run_transcribe(conn, sc, cfg, limit=args.limit, max_chunks=args.max_chunks)
print(f"transcription: {result['jobs_processed']} jobs processed")
return 0
def cmd_run_transcribe_gemini(args: argparse.Namespace) -> int:
from .ingest.gemini_transcribe import run_transcribe_gemini
cfg = load_config()
conn = db.connect(cfg.db_path)
r = run_transcribe_gemini(conn, cfg, limit=args.limit, concurrency=args.concurrency)
tok_in, tok_out = r["prompt_tokens"], r["output_tokens"]
# Gemini 2.5 Flash list price: ~$0.30/1M text-in, audio-in ~$1.00/1M, $2.50/1M out. Audio dominates in.
est = tok_in / 1_000_000 * 1.00 + tok_out / 1_000_000 * 2.50
print(f"gemini transcribe: done={r['done']} failed={r['failed']} | "
f"tokens in={tok_in:,} out={tok_out:,} | ~${est:.2f} this run (≈${est/max(r['done'],1):.3f}/ep)")
return 0
def cmd_run_extract(args: argparse.Namespace) -> int:
from .spark import from_config
from .extract.worker import run_extract
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
sc = from_config(cfg)
result = run_extract(conn, sc, cfg, limit=args.limit, max_chunks_per_doc=args.max_chunks)
print(f"extraction: {result['jobs_processed']} jobs, {result['claims_written']} claims written")
return 0
def cmd_queue_status(args: argparse.Namespace) -> int:
from .backfill import queue
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
s = queue.stats(conn)
if not s:
print("queue empty")
return 0
for job_type, states in sorted(s.items()):
parts = ", ".join(f"{st}={n}" for st, n in sorted(states.items()))
print(f" {job_type}: {parts}")
return 0
def cmd_feed_peek(args: argparse.Namespace) -> int:
from .ingest.feeds import fetch_feed, episode_records
parsed = fetch_feed(args.url)
status = getattr(parsed, "status", None)
recs = episode_records(parsed)
print(f"status={status} bozo={getattr(parsed, 'bozo', None)} episodes_with_audio={len(recs)}")
for r in recs[: args.limit]:
print(f" [{r['published']}] {str(r['title'])[:70]}")
if recs:
print(f"oldest in feed: {recs[-1]['published']} newest: {recs[0]['published']}")
return 0
def cmd_serve(args: argparse.Namespace) -> int:
import uvicorn
from .ui.app import create_app
cfg = load_config()
port = args.port or cfg.ui_port
print(f"serving corpus UI on http://0.0.0.0:{port}")
uvicorn.run(create_app(), host="0.0.0.0", port=port)
return 0
def cmd_seed_fanout(args: argparse.Namespace) -> int:
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
n = load_fanout(conn, Path(args.file))
print(f"seeded {n} fan-out derivative nodes")
return 0
def cmd_backtest(args: argparse.Namespace) -> int:
from .spark import from_config
from .signals.run import run_backtest
from datetime import datetime, timedelta
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
sc = from_config(cfg)
# monthly as_of march
start = datetime.strptime(args.start, "%Y-%m-%d")
end = datetime.strptime(args.end, "%Y-%m-%d")
dates, d = [], start
while d <= end:
dates.append(d.strftime("%Y-%m-%d"))
d = d + timedelta(days=args.step_days)
print(f"§7.1 backtest: conviction={args.conviction}, as_of march {args.start}{args.end} ({len(dates)} points)")
timeline = run_backtest(conn, sc, cfg, conviction_id=args.conviction, dates=dates, window_days=args.window_days)
# report: per-node first-clear date + score trajectory; highlight the headline derivative
print("\n=== node trajectories (score by as_of; ★=cleared evidence bar) ===")
nodes = {}
for as_of, res in timeline:
for r in res:
key = r["node"]["node_id"] or r["node"]["conviction_id"]
nodes.setdefault(key, []).append((as_of, r["result"]["score"], r["evidence"], r["promotion"], r["result"]["inputs"]))
for key, traj in sorted(nodes.items()):
first = next((t for t in traj if t[2]), None)
peak = max(traj, key=lambda t: t[1])
mark = f"first-cleared {first[0]}" if first else "never cleared"
print(f" {key:28} peak={peak[1]:.2f} {mark}")
head = nodes.get(args.headline)
if head:
print(f"\n=== HEADLINE derivative: {args.headline} ===")
for as_of, score, ev, pr, inp in head:
star = "" if ev else ("·" if score > 0 else " ")
print(f" {as_of} {star} score={score:.2f} corrob={inp.get('corroboration',0)} "
f"n_conf={inp.get('n_confirmed',0)} eisc={inp.get('eisc_corrob',0)} "
f"a={inp.get('a_corrob',0)} k_eff={inp.get('k_eff0',0)}")
firstclear = next((t for t in head if t[2]), None)
print(f"\n VERDICT: headline power-infra derivative "
f"{'SURFACED at ' + firstclear[0] if firstclear else 'did NOT surface'} "
f"(bar = under_acted ≥ {0.3})")
return 0
def cmd_two_sided(args: argparse.Namespace) -> int:
"""Two-sided net-corroboration trajectory (DESIGN_v2.1 H5) for the adversarial cases.
BATTERY: demand-net should rise while supply-net stays flat. STRIKE: net stays quiet in live, fires in test."""
from .spark import from_config as spark_from_config
from .extract.backends import from_config as backend_from_config
from .signals.two_sided import trajectory
cfg = load_config()
conn = db.connect(cfg.db_path)
sc = spark_from_config(cfg)
backend = backend_from_config(cfg, sc)
nodes = conn.execute(
"SELECT node_id, derivative_proposition FROM fanout_nodes WHERE parent_conviction_id=? ORDER BY node_id",
(args.conviction,),
).fetchall()
dates = [d.strip() for d in args.dates.split(",")]
filt = [s for s in args.nodes.split(",") if s] if args.nodes else []
for r in nodes:
if filt and not any(k.lower() in r["node_id"].lower() for k in filt):
continue
for mode in [m.strip() for m in args.modes.split(",")]:
traj = trajectory(conn, sc, backend, r["derivative_proposition"], dates,
window_days=args.window_days, mode=mode)
print(f"\n### {r['node_id']} [mode={mode}, window={args.window_days}d] ###")
for pt in traj:
print(f" {pt['as_of']}: net={pt['net']:+.2f} "
f"affirm(eisc={pt['affirms_eisc']}, hard_src={pt.get('hard_affirm_src','?')}, "
f"n_claims={pt['n_affirm']}, soft_dropped={pt.get('soft_affirm_src_dropped','?')}) "
f"deny(eisc={pt['denies_eisc']}, n={pt['n_deny']}) "
f"own_net={pt['own_network_affirm_src']}")
return 0
def cmd_confusion(args: argparse.Namespace) -> int:
from .signals.confusion import run_confusion
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
out = run_confusion(conn, cfg, args.spec)
classify = out["classify"]
print("=== PRE-REGISTERED confusion matrix (DESIGN_v2 §1) — precision AND recall; RUNWAY = frac of move still ahead at signal ===")
print(f"{'derivative':26} {'reprice?':8} {'peak%':>6} {'whisper':>9} {'run_wh':>6} {'cleared':>9} {'run_cl':>6} cl/wh")
for r in out["rows"]:
cl, wh = classify(r, "cleared"), classify(r, "whisper")
miss = f" (no px:{','.join(r['missing'])})" if r["missing"] else ""
print(f"{r['node']:26} {('REAL' if r['confirmed'] else 'no'):8} {str(r['peak_pct']):>6} "
f"{str(r['whisper_date'] or '-'):>9} {str(r['runway_whisper'] if r['runway_whisper'] is not None else '-'):>6} "
f"{str(r['cleared_date'] or '-'):>9} {str(r['runway_cleared'] if r['runway_cleared'] is not None else '-'):>6} "
f"{cl}/{wh}{miss}")
for level in ("cleared", "whisper"):
c, p, rec = out[level]
print(f"\n{level.upper()} level: TP={c['TP']} FP={c['FP']} FN={c['FN']} TN={c['TN']} | "
f"precision={p if p is None else round(p,2)} recall={rec if rec is None else round(rec,2)}")
print("\nlead_* = days the repricing came AFTER the signal (positive = engine was early).")
print("The cleared→whisper delta = what the independence floor cost in lead time / recall.")
return 0
def cmd_provenance(args: argparse.Namespace) -> int:
"""The processing log — what's been ingested/processed, so we never reprocess silently."""
cfg = load_config()
conn = db.connect(cfg.db_path)
db.init_db(conn)
print("processed documents (the durable log):")
for r in conn.execute(
"SELECT kind, COUNT(*) total, SUM(CASE WHEN processed_at IS NOT NULL THEN 1 ELSE 0 END) proc "
"FROM documents GROUP BY kind ORDER BY kind"
):
print(f" {r['kind']:14} {r['proc']}/{r['total']} processed")
print("dedup model: (1) UNIQUE(source_id, external_id) = robust pre-GPU guard; "
"(2) dedup_key = cross-mirror (title+date); content_hash = audit only.")
dups = conn.execute(
"SELECT dedup_key, COUNT(*) c FROM documents WHERE dedup_key IS NOT NULL "
"GROUP BY dedup_key HAVING c > 1"
).fetchall()
print(f"cross-mirror dedup_key groups (same episode via >1 feed): {len(dups)}")
miss = conn.execute("SELECT COUNT(*) FROM documents WHERE dedup_key IS NULL").fetchone()[0]
if miss:
print(f" ({miss} docs missing dedup_key — run `provenance --backfill-hashes`)")
if args.backfill_hashes:
import hashlib
import os
from .util import audio_dedup_key
ndk = nch = 0
for r in conn.execute("SELECT doc_id, kind, title, date, external_id, transcript_path, dedup_key, content_hash FROM documents"):
updates: dict = {}
if not r["dedup_key"]:
updates["dedup_key"] = (audio_dedup_key(r["title"], r["date"])
if r["kind"] in ("podcast", "youtube") else r["external_id"])
ndk += 1
if not r["content_hash"] and r["transcript_path"] and os.path.exists(r["transcript_path"]):
updates["content_hash"] = hashlib.sha256(open(r["transcript_path"], "rb").read()).hexdigest()
nch += 1
if updates:
sets = ", ".join(f"{k}=?" for k in updates)
conn.execute(f"UPDATE documents SET {sets} WHERE doc_id=?", (*updates.values(), r["doc_id"]))
conn.commit()
print(f"backfilled {ndk} dedup_keys, {nch} content hashes (audit)")
return 0
def cmd_db_tables(args: argparse.Namespace) -> int:
cfg = load_config()
conn = db.connect(cfg.db_path)
for t in db.table_names(conn):
print(t)
return 0
def cmd_spark_status(args: argparse.Namespace) -> int:
from .spark import from_config
cfg = load_config()
sc = from_config(cfg)
try:
print("status:", sc.status())
print("endpoints:", sc.endpoints())
return 0
except Exception as e: # noqa: BLE001 — health probe; surface, don't crash
print(f"Spark Control unreachable at {cfg.spark_control_url}: {e}", file=sys.stderr)
return 1
def build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(prog="signal_engine", description="Ten31 Signal Engine (pilot)")
sub = p.add_subparsers(dest="command", required=True)
sub.add_parser("init-db", help="Create the SQLite schema").set_defaults(func=cmd_init_db)
sp = sub.add_parser("seed-convictions", help="Load the conviction log (§3.1)")
sp.add_argument("--file", default=str(DEFAULT_CONVICTION_SEED))
sp.set_defaults(func=cmd_seed_convictions)
ss = sub.add_parser("seed-sources", help="Load the source registry (§7.3/§7.4)")
ss.add_argument("--file", default=str(DEFAULT_SOURCES_SEED))
ss.set_defaults(func=cmd_seed_sources)
sde = sub.add_parser("seed-edges", help="Seed EISC connectedness edges (priors) idempotently")
sde.add_argument("--file", default="seeds/source_edges.bitcoin.seed.yaml")
sde.set_defaults(func=cmd_seed_edges)
lf = sub.add_parser("load-feeds", help="Apply resolved/verified podcast feed URLs + backtest reach")
lf.add_argument("--file", default=str(DEFAULT_FEEDS_SEED))
lf.set_defaults(func=cmd_load_feeds)
sf = sub.add_parser("seed-fanout", help="Load the hand-written fan-out tree (§7.1 backtest)")
sf.add_argument("--file", default="seeds/fanout.K2023.seed.yaml")
sf.set_defaults(func=cmd_seed_fanout)
bt = sub.add_parser("backtest", help="Run the §7.1 under-acted-conviction backtest (as-of march)")
bt.add_argument("--conviction", default="K2023")
bt.add_argument("--start", default="2023-01-01")
bt.add_argument("--end", default="2024-06-01")
bt.add_argument("--step-days", type=int, default=30)
bt.add_argument("--window-days", type=int, default=90, help="~quarterly for filings/earnings cadence")
bt.add_argument("--headline", default="K2023-picks-and-shovels")
bt.set_defaults(func=cmd_backtest)
ie = sub.add_parser("ingest-edgar", help="Fetch SEC filings for a ticker → documents + extract jobs")
ie.add_argument("--ticker", required=True)
ie.add_argument("--since", help="ISO date lower bound, e.g. 2022-01-01")
ie.add_argument("--until", help="ISO date upper bound, e.g. 2023-12-31")
ie.add_argument("--forms", help="comma list, default 10-K,10-Q,8-K")
ie.set_defaults(func=cmd_ingest_edgar)
idoc = sub.add_parser("ingest-doc", help="Fetch one text doc (HTML/PDF) → document + extract job (Battery corpus)")
idoc.add_argument("--source", required=True, help="source_id (must exist)")
idoc.add_argument("--url", required=True)
idoc.add_argument("--title")
idoc.add_argument("--date", help="ISO date of the document")
idoc.add_argument("--method", choices=["auto", "html", "pdf"], default="auto")
idoc.set_defaults(func=cmd_ingest_doc)
idm = sub.add_parser("ingest-doc-manifest", help="Batch-ingest a YAML doc manifest (Battery corpus)")
idm.add_argument("--file", default="seeds/battery_docs.manifest.yaml")
idm.set_defaults(func=cmd_ingest_doc_manifest)
ift = sub.add_parser("ingest-feed-text", help="Ingest article bodies behind a text RSS feed (blog/press)")
ift.add_argument("--source", required=True)
ift.add_argument("--url", required=True, help="RSS feed URL")
ift.add_argument("--since")
ift.add_argument("--until")
ift.add_argument("--limit", type=int, default=50)
ift.set_defaults(func=cmd_ingest_feed_text)
ge = sub.add_parser("ingest-earnings", help="Fetch FMP earnings transcripts → documents + extract jobs")
ge.add_argument("--ticker", required=True)
ge.add_argument("--since", help="ISO date lower bound (uses transcript date)")
ge.add_argument("--until", help="ISO date upper bound")
ge.add_argument("--limit", type=int, default=8)
ge.set_defaults(func=cmd_ingest_earnings)
ts = sub.add_parser("two-sided", help="Two-sided net-corroboration trajectory (Strike/Battery adversarial cases)")
ts.add_argument("--conviction", default="BATTERY2022")
ts.add_argument("--nodes", default="", help="comma substrings to filter fan-out nodes, e.g. demand,supply")
ts.add_argument("--dates", default="2022-12-31,2023-06-30,2023-12-31,2024-06-30,2024-12-31")
ts.add_argument("--modes", default="live", help="comma list: live,test")
ts.add_argument("--window-days", type=int, default=365)
ts.set_defaults(func=cmd_two_sided)
ec = sub.add_parser("embed-claims", help="Embed pending propositions → Qdrant hybrid collection (§4.3)")
ec.add_argument("--qdrant-url", default="http://192.168.1.87:6333")
ec.add_argument("--no-sparse", action="store_true", help="dense-only (skip BM25)")
ec.set_defaults(func=cmd_embed_claims)
se = sub.add_parser("search", help="Hybrid search the proposition store via the gateway")
se.add_argument("--query", required=True)
se.add_argument("--top-k", type=int, default=8)
se.add_argument("--no-rerank", action="store_true")
se.set_defaults(func=cmd_search)
ip = sub.add_parser("ingest-podcast", help="Register podcast episodes → transcribe jobs (RSS or YouTube)")
ip.add_argument("--source", required=True, help="source_id, e.g. pod-dwarkesh")
ip.add_argument("--via", choices=["auto", "rss", "youtube"], default="auto")
ip.add_argument("--since")
ip.add_argument("--until")
ip.add_argument("--limit", type=int, default=20)
ip.set_defaults(func=cmd_ingest_podcast)
rt = sub.add_parser("run-transcribe", help="Drain 'transcribe' jobs → speaker-attributed transcripts + voiceprints")
rt.add_argument("--limit", type=int, default=5)
rt.add_argument("--max-chunks", type=int, default=999)
rt.set_defaults(func=cmd_run_transcribe)
rtg = sub.add_parser("run-transcribe-gemini",
help="One-time backfill: drain 'transcribe' jobs via Gemini (off the Spark GPU)")
rtg.add_argument("--limit", type=int, default=5)
rtg.add_argument("--concurrency", type=int, default=4)
rtg.set_defaults(func=cmd_run_transcribe_gemini)
re = sub.add_parser("run-extract", help="Drain 'extract' jobs → claims via the local LLM (§4.2)")
re.add_argument("--limit", type=int, default=5, help="max jobs to process this run")
re.add_argument("--max-chunks", type=int, default=4, help="max chunks per document")
re.set_defaults(func=cmd_run_extract)
sub.add_parser("queue-status", help="Backfill queue counts by type/state").set_defaults(func=cmd_queue_status)
fp = sub.add_parser("feed-peek", help="Parse an RSS feed and show episode coverage")
fp.add_argument("--url", required=True)
fp.add_argument("--limit", type=int, default=5)
fp.set_defaults(func=cmd_feed_peek)
sv = sub.add_parser("serve", help="Run the corpus-management web UI (FastAPI)")
sv.add_argument("--port", type=int, default=None)
sv.set_defaults(func=cmd_serve)
cm = sub.add_parser("confusion-matrix", help="Pre-registered precision/recall on the §7.1 derivatives (resolver)")
cm.add_argument("--spec", default="seeds/resolution.K2023.yaml")
cm.set_defaults(func=cmd_confusion)
pv = sub.add_parser("provenance", help="Processing log: what's ingested/processed (dedup-safe)")
pv.add_argument("--backfill-hashes", action="store_true", help="compute content_hash for older transcripts")
pv.set_defaults(func=cmd_provenance)
sub.add_parser("db-tables", help="List tables/views").set_defaults(func=cmd_db_tables)
sub.add_parser("spark-status", help="Probe Spark Control health").set_defaults(func=cmd_spark_status)
return p
def main(argv: list[str] | None = None) -> int:
args = build_parser().parse_args(argv)
cfg = load_config()
_setup_logging(cfg.log_level)
return args.func(args)
if __name__ == "__main__":
raise SystemExit(main())
+101
View File
@@ -0,0 +1,101 @@
"""Environment-driven configuration (handoff §10, §13).
All config flows through env vars so the SAME code runs as a plain process now and, later, as a
StartOS s9pk daemon (which injects these via the daemon's `exec.env` from a `store.json` FileModel).
A local `.env` (gitignored) is loaded for convenience during the pilot.
Live values confirmed against the operator's gateway 2026-06-07 (GET /api/status,/api/endpoints):
gateway = https://192.168.1.72:62419 (self-signed → SPARK_VERIFY_TLS=false)
LLM = RedHatAI/Qwen3.6-35B-A3B-NVFP4
embed = BAAI/bge-m3 (1024-d) rerank = BAAI/bge-reranker-v2-m3
ASR = nvidia/parakeet-tdt-0.6b-v3 diarizer = nvidia/diar_sortformer_4spk-v1
"""
from __future__ import annotations
import os
from dataclasses import dataclass
from pathlib import Path
def _load_dotenv(path: str = ".env") -> None:
"""Minimal .env loader (no dependency): KEY=VALUE lines populate os.environ if not already set."""
p = Path(path)
if not p.exists():
return
for line in p.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, _, val = line.partition("=")
os.environ.setdefault(key.strip(), val.strip().strip('"').strip("'"))
def _env(key: str, default: str | None = None) -> str | None:
return os.environ.get(key, default)
@dataclass(frozen=True)
class Config:
spark_control_url: str
spark_verify_tls: bool
spark_timeout_s: float
audio_concurrency: int # global in-flight cap across BOTH parakeet audio endpoints (sit at 2, ceiling 3)
local_llm_model: str
embed_model: str
transcribe_model: str
anthropic_api_key: str | None
frontier_model: str
# Extraction backend: 'local' (Qwen via Spark Control, default) | 'gemini' (batch overflow/fallback, §scaling)
extraction_backend: str
gemini_api_key: str | None
gemini_model: str
fmp_api_key: str | None
edgar_user_agent: str
data_dir: Path
database_url: str
audio_cache_dir: Path
ui_port: int
log_level: str
@classmethod
def from_env(cls) -> "Config":
_load_dotenv()
data_dir = Path(_env("DATA_DIR", "./data") or "./data")
return cls(
spark_control_url=_env("SPARK_CONTROL_URL", "https://192.168.1.72:62419") or "",
spark_verify_tls=(_env("SPARK_VERIFY_TLS", "false") or "false").lower() == "true",
spark_timeout_s=float(_env("SPARK_TIMEOUT_S", "180") or "180"),
audio_concurrency=min(3, max(1, int(_env("AUDIO_CONCURRENCY", "2") or "2"))),
local_llm_model=_env("LOCAL_LLM_MODEL", "RedHatAI/Qwen3.6-35B-A3B-NVFP4") or "",
embed_model=_env("EMBED_MODEL", "BAAI/bge-m3") or "",
transcribe_model=_env("TRANSCRIBE_MODEL", "nvidia/parakeet-tdt-0.6b-v3") or "",
anthropic_api_key=_env("ANTHROPIC_API_KEY"),
frontier_model=_env("FRONTIER_MODEL", "claude-opus-4-8") or "",
extraction_backend=_env("EXTRACTION_BACKEND", "local") or "local",
gemini_api_key=_env("GEMINI_API_KEY"),
gemini_model=_env("GEMINI_MODEL", "gemini-2.5-flash") or "",
fmp_api_key=_env("FMP_API_KEY"),
edgar_user_agent=_env("EDGAR_USER_AGENT", "Ten31 Research grant@ten31.xyz") or "",
data_dir=data_dir,
database_url=_env("DATABASE_URL", "") or "",
audio_cache_dir=Path(_env("AUDIO_CACHE_DIR", str(data_dir / "audio-cache")) or "audio-cache"),
ui_port=int(_env("UI_PORT", "8000") or "8000"),
log_level=_env("LOG_LEVEL", "INFO") or "INFO",
)
@property
def db_path(self) -> Path:
prefix = "sqlite:///"
if self.database_url.startswith(prefix):
return Path(self.database_url[len(prefix):])
return self.data_dir / "signal.db"
def load_config() -> Config:
return Config.from_env()
+6
View File
@@ -0,0 +1,6 @@
"""Embedding + vector storage (§4.3).
Embed DISTILLED PROPOSITIONS (not raw chunks) into a Qdrant HYBRID collection: dense bge-m3
(via the gateway) + BM25 sparse (client-side), so entity-heavy propositions (MSTR/Strategy/
Microstrategy) match on the lexical leg too. Retrieval goes through the gateway's /api/search.
"""
+36
View File
@@ -0,0 +1,36 @@
"""Proposition embedding: dense (bge-m3 via gateway) + optional BM25 sparse (client-side)."""
from __future__ import annotations
import logging
log = logging.getLogger(__name__)
def dense_embed(sc, texts: list[str]) -> list[list[float]]:
"""Dense bge-m3 (1024-d) via the gateway /v1/embeddings (§4.3)."""
resp = sc.embed(texts)
data = sorted(resp["data"], key=lambda d: d.get("index", 0))
return [d["embedding"] for d in data]
class SparseEmbedder:
"""BM25 sparse vectors via FastEmbed `Qdrant/bm25` (the operator's CRM uses this exact model,
with the collection's `modifier: idf`). Degrades gracefully to dense-only if fastembed is absent."""
def __init__(self, model_name: str = "Qdrant/bm25") -> None:
self.available = False
self._model = None
try:
from fastembed import SparseTextEmbedding
self._model = SparseTextEmbedding(model_name=model_name)
self.available = True
except Exception as e: # noqa: BLE001
log.warning("fastembed sparse unavailable (%s) — upserting dense-only; add sparse later", e)
def embed(self, texts: list[str]) -> list[dict | None]:
if not self.available or self._model is None:
return [None] * len(texts)
out: list[dict | None] = []
for emb in self._model.embed(texts):
out.append({"indices": emb.indices.tolist(), "values": emb.values.tolist()})
return out
+79
View File
@@ -0,0 +1,79 @@
"""Qdrant hybrid collection: create + upsert distilled propositions (§4.3).
Collection mgmt + upserts go DIRECT to Qdrant (§13.2 "(Qdrant direct) :6333"); retrieval goes
through the gateway's /api/search. Named dense vector `bge_m3` (1024-d cosine) + sparse `bm25`
(modifier IDF). Point id is a deterministic UUID5 of claim_id, so re-upsert is idempotent.
"""
from __future__ import annotations
import logging
import sqlite3
import uuid
from qdrant_client import QdrantClient, models
from .embedder import SparseEmbedder, dense_embed
log = logging.getLogger(__name__)
COLLECTION = "propositions"
DENSE = "bge_m3"
SPARSE = "bm25"
_NS = uuid.UUID("5f9b7e10-0000-4000-8000-000000000001")
# Filterable payload (§4.3): stance/topic/cluster/date for stance distributions, time-windowed
# consensus, corroboration lookups. NEVER infer stance from vector distance (§2.2/§5.3).
_PAYLOAD_FIELDS = (
"claim_id", "doc_id", "source_id", "source_cluster", "topic_canonical", "date",
"claim_type", "time_horizon", "confidence", "rel_polarity", "engages_consensus",
"counters_position", "thesis_seam", "salience", "claimant", "proposition",
)
def get_client(qdrant_url: str) -> QdrantClient:
return QdrantClient(url=qdrant_url, prefer_grpc=False, timeout=60)
def ensure_collection(client: QdrantClient, *, dim: int = 1024) -> bool:
names = [c.name for c in client.get_collections().collections]
if COLLECTION in names:
return False
client.create_collection(
collection_name=COLLECTION,
vectors_config={DENSE: models.VectorParams(size=dim, distance=models.Distance.COSINE)},
sparse_vectors_config={SPARSE: models.SparseVectorParams(modifier=models.Modifier.IDF)},
)
log.info("created Qdrant collection %r (dense %s %dd + sparse %s/idf)", COLLECTION, DENSE, dim, SPARSE)
return True
def _point_id(claim_id: str) -> str:
return str(uuid.uuid5(_NS, claim_id))
def upsert_pending(conn: sqlite3.Connection, sc, client: QdrantClient,
sparse: SparseEmbedder | None = None, *, batch: int = 64) -> int:
"""Embed + upsert every claim that has no qdrant_point_id yet; back-link the id into SQLite."""
rows = conn.execute("SELECT * FROM claims WHERE qdrant_point_id IS NULL").fetchall()
if not rows:
return 0
total = 0
for i in range(0, len(rows), batch):
chunk = rows[i:i + batch]
texts = [r["proposition"] for r in chunk]
dvecs = dense_embed(sc, texts)
svecs = sparse.embed(texts) if sparse else [None] * len(texts)
points = []
for r, dv, sv in zip(chunk, dvecs, svecs):
vectors: dict = {DENSE: dv}
if sv is not None:
vectors[SPARSE] = models.SparseVector(indices=sv["indices"], values=sv["values"])
payload = {f: r[f] for f in _PAYLOAD_FIELDS}
points.append(models.PointStruct(id=_point_id(r["claim_id"]), vector=vectors, payload=payload))
client.upsert(collection_name=COLLECTION, points=points)
for r in chunk:
conn.execute("UPDATE claims SET qdrant_point_id=? WHERE claim_id=?",
(_point_id(r["claim_id"]), r["claim_id"]))
conn.commit()
total += len(chunk)
return total
+6
View File
@@ -0,0 +1,6 @@
"""Extraction (§4.2) — local LLM → structured claim units. The cost & quality center.
Emits at the level of the PROPOSITION: a passage may yield 0..N claims, and MOST passages yield
zero. An extractor that dutifully emits a claim per chunk reintroduces exactly the noise the rest
of the system is designed to remove.
"""
+64
View File
@@ -0,0 +1,64 @@
"""Pluggable extraction backends (§scaling).
The §4.2 extractor calls a backend that turns chat messages into a JSON string. Default is the
LOCAL Qwen via Spark Control (the ~95%-local design). The Gemini backend is the documented
overflow/fallback for bulk back-cataloging at scale, or if the Sparks are unavailable — used for
the PUBLIC corpus only, never conviction/exposure data (sovereignty boundary, §4.6).
A backend exposes: complete_json(messages, max_tokens) -> str (a JSON object string).
"""
from __future__ import annotations
import logging
log = logging.getLogger(__name__)
class LocalQwenBackend:
name = "local"
def __init__(self, sc) -> None:
self.sc = sc
def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
resp = self.sc.chat(messages, json_object=True, temperature=0,
enable_thinking=False, max_tokens=max_tokens)
return resp["choices"][0]["message"]["content"]
class GeminiBackend:
"""Gemini fallback/overflow. Implemented against the `google-genai` SDK. NOTE: untested until a
key is provided — validate end-to-end before relying on it for a real backfill. The async BATCH
API is the eventual scale path; this synchronous form is the drop-in fallback."""
name = "gemini"
def __init__(self, api_key: str, model: str = "gemini-2.5-flash") -> None:
from google import genai # guarded import; pip install google-genai
self._genai = genai
self.client = genai.Client(api_key=api_key)
self.model = model
def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
from google.genai import types
system = "\n\n".join(m["content"] for m in messages if m["role"] == "system")
user = "\n\n".join(m["content"] for m in messages if m["role"] != "system")
resp = self.client.models.generate_content(
model=self.model,
contents=user,
config=types.GenerateContentConfig(
system_instruction=system or None,
temperature=0,
max_output_tokens=max_tokens,
response_mime_type="application/json",
),
)
return resp.text or "{}"
def from_config(cfg, sc) -> "LocalQwenBackend | GeminiBackend":
if cfg.extraction_backend == "gemini":
if not cfg.gemini_api_key:
log.warning("EXTRACTION_BACKEND=gemini but GEMINI_API_KEY missing — falling back to local")
else:
return GeminiBackend(cfg.gemini_api_key, cfg.gemini_model)
return LocalQwenBackend(sc)
+117
View File
@@ -0,0 +1,117 @@
"""Claim extraction: text → 0..N claim units → SQLite (§4.2)."""
from __future__ import annotations
import json
import logging
import sqlite3
from typing import Any
from .prompt import SEED_TOPICS, build_messages
log = logging.getLogger(__name__)
_ENUMS = {
"claim_type": {"interpretive", "predictive", "descriptive", "reactive"},
"time_horizon": {"near", "medium", "long", "unspecified"},
"confidence": {"low", "med", "high"},
"thesis_seam": {"energy_compute", "debasement_bitcoin", "ai_data_ownership", "none"},
"salience": {"central", "secondary", "aside"},
}
def register_seed_topics(conn: sqlite3.Connection) -> None:
"""Pre-load the controlled half of the hybrid topic vocabulary (§4.2)."""
for t in SEED_TOPICS:
conn.execute(
"INSERT INTO topics (topic_canonical, status) VALUES (?, 'controlled') "
"ON CONFLICT(topic_canonical) DO UPDATE SET status='controlled'",
(t,),
)
conn.commit()
def chunk_text(text: str, max_chars: int) -> list[str]:
"""Split on paragraph boundaries into windows that fit the model context alongside the prompt."""
text = text.strip()
if not text:
return []
if len(text) <= max_chars:
return [text]
chunks: list[str] = []
cur: list[str] = []
size = 0
for para in text.split("\n\n"):
if size + len(para) > max_chars and cur:
chunks.append("\n\n".join(cur))
cur, size = [], 0
cur.append(para)
size += len(para) + 2
if cur:
chunks.append("\n\n".join(cur))
return chunks
def _parse_claims(content: str) -> list[dict]:
try:
obj = json.loads(content)
except Exception:
i, j = content.find("{"), content.rfind("}")
if i < 0 or j < 0:
return []
try:
obj = json.loads(content[i:j + 1])
except Exception:
return []
claims = obj.get("claims", []) if isinstance(obj, dict) else []
return [c for c in claims if isinstance(c, dict) and c.get("proposition")]
def extract_claims_from_text(backend, text: str, *, source_name: str, source_cluster: str | None,
date: str | None, kind: str) -> list[dict]:
"""`backend` is any object with .complete_json(messages, max_tokens) -> str
(see extract.backends: LocalQwenBackend | GeminiBackend)."""
messages = build_messages(text, source_name=source_name, source_cluster=source_cluster,
date=date, kind=kind)
content = backend.complete_json(messages, max_tokens=4000)
return _parse_claims(content)
def _enum(c: dict, field: str, default: str) -> str:
v = c.get(field)
return v if v in _ENUMS[field] else default
def persist_claims(conn: sqlite3.Connection, *, doc: sqlite3.Row, source: sqlite3.Row | None,
claims: list[dict], chunk_idx: int) -> int:
n = 0
cluster = source["source_cluster"] if source else None
for i, c in enumerate(claims):
seam = _enum(c, "thesis_seam", "none")
topic = c.get("topic_canonical") or None
if topic:
# register emergent topics BEFORE the claim (claims.topic_canonical is a FK → topics)
conn.execute(
"INSERT OR IGNORE INTO topics (topic_canonical, status, seam) VALUES (?, 'emergent', ?)",
(topic, seam),
)
claim_id = f"{doc['doc_id']}:{chunk_idx}:{i}"
conn.execute(
"""INSERT OR IGNORE INTO claims
(claim_id, doc_id, source_id, proposition, topic_canonical, topic_raw, claimant,
source_cluster, date, claim_type, time_horizon, confidence, rel_polarity,
engages_consensus, counters_position, thesis_seam, salience)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
(
claim_id, doc["doc_id"], doc["source_id"], str(c["proposition"])[:1000],
topic, c.get("topic_raw"),
c.get("claimant") or (source["name"] if source else None),
cluster, doc["date"],
_enum(c, "claim_type", "descriptive"), _enum(c, "time_horizon", "unspecified"),
_enum(c, "confidence", "med"), "none",
1 if c.get("engages_consensus") else 0, c.get("counters_position"),
seam, _enum(c, "salience", "secondary"),
),
)
n += 1
conn.commit()
return n
+47
View File
@@ -0,0 +1,47 @@
"""SEC filing HTML → plain text. Stdlib only (boring, inspectable).
Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge <ix:hidden> section of
numeric facts that would otherwise swamp the extractor), and collapses whitespace.
"""
from __future__ import annotations
import re
from html.parser import HTMLParser
_SKIP_TAGS = {"script", "style", "head"}
_SKIP_PREFIXES = ("ix:hidden",) # inline-XBRL hidden fact dump
_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"}
class _Stripper(HTMLParser):
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self._skip_depth = 0
self._parts: list[str] = []
def handle_starttag(self, tag: str, attrs) -> None:
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
self._skip_depth += 1
elif tag in _BLOCK_TAGS:
self._parts.append("\n")
def handle_endtag(self, tag: str) -> None:
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
self._skip_depth = max(0, self._skip_depth - 1)
elif tag in _BLOCK_TAGS:
self._parts.append("\n")
def handle_data(self, data: str) -> None:
if self._skip_depth == 0 and data.strip():
self._parts.append(data)
def html_to_text(html: str, *, max_chars: int = 300_000) -> str:
p = _Stripper()
p.feed(html)
text = "".join(p._parts)
text = re.sub(r"[ \t ]+", " ", text)
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
text = "\n".join(line.strip() for line in text.splitlines())
text = text.strip()
return text[:max_chars]
+72
View File
@@ -0,0 +1,72 @@
"""The §4.2 claim-extraction prompt. Prompt engineering is ours (§13.3); the schema is finalized.
Discipline encoded here (the whole point of the system, §2/§4.2):
- Extract at the level of the PROPOSITION; emit ZERO when there is no substantive claim.
- Separate topic from stance: capture stance-vs-consensus explicitly, never as a bull/bear label.
- thesis_seam is a TAG, not a filter — off-thesis and anti-thesis claims are still extracted.
"""
from __future__ import annotations
# Hybrid topic vocabulary (§4.2): a small SEEDED controlled list. The model reuses one when it
# fits and proposes a concise snake_case topic otherwise; emergent topics are merged on a schedule.
SEED_TOPICS = [
# energy <-> compute
"ai_compute_demand", "ai_power_constraint", "datacenter_buildout", "grid_interconnect",
"transformers_equipment", "nuclear_power", "natural_gas_power", "uranium_supply",
"cooling_infrastructure", "miner_flexible_load", "mining_ai_pivot",
# debasement <-> bitcoin
"bitcoin_reserve_asset", "bitcoin_collateral_credit", "bitcoin_treasury_strategy",
"btc_custody_regulation", "sovereign_bitcoin_adoption",
# ai <-> data ownership
"ai_data_ownership", "confidential_inference", "ai_commoditization",
# macro
"fed_policy", "fiscal_debasement", "stablecoins_cbdc",
]
_SYSTEM = """You are the claim-extraction component of an investment signal engine. You read a passage \
(an SEC filing excerpt or a podcast/earnings-call transcript) and extract structured CLAIM UNITS.
A CLAIM UNIT is a single normalized proposition that someone asserts — a forward-looking prediction, \
an interpretive or causal judgment, or a stance taken against a prevailing view. It must be specific \
enough to later be checked against the world.
CRITICAL DISCIPLINE — be willing to extract NOTHING:
- Most passages contain ZERO claim units. Boilerplate, legal disclaimers, ad reads, pleasantries, \
generic descriptions, routine financial line-items, and recitations of well-known news are NOT claims.
- Do NOT invent claims. Do NOT emit one claim per paragraph to seem thorough. If the passage has no \
substantive proposition, return {"claims": []}. A precise empty answer is the correct, valued output.
- Extract at the level of the PROPOSITION: one normalized subject-assertion-object sentence each. A \
single rich passage may yield several; a long dull one yields none.
For EACH claim unit, output these fields:
- "proposition": one normalized sentence (subject-assertion-object), self-contained.
- "topic_canonical": a concise snake_case topic for clustering. REUSE one of the provided seed topics \
when it fits; otherwise propose a new concise snake_case label. Normalize synonyms (Fed/FOMC/rates → fed_policy).
- "topic_raw": the topic as actually phrased in the passage.
- "claimant": who asserts it (speaker name or the filing company). Use "unknown" if unclear.
- "claim_type": one of interpretive | predictive | descriptive | reactive. (interpretive/predictive = \
insight; descriptive/reactive = news echo — extract those only if clearly salient.)
- "time_horizon": one of near | medium | long | unspecified (for predictive claims especially).
- "confidence": the claimant's apparent conviction — one of low | med | high.
- "engages_consensus": true ONLY if the claim explicitly argues against a stated mainstream view.
- "counters_position": the mainstream position it argues against, or null.
- "thesis_seam": one of energy_compute | debasement_bitcoin | ai_data_ownership | none. This is a TAG \
for relevance only — tag off-thesis claims "none" and STILL extract them.
- "salience": central | secondary | aside (how central the claim is to the passage).
Return ONLY a JSON object: {"claims": [ {...}, ... ]}. No prose, no markdown."""
def build_messages(text: str, *, source_name: str, source_cluster: str | None,
date: str | None, kind: str) -> list[dict[str, str]]:
seed = ", ".join(SEED_TOPICS)
context = (
f"Source: {source_name or 'unknown'} (cluster: {source_cluster or 'n/a'}, type: {kind}, "
f"date: {date or 'n/a'}).\n"
f"Seed topics to reuse when they fit: {seed}.\n\n"
f"PASSAGE:\n{text}"
)
return [
{"role": "system", "content": _SYSTEM},
{"role": "user", "content": context},
]
+69
View File
@@ -0,0 +1,69 @@
"""Extraction worker — drains 'extract' jobs from the backfill queue (§4.2, §13.4).
Single sequential worker by design: extraction is the heavier serial load on the one LLM GPU.
For each job: load the document, get its text (fetch+strip filing HTML, or read a stored transcript),
chunk it, run the §4.2 extractor per chunk, persist 0..N claims, complete the job.
"""
from __future__ import annotations
import logging
from pathlib import Path
import requests
from ..backfill import queue
from . import claims as claims_mod
from .html_text import html_to_text
log = logging.getLogger(__name__)
def _document_text(doc, *, user_agent: str) -> str:
if doc["transcript_path"]:
return Path(doc["transcript_path"]).read_text()
if doc["kind"] == "filing" and doc["url"]:
r = requests.get(doc["url"], headers={"User-Agent": user_agent}, timeout=90)
r.raise_for_status()
return html_to_text(r.text)
raise ValueError(f"no text source for {doc['doc_id']} (kind={doc['kind']}, url={doc['url']})")
def run_extract(conn, sc, cfg, *, limit: int = 10, max_chunks_per_doc: int = 4,
chunk_chars: int = 18_000, lease_seconds: int = 900,
worker_id: str = "extract-1") -> dict:
from .backends import from_config as backend_from_config
backend = backend_from_config(cfg, sc)
log.info("extraction backend: %s", backend.name)
claims_mod.register_seed_topics(conn)
processed = total_claims = 0
while processed < limit:
job = queue.lease_next(conn, worker_id=worker_id, job_types=["extract"], lease_seconds=lease_seconds)
if job is None:
break
processed += 1
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
if doc is None:
queue.skip(conn, job["job_id"], "document missing")
continue
src = conn.execute("SELECT * FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
try:
text = _document_text(doc, user_agent=cfg.edgar_user_agent)
chunks = claims_mod.chunk_text(text, chunk_chars)[:max_chunks_per_doc]
doc_claims = 0
for idx, chunk in enumerate(chunks):
cl = claims_mod.extract_claims_from_text(
backend, chunk,
source_name=src["name"] if src else "",
source_cluster=src["source_cluster"] if src else None,
date=doc["date"], kind=doc["kind"],
)
doc_claims += claims_mod.persist_claims(conn, doc=doc, source=src, claims=cl, chunk_idx=idx)
conn.execute("UPDATE documents SET processed_at=datetime('now') WHERE doc_id=?", (doc["doc_id"],))
conn.commit()
queue.complete(conn, job["job_id"], output_ref=f"{doc_claims} claims / {len(chunks)} chunks")
total_claims += doc_claims
log.info("extracted %d claims from %s (%d chunks)", doc_claims, doc["doc_id"], len(chunks))
except Exception as e: # noqa: BLE001
state = queue.fail(conn, job["job_id"], e)
log.warning("extract failed for %s: %s (→ %s)", job["target_id"], e, state)
return {"jobs_processed": processed, "claims_written": total_claims}
+5
View File
@@ -0,0 +1,5 @@
"""Ingestion layer (§4.1) — the biggest greenfield piece.
Spark Control transcribes audio you hand it; it does NOT fetch. Everything here is fetch/schedule:
RSS + YouTube + EDGAR + FMP earnings, long-audio chunking, and cross-chunk speaker stitching.
"""
+36
View File
@@ -0,0 +1,36 @@
"""Long-audio chunking (§4.1, §13.4).
Podcasts run 13 h; the diarizer caps at 4 speakers/chunk and Spark 2 is a single GPU, so we cut
long audio into ~23 min pieces sent SEQUENTIALLY (parallel audio → 503 FFT race). Each chunk is
diarized independently and re-stitched across chunks by voiceprint (see speaker_stitch.py).
Requires ffmpeg/ffprobe.
"""
from __future__ import annotations
import subprocess
from pathlib import Path
CHUNK_SECONDS_DEFAULT = 150 # 2.5 min, within the ~23 min guidance
def duration_seconds(src: str | Path) -> float:
out = subprocess.run(
["ffprobe", "-v", "error", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", str(src)],
check=True, capture_output=True, text=True,
)
return float(out.stdout.strip())
def chunk_audio(src: str | Path, out_dir: str | Path, *, chunk_seconds: int = CHUNK_SECONDS_DEFAULT) -> list[Path]:
"""Split into fixed-length WAV chunks using ffmpeg's segment muxer (no re-encode of timing).
Returns chunk paths in order. Order matters: the queue sends them sequentially."""
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
pattern = str(out_dir / "chunk_%04d.wav")
subprocess.run(
["ffmpeg", "-y", "-i", str(src), "-f", "segment", "-segment_time", str(chunk_seconds),
"-ar", "16000", "-ac", "1", "-reset_timestamps", "1", pattern],
check=True, capture_output=True,
)
return sorted(out_dir.glob("chunk_*.wav"))
+159
View File
@@ -0,0 +1,159 @@
"""Text-document fetcher for the Battery (bitcoin-collateralized lending) corpus and any non-filing,
non-audio source: policy primaries (SEC SABs, OCC/FDIC/Fed), lender/issuer blogs, credit-market data.
Unlike EDGAR (CIK-driven) and the podcast path (audio→transcribe), these are dated HTML pages, PDFs, or
article RSS feeds. We fetch ONCE, extract clean text (HTML via html_to_text, PDF via pypdf), save it, and
point documents.transcript_path at the saved text so the extract worker reads it directly (it already
supports transcript_path) — this also lets PDFs work, which the worker's on-demand html_to_text fetch can't.
A source row must exist first (FK). Lineage/axis live on the source's cluster/notes (set in the seed);
policy sources are axis=context and must NOT feed the supply resolver (weight 0) — enforced downstream.
"""
from __future__ import annotations
import hashlib
import io
import logging
import sqlite3
from pathlib import Path
import requests
from ..backfill import queue
from ..extract.html_text import html_to_text
from .feeds import fetch_feed
log = logging.getLogger(__name__)
DEFAULT_UA = "ten31-signal-engine/1.0 (research; contact ops@ten31.xyz)"
def _pdf_to_text(data: bytes, *, max_chars: int) -> str:
import pypdf
reader = pypdf.PdfReader(io.BytesIO(data))
parts: list[str] = []
total = 0
for page in reader.pages:
t = page.extract_text() or ""
parts.append(t)
total += len(t)
if total > max_chars:
break
return "\n".join(parts)[:max_chars]
def fetch_clean_text(url: str, *, method: str = "auto", ua: str = DEFAULT_UA,
timeout: int = 90, max_chars: int = 300_000) -> str:
"""Fetch a URL once and return clean text. Auto-detects PDF vs HTML by content-type + magic bytes."""
r = requests.get(url, headers={"User-Agent": ua}, timeout=timeout)
r.raise_for_status()
ctype = r.headers.get("Content-Type", "").lower()
is_pdf = method == "pdf" or "application/pdf" in ctype or r.content[:5] == b"%PDF-"
if is_pdf:
return _pdf_to_text(r.content, max_chars=max_chars)
return html_to_text(r.text, max_chars=max_chars)
_BLOCK_MARKERS = (
"aggressive automated scraping", "request access", "access denied", "are you a robot",
"enable javascript", "captcha", "verify you are human", "rate limit exceeded",
"403 forbidden", "unusual traffic", "checking your browser",
)
def _looks_blocked(text: str) -> bool:
"""Anti-scraping interstitials return 200 + a short access-denied body. Detect so we don't ingest
a block page as if it were the document (a real policy/blog doc is long and has no such markers)."""
low = text[:2500].lower()
return any(m in low for m in _BLOCK_MARKERS)
def _doc_id(source_id: str, url: str) -> str:
return f"doc:{source_id}:{hashlib.sha256(url.encode()).hexdigest()[:12]}"
def ingest_one(conn: sqlite3.Connection, cfg, *, source_id: str, url: str, title: str,
date: str | None, method: str = "auto", prompt_version: str = "extract-v0",
min_chars: int = 400) -> str | None:
"""Fetch+store one text document and enqueue extraction. Idempotent on (source_id, url).
Returns doc_id if newly ingested, else None (duplicate, too-short, or fetch error → logged)."""
doc_id = _doc_id(source_id, url)
if conn.execute("SELECT 1 FROM documents WHERE doc_id=?", (doc_id,)).fetchone():
return None
ua = getattr(cfg, "user_agent", None) or DEFAULT_UA
try:
text = fetch_clean_text(url, method=method, ua=ua)
except Exception as e: # noqa: BLE001
log.warning("doc fetch failed %s: %s", url, e)
return None
if not text or len(text) < min_chars:
log.warning("doc too short (%d chars), skipping %s", len(text or ""), url)
return None
if _looks_blocked(text):
log.warning("blocked/anti-scrape page detected, skipping %s", url)
return None
safe = doc_id.replace(":", "_")
tpath = Path(cfg.data_dir) / "docs" / f"{safe}.txt"
tpath.parent.mkdir(parents=True, exist_ok=True)
tpath.write_text(text)
content_hash = hashlib.sha256(text.encode()).hexdigest()
conn.execute(
"""INSERT OR IGNORE INTO documents
(doc_id, source_id, kind, external_id, url, title, date, transcript_path, content_hash, processed_at)
VALUES (?,?,?,?,?,?,?,?,?,datetime('now'))""",
(doc_id, source_id, "filing", url, url, title[:300] if title else url, date, str(tpath), content_hash),
)
conn.commit()
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
parent_doc_id=doc_id, priority=50)
conn.commit()
log.info("ingested doc %s (%d chars) for %s", doc_id, len(text), source_id)
return doc_id
def ingest_manifest(conn: sqlite3.Connection, cfg, path) -> dict:
"""Batch-ingest the docs listed in a YAML manifest ({docs:[{source,url,title,date,method}]}).
Returns {ingested, skipped, missing_source}. Each source must already exist (FK)."""
import yaml
from pathlib import Path as _Path
data = yaml.safe_load(_Path(path).read_text()) or {}
docs = data.get("docs", [])
ingested = skipped = missing = 0
for d in docs:
src = d.get("source")
if not conn.execute("SELECT 1 FROM sources WHERE source_id=?", (src,)).fetchone():
log.warning("manifest doc references missing source %r — skipping %s", src, d.get("url"))
missing += 1
continue
doc_id = ingest_one(conn, cfg, source_id=src, url=d["url"], title=d.get("title", d["url"]),
date=d.get("date"), method=d.get("method", "auto"))
if doc_id:
ingested += 1
else:
skipped += 1
return {"ingested": ingested, "skipped": skipped, "missing_source": missing}
def ingest_feed_text(conn: sqlite3.Connection, cfg, *, source_id: str, rss_url: str,
since: str | None = None, until: str | None = None, limit: int = 50) -> int:
"""Ingest the ARTICLE bodies behind a text RSS feed (blog/press feed). Each item's link is fetched
and stored as a dated text document. Returns count of newly-ingested docs."""
from .feeds import _published_iso
parsed = fetch_feed(rss_url, user_agent=getattr(cfg, "user_agent", None) or DEFAULT_UA)
n = 0
for entry in parsed.entries:
if n >= limit:
break
link = entry.get("link")
if not link:
continue
date = _published_iso(entry)
if since and date and date < since:
continue
if until and date and date > until:
continue
if ingest_one(conn, cfg, source_id=source_id, url=link,
title=entry.get("title", link), date=date):
n += 1
return n
+61
View File
@@ -0,0 +1,61 @@
"""Audio acquisition (§4.1). Spark Control transcribes audio you fetch — this fetches it.
- Podcast enclosures: a plain streaming download that follows the Podtrac/Megaphone redirects to the
final signed CDN object (download immediately; resolved URLs carry short-lived params).
- YouTube: yt-dlp (audio-only → 16 kHz mono WAV). NOTE: 2026 YouTube enforces PO Tokens broadly — run
the `bgutil-ytdlp-pot-provider` sidecar or pulls will 403. yt-dlp is treated as a LAST resort; prefer
the RSS enclosure where a show publishes both (ToS: downloading YT audio violates YouTube ToS).
"""
from __future__ import annotations
import subprocess
from pathlib import Path
import requests
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
def download_enclosure(url: str, dest: str | Path, *, user_agent: str = DEFAULT_UA, timeout: int = 120) -> Path:
dest = Path(dest)
dest.parent.mkdir(parents=True, exist_ok=True)
with requests.get(url, stream=True, allow_redirects=True,
headers={"User-Agent": user_agent}, timeout=timeout) as r:
r.raise_for_status()
with open(dest, "wb") as f:
for chunk in r.iter_content(chunk_size=1 << 16):
f.write(chunk)
return dest
def to_wav_16k_mono(src: str | Path, dst: str | Path) -> Path:
"""Normalize any audio to 16 kHz mono PCM WAV (what the ASR endpoint wants). Requires ffmpeg."""
dst = Path(dst)
dst.parent.mkdir(parents=True, exist_ok=True)
subprocess.run(
["ffmpeg", "-y", "-i", str(src), "-ar", "16000", "-ac", "1", "-f", "wav", str(dst)],
check=True, capture_output=True,
)
return dst
def download_youtube_audio(url: str, out_dir: str | Path, *, archive_file: str | Path | None = None) -> Path:
"""Audio-only via yt-dlp → 16 kHz mono WAV. `archive_file` (yt-dlp --download-archive) is the
canonical 'only-new' dedup for channel/playlist back-catalog pulls."""
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
cmd = [
"yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "wav",
"--postprocessor-args", "ffmpeg:-ar 16000 -ac 1",
"-o", str(out_dir / "%(id)s.%(ext)s"),
"--no-progress",
]
if archive_file:
cmd += ["--download-archive", str(archive_file)]
cmd.append(url)
subprocess.run(cmd, check=True, capture_output=True)
# yt-dlp names the file by video id; return the newest wav
wavs = sorted(out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime)
if not wavs:
raise RuntimeError("yt-dlp produced no wav (PO-token/cookies issue? see module docstring)")
return wavs[-1]
+127
View File
@@ -0,0 +1,127 @@
"""Earnings-call transcripts via Financial Modeling Prep (§4.1, §12 — decision: FMP).
Audio isn't reliably fetchable for large-caps (no uniform feed; ~3090d replay expiry breaks
backfill), so FMP's transcript API is the backbone and EDGAR filings remain the durable core. FMP
also exposes an earnings *calendar* to trigger ingestion on the day a call drops.
Endpoint paths/params are marked TODO(contract): confirm against the FMP 'stable' docs for the
account tier at integration. Needs config.fmp_api_key.
"""
from __future__ import annotations
import hashlib
import sqlite3
from pathlib import Path
from typing import Any
import requests
FMP_BASE = "https://financialmodelingprep.com/stable"
class FMPClient:
def __init__(self, api_key: str, *, base: str = FMP_BASE, timeout: int = 30) -> None:
if not api_key:
raise ValueError("FMP_API_KEY is required for earnings-call transcripts")
self.api_key = api_key
self.base = base
self.timeout = timeout
self.s = requests.Session()
def _get(self, path: str, **params: Any) -> Any:
params["apikey"] = self.api_key
r = self.s.get(f"{self.base}/{path}", params=params, timeout=self.timeout)
r.raise_for_status()
return r.json()
# Confirmed against FMP 'stable' 2026-06-07 (v3 is legacy/403). Note singular "earning".
def transcript_dates(self, symbol: str) -> Any:
"""List available transcripts: [{quarter, fiscalYear, date}, ...]."""
return self._get("earning-call-transcript-dates", symbol=symbol)
def transcript(self, symbol: str, *, year: int, quarter: int) -> Any:
"""One transcript: [{symbol, period, year, date, content}]. Use the `date` field as the
document date — FMP's year/quarter labels are fiscal and can be offset from the call date."""
return self._get("earning-call-transcript", symbol=symbol, year=year, quarter=quarter)
def earnings_calendar(self, *, from_date: str, to_date: str) -> Any:
"""Earnings calendar (ingestion trigger): [{symbol, date, epsActual, ...}, ...]."""
return self._get("earnings-calendar", **{"from": from_date, "to": to_date})
def ingest_transcript(
conn: sqlite3.Connection,
*,
source_id: str,
symbol: str,
year: int,
quarter: int,
content: str,
date: str | None,
data_dir: Path,
prompt_version: str = "extract-v0",
) -> tuple[bool, bool]:
"""Store one transcript (content written to disk → transcript_path) and enqueue an 'extract'
job. Idempotent. Returns (new_document, new_job)."""
from ..backfill import queue
external_id = f"{symbol}-{year}Q{quarter}"
doc_id = f"earnings:{external_id}"
tdir = Path(data_dir) / "transcripts"
tdir.mkdir(parents=True, exist_ok=True)
tpath = tdir / f"{external_id}.txt"
tpath.write_text(content)
content_hash = hashlib.sha256(content.encode()).hexdigest()
cur = conn.execute(
"""INSERT OR IGNORE INTO documents
(doc_id, source_id, kind, external_id, title, date, transcript_path, content_hash, processed_at)
VALUES (?,?,?,?,?,?,?,?, datetime('now'))""",
(doc_id, source_id, "earnings_call", external_id, f"{symbol} {year} Q{quarter} call",
date, str(tpath), content_hash),
)
conn.commit()
if not cur.rowcount:
return (False, False)
# earnings-call Q&A is the highest-yield text source (§4.1) → priority 40, ahead of filings (50).
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
new_job = queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
parent_doc_id=doc_id, priority=40) is not None
return (True, new_job)
def ingest_for_ticker(
conn: sqlite3.Connection,
fmp: FMPClient,
*,
source_id: str,
symbol: str,
data_dir: Path,
since: str | None = None,
until: str | None = None,
limit: int = 8,
) -> tuple[int, int]:
"""Enumerate available transcripts via the dates index, fetch those in [since, until], and
ingest. Uses each transcript's own `date` (FMP fiscal labels are offset). Returns (docs, jobs)."""
dates = fmp.transcript_dates(symbol)
picked = []
for d in dates if isinstance(dates, list) else []:
dt = d.get("date")
if since and dt and dt < since:
continue
if until and dt and dt > until:
continue
picked.append(d)
n_docs = n_jobs = 0
for d in picked[:limit]:
tr = fmp.transcript(symbol, year=d["fiscalYear"], quarter=d["quarter"])
item = (tr[0] if isinstance(tr, list) and tr else tr) or {}
content = item.get("content") or ""
if not content:
continue
nd, nj = ingest_transcript(
conn, source_id=source_id, symbol=symbol, year=d["fiscalYear"], quarter=d["quarter"],
content=content, date=item.get("date") or d.get("date"), data_dir=data_dir,
)
n_docs += int(nd)
n_jobs += int(nj)
return n_docs, n_jobs
+148
View File
@@ -0,0 +1,148 @@
"""SEC EDGAR ingestion (§4.1).
Hits the official data.sec.gov / www.sec.gov APIs directly (free, keyless, full history).
Two hard requirements:
- a descriptive User-Agent (SEC 403s requests without one) — from config.edgar_user_agent.
- ≤10 requests/sec aggregate — enforced by a min-interval throttle here.
Supports an explicit date range AND historical shards (filings.files[]), so the §7.1 backtest can
reach 20222023 filings, not just the most-recent ~1000.
"""
from __future__ import annotations
import hashlib
import sqlite3
import time
from typing import Iterator
import requests
_FILING_COLS = ("accessionNumber", "form", "filingDate", "primaryDocument", "primaryDocDescription")
class EdgarClient:
BASE_DATA = "https://data.sec.gov"
BASE_WWW = "https://www.sec.gov"
def __init__(self, user_agent: str, *, min_interval: float = 0.12) -> None:
if not user_agent or "@" not in user_agent:
raise ValueError("EDGAR requires a descriptive User-Agent with contact email (config.edgar_user_agent)")
self.s = requests.Session()
self.s.headers.update({"User-Agent": user_agent, "Accept-Encoding": "gzip, deflate"})
self.min_interval = min_interval
self._last = 0.0
self._tickers: dict[str, int] | None = None
def _throttle(self) -> None:
dt = time.monotonic() - self._last
if dt < self.min_interval:
time.sleep(self.min_interval - dt)
self._last = time.monotonic()
def _get(self, url: str) -> requests.Response:
self._throttle()
r = self.s.get(url, timeout=30)
r.raise_for_status()
return r
# ---- ticker → CIK ----
def ticker_map(self) -> dict[str, int]:
if self._tickers is None:
data = self._get(f"{self.BASE_WWW}/files/company_tickers.json").json()
self._tickers = {row["ticker"].upper(): int(row["cik_str"]) for row in data.values()}
return self._tickers
def cik_for(self, ticker: str) -> int | None:
return self.ticker_map().get(ticker.upper())
# ---- filings ----
def _iter_array(self, block: dict, forms, since, until) -> Iterator[dict]:
arrays = [block.get(c, []) for c in _FILING_COLS]
for acc, form, fdate, pdoc, pdesc in zip(*arrays):
if forms and form not in forms:
continue
if since and fdate < since:
continue
if until and fdate > until:
continue
yield {"accession": acc, "form": form, "filing_date": fdate,
"primary_document": pdoc, "description": pdesc}
def iter_filings(
self,
cik: int,
*,
forms: tuple[str, ...] = ("10-K", "10-Q", "8-K"),
since: str | None = None,
until: str | None = None,
) -> Iterator[dict]:
"""Yield filing descriptors. Pulls the inline 'recent' block AND any historical shards whose
date window overlaps [since, until] — required to reach the backtest era for active filers."""
sub = self._get(f"{self.BASE_DATA}/submissions/CIK{cik:010d}.json").json()
recent = sub.get("filings", {}).get("recent", {})
for f in self._iter_array(recent, forms, since, until):
yield self._with_url(cik, f)
for shard in sub.get("filings", {}).get("files", []):
# shard has filingFrom / filingTo; skip shards entirely outside the window.
if until and shard.get("filingFrom", "") > until:
continue
if since and shard.get("filingTo", "9999") < since:
continue
block = self._get(f"{self.BASE_DATA}/submissions/{shard['name']}").json()
for f in self._iter_array(block, forms, since, until):
yield self._with_url(cik, f)
def _with_url(self, cik: int, f: dict) -> dict:
acc_nodash = f["accession"].replace("-", "")
f["cik"] = cik
f["url"] = f"{self.BASE_WWW}/Archives/edgar/data/{cik}/{acc_nodash}/{f['primary_document']}"
return f
def fetch_html(self, filing: dict) -> str:
return self._get(filing["url"]).text
# Domestic annual/quarterly + foreign-private-issuer equivalents. 20-F (foreign annual, e.g. TSM/IREN),
# 40-F (Canadian annual, e.g. CCJ). 8-K/6-K (current reports) excluded by default — low claim yield.
HIGH_YIELD_FORMS = ("10-K", "10-Q", "20-F", "40-F")
def ingest_filings(
conn: sqlite3.Connection,
client: EdgarClient,
*,
source_id: str,
ticker: str,
since: str | None = None,
until: str | None = None,
forms: tuple[str, ...] = HIGH_YIELD_FORMS,
prompt_version: str = "extract-v0",
) -> tuple[int, int]:
"""Insert filing documents and enqueue 'extract' jobs. Filings are text → no transcription;
they go straight to extraction (the extract worker fetches + strips the HTML later). Default
forms cover both domestic (10-K/10-Q) and foreign-private-issuer (20-F/40-F) filers.
Returns (new_documents, new_jobs). Idempotent on (source_id, accession)."""
from ..backfill import queue
cik = client.cik_for(ticker)
if cik is None:
raise ValueError(f"No CIK found for ticker {ticker!r}")
n_docs = n_jobs = 0
for f in client.iter_filings(cik, forms=forms, since=since, until=until):
doc_id = f"edgar:{f['accession']}"
cur = conn.execute(
"""INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date)
VALUES (?,?,?,?,?,?,?)""",
(doc_id, source_id, "filing", f["accession"], f["url"],
f"{ticker} {f['form']} {f['filing_date']}", f["filing_date"]),
)
conn.commit()
if not cur.rowcount:
continue
n_docs += 1
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
# priority 50: filings are high-info-density (§4.1) → ahead of podcasts (100)
if queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
parent_doc_id=doc_id, priority=50) is not None:
n_jobs += 1
return n_docs, n_jobs
+65
View File
@@ -0,0 +1,65 @@
"""Podcast RSS ingestion (§4.1).
feedparser + conditional GET (ETag/Last-Modified) for efficient incremental polling, with a
composite (feed_url, guid) dedup discipline. Many podcast CDNs send no validators and some feeds
truncate to recent episodes — for the §7.1 backtest, older episodes may need the show's full
archive feed (some hosts expose `?limit=` / a separate archive URL) or a YouTube back-catalog.
"""
from __future__ import annotations
import hashlib
import time
from typing import Any
import feedparser
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
def fetch_feed(url: str, *, etag: str | None = None, modified: str | None = None,
user_agent: str = DEFAULT_UA) -> feedparser.FeedParserDict:
"""Conditional GET. On HTTP 304 the result has .status == 304 and .entries == [] → skip."""
return feedparser.parse(url, etag=etag, modified=modified, agent=user_agent)
def _published_iso(entry: Any) -> str | None:
t = entry.get("published_parsed") or entry.get("updated_parsed")
if not t:
return None
return time.strftime("%Y-%m-%d", t)
def _enclosure_audio_url(entry: Any) -> str | None:
for enc in entry.get("enclosures", []) or []:
if str(enc.get("type", "")).startswith("audio"):
return enc.get("href") or enc.get("url")
# some feeds put audio only in links rel=enclosure
for link in entry.get("links", []) or []:
if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("audio"):
return link.get("href")
return None
def _guid(entry: Any) -> str:
g = entry.get("id") or entry.get("link")
if g:
return str(g)
basis = f"{entry.get('title','')}|{entry.get('published','')}"
return "sha1:" + hashlib.sha1(basis.encode()).hexdigest()
def episode_records(parsed: feedparser.FeedParserDict) -> list[dict]:
"""Normalize feed entries to episode records. Skips entries with no audio enclosure."""
out: list[dict] = []
for e in parsed.entries:
audio = _enclosure_audio_url(e)
if not audio:
continue
out.append({
"guid": _guid(e),
"title": e.get("title"),
"audio_url": audio,
"link": e.get("link"),
"published": _published_iso(e),
})
return out
+195
View File
@@ -0,0 +1,195 @@
"""One-time backfill path: transcribe podcast episodes via the Gemini multimodal API instead of the
local Spark Parakeet+diarizer pipeline. Used to take a bulk backfill OFF the shared Spark GPU (which
contends with production) — it is NOT the steady-state transcriber (local Parakeet remains the default).
Scope/guardrail: podcast audio is PUBLIC data, so sending it to the frontier does NOT trip the
exposure/positioning-data rule (that guardrail is about Ten31's conviction/exposure data, never public
audio). Output is written in the SAME 'Speaker: text' transcript format the extractor consumes, so the
downstream extract→embed stages are agnostic to which transcriber produced the file.
Tradeoff vs local: Gemini yields speaker-LABELED text, not voiceprint fingerprints — so no voiceprint
auto-edges. We rely on the hand-seeded EISC edges + name-based attribution instead (acceptable for a
bounded backfill).
"""
from __future__ import annotations
import hashlib
import logging
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from ..backfill import queue
from .download import download_enclosure
log = logging.getLogger(__name__)
_PROMPT = (
"You are a precise podcast transcriptionist. Transcribe this audio VERBATIM as a speaker-diarized "
"transcript.\n"
"RULES:\n"
"- One line per speaker turn, formatted exactly as `Name: spoken text` (a colon and one space).\n"
"- The host of this show is {host} — label every host turn with exactly `{host}` (the person's "
"name, never the show's name).\n"
"- When the host introduces a guest by name (e.g. 'welcome X to the show', 'I'm joined by X'), use "
"that real first name (or full name) as the guest's label for the WHOLE transcript. Only fall back "
"to `Guest` (or `Guest 2`, `Guest 3`) if a name is never stated. Do not invent names.\n"
"- Do NOT include timestamps, ad-reads markers, summaries, headings, markdown, or any commentary. "
"Only the transcript lines.\n"
"- Transcribe the entire episode from start to finish. Do not stop early or summarize.\n"
)
def _host_person(source_name: str) -> str:
"""Derive the host's PERSON name from a source/show name so claimant attribution isn't the show.
'What Bitcoin Did (Peter McCormack)' -> 'Peter McCormack'; 'Stephan Livera Podcast' -> 'Stephan
Livera'; 'The Kevin Rooke Show' -> 'Kevin Rooke'; 'The Anita Posch Show' -> 'Anita Posch'."""
m = re.search(r"\(([^)]+)\)", source_name or "")
if m:
return m.group(1).strip()
s = re.sub(r"^The\s+", "", source_name or "").strip()
s = re.sub(r"\s+(Podcast|Show)$", "", s, flags=re.I).strip()
return s
def _sniff_audio_mime(path: Path) -> str:
"""Determine audio MIME from the file header — the downloaded enclosure has a generic `.src`
extension, so the Files API can't infer it and rejects the upload without an explicit mime_type."""
with open(path, "rb") as fh:
head = fh.read(16)
if head[:3] == b"ID3" or (len(head) > 1 and head[0] == 0xFF and (head[1] & 0xE0) == 0xE0):
return "audio/mpeg"
if head[4:8] == b"ftyp":
return "audio/mp4" # m4a/aac
if head[:4] == b"OggS":
return "audio/ogg"
if head[:4] == b"RIFF":
return "audio/wav"
if head[:4] == b"fLaC":
return "audio/flac"
return "audio/mpeg" # podcast default
def _upload_and_wait(client, audio_path: Path, *, poll_s: float = 2.0, timeout_s: float = 300.0):
"""Upload to the Files API and wait until the file is ACTIVE (audio is processed server-side)."""
from google.genai import types
mime = _sniff_audio_mime(audio_path)
f = client.files.upload(file=str(audio_path), config=types.UploadFileConfig(mime_type=mime))
waited = 0.0
while getattr(f.state, "name", str(f.state)) == "PROCESSING" and waited < timeout_s:
time.sleep(poll_s)
waited += poll_s
f = client.files.get(name=f.name)
state = getattr(f.state, "name", str(f.state))
if state != "ACTIVE":
raise RuntimeError(f"Gemini file not ACTIVE (state={state}) for {audio_path.name}")
return f
def transcribe_one(client, model: str, audio_path: Path, host_name: str, *,
max_output_tokens: int = 65536) -> tuple[str, dict]:
"""Transcribe a single audio file → (transcript_text, usage_dict). Network/CPU only; no DB."""
from google.genai import types
f = _upload_and_wait(client, audio_path)
try:
resp = client.models.generate_content(
model=model,
contents=[f, _PROMPT.format(host=host_name or "the host")],
config=types.GenerateContentConfig(temperature=0, max_output_tokens=max_output_tokens),
)
text = (resp.text or "").strip()
um = getattr(resp, "usage_metadata", None)
usage = {
"prompt_tokens": getattr(um, "prompt_token_count", 0) or 0,
"output_tokens": getattr(um, "candidates_token_count", 0) or 0,
"finish_reason": str(getattr(resp.candidates[0], "finish_reason", "")) if resp.candidates else "",
}
return text, usage
finally:
try:
client.files.delete(name=f.name)
except Exception as e: # noqa: BLE001 — best-effort cleanup
log.debug("file cleanup failed for %s: %s", f.name, e)
def _fetch_and_transcribe(client, model: str, cfg, doc, host_name: str) -> dict:
"""Worker-thread unit: download enclosure → Gemini transcribe → write transcript file. No DB writes."""
cache = Path(cfg.audio_cache_dir)
cache.mkdir(parents=True, exist_ok=True)
safe = doc["doc_id"].replace(":", "_")
src = cache / f"{safe}.src"
audio = download_enclosure(doc["url"], src)
try:
text, usage = transcribe_one(client, model, audio, host_name)
if not text or len(text) < 40:
raise RuntimeError(f"empty/short transcript ({len(text)} chars)")
tpath = Path(cfg.data_dir) / "transcripts" / f"{safe}.txt"
tpath.parent.mkdir(parents=True, exist_ok=True)
tpath.write_text(text)
return {
"doc_id": doc["doc_id"], "ok": True, "transcript_path": str(tpath),
"n_lines": text.count("\n") + 1, "content_hash": hashlib.sha256(text.encode()).hexdigest(),
"usage": usage,
}
finally:
try:
if audio.exists():
audio.unlink()
except Exception: # noqa: BLE001
pass
def run_transcribe_gemini(conn, cfg, *, limit: int = 5, concurrency: int = 4,
lease_seconds: int = 7200, worker_id: str = "gemini-transcribe") -> dict:
"""Lease pending transcribe jobs and transcribe them via Gemini in parallel. DB writes stay on the
main thread; only download+API run in the pool. Reports token usage for cost accounting."""
from google import genai
if not cfg.gemini_api_key:
raise RuntimeError("GEMINI_API_KEY not configured")
client = genai.Client(api_key=cfg.gemini_api_key)
model = cfg.gemini_model or "gemini-2.5-flash"
# Lease the batch up front (main thread); resolve docs + host names.
leased: list[tuple] = []
while len(leased) < limit:
job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
if job is None:
break
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
if doc is None:
queue.skip(conn, job["job_id"], "document missing")
continue
host = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
leased.append((job, doc, _host_person(host["name"]) if host else ""))
done = failed = prompt_tok = out_tok = 0
with ThreadPoolExecutor(max_workers=concurrency) as pool:
futs = {pool.submit(_fetch_and_transcribe, client, model, cfg, doc, host): (job, doc)
for (job, doc, host) in leased}
for fut in as_completed(futs):
job, doc = futs[fut]
try:
r = fut.result()
conn.execute(
"UPDATE documents SET transcript_path=?, content_hash=?, processed_at=datetime('now') "
"WHERE doc_id=?", (r["transcript_path"], r["content_hash"], doc["doc_id"]),
)
h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
parent_doc_id=doc["doc_id"], priority=100)
queue.complete(conn, job["job_id"], output_ref=f"gemini {r['n_lines']} lines")
conn.commit()
done += 1
prompt_tok += r["usage"]["prompt_tokens"]
out_tok += r["usage"]["output_tokens"]
fr = r["usage"]["finish_reason"]
log.info("gemini transcribed %s (%d lines, %d in/%d out tok%s)", doc["doc_id"],
r["n_lines"], r["usage"]["prompt_tokens"], r["usage"]["output_tokens"],
", TRUNCATED" if "MAX_TOKENS" in fr else "")
except Exception as e: # noqa: BLE001
state = queue.fail(conn, job["job_id"], e)
conn.commit()
failed += 1
log.warning("gemini transcribe failed for %s: %s (→ %s)", doc["doc_id"], e, state)
return {"done": done, "failed": failed, "prompt_tokens": prompt_tok, "output_tokens": out_tok}
+45
View File
@@ -0,0 +1,45 @@
"""Speaker-name identification (§4.5 enhancement).
In a 1-on-1 interview the host introduces the guest by name at the top. Reading the transcript head
with the LLM, we attach a real NAME to each diarized speaker → voiceprints.person_label. This gives
the independence graph a SECOND, orthogonal overlap signal: the same NAMED guest across two shows is
a shared_guest edge even when the voiceprints don't cluster (different mic/codec/room). It complements
voiceprint cosine matching and is robust to fingerprint drift — exactly the case the operator flagged.
"""
from __future__ import annotations
import json
import logging
log = logging.getLogger(__name__)
_SYS = (
'You identify the speakers in a podcast/interview transcript. Each line is "LABEL: text". '
"Using the introduction and context, determine each LABEL's real full name and role. In an "
"interview the host normally introduces themselves and the guest within the first minute. Only "
"assert a name you can actually support from the text — if you cannot tell, use null. "
'Return ONLY JSON: {"speakers": {"<LABEL>": {"name": "Full Name" or null, '
'"role": "host"|"guest"|"panelist"|"unknown", "confidence": "low"|"med"|"high"}}}.'
)
def identify_speakers(backend, transcript_head: str, *, source_name: str, host_hint: str | None = None) -> dict:
"""Returns {label: {name, role, confidence}}. `backend` is any extract.backends backend."""
ctx = f"Show: {source_name}."
if host_hint:
ctx += f" The show's usual host is {host_hint}."
ctx += "\n\nTRANSCRIPT (beginning):\n" + transcript_head
messages = [{"role": "system", "content": _SYS}, {"role": "user", "content": ctx}]
raw = backend.complete_json(messages, max_tokens=600)
try:
obj = json.loads(raw)
except Exception:
i, j = raw.find("{"), raw.rfind("}")
if i < 0 or j < 0:
return {}
try:
obj = json.loads(raw[i:j + 1])
except Exception:
return {}
spk = obj.get("speakers", {}) if isinstance(obj, dict) else {}
return spk if isinstance(spk, dict) else {}
+111
View File
@@ -0,0 +1,111 @@
"""Podcast ingestion → documents + 'transcribe' jobs (§4.1).
RSS path: parse the feed, take episodes in [since, until], register documents pointing at the audio
enclosure. YouTube path: enumerate a channel's videos in the date window via yt-dlp (the back-catalog
route for the ~9 shows whose RSS is a truncated rolling window — see seeds/podcast_feeds.resolved.yaml).
The transcribe worker downloads + processes either kind identically.
"""
from __future__ import annotations
import hashlib
import json
import logging
import sqlite3
import subprocess
from ..backfill import queue
from ..util import audio_dedup_key
from .feeds import episode_records, fetch_feed
log = logging.getLogger(__name__)
def _enqueue_doc(conn, *, source_id, kind, external_id, url, title, date) -> tuple[int, int]:
doc_id = f"pod:{source_id}:{hashlib.sha1(external_id.encode()).hexdigest()[:12]}"
dkey = audio_dedup_key(title, date)
# Cross-mirror dedup (pre-GPU): if this same episode was already processed (any source/feed),
# record the sighting for provenance but DON'T re-transcribe. (external_id UNIQUE already covers
# same-feed re-ingest; this covers the same episode via a different feed/YouTube mirror.)
dup = conn.execute(
"SELECT doc_id FROM documents WHERE dedup_key=? AND processed_at IS NOT NULL LIMIT 1", (dkey,)
).fetchone()
cur = conn.execute(
"""INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date, dedup_key)
VALUES (?,?,?,?,?,?,?,?)""",
(doc_id, source_id, kind, external_id, url, title, date, dkey),
)
conn.commit()
if not cur.rowcount:
return (0, 0) # same (source_id, external_id) already known
if dup:
conn.execute(
"UPDATE documents SET processed_at=datetime('now'), raw_path=? WHERE doc_id=?",
(f"dup_of:{dup['doc_id']}", doc_id),
)
conn.commit()
log.info("skip transcribe for %s — duplicate content of %s", doc_id, dup["doc_id"])
return (1, 0)
h = hashlib.sha256(f"{doc_id}|audio-v0".encode()).hexdigest()
job = queue.enqueue(conn, job_type="transcribe", target_id=doc_id, input_hash=h,
parent_doc_id=doc_id, priority=100)
return (1, 1 if job is not None else 0)
def ingest_rss(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None, limit=20):
if not source["rss_url"]:
raise ValueError(f"{source['source_id']} has no rss_url")
recs = episode_records(fetch_feed(source["rss_url"]))
n_docs = n_jobs = count = 0
for r in recs:
d = r["published"]
if since and d and d < since:
continue
if until and d and d > until:
continue
if count >= limit:
break
count += 1
nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="podcast",
external_id=r["guid"], url=r["audio_url"], title=r["title"], date=d)
n_docs += nd
n_jobs += nj
return n_docs, n_jobs
def ingest_youtube(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None,
limit=20, max_scan=800):
"""Enumerate channel videos in the date window via yt-dlp (NON-flat, so upload_date is populated —
flat mode returns NA). Videos come newest-first, so we use --dateafter/--datebefore to select the
window and --break-match-filters to STOP scanning once we drop below `since` (avoids walking the
entire channel history). The transcribe worker downloads audio on demand."""
if not source["channel_url"]:
raise ValueError(f"{source['source_id']} has no channel_url")
url = source["channel_url"].rstrip("/")
if "/playlist" not in url and not url.endswith("/videos"):
url = url + "/videos"
cmd = ["yt-dlp", "--no-warnings", "--ignore-errors", "--skip-download",
"--print", "%(id)s\t%(upload_date)s\t%(title)s", "--playlist-end", str(max_scan)]
if since:
s = since.replace("-", "")
cmd += ["--dateafter", s, "--break-match-filters", f"upload_date>={s}"]
if until:
cmd += ["--datebefore", until.replace("-", "")]
cmd.append(url)
out = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
n_docs = n_jobs = count = 0
for line in out.stdout.splitlines():
parts = line.split("\t")
if len(parts) < 2 or not parts[0] or parts[1] in ("NA", ""):
continue
vid, upd = parts[0], parts[1]
title = parts[2] if len(parts) > 2 else vid
date = f"{upd[:4]}-{upd[4:6]}-{upd[6:8]}" if len(upd) == 8 else None
if count >= limit:
break
count += 1
nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="youtube",
external_id=vid, url=f"https://www.youtube.com/watch?v={vid}",
title=title, date=date)
n_docs += nd
n_jobs += nj
return n_docs, n_jobs
+60
View File
@@ -0,0 +1,60 @@
"""Cross-chunk speaker stitching + the voiceprint library (§4.1, §4.5).
diarize-chunk returns a 192-d TitaNet voiceprint per speaker per chunk. Because each chunk is
diarized independently, "Speaker 1" in chunk 3 is not the same label as "Speaker 1" in chunk 7 —
we re-cluster by cosine similarity (~0.7 distance threshold) so one person gets one identity across
the whole episode. The SAME library then matches a guest ACROSS shows by voice (the independence
graph's hardest edge, §4.5).
"""
from __future__ import annotations
import numpy as np
DISTANCE_THRESHOLD = 0.7 # cosine DISTANCE (1 - cosine similarity); §4.1
def _unit(v: np.ndarray) -> np.ndarray:
n = np.linalg.norm(v)
return v / n if n else v
def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
return float(1.0 - np.dot(_unit(np.asarray(a, dtype=float)), _unit(np.asarray(b, dtype=float))))
def stitch_chunks(chunk_voiceprints: list[np.ndarray], *, threshold: float = DISTANCE_THRESHOLD) -> list[int]:
"""Greedy online clustering of per-(chunk,speaker) voiceprints into stable speaker ids.
Input: a flat list of voiceprint vectors (one per chunk-speaker, in encounter order).
Output: a parallel list of cluster ids. A vector joins the nearest existing cluster if its
distance to that cluster's centroid < threshold, else it starts a new cluster.
"""
centroids: list[np.ndarray] = []
counts: list[int] = []
labels: list[int] = []
for vp in chunk_voiceprints:
vp = np.asarray(vp, dtype=float)
if centroids:
dists = [cosine_distance(vp, c) for c in centroids]
j = int(np.argmin(dists))
if dists[j] < threshold:
centroids[j] = (centroids[j] * counts[j] + vp) / (counts[j] + 1)
counts[j] += 1
labels.append(j)
continue
centroids.append(vp.copy())
counts.append(1)
labels.append(len(centroids) - 1)
return labels
def match_library(vp: np.ndarray, library: list[tuple[str, np.ndarray]], *,
threshold: float = DISTANCE_THRESHOLD) -> str | None:
"""Return the voiceprint_id of the closest library entry within threshold, else None
(a new speaker → caller mints a new library id)."""
best_id, best_d = None, threshold
for vid, lib_vec in library:
d = cosine_distance(vp, lib_vec)
if d < best_d:
best_id, best_d = vid, d
return best_id
+308
View File
@@ -0,0 +1,308 @@
"""Audio → speaker-attributed transcript + voiceprint library (§4.1, §4.5).
Per chunk (sequential — audio lock): diarize-chunk (192-d TitaNet fingerprints + timed speaker
segments) + transcribe (word timestamps). Align words to speakers by time, stitch speakers ACROSS
chunks by fingerprint cosine, then match the persisted voiceprint library so the SAME guest is
recognized ACROSS shows by voice — the highest-leverage input to the source-independence graph.
"""
from __future__ import annotations
import logging
import time
import uuid
from pathlib import Path
import numpy as np
from ..backfill import queue
from .chunker import chunk_audio
from .download import download_enclosure, download_youtube_audio, to_wav_16k_mono
from .speaker_stitch import DISTANCE_THRESHOLD, match_library, stitch_chunks
log = logging.getLogger(__name__)
# ---------- alignment ----------
def _speaker_at(segments: list[dict], t: float) -> str:
for s in segments:
if s["start_s"] <= t <= s["end_s"]:
return s["speaker"]
if not segments:
return "Speaker_0"
return min(segments, key=lambda s: min(abs(s["start_s"] - t), abs(s["end_s"] - t)))["speaker"]
def align_words(words: list[dict], segments: list[dict]) -> list[dict]:
"""Group word-level transcription into speaker turns using the diarization segments."""
turns: list[dict] = []
cur: dict | None = None
for w in words:
mid = (w["start"] + w["end"]) / 2
spk = _speaker_at(segments, mid)
if cur and cur["speaker"] == spk:
cur["text"] += " " + w["text"]
cur["end"] = w["end"]
else:
if cur:
turns.append(cur)
cur = {"speaker": spk, "start": w["start"], "end": w["end"], "text": w["text"]}
if cur:
turns.append(cur)
return turns
# ---------- per-document audio processing ----------
def diarize_transcribe_chunks(sc, chunks: list[Path], *, concurrency: int = 2):
"""Returns (chunk_turns, chunk_speakers): turns per chunk + (chunk_idx, local_spk, fingerprint).
Drives up to `concurrency` chunks in flight — the client's global audio SEMAPHORE is the hard cap
across both parakeet endpoints (sit at 2: keeps the single serial GPU continuously fed = full
throughput, no idle gap). A single chunk's failure is non-fatal (skip; the client already busy-
retries transient blips), but if a MAJORITY of chunks fail the whole job raises so it retries later
(rather than emitting a half-empty transcript). Results are reassembled in chunk order."""
from concurrent.futures import ThreadPoolExecutor, as_completed
def _one(idx: int, ch: Path):
dia = sc.diarize_chunk(str(ch))
tr = sc.transcribe(str(ch))
turns = align_words(tr.get("words", []), dia.get("segments", []))
spks = [(idx, spk, np.asarray(vec, dtype=np.float32))
for spk, vec in (dia.get("fingerprints") or {}).items()]
return idx, turns, spks
results: dict[int, tuple] = {}
failed = 0
with ThreadPoolExecutor(max_workers=max(1, concurrency)) as pool:
futs = {pool.submit(_one, i, ch): i for i, ch in enumerate(chunks)}
for fut in as_completed(futs):
try:
idx, turns, spks = fut.result()
results[idx] = (turns, spks)
except Exception as e: # noqa: BLE001 — one contended chunk shouldn't kill the episode
failed += 1
log.warning("chunk %d/%d failed (%s) — skipping", futs[fut], len(chunks), str(e)[:90])
if chunks and failed >= max(3, len(chunks) // 2):
raise RuntimeError(f"{failed}/{len(chunks)} chunks failed — backend contended; will retry later")
chunk_turns = [(idx, results[idx][0]) for idx in sorted(results)]
chunk_speakers = [s for idx in sorted(results) for s in results[idx][1]]
return chunk_turns, chunk_speakers
def stitch_and_centroids(chunk_speakers, *, threshold: float = DISTANCE_THRESHOLD):
"""Cluster all (chunk,speaker) fingerprints into within-episode global speakers."""
if not chunk_speakers:
return {}, {}
vecs = [v for (_, _, v) in chunk_speakers]
labels = stitch_chunks(vecs, threshold=threshold)
keymap: dict[tuple[int, str], int] = {}
groups: dict[int, list[np.ndarray]] = {}
for (idx, spk, vec), lab in zip(chunk_speakers, labels):
keymap[(idx, spk)] = lab
groups.setdefault(lab, []).append(vec)
centroids = {lab: np.mean(v, axis=0) for lab, v in groups.items()}
return keymap, centroids
def _load_library(conn) -> list[tuple[str, np.ndarray]]:
rows = conn.execute("SELECT voiceprint_id, vector, person_label FROM voiceprints").fetchall()
return [(r["voiceprint_id"], np.frombuffer(r["vector"], dtype=np.float32)) for r in rows]
def _label_for(conn, vpid: str) -> str:
r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
return (r["person_label"] if r and r["person_label"] else f"SPK:{vpid[:8]}")
def resolve_voiceprints(conn, doc, centroids: dict[int, np.ndarray], *, threshold: float = DISTANCE_THRESHOLD):
"""Match each within-episode speaker to the persisted library (cross-show identity) or mint a new
one; record observations; add shared_guest edges when the voice also appears in ANOTHER source."""
library = _load_library(conn)
cluster_to_vpid: dict[int, str] = {}
for lab, cen in centroids.items():
vpid = match_library(cen, library, threshold=threshold)
if vpid is None:
vpid = "vp_" + uuid.uuid4().hex[:16]
conn.execute(
"INSERT INTO voiceprints (voiceprint_id, vector, first_doc_id) VALUES (?,?,?)",
(vpid, cen.astype(np.float32).tobytes(), doc["doc_id"]),
)
library.append((vpid, cen))
conn.execute(
"INSERT INTO voiceprint_observations (voiceprint_id, doc_id, chunk_idx) VALUES (?,?,?)",
(vpid, doc["doc_id"], None),
)
cluster_to_vpid[lab] = vpid
conn.commit()
# independence graph (§4.5): if this voice appears in a DIFFERENT source, that's a shared guest.
for vpid in set(cluster_to_vpid.values()):
others = conn.execute(
"""SELECT DISTINCT d.source_id FROM voiceprint_observations o
JOIN documents d ON d.doc_id = o.doc_id
WHERE o.voiceprint_id=? AND d.source_id != ?""",
(vpid, doc["source_id"]),
).fetchall()
for o in others:
a, b = sorted([doc["source_id"], o["source_id"]])
conn.execute(
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
VALUES (?,?,'shared_guest',1.0,?)
ON CONFLICT(src_a, src_b, edge_type)
DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
(a, b, vpid),
)
conn.commit()
return cluster_to_vpid
def _labeled(chunk_turns, keymap, label_by_cluster: dict) -> str:
lines: list[str] = []
for idx, turns in chunk_turns:
for t in turns:
lab = keymap.get((idx, t["speaker"]))
label = label_by_cluster.get(lab, t["speaker"])
lines.append(f"{label}: {t['text']}")
return "\n".join(lines)
def build_transcript(conn, chunk_turns, keymap, cluster_to_vpid) -> str:
labels = {lab: _label_for(conn, vpid) for lab, vpid in cluster_to_vpid.items()}
return _labeled(chunk_turns, keymap, labels)
def apply_names(conn, cluster_to_vpid: dict, idmap: dict) -> dict:
"""Attach confident names to the voiceprint library (person_label). Returns {cluster: name}."""
named: dict[int, str] = {}
for lab, vpid in cluster_to_vpid.items():
info = idmap.get(f"Speaker {lab + 1}") or idmap.get(str(lab + 1)) or {}
name = (info.get("name") or "").strip() if isinstance(info, dict) else ""
if name and info.get("confidence") in ("med", "high"):
conn.execute("UPDATE voiceprints SET person_label=? WHERE voiceprint_id=?", (name, vpid))
named[lab] = name
conn.commit()
return named
def add_name_edges(conn, doc, cluster_to_vpid: dict) -> int:
"""Name-based shared_guest edges: same person_label seen in a DIFFERENT source → independence edge,
even if the voiceprints didn't cluster (drift-robust complement to voiceprint matching, §4.5)."""
n = 0
for vpid in set(cluster_to_vpid.values()):
r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
name = r["person_label"] if r else None
if not name:
continue
others = conn.execute(
"""SELECT DISTINCT d.source_id FROM voiceprints v
JOIN voiceprint_observations o ON o.voiceprint_id = v.voiceprint_id
JOIN documents d ON d.doc_id = o.doc_id
WHERE v.person_label = ? AND d.source_id != ?""",
(name, doc["source_id"]),
).fetchall()
for o in others:
a, b = sorted([doc["source_id"], o["source_id"]])
conn.execute(
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
VALUES (?,?,'shared_guest',1.0,?)
ON CONFLICT(src_a, src_b, edge_type)
DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
(a, b, f"name:{name}"),
)
n += 1
conn.commit()
return n
def _download_audio(doc, cfg) -> Path:
cache = Path(cfg.audio_cache_dir)
cache.mkdir(parents=True, exist_ok=True)
wav = cache / f"{doc['doc_id'].replace(':', '_')}.wav"
if wav.exists():
return wav
url = doc["url"]
if doc["kind"] == "youtube" or (url and ("youtube.com" in url or "youtu.be" in url)):
return download_youtube_audio(url, cache, archive_file=cache / "yt-archive.txt")
raw = download_enclosure(url, cache / f"{doc['doc_id'].replace(':', '_')}.src")
return to_wav_16k_mono(raw, wav)
def process_document(conn, sc, cfg, doc, *, max_chunks: int, chunk_seconds: int = 150,
keep_audio: bool = False) -> int:
audio = _download_audio(doc, cfg)
chunkdir = Path(cfg.audio_cache_dir) / f"chunks_{doc['doc_id'].replace(':', '_')}"
chunks = chunk_audio(audio, chunkdir, chunk_seconds=chunk_seconds)[:max_chunks]
chunk_turns, chunk_speakers = diarize_transcribe_chunks(
sc, chunks, concurrency=getattr(cfg, "audio_concurrency", 2))
keymap, centroids = stitch_and_centroids(chunk_speakers)
cluster_to_vpid = resolve_voiceprints(conn, doc, centroids)
# Name the speakers (§4.5): host introduces guest in 1-on-1 → attach person_label, then a
# name-based shared_guest edge that survives voiceprint drift across shows.
src = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
try:
from ..extract.backends import from_config as backend_from_config
from .identify import identify_speakers
backend = backend_from_config(cfg, sc)
draft = _labeled(chunk_turns, keymap, {lab: f"Speaker {lab + 1}" for lab in cluster_to_vpid})
idmap = identify_speakers(backend, draft[:6000], source_name=src["name"] if src else "")
named = apply_names(conn, cluster_to_vpid, idmap)
if named:
log.info("named speakers in %s: %s", doc["doc_id"], ", ".join(named.values()))
except Exception as e: # noqa: BLE001 — naming is best-effort enrichment
log.warning("speaker identification failed for %s: %s", doc["doc_id"], e)
add_name_edges(conn, doc, cluster_to_vpid)
transcript = build_transcript(conn, chunk_turns, keymap, cluster_to_vpid)
tpath = Path(cfg.data_dir) / "transcripts" / f"{doc['doc_id'].replace(':', '_')}.txt"
tpath.parent.mkdir(parents=True, exist_ok=True)
tpath.write_text(transcript)
import hashlib
content_hash = hashlib.sha256(transcript.encode()).hexdigest()
conn.execute(
"UPDATE documents SET transcript_path=?, duration_sec=?, content_hash=?, processed_at=datetime('now') WHERE doc_id=?",
(str(tpath), len(chunks) * chunk_seconds, content_hash, doc["doc_id"]),
)
conn.commit()
h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
parent_doc_id=doc["doc_id"], priority=100)
if not keep_audio:
_cleanup_audio(audio, chunkdir)
return len(chunk_turns)
def _cleanup_audio(audio: Path, chunkdir: Path) -> None:
"""Audio files are large and disposable once transcribed — reclaim the disk (the transcript +
voiceprints are what we keep). Backfilling hundreds of 1-3 hr episodes would otherwise be tens of GB."""
import shutil
try:
if audio.exists():
audio.unlink()
src = audio.with_suffix(".src")
if src.exists():
src.unlink()
if chunkdir.exists():
shutil.rmtree(chunkdir, ignore_errors=True)
except Exception as e: # noqa: BLE001
log.warning("audio cleanup failed for %s: %s", audio, e)
def run_transcribe(conn, sc, cfg, *, limit: int = 5, max_chunks: int = 999,
lease_seconds: int = 3600, worker_id: str = "transcribe-1") -> dict:
processed = 0
while processed < limit:
job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
if job is None:
break
processed += 1
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
if doc is None:
queue.skip(conn, job["job_id"], "document missing")
continue
try:
n = process_document(conn, sc, cfg, doc, max_chunks=max_chunks)
queue.complete(conn, job["job_id"], output_ref=f"{n} chunks")
log.info("transcribed %s (%d chunks)", doc["doc_id"], n)
except Exception as e: # noqa: BLE001
state = queue.fail(conn, job["job_id"], e)
log.warning("transcribe failed for %s: %s (→ %s)", job["target_id"], e, state)
return {"jobs_processed": processed}
+6
View File
@@ -0,0 +1,6 @@
"""The scoring brain (build blueprint).
Stats/geometry NOMINATE candidates; the frontier model only judges/expands a pre-filtered shortlist
(§5.1). Every count that feeds a score routes through the independence primitive (EISC), never a raw
source count (§4.5). Every scorer reads `visible_claims` (as-of filtered), never `claims` directly.
"""
+43
View File
@@ -0,0 +1,43 @@
"""As-of harness (§6.6 look-ahead guard).
Every scorer reads the `visible_claims` TEMP VIEW, never `claims` directly: at nomination time only
claims dated <= as_of are visible, so the backtest can't reward noticing what already happened. The
view also resolves merged canonical topics (topics.status='merged') to a stable `topic_id`.
"""
from __future__ import annotations
import sqlite3
class Scorer:
"""Context manager that binds a run to an as_of date and exposes `visible_claims`.
mode='backtest' enforces strict as-of discipline; 'forward' is the live pilot. as_of is a
controlled ISO date (YYYY-MM-DD) — safe to inline into the view DDL (views can't take params)."""
def __init__(self, conn: sqlite3.Connection, as_of: str, *, mode: str = "backtest") -> None:
self.conn = conn
self.as_of = as_of
self.mode = mode
def __enter__(self) -> "Scorer":
self.conn.executescript(
f"""
DROP VIEW IF EXISTS visible_claims;
CREATE TEMP VIEW visible_claims AS
SELECT c.*,
COALESCE((SELECT t.merged_into FROM topics t
WHERE t.topic_canonical = c.topic_canonical AND t.status='merged'),
c.topic_canonical) AS topic_id
FROM claims c
JOIN documents d ON d.doc_id = c.doc_id
WHERE c.date IS NOT NULL AND c.date <= '{self.as_of}';
"""
)
return self
def __exit__(self, *exc) -> None:
self.conn.execute("DROP VIEW IF EXISTS visible_claims")
def count_visible(self) -> int:
return self.conn.execute("SELECT COUNT(*) FROM visible_claims").fetchone()[0]
+49
View File
@@ -0,0 +1,49 @@
"""The quantitative bar (§5.1, §6.6) — the single gate between nomination and the frontier judge.
Two tiers:
- evidence bar → clears hard gates → WRITE A LEDGER ROW (the denominator, §6.6), even if never judged.
- promotion bar → also clears the score threshold → goes to the frontier judge.
THE GLOBAL META-RULE (applied to every scorer): no candidate clears on a single source or single
cluster — EISC_adj >= 2.0 AND K_eff >= 2. This is the §2.1 anti-lonely-outlier law, enforced once.
"""
from __future__ import annotations
EISC_FLOOR = 2.0
KEFF_FLOOR = 2
# Defaults; overridable via the score_thresholds table (so the backtest can sweep without code edits).
DEFAULT_MIN_SCORE = {"under_acted": 0.3, "emergence": 2.0, "contrarian": 1.5,
"convergence": 2.5, "intersection": 2.0}
def _min_score(conn, scorer: str) -> float:
if conn is not None:
row = conn.execute("SELECT min_score FROM score_thresholds WHERE scorer=?", (scorer,)).fetchone()
if row and row[0] is not None:
return float(row[0])
return DEFAULT_MIN_SCORE.get(scorer, 0.0)
def evaluate(scorer: str, result: dict, *, conn=None) -> tuple[bool, bool]:
"""Returns (cleared_evidence_bar, cleared_promotion_bar)."""
if scorer == "under_acted":
return _under_acted(result, _min_score(conn, scorer))
return (False, False) # Job A scorers wired with the forward pilot
def _under_acted(result: dict, min_score: float) -> tuple[bool, bool]:
i = result["inputs"]
breaker = bool(i.get("is_breaker"))
# §4.4 Job B = "rising INDEPENDENT corroboration". EISC>=2.0 enforces independence (shared-guest +
# same-cluster discounting), so this is NOT an isolated point or one-guest echo (§2.1). Cross-cluster
# (k_eff>=2) is the §4.5 GOLD for Job A DISCOVERY — NOT a hard gate for Job B corroboration: N
# independent energy companies confirming a power thesis is real corroboration. Cross-cluster still
# BOOSTS the score (eisc_corrob = eisc_adj includes the xcluster_mult) so cross-cluster ranks first.
corroborated = (i.get("n_confirmed", 0) >= 4 and i.get("n_src", 0) >= 2
and i.get("eisc_corrob", 0.0) >= EISC_FLOOR and i.get("a_corrob", 0.0) > 0)
conv_ok = breaker or i.get("conviction_weight", 0.0) >= 0.7 # med-high / high
expo_ok = breaker or i.get("exposure") in ("none", "lt2") # genuine exposure gap
evidence = corroborated and conv_ok and expo_ok
promotion = evidence and result["score"] >= min_score
return evidence, promotion
+86
View File
@@ -0,0 +1,86 @@
"""Pre-registered confusion matrix on the §7.1 derivatives (DESIGN_v2 §1.3).
Measures PRECISION and RECALL, not recall alone. Uses the engine's already-stored candidate_scores
(cleared_date + whisper_date) × the pre-registered external repricing (resolution.K2023.yaml). Reports
the matrix at BOTH the cleared level (what the engine fired) and the whisper level (what it saw before
the independence floor) — the delta is the empirical answer to the gate debate.
"""
from __future__ import annotations
import json
from datetime import datetime
import yaml
from .external import basket_index, fetch_eod, resolve_reprice, runway_at_signal
def _engine_dates(conn) -> dict[str, dict]:
"""For each under_acted node: earliest cleared as_of and earliest whisper as_of (n_conf>=4, a>0)."""
rows = conn.execute(
"SELECT node_id, conviction_id, as_of, cleared_evidence_bar ev, inputs_json "
"FROM candidate_scores WHERE scorer='under_acted'"
).fetchall()
out: dict[str, dict] = {}
for r in rows:
k = r["node_id"] or r["conviction_id"]
i = json.loads(r["inputs_json"])
d = out.setdefault(k, {"cleared": None, "whisper": None})
if r["ev"] and (d["cleared"] is None or r["as_of"] < d["cleared"]):
d["cleared"] = r["as_of"]
if i.get("n_confirmed", 0) >= 4 and i.get("a_corrob", 0) > 0:
if d["whisper"] is None or r["as_of"] < d["whisper"]:
d["whisper"] = r["as_of"]
return out
def _lead_days(repricing_date: str, signal_date: str | None) -> int | None:
if not signal_date or not repricing_date:
return None
return (datetime.strptime(repricing_date, "%Y-%m-%d") - datetime.strptime(signal_date, "%Y-%m-%d")).days
def run_confusion(conn, cfg, spec_path: str) -> dict:
spec = yaml.safe_load(open(spec_path))
w, rule = spec["window"], spec["rule"]
engine = _engine_dates(conn)
price_cache: dict[str, list] = {}
rows = []
for node, basket in spec["baskets"].items():
prices = {}
for sym in basket:
if sym not in price_cache:
price_cache[sym] = fetch_eod(cfg.fmp_api_key, sym, w["start"], w["end"])
prices[sym] = price_cache[sym]
missing = [s for s in basket if not prices[s]]
idx = basket_index(prices)
res = resolve_reprice(idx, threshold_pct=rule["threshold_pct"], hold_pct=rule["hold_pct"],
hold_days=rule["hold_days"])
ed = engine.get(node, {"cleared": None, "whisper": None})
rows.append({
"node": node, "basket": basket, "missing": missing,
"confirmed": res["confirmed"], "repricing_date": res["repricing_date"], "peak_pct": res["peak_pct"],
"cleared_date": ed["cleared"], "whisper_date": ed["whisper"],
"lead_cleared": _lead_days(res["repricing_date"], ed["cleared"]) if res["confirmed"] else None,
"lead_whisper": _lead_days(res["repricing_date"], ed["whisper"]) if res["confirmed"] else None,
# DESIGN_v2.1 Correction A: runway = fraction of the durable move still ahead at signal
"runway_cleared": runway_at_signal(idx, ed["cleared"]) if res["confirmed"] else None,
"runway_whisper": runway_at_signal(idx, ed["whisper"]) if res["confirmed"] else None,
})
def classify(r, level):
fired = bool(r[f"{level}_date"])
real = r["confirmed"]
return "TP" if (fired and real) else "FP" if (fired and not real) else "FN" if real else "TN"
def matrix(level):
c = {"TP": 0, "FP": 0, "FN": 0, "TN": 0}
for r in rows:
c[classify(r, level)] += 1
p = c["TP"] / (c["TP"] + c["FP"]) if (c["TP"] + c["FP"]) else None
rec = c["TP"] / (c["TP"] + c["FN"]) if (c["TP"] + c["FN"]) else None
return c, p, rec
return {"rows": rows, "cleared": matrix("cleared"), "whisper": matrix("whisper"),
"classify": classify}
+96
View File
@@ -0,0 +1,96 @@
"""External-confirmation data for the resolver (DESIGN_v2 §1). Price series via FMP (already paid for).
This is the *resolving* leg (§6.2): real-world repricing, not discourse. Kept deliberately simple and
transparent — the resolution rule is pre-registered, so the code here only fetches + applies it.
"""
from __future__ import annotations
import requests
_FMP = "https://financialmodelingprep.com"
def fetch_eod(api_key: str, symbol: str, start: str, end: str) -> list[tuple[str, float]]:
"""Daily (date, close) for a symbol. Tries the FMP 'stable' then legacy 'v3' price endpoints."""
s = requests.Session()
attempts = [
(f"{_FMP}/stable/historical-price-eod/full", {"symbol": symbol, "from": start, "to": end}),
(f"{_FMP}/api/v3/historical-price-full/{symbol}", {"from": start, "to": end}),
]
for url, params in attempts:
try:
r = s.get(url, params={**params, "apikey": api_key}, timeout=40)
if r.status_code != 200:
continue
j = r.json()
except Exception: # noqa: BLE001
continue
rows = j.get("historical") if isinstance(j, dict) else j
if not rows:
continue
out = [(x["date"][:10], x.get("close") or x.get("adjClose")) for x in rows
if x.get("date") and (x.get("close") or x.get("adjClose"))]
if out:
return sorted(out)
return []
def basket_index(prices_by_symbol: dict[str, list[tuple[str, float]]]) -> list[tuple[str, float]]:
"""Equal-weight, each-symbol-normalized-to-its-own-first-close index, averaged over dates where
data exists. (Symbols that IPO'd mid-window enter at 1.0 when they start — flagged by the caller.)"""
norm = {}
for sym, series in prices_by_symbol.items():
if series:
base = series[0][1]
norm[sym] = {d: c / base for d, c in series if base}
dates = sorted({d for n in norm.values() for d in n})
idx = []
for d in dates:
vals = [n[d] for n in norm.values() if d in n]
if vals:
idx.append((d, sum(vals) / len(vals)))
return idx
def index_value_at(index: list[tuple[str, float]], date: str | None) -> float | None:
"""Latest index value on or before `date` (baseline if the signal predates the data)."""
if not index or not date:
return None
vals = [v for d, v in index if d <= date]
return vals[-1] if vals else index[0][1]
def runway_at_signal(index: list[tuple[str, float]], signal_date: str | None) -> float | None:
"""Fraction of the durable move STILL AHEAD at the signal date (DESIGN_v2.1 Correction A).
1.0 = whole move ahead (signal before it); 0.0 = signal at the peak. The right metric for a
long-duration holder — a modestly-late signal with most of the move ahead is still actionable."""
if not index or not signal_date:
return None
base = index[0][1]
peak = max(v for _, v in index)
val = index_value_at(index, signal_date)
if peak <= base or val is None:
return None
return round(max(0.0, (peak - val) / (peak - base)), 2)
def resolve_reprice(index: list[tuple[str, float]], *, threshold_pct: float, hold_pct: float,
hold_days: int) -> dict:
"""Apply the pre-registered rule: first date the index is ≥ +threshold% vs baseline AND still
≥ +hold% `hold_days` later. Returns {confirmed, repricing_date, peak_pct}."""
from datetime import datetime, timedelta
if not index:
return {"confirmed": False, "repricing_date": None, "peak_pct": None}
base = index[0][1]
thr = 1.0 + threshold_pct / 100.0
hold = 1.0 + hold_pct / 100.0
by_date = dict(index)
dates = [d for d, _ in index]
peak = max(v for _, v in index)
for d, v in index:
if v / base >= thr:
target = (datetime.strptime(d, "%Y-%m-%d") + timedelta(days=hold_days)).strftime("%Y-%m-%d")
later = [vv for dd, vv in index if dd >= target]
if later and (later[0] / base) >= hold:
return {"confirmed": True, "repricing_date": d, "peak_pct": round((peak / base - 1) * 100, 1)}
return {"confirmed": False, "repricing_date": None, "peak_pct": round((peak / base - 1) * 100, 1)}
+113
View File
@@ -0,0 +1,113 @@
"""Effective Independent Source Count (EISC) — the system's differentiator (§4.5).
Discount convergence by source connectedness. Five shows that "independently converge" but share one
guest must count as ~one voice; three shows across macro/energy/ai with no shared guests are gold.
Method (resolved in the design panel): noisy-OR connectedness matrix + inverse-row-sum EISC.
- symmetric & order-independent (unlike a sequential pairwise-penalty walk)
- each source's contribution is individually explainable ("counts 0.31 because connected to 3 others")
- collapses correctly: 5 clones -> ~1.0 ; 5 cross-cluster independents -> ~5.0 (raw)
- no eigensolve (unstable at n=2..4, our common case)
"""
from __future__ import annotations
from collections import defaultdict
import numpy as np
# Coupling per edge type: a voiceprint-confirmed shared guest is near-total redundancy on a topic.
KAPPA = {"shared_guest": 0.85, "citation": 0.45, "community": 0.60}
# Same-cluster baseline correlation (sources in the same world are partly redundant even w/o an edge).
CLUSTER_COUPLING = {"bitcoin": 0.55, "vc_consensus": 0.35}
SAME_CLUSTER_DEFAULT = 0.25
EDGE_CLAMP = 0.95 # cap kappa*weight so a heavily-weighted edge can't exceed near-total
CAP_VALUE = 0.25 # §4.5: bitcoin / capped sources contribute at most 0.25 of a voice
CLUSTER_MIN_CONTRIB = 0.5 # a cluster must add >= half an independent voice to count toward K_eff
def effective_independent_N(srcs: list[tuple], edges: list[tuple], *, mode: str = "live") -> dict:
"""srcs: [(source_id, source_cluster, cluster_capped_low[, own_network])]; edges: [(a,b,type,weight)].
mode='live' (default) DROPS own_network sources (Ten31's own orbit — listening to ourselves, §v2.1);
mode='test' keeps them (the reflexivity test fixture). Returns {eisc_adj, eisc_raw, k_eff, ...}."""
if mode == "live":
srcs = [s for s in srcs if not (len(s) > 3 and s[3])]
ids = [s[0] for s in srcs]
n = len(ids)
if n == 0:
return {"eisc_adj": 0.0, "eisc_raw": 0.0, "k_eff": 0, "xcluster_mult": 1.0, "per_source_contrib": {}}
idx = {sid: i for i, sid in enumerate(ids)}
cluster = {s[0]: s[1] for s in srcs}
capped = {s[0]: (bool(s[2]) or s[1] == "bitcoin") for s in srcs}
# edge channel: combine all edges between a pair by noisy-OR product of (1 - kappa*weight)
pair_factor: dict = defaultdict(lambda: 1.0)
for a, b, etype, w in edges:
if a in idx and b in idx and a != b:
term = min(EDGE_CLAMP, KAPPA.get(etype, 0.0) * (w if w is not None else 1.0))
pair_factor[frozenset((a, b))] *= (1.0 - term)
C = np.eye(n)
for i in range(n):
for j in range(i + 1, n):
a, b = ids[i], ids[j]
e = 1.0 - pair_factor[frozenset((a, b))] # 0 if no edge
ci, cj = cluster[a], cluster[b]
clust = (CLUSTER_COUPLING.get(ci, SAME_CLUSTER_DEFAULT)
if (ci is not None and ci == cj) else 0.0)
c = 1.0 - (1.0 - e) * (1.0 - clust)
C[i, j] = C[j, i] = c
rowsum = C.sum(axis=1) # includes the diagonal 1.0
contrib, eisc_raw = {}, 0.0
cluster_mass: dict = defaultdict(float)
for i, sid in enumerate(ids):
cap = CAP_VALUE if capped[sid] else 1.0
contrib[sid] = cap * (1.0 / rowsum[i])
eisc_raw += contrib[sid]
if not capped[sid] and cluster[sid]:
cluster_mass[cluster[sid]] += contrib[sid]
# cross-cluster bonus: count NON-capped clusters that genuinely contribute an independent voice
# (summed contribution >= half a voice). This stops "one guest across many clusters" from earning
# the gold multiplier — the raw EISC already collapses that guest to ~1, and k_eff must agree.
k_eff = sum(1 for m in cluster_mass.values() if m >= CLUSTER_MIN_CONTRIB)
xmult = max(1.0, 1.0 + 0.5 * (k_eff - 1)) # 1clu->1.0, 2->1.5, 3->2.0 (gold)
return {
"eisc_adj": xmult * eisc_raw,
"eisc_raw": eisc_raw,
"k_eff": k_eff,
"xcluster_mult": xmult,
"per_source_contrib": {k: round(v, 4) for k, v in contrib.items()},
}
# --- DB helpers (the brain only READS the graph; edges are produced upstream by the voiceprint lib) ---
def load_source_meta(conn, ids: list[str]) -> list[tuple]:
ids = list(dict.fromkeys(ids))
if not ids:
return []
ph = ",".join("?" * len(ids))
rows = conn.execute(
f"SELECT source_id, source_cluster, cluster_capped_low, COALESCE(own_network,0) "
f"FROM sources WHERE source_id IN ({ph})", ids
).fetchall()
return [(r[0], r[1], r[2], r[3]) for r in rows]
def load_edges(conn, ids: list[str]) -> list[tuple]:
ids = list(dict.fromkeys(ids))
if not ids:
return []
ph = ",".join("?" * len(ids))
rows = conn.execute(
f"SELECT src_a, src_b, edge_type, weight FROM source_edges WHERE src_a IN ({ph}) AND src_b IN ({ph})",
ids + ids,
).fetchall()
return [(r[0], r[1], r[2], r[3]) for r in rows]
def eisc_for(conn, source_ids: list[str], *, mode: str = "live") -> dict:
"""Convenience: EISC for a set of source_ids, loading cluster/cap/own_network + edges from SQLite.
mode='live' drops own_network sources; mode='test' keeps them (§v2.1 condition 1)."""
ids = list(dict.fromkeys(source_ids))
return effective_independent_N(load_source_meta(conn, ids), load_edges(conn, ids), mode=mode)
+49
View File
@@ -0,0 +1,49 @@
"""Ledger + candidate_scores writers. Log EVERY bar-clearer from day one (§6.6 denominator).
date_logged = as_of (backtest rows carry historical dates so lead-time math is correct). The
discourse_metric JSON is FROZEN here at log time — the resolver (separate forward pass) never edits it.
Grant's rating lives in human_evaluations; the model never reads it pre-log (§6.7).
"""
from __future__ import annotations
import hashlib
import json
def _sig_id(scorer: str, key: str, as_of: str) -> str:
return "sig_" + hashlib.sha1(f"{scorer}|{key}|{as_of}".encode()).hexdigest()[:16]
def _score_id(scorer: str, key: str, as_of: str) -> str:
return hashlib.sha1(f"cs|{scorer}|{key}|{as_of}".encode()).hexdigest()
def record_candidate_score(conn, result: dict, as_of: str, evidence: bool, promotion: bool) -> None:
key = result.get("node_id") or result.get("conviction_id") or result.get("topic_canonical") or ""
conn.execute(
"""INSERT OR REPLACE INTO candidate_scores
(score_id, scorer, as_of, topic_canonical, node_id, conviction_id, score,
cleared_evidence_bar, cleared_promotion_bar, inputs_json)
VALUES (?,?,?,?,?,?,?,?,?,?)""",
(_score_id(result["scorer"], key, as_of), result["scorer"], as_of,
result.get("topic_canonical"), result.get("node_id"), result.get("conviction_id"),
result["score"], int(evidence), int(promotion), json.dumps(result["inputs"])[:8000]),
)
conn.commit()
def log_candidate(conn, *, scorer: str, as_of: str, ledger_type: str, proposition: str,
discourse_metric: dict, origin_conviction_id=None, origin_node_id=None) -> str:
key = origin_node_id or origin_conviction_id or proposition
signal_id = _sig_id(scorer, key, as_of)
dm = {**discourse_metric, "scorer": scorer}
conn.execute(
"""INSERT OR IGNORE INTO ledger
(signal_id, type, proposition, date_logged, discourse_metric, model_confidence,
origin_conviction_id, origin_node_id)
VALUES (?,?,?,?,?,?,?,?)""",
(signal_id, ledger_type, proposition[:1000], as_of, json.dumps(dm)[:8000], None,
origin_conviction_id, origin_node_id),
)
conn.commit()
return signal_id
+80
View File
@@ -0,0 +1,80 @@
"""Local-LLM scoring helpers (§4.4). Bounded labeling passes over PRE-FILTERED candidates only —
never nomination from the raw corpus (§5.1). JSON mode, temp 0, no thinking → deterministic.
Helper #2 (derivative-relevance) is built first — it's the one the §7.1 backtest needs. Helper #1
(stance-folding for Job A contrarian) comes with the forward pilot.
"""
from __future__ import annotations
import json
import logging
log = logging.getLogger(__name__)
_REL_SYS = (
"You assess whether claims corroborate a specific investment hypothesis (a 2nd/3rd-order "
"derivative of a thesis). For EACH claim decide: does it provide real-world evidence that the "
"hypothesis is PLAYING OUT (corroborates), and the direction. 'affirms' = supports the hypothesis; "
"'contradicts' = is evidence against it; 'tangential' = same topic words but not actually about the "
"hypothesis (e.g. 'transformers' the ML architecture vs the electrical-grid kind). Be strict: a "
"passing mention is tangential, not corroboration. "
"TWO HARD RULES (these are the difference between catching a real signal and being fooled):\n"
"1) REALIZED-ONLY. The hypothesis must be PLAYING OUT in fact. Announcements, plans, intentions, "
"forecasts, targets, and 'may/will/expects/poised-to/aims-to/up-to' language are NOT corroboration — "
"they are 'tangential' unless the claim states the thing has ACTUALLY HAPPENED / been DEPLOYED / "
"closed. A $2B program 'announced' or capital 'made available' is NOT capital deployed. A company "
"that 'may consider' or 'expects' something has not done it.\n"
"2) ROLE-MATCH. The actor in the claim must occupy the role the hypothesis is about. If the "
"hypothesis is that capital PROVIDERS are funding/supplying something, then a BORROWER or USER on the "
"demand side (e.g. a firm posting an asset AS collateral to RECEIVE a loan) is the wrong side of the "
"transaction → 'tangential' to that hypothesis, not 'affirms'. "
'Return ONLY JSON: {"results":[{"claim_id":"...","corroborates":true|false,'
'"direction":"affirms"|"contradicts"|"tangential"}]}.'
)
def _parse(raw: str) -> list[dict]:
try:
obj = json.loads(raw)
except Exception:
i, j = raw.find("{"), raw.rfind("}")
if i < 0 or j < 0:
return []
try:
obj = json.loads(raw[i:j + 1])
except Exception:
return []
res = obj.get("results", []) if isinstance(obj, dict) else []
return [r for r in res if isinstance(r, dict) and r.get("claim_id")]
def derivative_relevance(backend, derivative: str, claims: list[dict]) -> dict[str, dict]:
"""claims: [{claim_id, proposition}]. Returns {claim_id: {corroborates, direction}}.
Filters retrieval near-misses; it cannot ADD claims search didn't return (not a nominator)."""
if not claims:
return {}
listing = "\n".join(f"- [{c['claim_id']}] {c['proposition']}" for c in claims)
user = (f"HYPOTHESIS (derivative): {derivative}\n\nCLAIMS:\n{listing}\n\n"
f"Judge each claim id.")
messages = [{"role": "system", "content": _REL_SYS}, {"role": "user", "content": user}]
# Output is ~one JSON record per claim (claim_id + corroborates + direction ≈ 70-100 tokens). At
# top_k=60 that's ~5k tokens — a fixed 3000 budget truncated mid-array → empty parse → a node
# silently zeroed (the source of the unstable 5-affirm/0-affirm flip). Size the budget to the batch.
budget = max(3000, 120 * len(claims) + 500)
parsed = []
for attempt in range(2): # one retry — a gateway-under-load truncation shouldn't zero out a node
raw = backend.complete_json(messages, max_tokens=budget)
parsed = _parse(raw)
if parsed:
break
log.warning("derivative_relevance empty parse (attempt %d) for %r; raw[:160]=%r",
attempt + 1, derivative[:50], raw[:160])
# The listing presents ids as `- [{claim_id}] ...`; the model INCONSISTENTLY echoes the id back with
# the surrounding brackets ("[edgar:...]") — which then misses the bracket-less lookup key and the
# whole node reads as 0/(missing). Normalize the brackets+whitespace so matching is robust either way.
out = {}
for r in parsed:
cid = str(r["claim_id"]).strip().strip("[]").strip()
out[cid] = {"corroborates": bool(r.get("corroborates")),
"direction": r.get("direction", "tangential")}
return out
+27
View File
@@ -0,0 +1,27 @@
"""Resolver — the SEPARATE forward pass that closes the loop (§6.2, §6.3).
ARCHITECTURALLY ISOLATED from the scorers: it has no shared write path with them. Scorers write
candidate_scores + ledger rows with outcome columns NULL and a FROZEN discourse_metric. The resolver
runs later (larger as_of), reads ledger rows whose date_logged < as_of_now, and writes ONLY
resolution_date / discourse_outcome / external_outcome / lead_time_days. It is FORBIDDEN from touching
discourse_metric — that is the structural reason the ledger can't reward noticing what already happened.
Implementation note: real resolutions need forward time (the clock can't be backfilled). For the
backtest, the discourse leg can be resolved by re-running the discourse metric forward from date_logged;
the external leg (price/filings/human check, §6.5) is filled as that evidence arrives. Stubbed now to
lock the architecture; filled out for the forward pilot.
"""
from __future__ import annotations
def resolve_discourse_leg(conn, sc, cfg, *, as_of_now: str) -> int:
"""For each ledger row logged before as_of_now without a resolution, re-measure discourse forward
and set discourse_outcome + lead_time. (Forward-only; never reads/edits discourse_metric.)
Returns count resolved. STUB — implemented for the forward pilot."""
rows = conn.execute(
"SELECT signal_id, date_logged FROM ledger WHERE resolution_date IS NULL AND date_logged < ?",
(as_of_now,),
).fetchall()
# TODO(forward-pilot): re-run windowed independence from date_logged→as_of_now for each row's
# origin derivative; set discourse_outcome in {up_cross_cluster,up_single_cluster,flat,down}.
return 0
+81
View File
@@ -0,0 +1,81 @@
"""Scoring orchestrator. For Job B / the §7.1 backtest: march as_of dates, score every conviction +
fan-out derivative, gate, log the denominator, promote nodes.
"""
from __future__ import annotations
import logging
from ..extract.backends import from_config as backend_from_config
from . import bar, under_acted
from .asof import Scorer
from .ledger_writer import log_candidate, record_candidate_score
log = logging.getLogger(__name__)
def _nodes_for(conn, as_of, mode, conviction_ids):
nodes = []
where, params = "", []
if conviction_ids:
ph = ",".join("?" * len(conviction_ids))
where = f" WHERE conviction_id IN ({ph})"
params = list(conviction_ids)
for c in conn.execute(
f"SELECT conviction_id, thematic_proposition, conviction_level, current_exposure, is_thesis_breaker "
f"FROM conviction_log{where}", params,
):
nodes.append({"conviction_id": c[0], "node_id": None, "derivative": c[1],
"level": c[2], "exposure": c[3], "breaker": bool(c[4])})
fq = ("SELECT f.node_id, f.parent_conviction_id, f.derivative_proposition, c.conviction_level, "
"c.current_exposure, c.is_thesis_breaker FROM fanout_nodes f "
"JOIN conviction_log c ON c.conviction_id = f.parent_conviction_id")
conds, fparams = [], []
if conviction_ids:
conds.append(f"f.parent_conviction_id IN ({','.join('?' * len(conviction_ids))})")
fparams += list(conviction_ids)
if mode == "forward": # backtest uses the seeded tree as the as-of-2023 hypothesis (no created_at leak)
conds.append("f.created_at <= ?")
fparams.append(as_of)
if conds:
fq += " WHERE " + " AND ".join(conds)
for f in conn.execute(fq, fparams):
nodes.append({"conviction_id": f[1], "node_id": f[0], "derivative": f[2],
"level": f[3], "exposure": f[4], "breaker": bool(f[5])})
return nodes
def run_under_acted(conn, sc, cfg, *, as_of, mode="backtest", conviction_ids=None, window_days=28) -> list[dict]:
backend = backend_from_config(cfg, sc)
out = []
with Scorer(conn, as_of, mode=mode):
for nd in _nodes_for(conn, as_of, mode, conviction_ids):
r = under_acted.score_node(
conn, sc, backend, as_of=as_of, derivative=nd["derivative"],
conviction_id=nd["conviction_id"], node_id=nd["node_id"],
conviction_level=nd["level"], exposure=nd["exposure"], is_breaker=nd["breaker"],
window_days=window_days,
)
ev, pr = bar.evaluate("under_acted", r, conn=conn)
record_candidate_score(conn, r, as_of, ev, pr)
if ev:
log_candidate(conn, scorer="under_acted", as_of=as_of,
ledger_type="under_acted_conviction", proposition=nd["derivative"],
discourse_metric=r["inputs"], origin_conviction_id=nd["conviction_id"],
origin_node_id=nd["node_id"])
if nd["node_id"]:
conn.execute("UPDATE fanout_nodes SET status=? WHERE node_id=?",
("signal" if pr else "corroborated", nd["node_id"]))
conn.commit()
out.append({"node": nd, "result": r, "evidence": ev, "promotion": pr})
return out
def run_backtest(conn, sc, cfg, *, conviction_id, dates, window_days=90) -> list[tuple]:
timeline = []
for as_of in dates:
res = run_under_acted(conn, sc, cfg, as_of=as_of, mode="backtest",
conviction_ids=[conviction_id], window_days=window_days)
timeline.append((as_of, res))
fired = [r for r in res if r["evidence"]]
log.info("as_of %s: %d/%d nodes cleared evidence bar", as_of, len(fired), len(res))
return timeline
+105
View File
@@ -0,0 +1,105 @@
"""Two-sided net-corroboration (DESIGN_v2.1 H5 + condition 3) — the instrument for the adversarial cases.
For a derivative, track the INDEPENDENCE-WEIGHTED affirms MINUS denies over time. This is the right
output for Strike/Battery (where the question is "did the engine distinguish real adoption from
narrative, and catch the contradiction?"), not runway:
- STRIKE (reflexivity): a PASS = net stays low/quiet in LIVE mode (own_network dropped) while it
would have fired in TEST mode (own_network kept) → the engine refuses the intra-cluster echo.
- BATTERY (timing): the DEMAND derivative's net rises while the SUPPLY derivative's net stays flat →
"half-confirmed, the load-bearing half isn't moving" = the eroding-conviction signal.
Reuses the §4.6 relevance helper, which already returns direction affirms|contradicts|tangential.
"""
from __future__ import annotations
from .independence import eisc_for
from .llm_helpers import derivative_relevance
from .windows import window_bounds
def classify_corpus(sc, backend, derivative: str, as_of: str, *, top_k: int = 60) -> list[dict]:
"""Retrieve (as-of filtered) + LLM-classify each claim's direction toward the derivative.
Returns affirms/contradicts claims with source_id + date (tangential dropped)."""
res = sc.search(derivative, collection="propositions", top_k=top_k, rerank=True)
hits = res.get("data", []) if isinstance(res, dict) else []
cand = []
for h in hits:
pl = (h.get("payload") or {})
d = pl.get("date")
if not pl.get("claim_id") or not d or d[:10] > as_of:
continue
cand.append({"claim_id": pl["claim_id"], "proposition": pl.get("proposition", ""),
"date": d[:10], "source_id": pl.get("source_id")})
if not cand:
return []
rel = derivative_relevance(backend, derivative,
[{"claim_id": c["claim_id"], "proposition": c["proposition"]} for c in cand])
out = []
for c in cand:
direction = rel.get(c["claim_id"], {}).get("direction", "tangential")
if direction in ("affirms", "contradicts"):
out.append({**c, "direction": direction})
return out
# DESIGN_v2 ADOPT #1 (claim-type weighting): a node "resolves" on REALIZED, descriptive disclosure —
# not on forecasts/intent. A source counts toward the net only if it carries a HARD (realized-fact)
# claim on this side; predictive/interpretive claims (forecasts, opinion, 'may consider', 'expects')
# are the exact material that fooled the supply axis on Battery, so they don't qualify a source alone.
_HARD_CLAIM_TYPES = ("descriptive", "reactive")
def _hard_sources(conn, claim_ids: list[str]) -> set:
"""Sources that contributed at least one realized-fact (descriptive/reactive) claim among claim_ids."""
if not claim_ids:
return set()
ph = ",".join("?" * len(claim_ids))
qph = ",".join("?" * len(_HARD_CLAIM_TYPES))
rows = conn.execute(
f"SELECT DISTINCT source_id FROM claims WHERE claim_id IN ({ph}) AND claim_type IN ({qph})",
list(claim_ids) + list(_HARD_CLAIM_TYPES),
).fetchall()
return {r[0] for r in rows}
def net_at(conn, classified: list[dict], as_of: str, *, window_days: int = 90, mode: str = "live",
require_hard_evidence: bool = True) -> dict:
"""Net independence-weighted corroboration in the trailing window ending at as_of. With
require_hard_evidence (default), a source only counts on a side if it carries a realized-fact claim
there — forecasts/intent alone don't qualify it (the announced-vs-deployed / opinion-vs-fact guard)."""
_, start, end = window_bounds(as_of, n=1, days=window_days)[0]
win = [c for c in classified if start < c["date"] <= end]
aff = [c for c in win if c["direction"] == "affirms"]
den = [c for c in win if c["direction"] == "contradicts"]
aff_src_all = {c["source_id"] for c in aff}
den_src_all = {c["source_id"] for c in den}
if require_hard_evidence:
hard_aff = _hard_sources(conn, [c["claim_id"] for c in aff])
hard_den = _hard_sources(conn, [c["claim_id"] for c in den])
aff_src = list(aff_src_all & hard_aff)
den_src = list(den_src_all & hard_den)
else:
aff_src, den_src = list(aff_src_all), list(den_src_all)
aff_e = eisc_for(conn, aff_src, mode=mode)["eisc_adj"] if aff_src else 0.0
den_e = eisc_for(conn, den_src, mode=mode)["eisc_adj"] if den_src else 0.0
own = 0
if aff_src:
ph = ",".join("?" * len(aff_src))
own = conn.execute(
f"SELECT COUNT(*) FROM sources WHERE source_id IN ({ph}) AND COALESCE(own_network,0)=1", aff_src
).fetchone()[0]
return {"as_of": as_of, "affirms_eisc": round(aff_e, 2), "denies_eisc": round(den_e, 2),
"net": round(aff_e - den_e, 2),
"n_affirm": len(aff), "n_deny": len(den),
"hard_affirm_src": len(aff_src), "soft_affirm_src_dropped": len(aff_src_all) - len(aff_src),
"own_network_affirm_src": own}
def trajectory(conn, sc, backend, derivative: str, as_of_dates: list[str], *,
window_days: int = 90, mode: str = "live", top_k: int = 60) -> list[dict]:
"""The net-corroboration curve over as_of_dates. Run twice (mode='live' vs 'test') to see what the
own_network quarantine removes — the reflexivity measurement."""
out = []
for as_of in as_of_dates:
classified = classify_corpus(sc, backend, derivative, as_of, top_k=top_k)
out.append(net_at(conn, classified, as_of, window_days=window_days, mode=mode))
return out
+75
View File
@@ -0,0 +1,75 @@
"""Under-acted-conviction scorer — Job B, the §7.1 backtest target.
score = conviction_weight x exposure_gap x rising_independent_corroboration
Fires when Ten31 believes something (high conviction), has little/no position (exposure gap), and the
world is beginning to corroborate it or a derivative of it — independently and with acceleration. This
is the signal that should have flagged "size up power-infra picks-and-shovels" in 2023.
Exposure is joined LOCALLY (never crosses the frontier boundary, §4.6). Corroboration is RETRIEVED
(stats nominate), then an LLM helper only FILTERS retrieval near-misses (§5.1) — it cannot add claims.
"""
from __future__ import annotations
from .llm_helpers import derivative_relevance
from .windows import windowed_independence
CONVICTION_WEIGHT = {"low": 0.15, "med": 0.4, "med-high": 0.7, "high": 1.0}
EXPOSURE_GAP = {"none": 1.0, "lt2": 0.8, "2to10": 0.4, "gt10": 0.1, "unset": 0.6}
def score_node(conn, sc, backend, *, as_of: str, derivative: str, conviction_id: str,
node_id: str | None, conviction_level: str, exposure: str,
is_breaker: bool = False, top_k: int = 40, window_days: int = 28) -> dict:
cw = CONVICTION_WEIGHT.get(conviction_level, 0.4)
eg = EXPOSURE_GAP.get(exposure, 0.6)
# 1. RETRIEVE (stats nominate): hybrid search over embedded propositions; as-of post-filter.
try:
res = sc.search(derivative, collection="propositions", top_k=top_k, rerank=True)
except Exception as e: # noqa: BLE001
return _result(conviction_id, node_id, 0.0, {"reason": f"search_failed:{str(e)[:60]}"},
cw, eg, exposure, is_breaker)
hits = res.get("data", []) if isinstance(res, dict) else []
cand = []
for h in hits:
pl = (h.get("payload") or {}) if isinstance(h, dict) else {}
d = pl.get("date")
if not pl.get("claim_id") or not d or d[:10] > as_of: # Qdrant can't date-filter; do it here
continue
cand.append({"claim_id": pl["claim_id"], "proposition": pl.get("proposition", ""),
"date": d, "source_id": pl.get("source_id")})
if not cand:
return _result(conviction_id, node_id, 0.0, {"reason": "no_retrieval", "n_retrieved": 0},
cw, eg, exposure, is_breaker)
# 2. FILTER near-misses with the LLM (affirms-only). Not a nominator — can't add claims.
rel = derivative_relevance(backend, derivative,
[{"claim_id": c["claim_id"], "proposition": c["proposition"]} for c in cand])
confirmed = [c for c in cand
if rel.get(c["claim_id"], {}).get("corroborates")
and rel[c["claim_id"]].get("direction") == "affirms"]
n_src = len({c["source_id"] for c in confirmed})
# 3. CORROBORATION = independence-weighted acceleration over the confirmed set (treat as a topic).
# window_days matches corpus cadence: ~90d for quarterly filings/earnings, ~28d for weekly podcasts.
wi = windowed_independence(conn, [(c["date"], c["source_id"]) for c in confirmed], as_of, days=window_days)
a_corrob = wi["acceleration"]
eisc_corrob = wi["eisc0"]
corroboration = max(0.0, a_corrob) * eisc_corrob
score = corroboration if is_breaker else cw * eg * corroboration
inputs = {
"as_of": as_of, "derivative": derivative, "n_retrieved": len(cand), "n_confirmed": len(confirmed),
"n_src": n_src, "a_corrob": a_corrob, "eisc_corrob": eisc_corrob, "k_eff0": wi["k_eff0"],
"window_counts": wi["counts"], "window_eisc": wi["eisc"], "corroboration": round(corroboration, 3),
"confirmed_claim_ids": [c["claim_id"] for c in confirmed][:50],
}
return _result(conviction_id, node_id, score, inputs, cw, eg, exposure, is_breaker)
def _result(conviction_id, node_id, score, inputs, cw, eg, exposure, is_breaker) -> dict:
inputs = {**inputs, "conviction_weight": cw, "exposure_gap": eg, "exposure": exposure,
"is_breaker": is_breaker}
return {"scorer": "under_acted", "conviction_id": conviction_id, "node_id": node_id,
"score": round(float(score), 4), "inputs": inputs}
+53
View File
@@ -0,0 +1,53 @@
"""Temporal windows + windowed independence (the single temporal layer, §4.4).
28-day non-overlapping windows anchored at as_of (W0 ends at as_of, then back). Non-overlapping
avoids autocorrelation faking significance. The signal is the discrete 2nd derivative of the
INDEPENDENCE-WEIGHTED flow (EISC per window), never the raw count — so a topic that "accelerates"
only because one show booked the same guest three times has flat N(W).
"""
from __future__ import annotations
from datetime import datetime, timedelta
from .independence import eisc_for
WINDOW_DAYS = 28
N_WINDOWS = 3
def _d(s: str) -> datetime:
return datetime.strptime(s[:10], "%Y-%m-%d")
def window_bounds(as_of: str, *, n: int = N_WINDOWS, days: int = WINDOW_DAYS) -> list[tuple[int, str, str]]:
"""Returns [(idx, start_iso, end_iso)] with W0 ending at as_of, extending backward only."""
end = _d(as_of)
out = []
for idx in range(n):
w_end = end - timedelta(days=idx * days)
w_start = end - timedelta(days=(idx + 1) * days)
out.append((idx, w_start.strftime("%Y-%m-%d"), w_end.strftime("%Y-%m-%d")))
return out
def windowed_independence(conn, rows: list[tuple], as_of: str, *, n: int = N_WINDOWS,
days: int = WINDOW_DAYS) -> dict:
"""rows: [(date_iso, source_id)]. For each window compute raw count + EISC_adj of its sources.
Returns {counts:[c0..], eisc:[N0..], k_eff:[...], acceleration, eisc0, sources0}.
acceleration = N0 - 2*N1 + N2 (independence-weighted 2nd derivative)."""
bounds = window_bounds(as_of, n=n, days=days)
counts, eiscs, keffs, src_sets = [], [], [], []
for _idx, start, end in bounds:
win = [r for r in rows if r[0] and start < r[0][:10] <= end]
srcs = list({r[1] for r in win})
e = eisc_for(conn, srcs) if srcs else {"eisc_adj": 0.0, "k_eff": 0}
counts.append(len(win))
eiscs.append(e["eisc_adj"])
keffs.append(e["k_eff"])
src_sets.append(srcs)
accel = eiscs[0] - 2 * eiscs[1] + eiscs[2] if n >= 3 else 0.0
return {
"counts": counts, "eisc": [round(x, 3) for x in eiscs], "k_eff": keffs,
"acceleration": round(accel, 3), "eisc0": round(eiscs[0], 3), "k_eff0": keffs[0],
"sources0": src_sets[0], "n_total": sum(counts),
}
+9
View File
@@ -0,0 +1,9 @@
"""Spark Control gateway client — the SINGLE chokepoint for all gateway HTTP (§13).
No other module in the engine knows the gateway URL. Everything local-compute
(transcription, diarization, the local LLM, embeddings, rerank, hybrid search, and the
scrub/rehydrate sovereignty boundary) goes through here.
"""
from .client import SparkControl, SparkControlError, from_config
__all__ = ["SparkControl", "SparkControlError", "from_config"]
+242
View File
@@ -0,0 +1,242 @@
"""Spark Control HTTP client (handoff §13.2 endpoint table).
Enforces the two operational invariants from §4.1 / §13.4 (revised per infra guidance 2026-06-09):
1. AUDIO concurrency is CAPPED at 2 in-flight (hard ceiling 3), GLOBAL across both parakeet
endpoints (/v1/audio/transcriptions + /api/audio/diarize*) — they share ONE serial GPU. A
process-wide BoundedSemaphore enforces it. Going wider buys ZERO throughput (requests queue and
hold the GPU); 2 just keeps the GPU continuously fed with no idle gap = full throughput.
2. Transient unresponsiveness is NORMAL, not failure: when the GPU stays continuously busy the
/health and in-flight requests can briefly (1-4s) stop responding. Timeouts / 503s /
connection-resets are "busy, retry" — handled by short exponential backoff, never treated as work loss.
NOTE: request/response *shapes* for the non-OpenAI endpoints (/api/audio/*, /scrub,
/rehydrate, /api/search) are provisional and marked TODO(contract) — confirm against the
live gateway's /api/endpoints. The OpenAI-compatible routes (/v1/*) follow the standard.
"""
from __future__ import annotations
import logging
import threading
import time
from pathlib import Path
from typing import Any
import requests
log = logging.getLogger(__name__)
# Process-wide AUDIO in-flight cap, GLOBAL across both parakeet endpoints. Single serial GPU shared
# with the operator's production app → concurrency only deepens the queue + lengthens transient
# busy-blips; sit at 2 (full throughput, ~2-3s busy windows), hard ceiling 3.
_AUDIO_MAX = 3
_AUDIO_SEM = threading.BoundedSemaphore(2)
def _set_audio_concurrency(n: int) -> None:
"""Resize the global audio semaphore (clamped to [1, _AUDIO_MAX]). Called at client init from config;
set before any worker threads start, so the rebind is not racing in-flight acquirers."""
global _AUDIO_SEM
_AUDIO_SEM = threading.BoundedSemaphore(min(_AUDIO_MAX, max(1, int(n))))
class SparkControlError(RuntimeError):
pass
class SparkControl:
def __init__(
self,
base_url: str,
*,
verify_tls: bool = False,
timeout: float = 120.0,
llm_model: str = "",
embed_model: str = "",
transcribe_model: str = "",
audio_concurrency: int = 2,
) -> None:
self.base = base_url.rstrip("/")
self.verify = verify_tls
self.timeout = timeout
self.llm_model = llm_model
self.embed_model = embed_model
self.transcribe_model = transcribe_model
_set_audio_concurrency(audio_concurrency)
self._session = requests.Session()
if not verify_tls:
# same-LAN self-signed cert (§13): suppress the per-request InsecureRequestWarning noise.
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# ---------- low-level ----------
def _post(
self,
path: str,
*,
json: Any = None,
files: Any = None,
data: Any = None,
retries: int = 4,
backoff: float = 5.0,
) -> Any:
url = f"{self.base}{path}"
for attempt in range(retries + 1):
try:
r = self._session.post(
url, json=json, files=files, data=data,
timeout=self.timeout, verify=self.verify,
)
if r.status_code == 503:
raise SparkControlError("503 from Spark Control (GPU busy / cold start)")
r.raise_for_status()
return r.json()
except (requests.RequestException, SparkControlError) as e:
if attempt < retries:
sleep = backoff * (2 ** attempt)
log.warning("Spark Control POST %s failed (%s); retry %d/%d in %.0fs",
path, e, attempt + 1, retries, sleep)
time.sleep(sleep)
else:
raise SparkControlError(f"POST {path} failed after {retries} retries: {e}") from e
def _get(self, path: str) -> Any:
r = self._session.get(f"{self.base}{path}", timeout=self.timeout, verify=self.verify)
r.raise_for_status()
return r.json()
# ---------- health / discovery (§13.2) ----------
def status(self) -> Any:
return self._get("/api/status")
def endpoints(self) -> Any:
return self._get("/api/endpoints")
# ---------- local LLM: extraction + scoring helpers (§4.2) ----------
def chat(
self,
messages: list[dict[str, str]],
*,
json_object: bool = True,
temperature: float = 0.0,
enable_thinking: bool = False,
max_tokens: int | None = None,
) -> Any:
"""Deterministic, no-chain-of-thought extraction per §4.2 (temp 0, thinking off,
JSON mode for guaranteed-valid JSON)."""
body: dict[str, Any] = {
"model": self.llm_model,
"messages": messages,
"temperature": temperature,
"chat_template_kwargs": {"enable_thinking": enable_thinking},
}
if json_object:
body["response_format"] = {"type": "json_object"}
if max_tokens:
body["max_tokens"] = max_tokens
return self._post("/v1/chat/completions", json=body)
# ---------- embeddings / rerank / hybrid search (§4.3) ----------
def embed(self, inputs: list[str]) -> Any:
"""Embed DISTILLED PROPOSITIONS, not raw chunks (§4.3)."""
return self._post("/v1/embeddings", json={"model": self.embed_model, "input": inputs})
def rerank(self, query: str, documents: list[str], *, top_n: int | None = None) -> Any:
body: dict[str, Any] = {"query": query, "documents": documents}
if top_n:
body["top_n"] = top_n
return self._post("/v1/rerank", json=body)
def search(
self,
query: str,
*,
collection: str,
top_k: int = 10,
retrieve_n: int | None = None,
rerank: bool = True,
filter: dict[str, Any] | None = None,
with_payload: bool = True,
min_score: float | None = None,
dense_vector_name: str = "bge_m3",
sparse_vector_name: str = "bm25",
text_field: str = "proposition",
) -> Any:
"""Hybrid dense+sparse retrieval (RRF) + optional rerank over a Qdrant collection (§4.3).
The gateway defaults vector names to 'dense'/'sparse'; our `propositions` collection uses
named vectors bge_m3/bm25, so they must be passed explicitly (confirmed live)."""
body: dict[str, Any] = {
"query": query, "collection": collection, "top_k": top_k,
"rerank": rerank, "with_payload": with_payload,
"dense_vector_name": dense_vector_name,
"sparse_vector_name": sparse_vector_name,
"text_field": text_field,
}
if retrieve_n is not None:
body["retrieve_n"] = retrieve_n
if filter is not None:
body["filter"] = filter
if min_score is not None:
body["min_score"] = min_score
return self._post("/api/search", json=body)
# ---------- audio: capped at 2 in-flight GLOBAL (semaphore), short busy-retry ----------
# backoff=1.5 → ~1.5/3/6/12/24s: tuned to ride out the 1-4s busy-blips, not the old 5-40s.
def transcribe(self, audio_path: str | Path, *, response_format: str = "verbose_json") -> Any:
with _AUDIO_SEM, open(audio_path, "rb") as f:
return self._post(
"/v1/audio/transcriptions",
files={"file": f},
data={"model": self.transcribe_model, "response_format": response_format},
retries=5, backoff=1.5,
)
def diarize_chunk(self, audio_path: str | Path) -> Any:
# TODO(contract): confirm /api/audio/diarize-chunk response shape (segments + 192-d voiceprint).
with _AUDIO_SEM, open(audio_path, "rb") as f:
return self._post("/api/audio/diarize-chunk", files={"file": f}, retries=5, backoff=1.5)
def transcribe_with_speakers(self, audio_path: str | Path) -> Any:
with _AUDIO_SEM, open(audio_path, "rb") as f:
return self._post("/api/audio/transcribe-with-speakers", files={"file": f}, retries=5, backoff=1.5)
# ---------- frontier sovereignty boundary (§4.6) ----------
# Confirmed contract (gateway /openapi.json):
# /scrub: task_id*, items*, known_entities, actor, tier1_action, bucket, ner, map_handle
# /rehydrate: task_id*, map_handle*, items*, actor, strict
# De-identifies IDENTITIES into stable placeholders; the de-anon map stays on the box and is
# referenced by `map_handle`. Exposure/position data must NEVER be sent here at all (§4.6).
def scrub(
self,
items: list[Any],
*,
task_id: str,
known_entities: dict[str, str] | None = None,
actor: str | None = None,
ner: bool = True,
) -> Any:
"""Returns the scrubbed items + a `map_handle` to pass to rehydrate. `known_entities` is the
caller-supplied dictionary (Strike→[FUND_1]); `ner` toggles the local-Qwen NER backstop."""
body: dict[str, Any] = {"task_id": task_id, "items": items, "ner": ner}
if known_entities is not None:
body["known_entities"] = known_entities
if actor is not None:
body["actor"] = actor
return self._post("/scrub", json=body)
def rehydrate(self, items: list[Any], *, task_id: str, map_handle: str, strict: bool = False) -> Any:
"""Restore real identities in the frontier's output locally, using the scrub `map_handle`."""
return self._post("/rehydrate", json={
"task_id": task_id, "map_handle": map_handle, "items": items, "strict": strict,
})
def from_config(cfg: Any) -> SparkControl:
return SparkControl(
cfg.spark_control_url,
verify_tls=cfg.spark_verify_tls,
timeout=cfg.spark_timeout_s,
llm_model=cfg.local_llm_model,
embed_model=cfg.embed_model,
transcribe_model=cfg.transcribe_model,
audio_concurrency=getattr(cfg, "audio_concurrency", 2),
)
+4
View File
@@ -0,0 +1,4 @@
"""Persistence layer: SQLite (metadata, ledger, conviction log, graph, queue).
Qdrant (vectors) is reached via the Spark Control gateway; see signal_engine.spark.
"""
+81
View File
@@ -0,0 +1,81 @@
"""SQLite connection + schema initialization. Boring and inspectable (§5)."""
from __future__ import annotations
import sqlite3
from pathlib import Path
SCHEMA_FILE = Path(__file__).with_name("schema.sql")
def connect(db_path: Path) -> sqlite3.Connection:
db_path = Path(db_path)
db_path.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(db_path), timeout=30)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
conn.execute("PRAGMA busy_timeout = 30000") # wait, don't fail, under concurrent backfill writers
return conn
# Additive migrations for DBs created before a column existed (CREATE IF NOT EXISTS won't add columns).
_MIGRATIONS = {
"documents": {"content_hash": "TEXT", "processed_at": "TEXT", "dedup_key": "TEXT"},
# DESIGN_v2.1 condition 1: own_network = the Ten31 orbit (Odell/Bent partners etc.) — listening to
# ourselves. Quarantined: a TEST FIXTURE for the reflexivity case, DROPPED in live EISC scoring.
"sources": {"backtest_2022_2023": "TEXT", "own_network": "INTEGER"},
# DESIGN_v2.1: tag derivatives by distance-from-edge for TRIAGE — surfaced, NEVER used as a filter
# (an engine that pre-filters to in-mandate reproduces the AI/compute mandate-expansion miss).
"fanout_nodes": {"distance_from_edge": "TEXT"},
}
def _widen_cluster_check(conn: sqlite3.Connection) -> None:
"""Add 'banks'/'credit'/'fintech' to sources.source_cluster's CHECK. SQLite can't ALTER a CHECK, so
rebuild the (tiny) table via the standard table-swap. Idempotent: no-op once already widened. Toggles
foreign_keys OFF around the swap (DROP would otherwise fail on inbound FKs); data copied by value so
referential integrity holds. busy_timeout (set in connect) lets it wait out concurrent backfill writers."""
import re
row = conn.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='sources'").fetchone()
if not row or "'banks'" in row[0]:
return
new_list = ("('macro','ai_tech','energy','bitcoin','vc_consensus','generalist',"
"'banks','credit','fintech')")
new_ddl = re.sub(r"source_cluster IN\s*\([^)]*\)", f"source_cluster IN {new_list}", row[0], count=1)
new_ddl = new_ddl.replace("CREATE TABLE sources", "CREATE TABLE sources_new", 1)
conn.commit() # close any implicit txn before toggling FK pragma
conn.execute("PRAGMA foreign_keys=OFF")
try:
conn.execute(new_ddl)
conn.execute("INSERT INTO sources_new SELECT * FROM sources")
conn.execute("DROP TABLE sources")
conn.execute("ALTER TABLE sources_new RENAME TO sources")
conn.commit()
finally:
conn.execute("PRAGMA foreign_keys=ON")
def _migrate(conn: sqlite3.Connection) -> None:
for table, cols in _MIGRATIONS.items():
existing = {r[1] for r in conn.execute(f"PRAGMA table_info({table})")}
for col, typ in cols.items():
if col not in existing:
conn.execute(f"ALTER TABLE {table} ADD COLUMN {col} {typ}")
# indexes on migrated columns (created here so they work on DBs predating the column)
conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_content_hash ON documents(content_hash)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_dedup_key ON documents(dedup_key)")
conn.commit()
_widen_cluster_check(conn)
def init_db(conn: sqlite3.Connection) -> None:
"""Idempotent: CREATE ... IF NOT EXISTS + additive column migrations."""
conn.executescript(SCHEMA_FILE.read_text())
conn.commit()
_migrate(conn)
def table_names(conn: sqlite3.Connection) -> list[str]:
rows = conn.execute(
"SELECT name FROM sqlite_master WHERE type IN ('table','view') ORDER BY name"
).fetchall()
return [r[0] for r in rows]
+280
View File
@@ -0,0 +1,280 @@
-- Ten31 Signal Engine — SQLite schema (pilot)
-- Source of truth: ten31-signal-engine-handoff.md §4 (pipeline layers), §6.7 (ledger),
-- §3.1 (conviction log), §13.4 (backfill queue).
-- Design principle (§5, §10): boring, inspectable tables. The whole system state is a SELECT away.
PRAGMA journal_mode = WAL;
PRAGMA foreign_keys = ON;
-- ============================================================================
-- CANONICAL TOPIC VOCABULARY (§4.2) — HYBRID (operator decision):
-- seeded controlled list + emergent topics merged in on a schedule.
-- ============================================================================
CREATE TABLE IF NOT EXISTS topics (
topic_canonical TEXT PRIMARY KEY,
status TEXT CHECK (status IN ('controlled','emergent','merged')) DEFAULT 'emergent',
merged_into TEXT REFERENCES topics(topic_canonical),
seam TEXT,
created_at TEXT DEFAULT (datetime('now'))
);
-- ============================================================================
-- SOURCES & DOCUMENTS (§4.1)
-- ============================================================================
CREATE TABLE IF NOT EXISTS sources (
source_id TEXT PRIMARY KEY,
name TEXT NOT NULL,
kind TEXT NOT NULL CHECK (kind IN ('podcast','youtube','filing','earnings_call')),
source_cluster TEXT CHECK (source_cluster IN
('macro','ai_tech','energy','bitcoin','vc_consensus','generalist','banks','credit','fintech')),
role TEXT CHECK (role IN ('CB','IND','DX','none')) DEFAULT 'none', -- §7.4
rss_url TEXT,
channel_url TEXT,
ticker TEXT,
-- §8 credibility: neutral prior that DECAYS in favor of earned track record from the ledger.
bootstrap_prior REAL DEFAULT 1.0,
earned_credibility REAL,
cluster_capped_low INTEGER DEFAULT 0, -- §4.5 bitcoin cluster deliberately under-weighted
backtest_2022_2023 TEXT, -- §7.1 reach: rss_full | rss_2023_only | youtube_only | launched_later | unavailable
notes TEXT,
created_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS documents (
doc_id TEXT PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES sources(source_id),
kind TEXT NOT NULL, -- podcast|youtube|filing|earnings_call
external_id TEXT, -- rss guid / yt video id / EDGAR accession / transcript id
url TEXT,
title TEXT,
date TEXT, -- ISO publication/filing date
duration_sec REAL,
raw_path TEXT, -- downloaded audio / raw filing
transcript_path TEXT,
-- DEDUP MODEL (layered):
-- (1) UNIQUE(source_id, external_id) below = the ROBUST guard. external_id is the stable item id
-- (RSS GUID / YouTube video id / EDGAR accession). Checked at ingest, BEFORE any GPU work.
-- (2) dedup_key = normalized title+date → catches the SAME episode arriving via a different
-- feed/mirror (different external_id). Computed pre-transcription. NOT from the transcript.
-- content_hash is ONLY an audit fingerprint of the transcript (did a re-run change?) — it is NOT
-- a dedup key (ASR is non-deterministic, so one differing word flips the hash).
dedup_key TEXT,
content_hash TEXT,
processed_at TEXT, -- set when transcription/extraction completes
ingested_at TEXT DEFAULT (datetime('now')),
UNIQUE (source_id, external_id) -- idempotent ingest (§13.4 dedup)
);
-- indexes for dedup_key / content_hash are created in db._migrate (after columns exist on older DBs).
-- ============================================================================
-- CLAIMS / PROPOSITIONS (§4.2) — the atomic unit of the whole system.
-- One passage emits 0..N claims; MOST of a podcast hour is 0 (§4.2). The
-- extractor must be willing to find nothing.
-- NOTE: thesis_seam is a TAG, never a hard filter (§5.7) — off-thesis &
-- anti-thesis claims MUST survive.
-- ============================================================================
CREATE TABLE IF NOT EXISTS claims (
claim_id TEXT PRIMARY KEY,
doc_id TEXT NOT NULL REFERENCES documents(doc_id),
source_id TEXT NOT NULL REFERENCES sources(source_id),
proposition TEXT NOT NULL, -- normalized subject-assertion-object
topic_canonical TEXT REFERENCES topics(topic_canonical),
topic_raw TEXT,
claimant TEXT,
source_cluster TEXT,
date TEXT,
claim_type TEXT CHECK (claim_type IN ('interpretive','predictive','descriptive','reactive')),
time_horizon TEXT CHECK (time_horizon IN ('near','medium','long','unspecified')),
confidence TEXT CHECK (confidence IN ('low','med','high')),
-- §4.2 relation: stance is EXTRACTED, never inferred from vector distance (§2.2/§5.3).
rel_target_claim_id TEXT REFERENCES claims(claim_id),
rel_polarity TEXT CHECK (rel_polarity IN ('affirms','denies','qualifies','none')) DEFAULT 'none',
engages_consensus INTEGER DEFAULT 0,
counters_position TEXT,
thesis_seam TEXT CHECK (thesis_seam IN
('energy_compute','debasement_bitcoin','ai_data_ownership','none')) DEFAULT 'none',
salience TEXT CHECK (salience IN ('central','secondary','aside')) DEFAULT 'secondary',
qdrant_point_id TEXT, -- link to the embedded proposition vector (§4.3)
extracted_at TEXT DEFAULT (datetime('now'))
);
CREATE INDEX IF NOT EXISTS idx_claims_topic ON claims(topic_canonical);
CREATE INDEX IF NOT EXISTS idx_claims_date ON claims(date);
CREATE INDEX IF NOT EXISTS idx_claims_seam ON claims(thesis_seam);
CREATE INDEX IF NOT EXISTS idx_claims_type ON claims(claim_type);
-- ============================================================================
-- SOURCE-INDEPENDENCE GRAPH (§4.5) — discount convergence by connectedness.
-- Cross-cluster convergence = gold; within-cluster = near-noise.
-- ============================================================================
CREATE TABLE IF NOT EXISTS source_edges (
src_a TEXT NOT NULL REFERENCES sources(source_id),
src_b TEXT NOT NULL REFERENCES sources(source_id),
edge_type TEXT NOT NULL CHECK (edge_type IN ('shared_guest','citation','community')),
weight REAL DEFAULT 1.0,
evidence TEXT, -- voiceprint_id / show-note ref / url
updated_at TEXT DEFAULT (datetime('now')),
PRIMARY KEY (src_a, src_b, edge_type)
);
-- ============================================================================
-- VOICEPRINT LIBRARY (§4.5, §4.1) — same-guest-across-shows BY VOICE.
-- 192-dim TitaNet voiceprints; cosine ~0.7 distance threshold for same speaker.
-- This is the highest-leverage automated input to the independence graph.
-- ============================================================================
CREATE TABLE IF NOT EXISTS voiceprints (
voiceprint_id TEXT PRIMARY KEY,
vector BLOB NOT NULL, -- 192 x float32
person_label TEXT, -- resolved name if known
first_doc_id TEXT REFERENCES documents(doc_id),
first_seen TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS voiceprint_observations (
obs_id INTEGER PRIMARY KEY AUTOINCREMENT,
voiceprint_id TEXT NOT NULL REFERENCES voiceprints(voiceprint_id),
doc_id TEXT NOT NULL REFERENCES documents(doc_id),
chunk_idx INTEGER,
segment_start REAL,
segment_end REAL
);
-- ============================================================================
-- CONVICTION LOG (§3.1) — human-owned seed nodes for Job B.
-- Structural rule (§3.1): separate the TRACKABLE thematic proposition (corpus
-- can corroborate) from TEAM conviction (context only). The engine must NEVER
-- present theme corroboration as validation of the team bet beneath it.
-- Exposure scored as coarse NAV bands (operator decision): none | lt2 | 2to10 | gt10 | unset.
-- ============================================================================
CREATE TABLE IF NOT EXISTS conviction_log (
conviction_id TEXT PRIMARY KEY, -- R1, E1, A1, B1 ...
seam TEXT, -- root|energy_compute|debasement_bitcoin|ai_data_ownership
thematic_proposition TEXT NOT NULL, -- the TRACKABLE half
team_conviction_note TEXT, -- context ONLY, never scored as theme validation
conviction_level TEXT CHECK (conviction_level IN ('low','med','med-high','high')),
current_exposure TEXT CHECK (current_exposure IN ('none','lt2','2to10','gt10','unset')) DEFAULT 'unset',
exposure_note TEXT, -- original §3.1 prose ("pervasive", "MED-HIGH") pending NAV-band finalization
disconfirming_signal TEXT,
is_thesis_breaker INTEGER DEFAULT 0, -- §3.1 B1-B3: engine must surface these AGAINST the thesis (§5.7)
updated_at TEXT DEFAULT (datetime('now'))
);
-- Conviction fan-out tree (§4.6). A derivative is a HYPOTHESIS until independent
-- corpus corroboration AND the exposure gap both clear the bar — then 'signal'.
CREATE TABLE IF NOT EXISTS fanout_nodes (
node_id TEXT PRIMARY KEY,
parent_conviction_id TEXT REFERENCES conviction_log(conviction_id),
parent_node_id TEXT REFERENCES fanout_nodes(node_id),
derivative_proposition TEXT NOT NULL,
depth INTEGER DEFAULT 1,
status TEXT CHECK (status IN ('hypothesis','corroborated','signal')) DEFAULT 'hypothesis',
created_at TEXT DEFAULT (datetime('now'))
);
-- ============================================================================
-- DUAL-EVALUATION LEDGER (§4.7, §6) — START DAY ONE; the clock can't be backfilled.
-- Log EVERY candidate that clears the quantitative bar (§6.6 — you need a denominator).
-- ============================================================================
CREATE TABLE IF NOT EXISTS ledger (
signal_id TEXT PRIMARY KEY,
type TEXT NOT NULL CHECK (type IN ('theme','event','under_acted_conviction')),
proposition TEXT NOT NULL,
date_logged TEXT NOT NULL DEFAULT (datetime('now')),
discourse_metric TEXT, -- JSON: acceleration, cross-cluster source set, independence-discounted count
external_check TEXT, -- JSON: resolution spec / nested clean events the model proposed (§6.5)
resolution_date TEXT,
discourse_outcome TEXT CHECK (discourse_outcome IN
('up_cross_cluster','up_single_cluster','flat','down')),
external_outcome TEXT CHECK (external_outcome IN
('correct','partial','wrong','unresolved_expired','too_early')),
lead_time_days INTEGER, -- §6.3 THE alpha measurement (to the DERIVATIVE node for Job B)
model_confidence REAL, -- §6.7 logged ONLY to measure its uselessness — NEVER fed into scoring
origin_conviction_id TEXT REFERENCES conviction_log(conviction_id), -- Job B traceability
origin_node_id TEXT REFERENCES fanout_nodes(node_id)
);
CREATE INDEX IF NOT EXISTS idx_ledger_type ON ledger(type);
CREATE INDEX IF NOT EXISTS idx_ledger_logged ON ledger(date_logged);
-- Human eval on a SEPARATE write path (§6.7): "keep them in separate columns and do not let the
-- model see Grant's rating before it logs its prediction." The model-facing code reads `ledger`;
-- ONLY the eval UI writes here. A separate table makes that separation structural, not a convention.
CREATE TABLE IF NOT EXISTS human_evaluations (
signal_id TEXT PRIMARY KEY REFERENCES ledger(signal_id),
grant_rating INTEGER, -- "non-obvious and relevant to me?" (e.g. 1-5)
non_obvious INTEGER, -- 0/1
notes TEXT,
rated_at TEXT DEFAULT (datetime('now'))
);
-- Reporting view — the valuable cell is DISAGREEMENT (§6.7). Used for analysis, NOT by the model path.
CREATE VIEW IF NOT EXISTS v_ledger_eval AS
SELECT l.*, h.grant_rating, h.non_obvious, h.notes AS grant_notes, h.rated_at
FROM ledger l LEFT JOIN human_evaluations h ON h.signal_id = l.signal_id;
-- ============================================================================
-- BACKFILL QUEUE (§13.4) — client-side, measured in GPU-HOURS.
-- Extraction (one LLM pass per chunk over the whole corpus) is the HEAVIER serial load.
-- Audio is SEQUENTIAL (parallel → 503). Leases give crash-safe resumability.
-- ============================================================================
CREATE TABLE IF NOT EXISTS backfill_jobs (
job_id INTEGER PRIMARY KEY AUTOINCREMENT,
job_type TEXT NOT NULL CHECK (job_type IN ('transcribe','diarize','extract','embed')),
target_id TEXT NOT NULL, -- doc_id or chunk id
parent_doc_id TEXT,
state TEXT NOT NULL CHECK (state IN
('pending','leased','running','done','failed','skipped')) DEFAULT 'pending',
priority INTEGER DEFAULT 100, -- lower = sooner (backtest corpus jumps the queue, §7.1)
attempts INTEGER DEFAULT 0,
max_attempts INTEGER DEFAULT 5,
lease_owner TEXT,
lease_expires_at TEXT,
input_hash TEXT NOT NULL, -- hash(content + model/prompt version) — idempotency
output_ref TEXT,
gpu_seconds REAL, -- measured per job → self-calibrating GPU-hours estimate
error TEXT,
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now')),
UNIQUE (job_type, input_hash)
);
CREATE INDEX IF NOT EXISTS idx_jobs_state_priority ON backfill_jobs(state, priority, job_id);
-- ============================================================================
-- SCORING BRAIN state (the "brain", build blueprint). Candidate state lands here +
-- ledger + fanout_nodes.status; existing tables unchanged.
-- ============================================================================
-- Temporal layer: one row per (topic, as_of, window). 28d non-overlapping windows.
CREATE TABLE IF NOT EXISTS topic_window_stats (
topic_canonical TEXT NOT NULL,
as_of TEXT NOT NULL,
window_idx INTEGER NOT NULL, -- 0 = window ending at as_of, 1 = prior, 2 = baseline
window_start TEXT NOT NULL,
window_end TEXT NOT NULL,
n_interp_pred INTEGER NOT NULL DEFAULT 0,
n_descr_react INTEGER NOT NULL DEFAULT 0,
n_distinct_src INTEGER NOT NULL DEFAULT 0,
n_distinct_clu INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (topic_canonical, as_of, window_idx)
);
-- Audit trail: one row per (scorer, key, as_of). Deterministic score_id → re-run reproduces.
CREATE TABLE IF NOT EXISTS candidate_scores (
score_id TEXT PRIMARY KEY,
scorer TEXT NOT NULL, -- emergence|contrarian|intersection|convergence|under_acted
as_of TEXT NOT NULL,
topic_canonical TEXT,
node_id TEXT,
conviction_id TEXT,
score REAL NOT NULL,
cleared_evidence_bar INTEGER NOT NULL DEFAULT 0, -- tier 1: logged to ledger (the denominator)
cleared_promotion_bar INTEGER NOT NULL DEFAULT 0, -- tier 2: sent to frontier judge
inputs_json TEXT NOT NULL, -- every term that produced the score (full audit)
computed_at TEXT DEFAULT (datetime('now'))
);
CREATE INDEX IF NOT EXISTS idx_cs_asof ON candidate_scores(scorer, as_of, cleared_promotion_bar);
-- Tunable bar config so the backtest can sweep thresholds without code edits.
CREATE TABLE IF NOT EXISTS score_thresholds (
scorer TEXT PRIMARY KEY,
min_score REAL,
gates_json TEXT,
version TEXT
);
+74
View File
@@ -0,0 +1,74 @@
"""Load human-owned seed data (conviction log, §3.1) into SQLite.
The conviction log is the highest-leverage Job B input (§3.1) and is HUMAN-OWNED:
Grant edits the YAML seed files; this loader upserts them. Re-running is idempotent.
"""
from __future__ import annotations
import sqlite3
from pathlib import Path
from typing import Any
import yaml
_CONVICTION_COLS = (
"conviction_id",
"seam",
"thematic_proposition",
"team_conviction_note",
"conviction_level",
"current_exposure",
"exposure_note",
"disconfirming_signal",
"is_thesis_breaker",
)
def _row(c: dict[str, Any]) -> dict[str, Any]:
return {
"conviction_id": c["id"],
"seam": c.get("seam"),
"thematic_proposition": c["thematic_proposition"],
"team_conviction_note": c.get("team_conviction_note"),
"conviction_level": c.get("conviction_level"),
"current_exposure": c.get("current_exposure", "unset"),
"exposure_note": c.get("exposure_note"),
"disconfirming_signal": c.get("disconfirming_signal"),
"is_thesis_breaker": 1 if c.get("is_thesis_breaker") else 0,
}
def load_fanout(conn: sqlite3.Connection, path: Path) -> int:
"""Load a hand-written fan-out tree (§7.1 backtest). Idempotent on node_id."""
data = yaml.safe_load(Path(path).read_text()) or {}
parent = data["parent_conviction_id"]
nodes = data.get("nodes", [])
for n in nodes:
conn.execute(
"""INSERT INTO fanout_nodes
(node_id, parent_conviction_id, derivative_proposition, depth, status, distance_from_edge)
VALUES (?,?,?,?, 'hypothesis', ?)
ON CONFLICT(node_id) DO UPDATE SET derivative_proposition=excluded.derivative_proposition,
parent_conviction_id=excluded.parent_conviction_id,
distance_from_edge=excluded.distance_from_edge""",
(n["node_id"], parent, n["derivative_proposition"], n.get("depth", 1), n.get("distance_from_edge")),
)
conn.commit()
return len(nodes)
def load_convictions(conn: sqlite3.Connection, path: Path) -> int:
data = yaml.safe_load(Path(path).read_text()) or {}
rows = data.get("convictions", [])
cols = ", ".join(_CONVICTION_COLS)
placeholders = ", ".join(f":{c}" for c in _CONVICTION_COLS)
updates = ", ".join(f"{c}=excluded.{c}" for c in _CONVICTION_COLS if c != "conviction_id")
sql = (
f"INSERT INTO conviction_log ({cols}, updated_at) "
f"VALUES ({placeholders}, datetime('now')) "
f"ON CONFLICT(conviction_id) DO UPDATE SET {updates}, updated_at=datetime('now')"
)
for c in rows:
conn.execute(sql, _row(c))
conn.commit()
return len(rows)
+90
View File
@@ -0,0 +1,90 @@
"""Load the source registry (companies + podcasts, §7.3/§7.4) into SQLite. Idempotent upsert."""
from __future__ import annotations
import sqlite3
from pathlib import Path
from typing import Any
import yaml
_COLS = ("source_id", "name", "kind", "source_cluster", "role", "rss_url",
"channel_url", "ticker", "cluster_capped_low", "own_network", "backtest_2022_2023", "notes")
def _row(s: dict[str, Any]) -> dict[str, Any]:
return {
"source_id": s["id"],
"name": s["name"],
"kind": s["kind"],
"source_cluster": s.get("cluster"),
"role": s.get("role", "none"),
"rss_url": s.get("rss_url"),
"channel_url": s.get("channel_url"),
"ticker": s.get("ticker"),
"cluster_capped_low": 1 if s.get("cluster_capped_low") else 0,
"own_network": 1 if s.get("own_network") else 0,
"backtest_2022_2023": s.get("backtest_2022_2023"),
"notes": s.get("notes"),
}
def update_feeds(conn: sqlite3.Connection, path: Path) -> int:
"""Apply resolved/verified podcast feed URLs + backtest-reach to existing source rows."""
try:
conn.execute("ALTER TABLE sources ADD COLUMN backtest_2022_2023 TEXT")
conn.commit()
except sqlite3.OperationalError:
pass # column already exists
data = yaml.safe_load(Path(path).read_text()) or {}
rows = data.get("feeds", [])
for f in rows:
conn.execute(
"""UPDATE sources
SET rss_url=:rss_url, channel_url=:youtube_channel_url,
backtest_2022_2023=:backtest_2022_2023, notes=COALESCE(:note, notes)
WHERE source_id=:id""",
{
"id": f["id"], "rss_url": f.get("rss_url"),
"youtube_channel_url": f.get("youtube_channel_url"),
"backtest_2022_2023": f.get("backtest_2022_2023"), "note": f.get("note"),
},
)
conn.commit()
return len(rows)
def load_source_edges(conn: sqlite3.Connection, path: Path) -> int:
"""Seed EISC connectedness edges (priors) idempotently. Stores src_a,src_b in sorted order to
match the transcribe_worker's convention (sorted([a,b]) + ON CONFLICT weight+=1) so real detections
accumulate on the same PK instead of creating a reversed duplicate. DO NOTHING on conflict → a
re-run won't inflate, and won't clobber a stronger auto-detected weight."""
data = yaml.safe_load(Path(path).read_text()) or {}
rows = data.get("edges", [])
applied = 0
for e in rows:
a, b = sorted([e["a"], e["b"]])
cur = conn.execute(
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
VALUES (?,?,?,?,?)
ON CONFLICT(src_a, src_b, edge_type) DO NOTHING""",
(a, b, e["type"], float(e.get("weight", 1.0)), e.get("evidence")),
)
applied += cur.rowcount
conn.commit()
return applied
def load_sources(conn: sqlite3.Connection, path: Path) -> int:
data = yaml.safe_load(Path(path).read_text()) or {}
rows = data.get("sources", [])
cols = ", ".join(_COLS)
placeholders = ", ".join(f":{c}" for c in _COLS)
updates = ", ".join(f"{c}=excluded.{c}" for c in _COLS if c != "source_id")
sql = (
f"INSERT INTO sources ({cols}, created_at) VALUES ({placeholders}, datetime('now')) "
f"ON CONFLICT(source_id) DO UPDATE SET {updates}"
)
for s in rows:
conn.execute(sql, _row(s))
conn.commit()
return len(rows)
+5
View File
@@ -0,0 +1,5 @@
"""Web UI (FastAPI) — corpus management + (later) the human-eval rating interface (§4.7/§6.7).
This is the app the StartOS s9pk exposes on its `ui` interface. Server-rendered HTML, no template
engine / JS framework — boring and inspectable, like the rest of the system.
"""
+179
View File
@@ -0,0 +1,179 @@
"""Corpus-management web UI (FastAPI).
Pages:
/ dashboard — corpus + pipeline counts at a glance
/corpus full source selection (companies + podcasts) + "add source" form
/corpus/add POST handler (manual urlencoded parse → no python-multipart dependency)
/source/{id} per-source detail: documents + extracted claims (inspect the signal)
"""
from __future__ import annotations
import html
import re
import sqlite3
from urllib.parse import parse_qs
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse, RedirectResponse
from ..config import load_config
from ..store import db
_CSS = """
body{font:14px/1.5 -apple-system,Segoe UI,Roboto,sans-serif;margin:0;background:#0f1115;color:#e6e6e6}
header{background:#161a22;padding:12px 20px;border-bottom:1px solid #2a2f3a}
header a{color:#7aa2f7;text-decoration:none;margin-right:18px;font-weight:600}
main{padding:20px;max-width:1100px;margin:0 auto}
h1{font-size:20px}h2{font-size:16px;margin-top:28px;color:#9aa5b1}
table{border-collapse:collapse;width:100%;margin:10px 0}
th,td{text-align:left;padding:6px 10px;border-bottom:1px solid #232833;font-size:13px}
th{color:#9aa5b1;font-weight:600}
tr:hover td{background:#161a22}
.tag{display:inline-block;padding:1px 7px;border-radius:10px;background:#232833;font-size:11px;color:#aab}
.cards{display:flex;gap:14px;flex-wrap:wrap}
.card{background:#161a22;border:1px solid #2a2f3a;border-radius:8px;padding:14px 18px;min-width:130px}
.card .n{font-size:24px;font-weight:700;color:#7aa2f7}.card .l{color:#9aa5b1;font-size:12px}
form{background:#161a22;border:1px solid #2a2f3a;border-radius:8px;padding:16px;margin:14px 0}
label{display:block;margin:8px 0 2px;color:#9aa5b1;font-size:12px}
input,select{background:#0f1115;border:1px solid #2a2f3a;color:#e6e6e6;border-radius:5px;padding:6px 8px;width:240px}
button{background:#7aa2f7;color:#0f1115;border:0;border-radius:6px;padding:8px 16px;font-weight:700;cursor:pointer;margin-top:12px}
a{color:#7aa2f7}.muted{color:#6b7280;font-size:12px}
"""
_CLUSTERS = ["macro", "ai_tech", "energy", "bitcoin", "vc_consensus", "generalist"]
_KINDS = ["podcast", "youtube", "filing", "earnings_call"]
_ROLES = ["none", "CB", "IND", "DX"]
def _page(title: str, body: str) -> HTMLResponse:
nav = ('<header><a href="/">Dashboard</a><a href="/corpus">Corpus</a>'
'<span class="muted">Ten31 Signal Engine</span></header>')
doc = f"<!doctype html><html><head><meta charset=utf-8><title>{html.escape(title)}</title>" \
f"<style>{_CSS}</style></head><body>{nav}<main>{body}</main></body></html>"
return HTMLResponse(doc)
def _slug(s: str) -> str:
return re.sub(r"[^a-z0-9]+", "-", s.lower()).strip("-")[:40] or "src"
def create_app() -> FastAPI:
cfg = load_config()
app = FastAPI(title="Ten31 Signal Engine")
def conn() -> sqlite3.Connection:
c = db.connect(cfg.db_path)
db.init_db(c)
return c
@app.get("/", response_class=HTMLResponse)
def dashboard() -> HTMLResponse:
c = conn()
def scalar(q, *a):
r = c.execute(q, a).fetchone()
return r[0] if r else 0
cards = {
"Sources": scalar("SELECT COUNT(*) FROM sources"),
"Documents": scalar("SELECT COUNT(*) FROM documents"),
"Claims": scalar("SELECT COUNT(*) FROM claims"),
"Embedded": scalar("SELECT COUNT(*) FROM claims WHERE qdrant_point_id IS NOT NULL"),
"Convictions": scalar("SELECT COUNT(*) FROM conviction_log"),
"Ledger": scalar("SELECT COUNT(*) FROM ledger"),
}
cards_html = "".join(f'<div class="card"><div class="n">{v}</div><div class="l">{k}</div></div>'
for k, v in cards.items())
# breakdowns
def rows(q):
return "".join(f"<tr><td>{html.escape(str(a))}</td><td>{b}</td></tr>" for a, b in c.execute(q))
claims_by_type = rows("SELECT claim_type, COUNT(*) FROM claims GROUP BY claim_type ORDER BY 2 DESC")
claims_by_seam = rows("SELECT thesis_seam, COUNT(*) FROM claims GROUP BY thesis_seam ORDER BY 2 DESC")
queue = rows("SELECT job_type||' / '||state, COUNT(*) FROM backfill_jobs GROUP BY 1 ORDER BY 1")
c.close()
body = f"""<h1>Dashboard</h1><div class="cards">{cards_html}</div>
<h2>Claims by type</h2><table><tr><th>type</th><th>n</th></tr>{claims_by_type or '<tr><td class=muted colspan=2>none yet</td></tr>'}</table>
<h2>Claims by thesis seam</h2><table><tr><th>seam</th><th>n</th></tr>{claims_by_seam or '<tr><td class=muted colspan=2>none yet</td></tr>'}</table>
<h2>Backfill queue</h2><table><tr><th>type / state</th><th>n</th></tr>{queue or '<tr><td class=muted colspan=2>empty</td></tr>'}</table>"""
return _page("Dashboard", body)
@app.get("/corpus", response_class=HTMLResponse)
def corpus() -> HTMLResponse:
c = conn()
srcs = c.execute("""
SELECT s.*,
(SELECT COUNT(*) FROM documents d WHERE d.source_id=s.source_id) docs,
(SELECT COUNT(*) FROM claims cl WHERE cl.source_id=s.source_id) claims
FROM sources s ORDER BY s.kind, s.source_id""").fetchall()
c.close()
def row(s):
extra = s["ticker"] or s["backtest_2022_2023"] or ""
return (f"<tr><td><a href='/source/{html.escape(s['source_id'])}'>{html.escape(s['name'])}</a></td>"
f"<td><span class=tag>{s['kind']}</span></td><td>{s['source_cluster'] or ''}</td>"
f"<td>{s['role'] or ''}</td><td>{html.escape(str(extra))}</td>"
f"<td>{s['docs']}</td><td>{s['claims']}</td></tr>")
table = "".join(row(s) for s in srcs)
opt = lambda xs: "".join(f"<option>{x}</option>" for x in xs)
form = f"""<form method=post action="/corpus/add">
<strong>Add to corpus</strong>
<label>Name</label><input name=name required placeholder="NVIDIA / Odd Lots">
<label>Kind</label><select name=kind>{opt(_KINDS)}</select>
<label>Cluster</label><select name=cluster>{opt(_CLUSTERS)}</select>
<label>Role</label><select name=role>{opt(_ROLES)}</select>
<label>Ticker (companies)</label><input name=ticker placeholder="NVDA">
<label>RSS URL (podcasts)</label><input name=rss_url placeholder="https://...">
<label>YouTube channel</label><input name=channel_url placeholder="https://youtube.com/@...">
<button type=submit>Add source</button>
</form>"""
body = f"""<h1>Corpus ({len(srcs)} sources)</h1>{form}
<table><tr><th>name</th><th>kind</th><th>cluster</th><th>role</th><th>ticker / backtest</th><th>docs</th><th>claims</th></tr>{table}</table>"""
return _page("Corpus", body)
@app.post("/corpus/add")
async def corpus_add(request: Request):
raw = (await request.body()).decode()
f = {k: v[0].strip() for k, v in parse_qs(raw).items() if v and v[0].strip()}
name = f.get("name")
if not name:
return RedirectResponse("/corpus", status_code=303)
kind = f.get("kind", "podcast")
ticker = f.get("ticker")
sid = f"co-{ticker.lower()}" if ticker else f"{'pod' if kind in ('podcast','youtube') else kind}-{_slug(name)}"
c = conn()
c.execute("""INSERT OR IGNORE INTO sources
(source_id, name, kind, source_cluster, role, ticker, rss_url, channel_url)
VALUES (?,?,?,?,?,?,?,?)""",
(sid, name, kind, f.get("cluster"), f.get("role", "none"),
ticker.upper() if ticker else None, f.get("rss_url"), f.get("channel_url")))
c.commit()
c.close()
return RedirectResponse("/corpus", status_code=303)
@app.get("/source/{source_id}", response_class=HTMLResponse)
def source_detail(source_id: str) -> HTMLResponse:
c = conn()
s = c.execute("SELECT * FROM sources WHERE source_id=?", (source_id,)).fetchone()
if not s:
c.close()
return _page("Not found", "<h1>Source not found</h1>")
claims = c.execute("""SELECT proposition, claim_type, time_horizon, thesis_seam, topic_canonical,
engages_consensus, date FROM claims WHERE source_id=?
ORDER BY date DESC LIMIT 200""", (source_id,)).fetchall()
c.close()
def crow(cl):
star = "" if cl["engages_consensus"] else ""
return (f"<tr><td>{cl['date'] or ''}</td><td><span class=tag>{cl['claim_type']}</span></td>"
f"<td>{cl['thesis_seam']}</td><td>{html.escape(cl['topic_canonical'] or '')}</td>"
f"<td>{html.escape(cl['proposition'])}{star}</td></tr>")
rows = "".join(crow(cl) for cl in claims) or '<tr><td class=muted colspan=5>no claims extracted yet</td></tr>'
meta = f"<span class=tag>{s['kind']}</span> cluster={s['source_cluster'] or '-'} role={s['role'] or '-'}"
if s["ticker"]:
meta += f" ticker={s['ticker']}"
if s["backtest_2022_2023"]:
meta += f" · backtest={s['backtest_2022_2023']}"
body = f"""<h1>{html.escape(s['name'])}</h1><p>{meta}</p>
<p class=muted>{html.escape(s['notes'] or '')}</p>
<h2>Claims ({len(claims)}) <span class=muted>⚔ = engages consensus</span></h2>
<table><tr><th>date</th><th>type</th><th>seam</th><th>topic</th><th>proposition</th></tr>{rows}</table>"""
return _page(s["name"], body)
return app
+28
View File
@@ -0,0 +1,28 @@
"""Small shared utilities (normalization, dedup keys)."""
from __future__ import annotations
import re
_SHOW_SUFFIX = re.compile(r"\s*[|\-–—]\s*[^|\-–—]*(podcast|show|ep(isode)?\s*\d+).*$", re.I)
_EP_PREFIX = re.compile(r"^\s*(ep(isode)?\.?\s*\d+\s*[:\-]|#\s*\d+\s*[:\-]|\d+\s*[:\-])\s*", re.I)
_NONALNUM = re.compile(r"[^a-z0-9]+")
def slugify(s: str, *, maxlen: int = 60) -> str:
return _NONALNUM.sub("-", (s or "").lower()).strip("-")[:maxlen] or "x"
def normalize_title(title: str) -> str:
"""Normalize an episode title so the SAME episode matches across feeds/mirrors despite cosmetic
differences ('Ep 42: Foo' vs 'Foo | The Show'). Best-effort — a safety net, not the primary key."""
t = title or ""
t = _SHOW_SUFFIX.sub("", t)
t = _EP_PREFIX.sub("", t)
return _NONALNUM.sub(" ", t.lower()).strip()
def audio_dedup_key(title: str | None, date: str | None) -> str:
"""Cross-mirror dedup key for audio: normalized title + date. Computed BEFORE transcription so a
duplicate episode (same content via a different feed/mirror) is skipped without spending GPU.
NOT derived from the transcript (ASR is non-deterministic — a transcript hash would be brittle)."""
return f"{normalize_title(title or '')}|{date or ''}"