Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,11 @@
|
||||
"""Ten31 Signal Engine — pilot.
|
||||
|
||||
A recurring pipeline that ingests audio + text, extracts structured propositions
|
||||
locally, and surfaces signal over time. The discipline that separates signal from
|
||||
plausible-sounding noise (handoff §5): statistics & graph structure NOMINATE
|
||||
candidates; the frontier model only JUDGES and FANS OUT a pre-filtered shortlist.
|
||||
|
||||
See README.md for the architecture and ten31-signal-engine-handoff.md for the spec.
|
||||
"""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
@@ -0,0 +1,4 @@
|
||||
from .cli import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1 @@
|
||||
"""Client-side backfill queue (§13.4). Producers enqueue; ONE worker drains sequentially."""
|
||||
@@ -0,0 +1,123 @@
|
||||
"""Backfill job queue over the `backfill_jobs` table (§13.4).
|
||||
|
||||
Model the corpus backfill as a managed GPU-hours queue, not a real-time fan-out. Producers
|
||||
(ingestion) enqueue lightweight job descriptors; a SINGLE worker leases and drains them one at a
|
||||
time so audio never goes parallel (→ 503). Jobs are:
|
||||
- idempotent: UNIQUE(job_type, input_hash); re-enqueue of seen content is a no-op.
|
||||
- crash-safe: leases expire, so a dead worker's job returns to the pool automatically.
|
||||
- prioritized: lower `priority` runs first (backtest corpus + filings jump ahead).
|
||||
|
||||
This is plain SQLite so the whole queue is `SELECT * FROM backfill_jobs`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from typing import Any, Optional, Sequence
|
||||
|
||||
LEASE_SECONDS_DEFAULT = 600
|
||||
|
||||
|
||||
def enqueue(
|
||||
conn: sqlite3.Connection,
|
||||
*,
|
||||
job_type: str,
|
||||
target_id: str,
|
||||
input_hash: str,
|
||||
parent_doc_id: str | None = None,
|
||||
priority: int = 100,
|
||||
max_attempts: int = 5,
|
||||
) -> Optional[int]:
|
||||
"""Insert a job. Returns job_id, or None if this (job_type, input_hash) is already queued/done
|
||||
(idempotent skip — §13.4)."""
|
||||
cur = conn.execute(
|
||||
"""INSERT OR IGNORE INTO backfill_jobs
|
||||
(job_type, target_id, parent_doc_id, priority, max_attempts, input_hash, state)
|
||||
VALUES (?,?,?,?,?,?, 'pending')""",
|
||||
(job_type, target_id, parent_doc_id, priority, max_attempts, input_hash),
|
||||
)
|
||||
conn.commit()
|
||||
return cur.lastrowid if cur.rowcount else None
|
||||
|
||||
|
||||
def lease_next(
|
||||
conn: sqlite3.Connection,
|
||||
*,
|
||||
worker_id: str,
|
||||
job_types: Sequence[str] | None = None,
|
||||
lease_seconds: int = LEASE_SECONDS_DEFAULT,
|
||||
) -> Optional[sqlite3.Row]:
|
||||
"""Atomically claim the highest-priority eligible job. Eligible = pending, OR a running/leased
|
||||
job whose lease has expired (crash recovery). Increments `attempts`."""
|
||||
params: list[Any] = []
|
||||
type_filter = ""
|
||||
if job_types:
|
||||
type_filter = f" AND job_type IN ({','.join('?' * len(job_types))})"
|
||||
params.extend(job_types)
|
||||
row = conn.execute(
|
||||
f"""SELECT job_id FROM backfill_jobs
|
||||
WHERE (state = 'pending'
|
||||
OR (state IN ('leased','running')
|
||||
AND lease_expires_at IS NOT NULL
|
||||
AND lease_expires_at < datetime('now')))
|
||||
{type_filter}
|
||||
ORDER BY priority ASC, job_id ASC
|
||||
LIMIT 1""",
|
||||
params,
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return None
|
||||
conn.execute(
|
||||
"""UPDATE backfill_jobs
|
||||
SET state='running', lease_owner=?, lease_expires_at=datetime('now', ?),
|
||||
attempts=attempts+1, updated_at=datetime('now')
|
||||
WHERE job_id=?""",
|
||||
(worker_id, f"+{int(lease_seconds)} seconds", row["job_id"]),
|
||||
)
|
||||
conn.commit()
|
||||
return conn.execute("SELECT * FROM backfill_jobs WHERE job_id=?", (row["job_id"],)).fetchone()
|
||||
|
||||
|
||||
def complete(conn: sqlite3.Connection, job_id: int, *, output_ref: str | None = None,
|
||||
gpu_seconds: float | None = None) -> None:
|
||||
conn.execute(
|
||||
"""UPDATE backfill_jobs SET state='done', output_ref=?, gpu_seconds=?, error=NULL,
|
||||
updated_at=datetime('now') WHERE job_id=?""",
|
||||
(output_ref, gpu_seconds, job_id),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def fail(conn: sqlite3.Connection, job_id: int, error: Any) -> str:
|
||||
"""Retry (→ pending) if attempts remain, else dead-letter (→ failed). Returns the new state."""
|
||||
row = conn.execute(
|
||||
"SELECT attempts, max_attempts FROM backfill_jobs WHERE job_id=?", (job_id,)
|
||||
).fetchone()
|
||||
exhausted = bool(row) and row["attempts"] >= row["max_attempts"]
|
||||
new_state = "failed" if exhausted else "pending"
|
||||
conn.execute(
|
||||
"""UPDATE backfill_jobs SET state=?, error=?, lease_owner=NULL, lease_expires_at=NULL,
|
||||
updated_at=datetime('now') WHERE job_id=?""",
|
||||
(new_state, str(error)[:2000], job_id),
|
||||
)
|
||||
conn.commit()
|
||||
return new_state
|
||||
|
||||
|
||||
def skip(conn: sqlite3.Connection, job_id: int, reason: str | None = None) -> None:
|
||||
"""Terminal non-error skip (e.g. a chunk that produced zero claims is still 'done', but an
|
||||
intentionally dropped job is 'skipped')."""
|
||||
conn.execute(
|
||||
"UPDATE backfill_jobs SET state='skipped', error=?, updated_at=datetime('now') WHERE job_id=?",
|
||||
(reason, job_id),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def stats(conn: sqlite3.Connection) -> dict[str, dict[str, int]]:
|
||||
rows = conn.execute(
|
||||
"SELECT job_type, state, COUNT(*) AS n FROM backfill_jobs GROUP BY job_type, state"
|
||||
).fetchall()
|
||||
out: dict[str, dict[str, int]] = {}
|
||||
for r in rows:
|
||||
out.setdefault(r["job_type"], {})[r["state"]] = r["n"]
|
||||
return out
|
||||
@@ -0,0 +1,619 @@
|
||||
"""Pilot CLI. Subcommands map to the build order in handoff §11.
|
||||
|
||||
Currently implemented (foundation): init-db, seed-convictions, spark-status, db-tables.
|
||||
Later stages (ingest, extract, score, judge, eval-ui) are added as they're built.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from .config import load_config
|
||||
from .store import db
|
||||
from .store.seed import load_convictions, load_fanout
|
||||
from .store.sources import load_source_edges, load_sources, update_feeds
|
||||
|
||||
DEFAULT_CONVICTION_SEED = Path("seeds/conviction_log.seed.yaml")
|
||||
DEFAULT_SOURCES_SEED = Path("seeds/sources.seed.yaml")
|
||||
DEFAULT_FEEDS_SEED = Path("seeds/podcast_feeds.resolved.yaml")
|
||||
|
||||
|
||||
def _setup_logging(level: str) -> None:
|
||||
logging.basicConfig(level=getattr(logging, level.upper(), logging.INFO),
|
||||
format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||
|
||||
|
||||
def cmd_init_db(args: argparse.Namespace) -> int:
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
print(f"Initialized DB at {cfg.db_path}")
|
||||
print("Tables/views:", ", ".join(db.table_names(conn)))
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_seed_convictions(args: argparse.Namespace) -> int:
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn) # ensure schema exists
|
||||
path = Path(args.file)
|
||||
n = load_convictions(conn, path)
|
||||
print(f"Upserted {n} convictions from {path}")
|
||||
breakers = conn.execute(
|
||||
"SELECT conviction_id, thematic_proposition FROM conviction_log WHERE is_thesis_breaker = 1"
|
||||
).fetchall()
|
||||
if breakers:
|
||||
print("Thesis-breakers loaded (engine must surface these AGAINST the thesis, §5.7):")
|
||||
for b in breakers:
|
||||
print(f" {b['conviction_id']}: {b['thematic_proposition'][:80]}...")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_seed_sources(args: argparse.Namespace) -> int:
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
n = load_sources(conn, Path(args.file))
|
||||
by_kind = conn.execute(
|
||||
"SELECT kind, COUNT(*) n FROM sources GROUP BY kind ORDER BY kind"
|
||||
).fetchall()
|
||||
print(f"Upserted {n} sources from {args.file}")
|
||||
for r in by_kind:
|
||||
print(f" {r['kind']}: {r['n']}")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_seed_edges(args: argparse.Namespace) -> int:
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
n = load_source_edges(conn, Path(args.file))
|
||||
total = conn.execute("SELECT COUNT(*) FROM source_edges").fetchone()[0]
|
||||
print(f"Inserted {n} new edges from {args.file} ({total} edges total)")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_load_feeds(args: argparse.Namespace) -> int:
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
n = update_feeds(conn, Path(args.file))
|
||||
print(f"updated {n} podcast feeds")
|
||||
rows = conn.execute(
|
||||
"SELECT backtest_2022_2023, COUNT(*) c FROM sources WHERE kind='podcast' "
|
||||
"GROUP BY backtest_2022_2023 ORDER BY c DESC"
|
||||
).fetchall()
|
||||
print("backtest 2022-2023 reach:")
|
||||
for r in rows:
|
||||
print(f" {r['backtest_2022_2023'] or 'unset'}: {r['c']}")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_ingest_edgar(args: argparse.Namespace) -> int:
|
||||
from .ingest.edgar import EdgarClient, ingest_filings
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
client = EdgarClient(cfg.edgar_user_agent)
|
||||
forms = tuple(f.strip() for f in args.forms.split(",")) if args.forms else ("10-K", "10-Q", "8-K")
|
||||
|
||||
# resolve source_id from ticker (create a lightweight source row if not seeded)
|
||||
row = conn.execute("SELECT source_id FROM sources WHERE upper(ticker)=upper(?)", (args.ticker,)).fetchone()
|
||||
if row:
|
||||
source_id = row["source_id"]
|
||||
else:
|
||||
source_id = f"co-{args.ticker.lower()}"
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO sources (source_id, name, kind, ticker) VALUES (?,?,?,?)",
|
||||
(source_id, args.ticker, "filing", args.ticker.upper()),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
n_docs, n_jobs = ingest_filings(conn, client, source_id=source_id, ticker=args.ticker,
|
||||
since=args.since, until=args.until, forms=forms)
|
||||
print(f"{args.ticker}: +{n_docs} filing documents, +{n_jobs} extract jobs queued "
|
||||
f"(forms={','.join(forms)}, since={args.since}, until={args.until})")
|
||||
return 0
|
||||
|
||||
|
||||
def _resolve_source_id(conn, ticker: str, kind: str = "filing") -> str:
|
||||
row = conn.execute("SELECT source_id FROM sources WHERE upper(ticker)=upper(?)", (ticker,)).fetchone()
|
||||
if row:
|
||||
return row["source_id"]
|
||||
source_id = f"co-{ticker.lower()}"
|
||||
conn.execute("INSERT OR IGNORE INTO sources (source_id, name, kind, ticker) VALUES (?,?,?,?)",
|
||||
(source_id, ticker.upper(), kind, ticker.upper()))
|
||||
conn.commit()
|
||||
return source_id
|
||||
|
||||
|
||||
def cmd_ingest_doc(args: argparse.Namespace) -> int:
|
||||
from .ingest.docs import ingest_one
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
doc_id = ingest_one(conn, cfg, source_id=args.source, url=args.url,
|
||||
title=args.title or args.url, date=args.date, method=args.method)
|
||||
print(f"ingested: {doc_id}" if doc_id else "no new doc (duplicate / too short / fetch failed)")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_ingest_feed_text(args: argparse.Namespace) -> int:
|
||||
from .ingest.docs import ingest_feed_text
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
n = ingest_feed_text(conn, cfg, source_id=args.source, rss_url=args.url,
|
||||
since=args.since, until=args.until, limit=args.limit)
|
||||
print(f"ingested {n} article docs from feed for {args.source}")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_ingest_doc_manifest(args: argparse.Namespace) -> int:
|
||||
from .ingest.docs import ingest_manifest
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
r = ingest_manifest(conn, cfg, Path(args.file))
|
||||
print(f"manifest: ingested={r['ingested']} skipped={r['skipped']} missing_source={r['missing_source']}")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_ingest_earnings(args: argparse.Namespace) -> int:
|
||||
from .ingest.earnings import FMPClient, ingest_for_ticker
|
||||
cfg = load_config()
|
||||
if not cfg.fmp_api_key:
|
||||
print("FMP_API_KEY not set", file=sys.stderr)
|
||||
return 1
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
fmp = FMPClient(cfg.fmp_api_key)
|
||||
source_id = _resolve_source_id(conn, args.ticker)
|
||||
n_docs, n_jobs = ingest_for_ticker(conn, fmp, source_id=source_id, symbol=args.ticker.upper(),
|
||||
data_dir=cfg.data_dir, since=args.since, until=args.until, limit=args.limit)
|
||||
print(f"{args.ticker}: +{n_docs} earnings transcripts, +{n_jobs} extract jobs (since={args.since}, until={args.until})")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_embed_claims(args: argparse.Namespace) -> int:
|
||||
from .spark import from_config
|
||||
from .embedstore.qdrant_store import get_client, ensure_collection, upsert_pending
|
||||
from .embedstore.embedder import SparseEmbedder
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
sc = from_config(cfg)
|
||||
client = get_client(args.qdrant_url)
|
||||
created = ensure_collection(client)
|
||||
print(f"collection {'created' if created else 'exists'}")
|
||||
sparse = SparseEmbedder() if not args.no_sparse else None
|
||||
n = upsert_pending(conn, sc, client, sparse)
|
||||
print(f"embedded + upserted {n} propositions (sparse={'on' if sparse and sparse.available else 'off'})")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_search(args: argparse.Namespace) -> int:
|
||||
from .spark import from_config
|
||||
cfg = load_config()
|
||||
sc = from_config(cfg)
|
||||
res = sc.search(args.query, collection="propositions", top_k=args.top_k, rerank=not args.no_rerank)
|
||||
hits = res.get("results") or res.get("hits") or res
|
||||
print(json.dumps(hits, indent=2)[:2500])
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_ingest_podcast(args: argparse.Namespace) -> int:
|
||||
from .ingest.podcasts import ingest_rss, ingest_youtube
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
src = conn.execute("SELECT * FROM sources WHERE source_id=?", (args.source,)).fetchone()
|
||||
if not src:
|
||||
print(f"unknown source {args.source}", file=sys.stderr)
|
||||
return 1
|
||||
via = args.via
|
||||
if via == "auto":
|
||||
via = "youtube" if (src["backtest_2022_2023"] == "youtube_only" and args.since) else "rss"
|
||||
fn = ingest_youtube if via == "youtube" else ingest_rss
|
||||
n_docs, n_jobs = fn(conn, src, since=args.since, until=args.until, limit=args.limit)
|
||||
print(f"{src['name']} via {via}: +{n_docs} episodes, +{n_jobs} transcribe jobs")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_run_transcribe(args: argparse.Namespace) -> int:
|
||||
from .spark import from_config
|
||||
from .ingest.transcribe_worker import run_transcribe
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
sc = from_config(cfg)
|
||||
result = run_transcribe(conn, sc, cfg, limit=args.limit, max_chunks=args.max_chunks)
|
||||
print(f"transcription: {result['jobs_processed']} jobs processed")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_run_transcribe_gemini(args: argparse.Namespace) -> int:
|
||||
from .ingest.gemini_transcribe import run_transcribe_gemini
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
r = run_transcribe_gemini(conn, cfg, limit=args.limit, concurrency=args.concurrency)
|
||||
tok_in, tok_out = r["prompt_tokens"], r["output_tokens"]
|
||||
# Gemini 2.5 Flash list price: ~$0.30/1M text-in, audio-in ~$1.00/1M, $2.50/1M out. Audio dominates in.
|
||||
est = tok_in / 1_000_000 * 1.00 + tok_out / 1_000_000 * 2.50
|
||||
print(f"gemini transcribe: done={r['done']} failed={r['failed']} | "
|
||||
f"tokens in={tok_in:,} out={tok_out:,} | ~${est:.2f} this run (≈${est/max(r['done'],1):.3f}/ep)")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_run_extract(args: argparse.Namespace) -> int:
|
||||
from .spark import from_config
|
||||
from .extract.worker import run_extract
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
sc = from_config(cfg)
|
||||
result = run_extract(conn, sc, cfg, limit=args.limit, max_chunks_per_doc=args.max_chunks)
|
||||
print(f"extraction: {result['jobs_processed']} jobs, {result['claims_written']} claims written")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_queue_status(args: argparse.Namespace) -> int:
|
||||
from .backfill import queue
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
s = queue.stats(conn)
|
||||
if not s:
|
||||
print("queue empty")
|
||||
return 0
|
||||
for job_type, states in sorted(s.items()):
|
||||
parts = ", ".join(f"{st}={n}" for st, n in sorted(states.items()))
|
||||
print(f" {job_type}: {parts}")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_feed_peek(args: argparse.Namespace) -> int:
|
||||
from .ingest.feeds import fetch_feed, episode_records
|
||||
parsed = fetch_feed(args.url)
|
||||
status = getattr(parsed, "status", None)
|
||||
recs = episode_records(parsed)
|
||||
print(f"status={status} bozo={getattr(parsed, 'bozo', None)} episodes_with_audio={len(recs)}")
|
||||
for r in recs[: args.limit]:
|
||||
print(f" [{r['published']}] {str(r['title'])[:70]}")
|
||||
if recs:
|
||||
print(f"oldest in feed: {recs[-1]['published']} newest: {recs[0]['published']}")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_serve(args: argparse.Namespace) -> int:
|
||||
import uvicorn
|
||||
from .ui.app import create_app
|
||||
cfg = load_config()
|
||||
port = args.port or cfg.ui_port
|
||||
print(f"serving corpus UI on http://0.0.0.0:{port}")
|
||||
uvicorn.run(create_app(), host="0.0.0.0", port=port)
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_seed_fanout(args: argparse.Namespace) -> int:
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
n = load_fanout(conn, Path(args.file))
|
||||
print(f"seeded {n} fan-out derivative nodes")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_backtest(args: argparse.Namespace) -> int:
|
||||
from .spark import from_config
|
||||
from .signals.run import run_backtest
|
||||
from datetime import datetime, timedelta
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
sc = from_config(cfg)
|
||||
# monthly as_of march
|
||||
start = datetime.strptime(args.start, "%Y-%m-%d")
|
||||
end = datetime.strptime(args.end, "%Y-%m-%d")
|
||||
dates, d = [], start
|
||||
while d <= end:
|
||||
dates.append(d.strftime("%Y-%m-%d"))
|
||||
d = d + timedelta(days=args.step_days)
|
||||
print(f"§7.1 backtest: conviction={args.conviction}, as_of march {args.start}→{args.end} ({len(dates)} points)")
|
||||
timeline = run_backtest(conn, sc, cfg, conviction_id=args.conviction, dates=dates, window_days=args.window_days)
|
||||
|
||||
# report: per-node first-clear date + score trajectory; highlight the headline derivative
|
||||
print("\n=== node trajectories (score by as_of; ★=cleared evidence bar) ===")
|
||||
nodes = {}
|
||||
for as_of, res in timeline:
|
||||
for r in res:
|
||||
key = r["node"]["node_id"] or r["node"]["conviction_id"]
|
||||
nodes.setdefault(key, []).append((as_of, r["result"]["score"], r["evidence"], r["promotion"], r["result"]["inputs"]))
|
||||
for key, traj in sorted(nodes.items()):
|
||||
first = next((t for t in traj if t[2]), None)
|
||||
peak = max(traj, key=lambda t: t[1])
|
||||
mark = f"first-cleared {first[0]}" if first else "never cleared"
|
||||
print(f" {key:28} peak={peak[1]:.2f} {mark}")
|
||||
head = nodes.get(args.headline)
|
||||
if head:
|
||||
print(f"\n=== HEADLINE derivative: {args.headline} ===")
|
||||
for as_of, score, ev, pr, inp in head:
|
||||
star = "★" if ev else ("·" if score > 0 else " ")
|
||||
print(f" {as_of} {star} score={score:.2f} corrob={inp.get('corroboration',0)} "
|
||||
f"n_conf={inp.get('n_confirmed',0)} eisc={inp.get('eisc_corrob',0)} "
|
||||
f"a={inp.get('a_corrob',0)} k_eff={inp.get('k_eff0',0)}")
|
||||
firstclear = next((t for t in head if t[2]), None)
|
||||
print(f"\n VERDICT: headline power-infra derivative "
|
||||
f"{'SURFACED at ' + firstclear[0] if firstclear else 'did NOT surface'} "
|
||||
f"(bar = under_acted ≥ {0.3})")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_two_sided(args: argparse.Namespace) -> int:
|
||||
"""Two-sided net-corroboration trajectory (DESIGN_v2.1 H5) for the adversarial cases.
|
||||
BATTERY: demand-net should rise while supply-net stays flat. STRIKE: net stays quiet in live, fires in test."""
|
||||
from .spark import from_config as spark_from_config
|
||||
from .extract.backends import from_config as backend_from_config
|
||||
from .signals.two_sided import trajectory
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
sc = spark_from_config(cfg)
|
||||
backend = backend_from_config(cfg, sc)
|
||||
nodes = conn.execute(
|
||||
"SELECT node_id, derivative_proposition FROM fanout_nodes WHERE parent_conviction_id=? ORDER BY node_id",
|
||||
(args.conviction,),
|
||||
).fetchall()
|
||||
dates = [d.strip() for d in args.dates.split(",")]
|
||||
filt = [s for s in args.nodes.split(",") if s] if args.nodes else []
|
||||
for r in nodes:
|
||||
if filt and not any(k.lower() in r["node_id"].lower() for k in filt):
|
||||
continue
|
||||
for mode in [m.strip() for m in args.modes.split(",")]:
|
||||
traj = trajectory(conn, sc, backend, r["derivative_proposition"], dates,
|
||||
window_days=args.window_days, mode=mode)
|
||||
print(f"\n### {r['node_id']} [mode={mode}, window={args.window_days}d] ###")
|
||||
for pt in traj:
|
||||
print(f" {pt['as_of']}: net={pt['net']:+.2f} "
|
||||
f"affirm(eisc={pt['affirms_eisc']}, hard_src={pt.get('hard_affirm_src','?')}, "
|
||||
f"n_claims={pt['n_affirm']}, soft_dropped={pt.get('soft_affirm_src_dropped','?')}) "
|
||||
f"deny(eisc={pt['denies_eisc']}, n={pt['n_deny']}) "
|
||||
f"own_net={pt['own_network_affirm_src']}")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_confusion(args: argparse.Namespace) -> int:
|
||||
from .signals.confusion import run_confusion
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
out = run_confusion(conn, cfg, args.spec)
|
||||
classify = out["classify"]
|
||||
print("=== PRE-REGISTERED confusion matrix (DESIGN_v2 §1) — precision AND recall; RUNWAY = frac of move still ahead at signal ===")
|
||||
print(f"{'derivative':26} {'reprice?':8} {'peak%':>6} {'whisper':>9} {'run_wh':>6} {'cleared':>9} {'run_cl':>6} cl/wh")
|
||||
for r in out["rows"]:
|
||||
cl, wh = classify(r, "cleared"), classify(r, "whisper")
|
||||
miss = f" (no px:{','.join(r['missing'])})" if r["missing"] else ""
|
||||
print(f"{r['node']:26} {('REAL' if r['confirmed'] else 'no'):8} {str(r['peak_pct']):>6} "
|
||||
f"{str(r['whisper_date'] or '-'):>9} {str(r['runway_whisper'] if r['runway_whisper'] is not None else '-'):>6} "
|
||||
f"{str(r['cleared_date'] or '-'):>9} {str(r['runway_cleared'] if r['runway_cleared'] is not None else '-'):>6} "
|
||||
f"{cl}/{wh}{miss}")
|
||||
for level in ("cleared", "whisper"):
|
||||
c, p, rec = out[level]
|
||||
print(f"\n{level.upper()} level: TP={c['TP']} FP={c['FP']} FN={c['FN']} TN={c['TN']} | "
|
||||
f"precision={p if p is None else round(p,2)} recall={rec if rec is None else round(rec,2)}")
|
||||
print("\nlead_* = days the repricing came AFTER the signal (positive = engine was early).")
|
||||
print("The cleared→whisper delta = what the independence floor cost in lead time / recall.")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_provenance(args: argparse.Namespace) -> int:
|
||||
"""The processing log — what's been ingested/processed, so we never reprocess silently."""
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
db.init_db(conn)
|
||||
print("processed documents (the durable log):")
|
||||
for r in conn.execute(
|
||||
"SELECT kind, COUNT(*) total, SUM(CASE WHEN processed_at IS NOT NULL THEN 1 ELSE 0 END) proc "
|
||||
"FROM documents GROUP BY kind ORDER BY kind"
|
||||
):
|
||||
print(f" {r['kind']:14} {r['proc']}/{r['total']} processed")
|
||||
print("dedup model: (1) UNIQUE(source_id, external_id) = robust pre-GPU guard; "
|
||||
"(2) dedup_key = cross-mirror (title+date); content_hash = audit only.")
|
||||
dups = conn.execute(
|
||||
"SELECT dedup_key, COUNT(*) c FROM documents WHERE dedup_key IS NOT NULL "
|
||||
"GROUP BY dedup_key HAVING c > 1"
|
||||
).fetchall()
|
||||
print(f"cross-mirror dedup_key groups (same episode via >1 feed): {len(dups)}")
|
||||
miss = conn.execute("SELECT COUNT(*) FROM documents WHERE dedup_key IS NULL").fetchone()[0]
|
||||
if miss:
|
||||
print(f" ({miss} docs missing dedup_key — run `provenance --backfill-hashes`)")
|
||||
if args.backfill_hashes:
|
||||
import hashlib
|
||||
import os
|
||||
from .util import audio_dedup_key
|
||||
ndk = nch = 0
|
||||
for r in conn.execute("SELECT doc_id, kind, title, date, external_id, transcript_path, dedup_key, content_hash FROM documents"):
|
||||
updates: dict = {}
|
||||
if not r["dedup_key"]:
|
||||
updates["dedup_key"] = (audio_dedup_key(r["title"], r["date"])
|
||||
if r["kind"] in ("podcast", "youtube") else r["external_id"])
|
||||
ndk += 1
|
||||
if not r["content_hash"] and r["transcript_path"] and os.path.exists(r["transcript_path"]):
|
||||
updates["content_hash"] = hashlib.sha256(open(r["transcript_path"], "rb").read()).hexdigest()
|
||||
nch += 1
|
||||
if updates:
|
||||
sets = ", ".join(f"{k}=?" for k in updates)
|
||||
conn.execute(f"UPDATE documents SET {sets} WHERE doc_id=?", (*updates.values(), r["doc_id"]))
|
||||
conn.commit()
|
||||
print(f"backfilled {ndk} dedup_keys, {nch} content hashes (audit)")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_db_tables(args: argparse.Namespace) -> int:
|
||||
cfg = load_config()
|
||||
conn = db.connect(cfg.db_path)
|
||||
for t in db.table_names(conn):
|
||||
print(t)
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_spark_status(args: argparse.Namespace) -> int:
|
||||
from .spark import from_config
|
||||
cfg = load_config()
|
||||
sc = from_config(cfg)
|
||||
try:
|
||||
print("status:", sc.status())
|
||||
print("endpoints:", sc.endpoints())
|
||||
return 0
|
||||
except Exception as e: # noqa: BLE001 — health probe; surface, don't crash
|
||||
print(f"Spark Control unreachable at {cfg.spark_control_url}: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
p = argparse.ArgumentParser(prog="signal_engine", description="Ten31 Signal Engine (pilot)")
|
||||
sub = p.add_subparsers(dest="command", required=True)
|
||||
|
||||
sub.add_parser("init-db", help="Create the SQLite schema").set_defaults(func=cmd_init_db)
|
||||
|
||||
sp = sub.add_parser("seed-convictions", help="Load the conviction log (§3.1)")
|
||||
sp.add_argument("--file", default=str(DEFAULT_CONVICTION_SEED))
|
||||
sp.set_defaults(func=cmd_seed_convictions)
|
||||
|
||||
ss = sub.add_parser("seed-sources", help="Load the source registry (§7.3/§7.4)")
|
||||
ss.add_argument("--file", default=str(DEFAULT_SOURCES_SEED))
|
||||
ss.set_defaults(func=cmd_seed_sources)
|
||||
|
||||
sde = sub.add_parser("seed-edges", help="Seed EISC connectedness edges (priors) idempotently")
|
||||
sde.add_argument("--file", default="seeds/source_edges.bitcoin.seed.yaml")
|
||||
sde.set_defaults(func=cmd_seed_edges)
|
||||
|
||||
lf = sub.add_parser("load-feeds", help="Apply resolved/verified podcast feed URLs + backtest reach")
|
||||
lf.add_argument("--file", default=str(DEFAULT_FEEDS_SEED))
|
||||
lf.set_defaults(func=cmd_load_feeds)
|
||||
|
||||
sf = sub.add_parser("seed-fanout", help="Load the hand-written fan-out tree (§7.1 backtest)")
|
||||
sf.add_argument("--file", default="seeds/fanout.K2023.seed.yaml")
|
||||
sf.set_defaults(func=cmd_seed_fanout)
|
||||
|
||||
bt = sub.add_parser("backtest", help="Run the §7.1 under-acted-conviction backtest (as-of march)")
|
||||
bt.add_argument("--conviction", default="K2023")
|
||||
bt.add_argument("--start", default="2023-01-01")
|
||||
bt.add_argument("--end", default="2024-06-01")
|
||||
bt.add_argument("--step-days", type=int, default=30)
|
||||
bt.add_argument("--window-days", type=int, default=90, help="~quarterly for filings/earnings cadence")
|
||||
bt.add_argument("--headline", default="K2023-picks-and-shovels")
|
||||
bt.set_defaults(func=cmd_backtest)
|
||||
|
||||
ie = sub.add_parser("ingest-edgar", help="Fetch SEC filings for a ticker → documents + extract jobs")
|
||||
ie.add_argument("--ticker", required=True)
|
||||
ie.add_argument("--since", help="ISO date lower bound, e.g. 2022-01-01")
|
||||
ie.add_argument("--until", help="ISO date upper bound, e.g. 2023-12-31")
|
||||
ie.add_argument("--forms", help="comma list, default 10-K,10-Q,8-K")
|
||||
ie.set_defaults(func=cmd_ingest_edgar)
|
||||
|
||||
idoc = sub.add_parser("ingest-doc", help="Fetch one text doc (HTML/PDF) → document + extract job (Battery corpus)")
|
||||
idoc.add_argument("--source", required=True, help="source_id (must exist)")
|
||||
idoc.add_argument("--url", required=True)
|
||||
idoc.add_argument("--title")
|
||||
idoc.add_argument("--date", help="ISO date of the document")
|
||||
idoc.add_argument("--method", choices=["auto", "html", "pdf"], default="auto")
|
||||
idoc.set_defaults(func=cmd_ingest_doc)
|
||||
|
||||
idm = sub.add_parser("ingest-doc-manifest", help="Batch-ingest a YAML doc manifest (Battery corpus)")
|
||||
idm.add_argument("--file", default="seeds/battery_docs.manifest.yaml")
|
||||
idm.set_defaults(func=cmd_ingest_doc_manifest)
|
||||
|
||||
ift = sub.add_parser("ingest-feed-text", help="Ingest article bodies behind a text RSS feed (blog/press)")
|
||||
ift.add_argument("--source", required=True)
|
||||
ift.add_argument("--url", required=True, help="RSS feed URL")
|
||||
ift.add_argument("--since")
|
||||
ift.add_argument("--until")
|
||||
ift.add_argument("--limit", type=int, default=50)
|
||||
ift.set_defaults(func=cmd_ingest_feed_text)
|
||||
|
||||
ge = sub.add_parser("ingest-earnings", help="Fetch FMP earnings transcripts → documents + extract jobs")
|
||||
ge.add_argument("--ticker", required=True)
|
||||
ge.add_argument("--since", help="ISO date lower bound (uses transcript date)")
|
||||
ge.add_argument("--until", help="ISO date upper bound")
|
||||
ge.add_argument("--limit", type=int, default=8)
|
||||
ge.set_defaults(func=cmd_ingest_earnings)
|
||||
|
||||
ts = sub.add_parser("two-sided", help="Two-sided net-corroboration trajectory (Strike/Battery adversarial cases)")
|
||||
ts.add_argument("--conviction", default="BATTERY2022")
|
||||
ts.add_argument("--nodes", default="", help="comma substrings to filter fan-out nodes, e.g. demand,supply")
|
||||
ts.add_argument("--dates", default="2022-12-31,2023-06-30,2023-12-31,2024-06-30,2024-12-31")
|
||||
ts.add_argument("--modes", default="live", help="comma list: live,test")
|
||||
ts.add_argument("--window-days", type=int, default=365)
|
||||
ts.set_defaults(func=cmd_two_sided)
|
||||
|
||||
ec = sub.add_parser("embed-claims", help="Embed pending propositions → Qdrant hybrid collection (§4.3)")
|
||||
ec.add_argument("--qdrant-url", default="http://192.168.1.87:6333")
|
||||
ec.add_argument("--no-sparse", action="store_true", help="dense-only (skip BM25)")
|
||||
ec.set_defaults(func=cmd_embed_claims)
|
||||
|
||||
se = sub.add_parser("search", help="Hybrid search the proposition store via the gateway")
|
||||
se.add_argument("--query", required=True)
|
||||
se.add_argument("--top-k", type=int, default=8)
|
||||
se.add_argument("--no-rerank", action="store_true")
|
||||
se.set_defaults(func=cmd_search)
|
||||
|
||||
ip = sub.add_parser("ingest-podcast", help="Register podcast episodes → transcribe jobs (RSS or YouTube)")
|
||||
ip.add_argument("--source", required=True, help="source_id, e.g. pod-dwarkesh")
|
||||
ip.add_argument("--via", choices=["auto", "rss", "youtube"], default="auto")
|
||||
ip.add_argument("--since")
|
||||
ip.add_argument("--until")
|
||||
ip.add_argument("--limit", type=int, default=20)
|
||||
ip.set_defaults(func=cmd_ingest_podcast)
|
||||
|
||||
rt = sub.add_parser("run-transcribe", help="Drain 'transcribe' jobs → speaker-attributed transcripts + voiceprints")
|
||||
rt.add_argument("--limit", type=int, default=5)
|
||||
rt.add_argument("--max-chunks", type=int, default=999)
|
||||
rt.set_defaults(func=cmd_run_transcribe)
|
||||
|
||||
rtg = sub.add_parser("run-transcribe-gemini",
|
||||
help="One-time backfill: drain 'transcribe' jobs via Gemini (off the Spark GPU)")
|
||||
rtg.add_argument("--limit", type=int, default=5)
|
||||
rtg.add_argument("--concurrency", type=int, default=4)
|
||||
rtg.set_defaults(func=cmd_run_transcribe_gemini)
|
||||
|
||||
re = sub.add_parser("run-extract", help="Drain 'extract' jobs → claims via the local LLM (§4.2)")
|
||||
re.add_argument("--limit", type=int, default=5, help="max jobs to process this run")
|
||||
re.add_argument("--max-chunks", type=int, default=4, help="max chunks per document")
|
||||
re.set_defaults(func=cmd_run_extract)
|
||||
|
||||
sub.add_parser("queue-status", help="Backfill queue counts by type/state").set_defaults(func=cmd_queue_status)
|
||||
|
||||
fp = sub.add_parser("feed-peek", help="Parse an RSS feed and show episode coverage")
|
||||
fp.add_argument("--url", required=True)
|
||||
fp.add_argument("--limit", type=int, default=5)
|
||||
fp.set_defaults(func=cmd_feed_peek)
|
||||
|
||||
sv = sub.add_parser("serve", help="Run the corpus-management web UI (FastAPI)")
|
||||
sv.add_argument("--port", type=int, default=None)
|
||||
sv.set_defaults(func=cmd_serve)
|
||||
|
||||
cm = sub.add_parser("confusion-matrix", help="Pre-registered precision/recall on the §7.1 derivatives (resolver)")
|
||||
cm.add_argument("--spec", default="seeds/resolution.K2023.yaml")
|
||||
cm.set_defaults(func=cmd_confusion)
|
||||
|
||||
pv = sub.add_parser("provenance", help="Processing log: what's ingested/processed (dedup-safe)")
|
||||
pv.add_argument("--backfill-hashes", action="store_true", help="compute content_hash for older transcripts")
|
||||
pv.set_defaults(func=cmd_provenance)
|
||||
|
||||
sub.add_parser("db-tables", help="List tables/views").set_defaults(func=cmd_db_tables)
|
||||
sub.add_parser("spark-status", help="Probe Spark Control health").set_defaults(func=cmd_spark_status)
|
||||
return p
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
args = build_parser().parse_args(argv)
|
||||
cfg = load_config()
|
||||
_setup_logging(cfg.log_level)
|
||||
return args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,101 @@
|
||||
"""Environment-driven configuration (handoff §10, §13).
|
||||
|
||||
All config flows through env vars so the SAME code runs as a plain process now and, later, as a
|
||||
StartOS s9pk daemon (which injects these via the daemon's `exec.env` from a `store.json` FileModel).
|
||||
A local `.env` (gitignored) is loaded for convenience during the pilot.
|
||||
|
||||
Live values confirmed against the operator's gateway 2026-06-07 (GET /api/status,/api/endpoints):
|
||||
gateway = https://192.168.1.72:62419 (self-signed → SPARK_VERIFY_TLS=false)
|
||||
LLM = RedHatAI/Qwen3.6-35B-A3B-NVFP4
|
||||
embed = BAAI/bge-m3 (1024-d) rerank = BAAI/bge-reranker-v2-m3
|
||||
ASR = nvidia/parakeet-tdt-0.6b-v3 diarizer = nvidia/diar_sortformer_4spk-v1
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _load_dotenv(path: str = ".env") -> None:
|
||||
"""Minimal .env loader (no dependency): KEY=VALUE lines populate os.environ if not already set."""
|
||||
p = Path(path)
|
||||
if not p.exists():
|
||||
return
|
||||
for line in p.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, _, val = line.partition("=")
|
||||
os.environ.setdefault(key.strip(), val.strip().strip('"').strip("'"))
|
||||
|
||||
|
||||
def _env(key: str, default: str | None = None) -> str | None:
|
||||
return os.environ.get(key, default)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Config:
|
||||
spark_control_url: str
|
||||
spark_verify_tls: bool
|
||||
spark_timeout_s: float
|
||||
audio_concurrency: int # global in-flight cap across BOTH parakeet audio endpoints (sit at 2, ceiling 3)
|
||||
|
||||
local_llm_model: str
|
||||
embed_model: str
|
||||
transcribe_model: str
|
||||
|
||||
anthropic_api_key: str | None
|
||||
frontier_model: str
|
||||
|
||||
# Extraction backend: 'local' (Qwen via Spark Control, default) | 'gemini' (batch overflow/fallback, §scaling)
|
||||
extraction_backend: str
|
||||
gemini_api_key: str | None
|
||||
gemini_model: str
|
||||
|
||||
fmp_api_key: str | None
|
||||
edgar_user_agent: str
|
||||
|
||||
data_dir: Path
|
||||
database_url: str
|
||||
audio_cache_dir: Path
|
||||
|
||||
ui_port: int
|
||||
log_level: str
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> "Config":
|
||||
_load_dotenv()
|
||||
data_dir = Path(_env("DATA_DIR", "./data") or "./data")
|
||||
return cls(
|
||||
spark_control_url=_env("SPARK_CONTROL_URL", "https://192.168.1.72:62419") or "",
|
||||
spark_verify_tls=(_env("SPARK_VERIFY_TLS", "false") or "false").lower() == "true",
|
||||
spark_timeout_s=float(_env("SPARK_TIMEOUT_S", "180") or "180"),
|
||||
audio_concurrency=min(3, max(1, int(_env("AUDIO_CONCURRENCY", "2") or "2"))),
|
||||
local_llm_model=_env("LOCAL_LLM_MODEL", "RedHatAI/Qwen3.6-35B-A3B-NVFP4") or "",
|
||||
embed_model=_env("EMBED_MODEL", "BAAI/bge-m3") or "",
|
||||
transcribe_model=_env("TRANSCRIBE_MODEL", "nvidia/parakeet-tdt-0.6b-v3") or "",
|
||||
anthropic_api_key=_env("ANTHROPIC_API_KEY"),
|
||||
frontier_model=_env("FRONTIER_MODEL", "claude-opus-4-8") or "",
|
||||
extraction_backend=_env("EXTRACTION_BACKEND", "local") or "local",
|
||||
gemini_api_key=_env("GEMINI_API_KEY"),
|
||||
gemini_model=_env("GEMINI_MODEL", "gemini-2.5-flash") or "",
|
||||
fmp_api_key=_env("FMP_API_KEY"),
|
||||
edgar_user_agent=_env("EDGAR_USER_AGENT", "Ten31 Research grant@ten31.xyz") or "",
|
||||
data_dir=data_dir,
|
||||
database_url=_env("DATABASE_URL", "") or "",
|
||||
audio_cache_dir=Path(_env("AUDIO_CACHE_DIR", str(data_dir / "audio-cache")) or "audio-cache"),
|
||||
ui_port=int(_env("UI_PORT", "8000") or "8000"),
|
||||
log_level=_env("LOG_LEVEL", "INFO") or "INFO",
|
||||
)
|
||||
|
||||
@property
|
||||
def db_path(self) -> Path:
|
||||
prefix = "sqlite:///"
|
||||
if self.database_url.startswith(prefix):
|
||||
return Path(self.database_url[len(prefix):])
|
||||
return self.data_dir / "signal.db"
|
||||
|
||||
|
||||
def load_config() -> Config:
|
||||
return Config.from_env()
|
||||
@@ -0,0 +1,6 @@
|
||||
"""Embedding + vector storage (§4.3).
|
||||
|
||||
Embed DISTILLED PROPOSITIONS (not raw chunks) into a Qdrant HYBRID collection: dense bge-m3
|
||||
(via the gateway) + BM25 sparse (client-side), so entity-heavy propositions (MSTR/Strategy/
|
||||
Microstrategy) match on the lexical leg too. Retrieval goes through the gateway's /api/search.
|
||||
"""
|
||||
@@ -0,0 +1,36 @@
|
||||
"""Proposition embedding: dense (bge-m3 via gateway) + optional BM25 sparse (client-side)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def dense_embed(sc, texts: list[str]) -> list[list[float]]:
|
||||
"""Dense bge-m3 (1024-d) via the gateway /v1/embeddings (§4.3)."""
|
||||
resp = sc.embed(texts)
|
||||
data = sorted(resp["data"], key=lambda d: d.get("index", 0))
|
||||
return [d["embedding"] for d in data]
|
||||
|
||||
|
||||
class SparseEmbedder:
|
||||
"""BM25 sparse vectors via FastEmbed `Qdrant/bm25` (the operator's CRM uses this exact model,
|
||||
with the collection's `modifier: idf`). Degrades gracefully to dense-only if fastembed is absent."""
|
||||
|
||||
def __init__(self, model_name: str = "Qdrant/bm25") -> None:
|
||||
self.available = False
|
||||
self._model = None
|
||||
try:
|
||||
from fastembed import SparseTextEmbedding
|
||||
self._model = SparseTextEmbedding(model_name=model_name)
|
||||
self.available = True
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("fastembed sparse unavailable (%s) — upserting dense-only; add sparse later", e)
|
||||
|
||||
def embed(self, texts: list[str]) -> list[dict | None]:
|
||||
if not self.available or self._model is None:
|
||||
return [None] * len(texts)
|
||||
out: list[dict | None] = []
|
||||
for emb in self._model.embed(texts):
|
||||
out.append({"indices": emb.indices.tolist(), "values": emb.values.tolist()})
|
||||
return out
|
||||
@@ -0,0 +1,79 @@
|
||||
"""Qdrant hybrid collection: create + upsert distilled propositions (§4.3).
|
||||
|
||||
Collection mgmt + upserts go DIRECT to Qdrant (§13.2 "(Qdrant direct) :6333"); retrieval goes
|
||||
through the gateway's /api/search. Named dense vector `bge_m3` (1024-d cosine) + sparse `bm25`
|
||||
(modifier IDF). Point id is a deterministic UUID5 of claim_id, so re-upsert is idempotent.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
import uuid
|
||||
|
||||
from qdrant_client import QdrantClient, models
|
||||
|
||||
from .embedder import SparseEmbedder, dense_embed
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
COLLECTION = "propositions"
|
||||
DENSE = "bge_m3"
|
||||
SPARSE = "bm25"
|
||||
_NS = uuid.UUID("5f9b7e10-0000-4000-8000-000000000001")
|
||||
|
||||
# Filterable payload (§4.3): stance/topic/cluster/date for stance distributions, time-windowed
|
||||
# consensus, corroboration lookups. NEVER infer stance from vector distance (§2.2/§5.3).
|
||||
_PAYLOAD_FIELDS = (
|
||||
"claim_id", "doc_id", "source_id", "source_cluster", "topic_canonical", "date",
|
||||
"claim_type", "time_horizon", "confidence", "rel_polarity", "engages_consensus",
|
||||
"counters_position", "thesis_seam", "salience", "claimant", "proposition",
|
||||
)
|
||||
|
||||
|
||||
def get_client(qdrant_url: str) -> QdrantClient:
|
||||
return QdrantClient(url=qdrant_url, prefer_grpc=False, timeout=60)
|
||||
|
||||
|
||||
def ensure_collection(client: QdrantClient, *, dim: int = 1024) -> bool:
|
||||
names = [c.name for c in client.get_collections().collections]
|
||||
if COLLECTION in names:
|
||||
return False
|
||||
client.create_collection(
|
||||
collection_name=COLLECTION,
|
||||
vectors_config={DENSE: models.VectorParams(size=dim, distance=models.Distance.COSINE)},
|
||||
sparse_vectors_config={SPARSE: models.SparseVectorParams(modifier=models.Modifier.IDF)},
|
||||
)
|
||||
log.info("created Qdrant collection %r (dense %s %dd + sparse %s/idf)", COLLECTION, DENSE, dim, SPARSE)
|
||||
return True
|
||||
|
||||
|
||||
def _point_id(claim_id: str) -> str:
|
||||
return str(uuid.uuid5(_NS, claim_id))
|
||||
|
||||
|
||||
def upsert_pending(conn: sqlite3.Connection, sc, client: QdrantClient,
|
||||
sparse: SparseEmbedder | None = None, *, batch: int = 64) -> int:
|
||||
"""Embed + upsert every claim that has no qdrant_point_id yet; back-link the id into SQLite."""
|
||||
rows = conn.execute("SELECT * FROM claims WHERE qdrant_point_id IS NULL").fetchall()
|
||||
if not rows:
|
||||
return 0
|
||||
total = 0
|
||||
for i in range(0, len(rows), batch):
|
||||
chunk = rows[i:i + batch]
|
||||
texts = [r["proposition"] for r in chunk]
|
||||
dvecs = dense_embed(sc, texts)
|
||||
svecs = sparse.embed(texts) if sparse else [None] * len(texts)
|
||||
points = []
|
||||
for r, dv, sv in zip(chunk, dvecs, svecs):
|
||||
vectors: dict = {DENSE: dv}
|
||||
if sv is not None:
|
||||
vectors[SPARSE] = models.SparseVector(indices=sv["indices"], values=sv["values"])
|
||||
payload = {f: r[f] for f in _PAYLOAD_FIELDS}
|
||||
points.append(models.PointStruct(id=_point_id(r["claim_id"]), vector=vectors, payload=payload))
|
||||
client.upsert(collection_name=COLLECTION, points=points)
|
||||
for r in chunk:
|
||||
conn.execute("UPDATE claims SET qdrant_point_id=? WHERE claim_id=?",
|
||||
(_point_id(r["claim_id"]), r["claim_id"]))
|
||||
conn.commit()
|
||||
total += len(chunk)
|
||||
return total
|
||||
@@ -0,0 +1,6 @@
|
||||
"""Extraction (§4.2) — local LLM → structured claim units. The cost & quality center.
|
||||
|
||||
Emits at the level of the PROPOSITION: a passage may yield 0..N claims, and MOST passages yield
|
||||
zero. An extractor that dutifully emits a claim per chunk reintroduces exactly the noise the rest
|
||||
of the system is designed to remove.
|
||||
"""
|
||||
@@ -0,0 +1,64 @@
|
||||
"""Pluggable extraction backends (§scaling).
|
||||
|
||||
The §4.2 extractor calls a backend that turns chat messages into a JSON string. Default is the
|
||||
LOCAL Qwen via Spark Control (the ~95%-local design). The Gemini backend is the documented
|
||||
overflow/fallback for bulk back-cataloging at scale, or if the Sparks are unavailable — used for
|
||||
the PUBLIC corpus only, never conviction/exposure data (sovereignty boundary, §4.6).
|
||||
|
||||
A backend exposes: complete_json(messages, max_tokens) -> str (a JSON object string).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LocalQwenBackend:
|
||||
name = "local"
|
||||
|
||||
def __init__(self, sc) -> None:
|
||||
self.sc = sc
|
||||
|
||||
def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
|
||||
resp = self.sc.chat(messages, json_object=True, temperature=0,
|
||||
enable_thinking=False, max_tokens=max_tokens)
|
||||
return resp["choices"][0]["message"]["content"]
|
||||
|
||||
|
||||
class GeminiBackend:
|
||||
"""Gemini fallback/overflow. Implemented against the `google-genai` SDK. NOTE: untested until a
|
||||
key is provided — validate end-to-end before relying on it for a real backfill. The async BATCH
|
||||
API is the eventual scale path; this synchronous form is the drop-in fallback."""
|
||||
name = "gemini"
|
||||
|
||||
def __init__(self, api_key: str, model: str = "gemini-2.5-flash") -> None:
|
||||
from google import genai # guarded import; pip install google-genai
|
||||
self._genai = genai
|
||||
self.client = genai.Client(api_key=api_key)
|
||||
self.model = model
|
||||
|
||||
def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
|
||||
from google.genai import types
|
||||
system = "\n\n".join(m["content"] for m in messages if m["role"] == "system")
|
||||
user = "\n\n".join(m["content"] for m in messages if m["role"] != "system")
|
||||
resp = self.client.models.generate_content(
|
||||
model=self.model,
|
||||
contents=user,
|
||||
config=types.GenerateContentConfig(
|
||||
system_instruction=system or None,
|
||||
temperature=0,
|
||||
max_output_tokens=max_tokens,
|
||||
response_mime_type="application/json",
|
||||
),
|
||||
)
|
||||
return resp.text or "{}"
|
||||
|
||||
|
||||
def from_config(cfg, sc) -> "LocalQwenBackend | GeminiBackend":
|
||||
if cfg.extraction_backend == "gemini":
|
||||
if not cfg.gemini_api_key:
|
||||
log.warning("EXTRACTION_BACKEND=gemini but GEMINI_API_KEY missing — falling back to local")
|
||||
else:
|
||||
return GeminiBackend(cfg.gemini_api_key, cfg.gemini_model)
|
||||
return LocalQwenBackend(sc)
|
||||
@@ -0,0 +1,117 @@
|
||||
"""Claim extraction: text → 0..N claim units → SQLite (§4.2)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
from typing import Any
|
||||
|
||||
from .prompt import SEED_TOPICS, build_messages
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_ENUMS = {
|
||||
"claim_type": {"interpretive", "predictive", "descriptive", "reactive"},
|
||||
"time_horizon": {"near", "medium", "long", "unspecified"},
|
||||
"confidence": {"low", "med", "high"},
|
||||
"thesis_seam": {"energy_compute", "debasement_bitcoin", "ai_data_ownership", "none"},
|
||||
"salience": {"central", "secondary", "aside"},
|
||||
}
|
||||
|
||||
|
||||
def register_seed_topics(conn: sqlite3.Connection) -> None:
|
||||
"""Pre-load the controlled half of the hybrid topic vocabulary (§4.2)."""
|
||||
for t in SEED_TOPICS:
|
||||
conn.execute(
|
||||
"INSERT INTO topics (topic_canonical, status) VALUES (?, 'controlled') "
|
||||
"ON CONFLICT(topic_canonical) DO UPDATE SET status='controlled'",
|
||||
(t,),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def chunk_text(text: str, max_chars: int) -> list[str]:
|
||||
"""Split on paragraph boundaries into windows that fit the model context alongside the prompt."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
if len(text) <= max_chars:
|
||||
return [text]
|
||||
chunks: list[str] = []
|
||||
cur: list[str] = []
|
||||
size = 0
|
||||
for para in text.split("\n\n"):
|
||||
if size + len(para) > max_chars and cur:
|
||||
chunks.append("\n\n".join(cur))
|
||||
cur, size = [], 0
|
||||
cur.append(para)
|
||||
size += len(para) + 2
|
||||
if cur:
|
||||
chunks.append("\n\n".join(cur))
|
||||
return chunks
|
||||
|
||||
|
||||
def _parse_claims(content: str) -> list[dict]:
|
||||
try:
|
||||
obj = json.loads(content)
|
||||
except Exception:
|
||||
i, j = content.find("{"), content.rfind("}")
|
||||
if i < 0 or j < 0:
|
||||
return []
|
||||
try:
|
||||
obj = json.loads(content[i:j + 1])
|
||||
except Exception:
|
||||
return []
|
||||
claims = obj.get("claims", []) if isinstance(obj, dict) else []
|
||||
return [c for c in claims if isinstance(c, dict) and c.get("proposition")]
|
||||
|
||||
|
||||
def extract_claims_from_text(backend, text: str, *, source_name: str, source_cluster: str | None,
|
||||
date: str | None, kind: str) -> list[dict]:
|
||||
"""`backend` is any object with .complete_json(messages, max_tokens) -> str
|
||||
(see extract.backends: LocalQwenBackend | GeminiBackend)."""
|
||||
messages = build_messages(text, source_name=source_name, source_cluster=source_cluster,
|
||||
date=date, kind=kind)
|
||||
content = backend.complete_json(messages, max_tokens=4000)
|
||||
return _parse_claims(content)
|
||||
|
||||
|
||||
def _enum(c: dict, field: str, default: str) -> str:
|
||||
v = c.get(field)
|
||||
return v if v in _ENUMS[field] else default
|
||||
|
||||
|
||||
def persist_claims(conn: sqlite3.Connection, *, doc: sqlite3.Row, source: sqlite3.Row | None,
|
||||
claims: list[dict], chunk_idx: int) -> int:
|
||||
n = 0
|
||||
cluster = source["source_cluster"] if source else None
|
||||
for i, c in enumerate(claims):
|
||||
seam = _enum(c, "thesis_seam", "none")
|
||||
topic = c.get("topic_canonical") or None
|
||||
if topic:
|
||||
# register emergent topics BEFORE the claim (claims.topic_canonical is a FK → topics)
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO topics (topic_canonical, status, seam) VALUES (?, 'emergent', ?)",
|
||||
(topic, seam),
|
||||
)
|
||||
claim_id = f"{doc['doc_id']}:{chunk_idx}:{i}"
|
||||
conn.execute(
|
||||
"""INSERT OR IGNORE INTO claims
|
||||
(claim_id, doc_id, source_id, proposition, topic_canonical, topic_raw, claimant,
|
||||
source_cluster, date, claim_type, time_horizon, confidence, rel_polarity,
|
||||
engages_consensus, counters_position, thesis_seam, salience)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
|
||||
(
|
||||
claim_id, doc["doc_id"], doc["source_id"], str(c["proposition"])[:1000],
|
||||
topic, c.get("topic_raw"),
|
||||
c.get("claimant") or (source["name"] if source else None),
|
||||
cluster, doc["date"],
|
||||
_enum(c, "claim_type", "descriptive"), _enum(c, "time_horizon", "unspecified"),
|
||||
_enum(c, "confidence", "med"), "none",
|
||||
1 if c.get("engages_consensus") else 0, c.get("counters_position"),
|
||||
seam, _enum(c, "salience", "secondary"),
|
||||
),
|
||||
)
|
||||
n += 1
|
||||
conn.commit()
|
||||
return n
|
||||
@@ -0,0 +1,47 @@
|
||||
"""SEC filing HTML → plain text. Stdlib only (boring, inspectable).
|
||||
|
||||
Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge <ix:hidden> section of
|
||||
numeric facts that would otherwise swamp the extractor), and collapses whitespace.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
|
||||
_SKIP_TAGS = {"script", "style", "head"}
|
||||
_SKIP_PREFIXES = ("ix:hidden",) # inline-XBRL hidden fact dump
|
||||
_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"}
|
||||
|
||||
|
||||
class _Stripper(HTMLParser):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(convert_charrefs=True)
|
||||
self._skip_depth = 0
|
||||
self._parts: list[str] = []
|
||||
|
||||
def handle_starttag(self, tag: str, attrs) -> None:
|
||||
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
|
||||
self._skip_depth += 1
|
||||
elif tag in _BLOCK_TAGS:
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
|
||||
self._skip_depth = max(0, self._skip_depth - 1)
|
||||
elif tag in _BLOCK_TAGS:
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if self._skip_depth == 0 and data.strip():
|
||||
self._parts.append(data)
|
||||
|
||||
|
||||
def html_to_text(html: str, *, max_chars: int = 300_000) -> str:
|
||||
p = _Stripper()
|
||||
p.feed(html)
|
||||
text = "".join(p._parts)
|
||||
text = re.sub(r"[ \t ]+", " ", text)
|
||||
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
|
||||
text = "\n".join(line.strip() for line in text.splitlines())
|
||||
text = text.strip()
|
||||
return text[:max_chars]
|
||||
@@ -0,0 +1,72 @@
|
||||
"""The §4.2 claim-extraction prompt. Prompt engineering is ours (§13.3); the schema is finalized.
|
||||
|
||||
Discipline encoded here (the whole point of the system, §2/§4.2):
|
||||
- Extract at the level of the PROPOSITION; emit ZERO when there is no substantive claim.
|
||||
- Separate topic from stance: capture stance-vs-consensus explicitly, never as a bull/bear label.
|
||||
- thesis_seam is a TAG, not a filter — off-thesis and anti-thesis claims are still extracted.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
# Hybrid topic vocabulary (§4.2): a small SEEDED controlled list. The model reuses one when it
|
||||
# fits and proposes a concise snake_case topic otherwise; emergent topics are merged on a schedule.
|
||||
SEED_TOPICS = [
|
||||
# energy <-> compute
|
||||
"ai_compute_demand", "ai_power_constraint", "datacenter_buildout", "grid_interconnect",
|
||||
"transformers_equipment", "nuclear_power", "natural_gas_power", "uranium_supply",
|
||||
"cooling_infrastructure", "miner_flexible_load", "mining_ai_pivot",
|
||||
# debasement <-> bitcoin
|
||||
"bitcoin_reserve_asset", "bitcoin_collateral_credit", "bitcoin_treasury_strategy",
|
||||
"btc_custody_regulation", "sovereign_bitcoin_adoption",
|
||||
# ai <-> data ownership
|
||||
"ai_data_ownership", "confidential_inference", "ai_commoditization",
|
||||
# macro
|
||||
"fed_policy", "fiscal_debasement", "stablecoins_cbdc",
|
||||
]
|
||||
|
||||
_SYSTEM = """You are the claim-extraction component of an investment signal engine. You read a passage \
|
||||
(an SEC filing excerpt or a podcast/earnings-call transcript) and extract structured CLAIM UNITS.
|
||||
|
||||
A CLAIM UNIT is a single normalized proposition that someone asserts — a forward-looking prediction, \
|
||||
an interpretive or causal judgment, or a stance taken against a prevailing view. It must be specific \
|
||||
enough to later be checked against the world.
|
||||
|
||||
CRITICAL DISCIPLINE — be willing to extract NOTHING:
|
||||
- Most passages contain ZERO claim units. Boilerplate, legal disclaimers, ad reads, pleasantries, \
|
||||
generic descriptions, routine financial line-items, and recitations of well-known news are NOT claims.
|
||||
- Do NOT invent claims. Do NOT emit one claim per paragraph to seem thorough. If the passage has no \
|
||||
substantive proposition, return {"claims": []}. A precise empty answer is the correct, valued output.
|
||||
- Extract at the level of the PROPOSITION: one normalized subject-assertion-object sentence each. A \
|
||||
single rich passage may yield several; a long dull one yields none.
|
||||
|
||||
For EACH claim unit, output these fields:
|
||||
- "proposition": one normalized sentence (subject-assertion-object), self-contained.
|
||||
- "topic_canonical": a concise snake_case topic for clustering. REUSE one of the provided seed topics \
|
||||
when it fits; otherwise propose a new concise snake_case label. Normalize synonyms (Fed/FOMC/rates → fed_policy).
|
||||
- "topic_raw": the topic as actually phrased in the passage.
|
||||
- "claimant": who asserts it (speaker name or the filing company). Use "unknown" if unclear.
|
||||
- "claim_type": one of interpretive | predictive | descriptive | reactive. (interpretive/predictive = \
|
||||
insight; descriptive/reactive = news echo — extract those only if clearly salient.)
|
||||
- "time_horizon": one of near | medium | long | unspecified (for predictive claims especially).
|
||||
- "confidence": the claimant's apparent conviction — one of low | med | high.
|
||||
- "engages_consensus": true ONLY if the claim explicitly argues against a stated mainstream view.
|
||||
- "counters_position": the mainstream position it argues against, or null.
|
||||
- "thesis_seam": one of energy_compute | debasement_bitcoin | ai_data_ownership | none. This is a TAG \
|
||||
for relevance only — tag off-thesis claims "none" and STILL extract them.
|
||||
- "salience": central | secondary | aside (how central the claim is to the passage).
|
||||
|
||||
Return ONLY a JSON object: {"claims": [ {...}, ... ]}. No prose, no markdown."""
|
||||
|
||||
|
||||
def build_messages(text: str, *, source_name: str, source_cluster: str | None,
|
||||
date: str | None, kind: str) -> list[dict[str, str]]:
|
||||
seed = ", ".join(SEED_TOPICS)
|
||||
context = (
|
||||
f"Source: {source_name or 'unknown'} (cluster: {source_cluster or 'n/a'}, type: {kind}, "
|
||||
f"date: {date or 'n/a'}).\n"
|
||||
f"Seed topics to reuse when they fit: {seed}.\n\n"
|
||||
f"PASSAGE:\n{text}"
|
||||
)
|
||||
return [
|
||||
{"role": "system", "content": _SYSTEM},
|
||||
{"role": "user", "content": context},
|
||||
]
|
||||
@@ -0,0 +1,69 @@
|
||||
"""Extraction worker — drains 'extract' jobs from the backfill queue (§4.2, §13.4).
|
||||
|
||||
Single sequential worker by design: extraction is the heavier serial load on the one LLM GPU.
|
||||
For each job: load the document, get its text (fetch+strip filing HTML, or read a stored transcript),
|
||||
chunk it, run the §4.2 extractor per chunk, persist 0..N claims, complete the job.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from ..backfill import queue
|
||||
from . import claims as claims_mod
|
||||
from .html_text import html_to_text
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _document_text(doc, *, user_agent: str) -> str:
|
||||
if doc["transcript_path"]:
|
||||
return Path(doc["transcript_path"]).read_text()
|
||||
if doc["kind"] == "filing" and doc["url"]:
|
||||
r = requests.get(doc["url"], headers={"User-Agent": user_agent}, timeout=90)
|
||||
r.raise_for_status()
|
||||
return html_to_text(r.text)
|
||||
raise ValueError(f"no text source for {doc['doc_id']} (kind={doc['kind']}, url={doc['url']})")
|
||||
|
||||
|
||||
def run_extract(conn, sc, cfg, *, limit: int = 10, max_chunks_per_doc: int = 4,
|
||||
chunk_chars: int = 18_000, lease_seconds: int = 900,
|
||||
worker_id: str = "extract-1") -> dict:
|
||||
from .backends import from_config as backend_from_config
|
||||
backend = backend_from_config(cfg, sc)
|
||||
log.info("extraction backend: %s", backend.name)
|
||||
claims_mod.register_seed_topics(conn)
|
||||
processed = total_claims = 0
|
||||
while processed < limit:
|
||||
job = queue.lease_next(conn, worker_id=worker_id, job_types=["extract"], lease_seconds=lease_seconds)
|
||||
if job is None:
|
||||
break
|
||||
processed += 1
|
||||
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
|
||||
if doc is None:
|
||||
queue.skip(conn, job["job_id"], "document missing")
|
||||
continue
|
||||
src = conn.execute("SELECT * FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
|
||||
try:
|
||||
text = _document_text(doc, user_agent=cfg.edgar_user_agent)
|
||||
chunks = claims_mod.chunk_text(text, chunk_chars)[:max_chunks_per_doc]
|
||||
doc_claims = 0
|
||||
for idx, chunk in enumerate(chunks):
|
||||
cl = claims_mod.extract_claims_from_text(
|
||||
backend, chunk,
|
||||
source_name=src["name"] if src else "",
|
||||
source_cluster=src["source_cluster"] if src else None,
|
||||
date=doc["date"], kind=doc["kind"],
|
||||
)
|
||||
doc_claims += claims_mod.persist_claims(conn, doc=doc, source=src, claims=cl, chunk_idx=idx)
|
||||
conn.execute("UPDATE documents SET processed_at=datetime('now') WHERE doc_id=?", (doc["doc_id"],))
|
||||
conn.commit()
|
||||
queue.complete(conn, job["job_id"], output_ref=f"{doc_claims} claims / {len(chunks)} chunks")
|
||||
total_claims += doc_claims
|
||||
log.info("extracted %d claims from %s (%d chunks)", doc_claims, doc["doc_id"], len(chunks))
|
||||
except Exception as e: # noqa: BLE001
|
||||
state = queue.fail(conn, job["job_id"], e)
|
||||
log.warning("extract failed for %s: %s (→ %s)", job["target_id"], e, state)
|
||||
return {"jobs_processed": processed, "claims_written": total_claims}
|
||||
@@ -0,0 +1,5 @@
|
||||
"""Ingestion layer (§4.1) — the biggest greenfield piece.
|
||||
|
||||
Spark Control transcribes audio you hand it; it does NOT fetch. Everything here is fetch/schedule:
|
||||
RSS + YouTube + EDGAR + FMP earnings, long-audio chunking, and cross-chunk speaker stitching.
|
||||
"""
|
||||
@@ -0,0 +1,36 @@
|
||||
"""Long-audio chunking (§4.1, §13.4).
|
||||
|
||||
Podcasts run 1–3 h; the diarizer caps at 4 speakers/chunk and Spark 2 is a single GPU, so we cut
|
||||
long audio into ~2–3 min pieces sent SEQUENTIALLY (parallel audio → 503 FFT race). Each chunk is
|
||||
diarized independently and re-stitched across chunks by voiceprint (see speaker_stitch.py).
|
||||
Requires ffmpeg/ffprobe.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
CHUNK_SECONDS_DEFAULT = 150 # 2.5 min, within the ~2–3 min guidance
|
||||
|
||||
|
||||
def duration_seconds(src: str | Path) -> float:
|
||||
out = subprocess.run(
|
||||
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
||||
"-of", "default=noprint_wrappers=1:nokey=1", str(src)],
|
||||
check=True, capture_output=True, text=True,
|
||||
)
|
||||
return float(out.stdout.strip())
|
||||
|
||||
|
||||
def chunk_audio(src: str | Path, out_dir: str | Path, *, chunk_seconds: int = CHUNK_SECONDS_DEFAULT) -> list[Path]:
|
||||
"""Split into fixed-length WAV chunks using ffmpeg's segment muxer (no re-encode of timing).
|
||||
Returns chunk paths in order. Order matters: the queue sends them sequentially."""
|
||||
out_dir = Path(out_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
pattern = str(out_dir / "chunk_%04d.wav")
|
||||
subprocess.run(
|
||||
["ffmpeg", "-y", "-i", str(src), "-f", "segment", "-segment_time", str(chunk_seconds),
|
||||
"-ar", "16000", "-ac", "1", "-reset_timestamps", "1", pattern],
|
||||
check=True, capture_output=True,
|
||||
)
|
||||
return sorted(out_dir.glob("chunk_*.wav"))
|
||||
@@ -0,0 +1,159 @@
|
||||
"""Text-document fetcher for the Battery (bitcoin-collateralized lending) corpus and any non-filing,
|
||||
non-audio source: policy primaries (SEC SABs, OCC/FDIC/Fed), lender/issuer blogs, credit-market data.
|
||||
|
||||
Unlike EDGAR (CIK-driven) and the podcast path (audio→transcribe), these are dated HTML pages, PDFs, or
|
||||
article RSS feeds. We fetch ONCE, extract clean text (HTML via html_to_text, PDF via pypdf), save it, and
|
||||
point documents.transcript_path at the saved text so the extract worker reads it directly (it already
|
||||
supports transcript_path) — this also lets PDFs work, which the worker's on-demand html_to_text fetch can't.
|
||||
|
||||
A source row must exist first (FK). Lineage/axis live on the source's cluster/notes (set in the seed);
|
||||
policy sources are axis=context and must NOT feed the supply resolver (weight 0) — enforced downstream.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from ..backfill import queue
|
||||
from ..extract.html_text import html_to_text
|
||||
from .feeds import fetch_feed
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_UA = "ten31-signal-engine/1.0 (research; contact ops@ten31.xyz)"
|
||||
|
||||
|
||||
def _pdf_to_text(data: bytes, *, max_chars: int) -> str:
|
||||
import pypdf
|
||||
reader = pypdf.PdfReader(io.BytesIO(data))
|
||||
parts: list[str] = []
|
||||
total = 0
|
||||
for page in reader.pages:
|
||||
t = page.extract_text() or ""
|
||||
parts.append(t)
|
||||
total += len(t)
|
||||
if total > max_chars:
|
||||
break
|
||||
return "\n".join(parts)[:max_chars]
|
||||
|
||||
|
||||
def fetch_clean_text(url: str, *, method: str = "auto", ua: str = DEFAULT_UA,
|
||||
timeout: int = 90, max_chars: int = 300_000) -> str:
|
||||
"""Fetch a URL once and return clean text. Auto-detects PDF vs HTML by content-type + magic bytes."""
|
||||
r = requests.get(url, headers={"User-Agent": ua}, timeout=timeout)
|
||||
r.raise_for_status()
|
||||
ctype = r.headers.get("Content-Type", "").lower()
|
||||
is_pdf = method == "pdf" or "application/pdf" in ctype or r.content[:5] == b"%PDF-"
|
||||
if is_pdf:
|
||||
return _pdf_to_text(r.content, max_chars=max_chars)
|
||||
return html_to_text(r.text, max_chars=max_chars)
|
||||
|
||||
|
||||
_BLOCK_MARKERS = (
|
||||
"aggressive automated scraping", "request access", "access denied", "are you a robot",
|
||||
"enable javascript", "captcha", "verify you are human", "rate limit exceeded",
|
||||
"403 forbidden", "unusual traffic", "checking your browser",
|
||||
)
|
||||
|
||||
|
||||
def _looks_blocked(text: str) -> bool:
|
||||
"""Anti-scraping interstitials return 200 + a short access-denied body. Detect so we don't ingest
|
||||
a block page as if it were the document (a real policy/blog doc is long and has no such markers)."""
|
||||
low = text[:2500].lower()
|
||||
return any(m in low for m in _BLOCK_MARKERS)
|
||||
|
||||
|
||||
def _doc_id(source_id: str, url: str) -> str:
|
||||
return f"doc:{source_id}:{hashlib.sha256(url.encode()).hexdigest()[:12]}"
|
||||
|
||||
|
||||
def ingest_one(conn: sqlite3.Connection, cfg, *, source_id: str, url: str, title: str,
|
||||
date: str | None, method: str = "auto", prompt_version: str = "extract-v0",
|
||||
min_chars: int = 400) -> str | None:
|
||||
"""Fetch+store one text document and enqueue extraction. Idempotent on (source_id, url).
|
||||
Returns doc_id if newly ingested, else None (duplicate, too-short, or fetch error → logged)."""
|
||||
doc_id = _doc_id(source_id, url)
|
||||
if conn.execute("SELECT 1 FROM documents WHERE doc_id=?", (doc_id,)).fetchone():
|
||||
return None
|
||||
ua = getattr(cfg, "user_agent", None) or DEFAULT_UA
|
||||
try:
|
||||
text = fetch_clean_text(url, method=method, ua=ua)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("doc fetch failed %s: %s", url, e)
|
||||
return None
|
||||
if not text or len(text) < min_chars:
|
||||
log.warning("doc too short (%d chars), skipping %s", len(text or ""), url)
|
||||
return None
|
||||
if _looks_blocked(text):
|
||||
log.warning("blocked/anti-scrape page detected, skipping %s", url)
|
||||
return None
|
||||
safe = doc_id.replace(":", "_")
|
||||
tpath = Path(cfg.data_dir) / "docs" / f"{safe}.txt"
|
||||
tpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
tpath.write_text(text)
|
||||
content_hash = hashlib.sha256(text.encode()).hexdigest()
|
||||
conn.execute(
|
||||
"""INSERT OR IGNORE INTO documents
|
||||
(doc_id, source_id, kind, external_id, url, title, date, transcript_path, content_hash, processed_at)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,datetime('now'))""",
|
||||
(doc_id, source_id, "filing", url, url, title[:300] if title else url, date, str(tpath), content_hash),
|
||||
)
|
||||
conn.commit()
|
||||
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
|
||||
queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
|
||||
parent_doc_id=doc_id, priority=50)
|
||||
conn.commit()
|
||||
log.info("ingested doc %s (%d chars) for %s", doc_id, len(text), source_id)
|
||||
return doc_id
|
||||
|
||||
|
||||
def ingest_manifest(conn: sqlite3.Connection, cfg, path) -> dict:
|
||||
"""Batch-ingest the docs listed in a YAML manifest ({docs:[{source,url,title,date,method}]}).
|
||||
Returns {ingested, skipped, missing_source}. Each source must already exist (FK)."""
|
||||
import yaml
|
||||
from pathlib import Path as _Path
|
||||
data = yaml.safe_load(_Path(path).read_text()) or {}
|
||||
docs = data.get("docs", [])
|
||||
ingested = skipped = missing = 0
|
||||
for d in docs:
|
||||
src = d.get("source")
|
||||
if not conn.execute("SELECT 1 FROM sources WHERE source_id=?", (src,)).fetchone():
|
||||
log.warning("manifest doc references missing source %r — skipping %s", src, d.get("url"))
|
||||
missing += 1
|
||||
continue
|
||||
doc_id = ingest_one(conn, cfg, source_id=src, url=d["url"], title=d.get("title", d["url"]),
|
||||
date=d.get("date"), method=d.get("method", "auto"))
|
||||
if doc_id:
|
||||
ingested += 1
|
||||
else:
|
||||
skipped += 1
|
||||
return {"ingested": ingested, "skipped": skipped, "missing_source": missing}
|
||||
|
||||
|
||||
def ingest_feed_text(conn: sqlite3.Connection, cfg, *, source_id: str, rss_url: str,
|
||||
since: str | None = None, until: str | None = None, limit: int = 50) -> int:
|
||||
"""Ingest the ARTICLE bodies behind a text RSS feed (blog/press feed). Each item's link is fetched
|
||||
and stored as a dated text document. Returns count of newly-ingested docs."""
|
||||
from .feeds import _published_iso
|
||||
parsed = fetch_feed(rss_url, user_agent=getattr(cfg, "user_agent", None) or DEFAULT_UA)
|
||||
n = 0
|
||||
for entry in parsed.entries:
|
||||
if n >= limit:
|
||||
break
|
||||
link = entry.get("link")
|
||||
if not link:
|
||||
continue
|
||||
date = _published_iso(entry)
|
||||
if since and date and date < since:
|
||||
continue
|
||||
if until and date and date > until:
|
||||
continue
|
||||
if ingest_one(conn, cfg, source_id=source_id, url=link,
|
||||
title=entry.get("title", link), date=date):
|
||||
n += 1
|
||||
return n
|
||||
@@ -0,0 +1,61 @@
|
||||
"""Audio acquisition (§4.1). Spark Control transcribes audio you fetch — this fetches it.
|
||||
|
||||
- Podcast enclosures: a plain streaming download that follows the Podtrac/Megaphone redirects to the
|
||||
final signed CDN object (download immediately; resolved URLs carry short-lived params).
|
||||
- YouTube: yt-dlp (audio-only → 16 kHz mono WAV). NOTE: 2026 YouTube enforces PO Tokens broadly — run
|
||||
the `bgutil-ytdlp-pot-provider` sidecar or pulls will 403. yt-dlp is treated as a LAST resort; prefer
|
||||
the RSS enclosure where a show publishes both (ToS: downloading YT audio violates YouTube ToS).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
|
||||
|
||||
|
||||
def download_enclosure(url: str, dest: str | Path, *, user_agent: str = DEFAULT_UA, timeout: int = 120) -> Path:
|
||||
dest = Path(dest)
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
with requests.get(url, stream=True, allow_redirects=True,
|
||||
headers={"User-Agent": user_agent}, timeout=timeout) as r:
|
||||
r.raise_for_status()
|
||||
with open(dest, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=1 << 16):
|
||||
f.write(chunk)
|
||||
return dest
|
||||
|
||||
|
||||
def to_wav_16k_mono(src: str | Path, dst: str | Path) -> Path:
|
||||
"""Normalize any audio to 16 kHz mono PCM WAV (what the ASR endpoint wants). Requires ffmpeg."""
|
||||
dst = Path(dst)
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
subprocess.run(
|
||||
["ffmpeg", "-y", "-i", str(src), "-ar", "16000", "-ac", "1", "-f", "wav", str(dst)],
|
||||
check=True, capture_output=True,
|
||||
)
|
||||
return dst
|
||||
|
||||
|
||||
def download_youtube_audio(url: str, out_dir: str | Path, *, archive_file: str | Path | None = None) -> Path:
|
||||
"""Audio-only via yt-dlp → 16 kHz mono WAV. `archive_file` (yt-dlp --download-archive) is the
|
||||
canonical 'only-new' dedup for channel/playlist back-catalog pulls."""
|
||||
out_dir = Path(out_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
cmd = [
|
||||
"yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "wav",
|
||||
"--postprocessor-args", "ffmpeg:-ar 16000 -ac 1",
|
||||
"-o", str(out_dir / "%(id)s.%(ext)s"),
|
||||
"--no-progress",
|
||||
]
|
||||
if archive_file:
|
||||
cmd += ["--download-archive", str(archive_file)]
|
||||
cmd.append(url)
|
||||
subprocess.run(cmd, check=True, capture_output=True)
|
||||
# yt-dlp names the file by video id; return the newest wav
|
||||
wavs = sorted(out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime)
|
||||
if not wavs:
|
||||
raise RuntimeError("yt-dlp produced no wav (PO-token/cookies issue? see module docstring)")
|
||||
return wavs[-1]
|
||||
@@ -0,0 +1,127 @@
|
||||
"""Earnings-call transcripts via Financial Modeling Prep (§4.1, §12 — decision: FMP).
|
||||
|
||||
Audio isn't reliably fetchable for large-caps (no uniform feed; ~30–90d replay expiry breaks
|
||||
backfill), so FMP's transcript API is the backbone and EDGAR filings remain the durable core. FMP
|
||||
also exposes an earnings *calendar* to trigger ingestion on the day a call drops.
|
||||
|
||||
Endpoint paths/params are marked TODO(contract): confirm against the FMP 'stable' docs for the
|
||||
account tier at integration. Needs config.fmp_api_key.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
FMP_BASE = "https://financialmodelingprep.com/stable"
|
||||
|
||||
|
||||
class FMPClient:
|
||||
def __init__(self, api_key: str, *, base: str = FMP_BASE, timeout: int = 30) -> None:
|
||||
if not api_key:
|
||||
raise ValueError("FMP_API_KEY is required for earnings-call transcripts")
|
||||
self.api_key = api_key
|
||||
self.base = base
|
||||
self.timeout = timeout
|
||||
self.s = requests.Session()
|
||||
|
||||
def _get(self, path: str, **params: Any) -> Any:
|
||||
params["apikey"] = self.api_key
|
||||
r = self.s.get(f"{self.base}/{path}", params=params, timeout=self.timeout)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
# Confirmed against FMP 'stable' 2026-06-07 (v3 is legacy/403). Note singular "earning".
|
||||
def transcript_dates(self, symbol: str) -> Any:
|
||||
"""List available transcripts: [{quarter, fiscalYear, date}, ...]."""
|
||||
return self._get("earning-call-transcript-dates", symbol=symbol)
|
||||
|
||||
def transcript(self, symbol: str, *, year: int, quarter: int) -> Any:
|
||||
"""One transcript: [{symbol, period, year, date, content}]. Use the `date` field as the
|
||||
document date — FMP's year/quarter labels are fiscal and can be offset from the call date."""
|
||||
return self._get("earning-call-transcript", symbol=symbol, year=year, quarter=quarter)
|
||||
|
||||
def earnings_calendar(self, *, from_date: str, to_date: str) -> Any:
|
||||
"""Earnings calendar (ingestion trigger): [{symbol, date, epsActual, ...}, ...]."""
|
||||
return self._get("earnings-calendar", **{"from": from_date, "to": to_date})
|
||||
|
||||
|
||||
def ingest_transcript(
|
||||
conn: sqlite3.Connection,
|
||||
*,
|
||||
source_id: str,
|
||||
symbol: str,
|
||||
year: int,
|
||||
quarter: int,
|
||||
content: str,
|
||||
date: str | None,
|
||||
data_dir: Path,
|
||||
prompt_version: str = "extract-v0",
|
||||
) -> tuple[bool, bool]:
|
||||
"""Store one transcript (content written to disk → transcript_path) and enqueue an 'extract'
|
||||
job. Idempotent. Returns (new_document, new_job)."""
|
||||
from ..backfill import queue
|
||||
|
||||
external_id = f"{symbol}-{year}Q{quarter}"
|
||||
doc_id = f"earnings:{external_id}"
|
||||
tdir = Path(data_dir) / "transcripts"
|
||||
tdir.mkdir(parents=True, exist_ok=True)
|
||||
tpath = tdir / f"{external_id}.txt"
|
||||
tpath.write_text(content)
|
||||
content_hash = hashlib.sha256(content.encode()).hexdigest()
|
||||
cur = conn.execute(
|
||||
"""INSERT OR IGNORE INTO documents
|
||||
(doc_id, source_id, kind, external_id, title, date, transcript_path, content_hash, processed_at)
|
||||
VALUES (?,?,?,?,?,?,?,?, datetime('now'))""",
|
||||
(doc_id, source_id, "earnings_call", external_id, f"{symbol} {year} Q{quarter} call",
|
||||
date, str(tpath), content_hash),
|
||||
)
|
||||
conn.commit()
|
||||
if not cur.rowcount:
|
||||
return (False, False)
|
||||
# earnings-call Q&A is the highest-yield text source (§4.1) → priority 40, ahead of filings (50).
|
||||
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
|
||||
new_job = queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
|
||||
parent_doc_id=doc_id, priority=40) is not None
|
||||
return (True, new_job)
|
||||
|
||||
|
||||
def ingest_for_ticker(
|
||||
conn: sqlite3.Connection,
|
||||
fmp: FMPClient,
|
||||
*,
|
||||
source_id: str,
|
||||
symbol: str,
|
||||
data_dir: Path,
|
||||
since: str | None = None,
|
||||
until: str | None = None,
|
||||
limit: int = 8,
|
||||
) -> tuple[int, int]:
|
||||
"""Enumerate available transcripts via the dates index, fetch those in [since, until], and
|
||||
ingest. Uses each transcript's own `date` (FMP fiscal labels are offset). Returns (docs, jobs)."""
|
||||
dates = fmp.transcript_dates(symbol)
|
||||
picked = []
|
||||
for d in dates if isinstance(dates, list) else []:
|
||||
dt = d.get("date")
|
||||
if since and dt and dt < since:
|
||||
continue
|
||||
if until and dt and dt > until:
|
||||
continue
|
||||
picked.append(d)
|
||||
n_docs = n_jobs = 0
|
||||
for d in picked[:limit]:
|
||||
tr = fmp.transcript(symbol, year=d["fiscalYear"], quarter=d["quarter"])
|
||||
item = (tr[0] if isinstance(tr, list) and tr else tr) or {}
|
||||
content = item.get("content") or ""
|
||||
if not content:
|
||||
continue
|
||||
nd, nj = ingest_transcript(
|
||||
conn, source_id=source_id, symbol=symbol, year=d["fiscalYear"], quarter=d["quarter"],
|
||||
content=content, date=item.get("date") or d.get("date"), data_dir=data_dir,
|
||||
)
|
||||
n_docs += int(nd)
|
||||
n_jobs += int(nj)
|
||||
return n_docs, n_jobs
|
||||
@@ -0,0 +1,148 @@
|
||||
"""SEC EDGAR ingestion (§4.1).
|
||||
|
||||
Hits the official data.sec.gov / www.sec.gov APIs directly (free, keyless, full history).
|
||||
Two hard requirements:
|
||||
- a descriptive User-Agent (SEC 403s requests without one) — from config.edgar_user_agent.
|
||||
- ≤10 requests/sec aggregate — enforced by a min-interval throttle here.
|
||||
|
||||
Supports an explicit date range AND historical shards (filings.files[]), so the §7.1 backtest can
|
||||
reach 2022–2023 filings, not just the most-recent ~1000.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import sqlite3
|
||||
import time
|
||||
from typing import Iterator
|
||||
|
||||
import requests
|
||||
|
||||
_FILING_COLS = ("accessionNumber", "form", "filingDate", "primaryDocument", "primaryDocDescription")
|
||||
|
||||
|
||||
class EdgarClient:
|
||||
BASE_DATA = "https://data.sec.gov"
|
||||
BASE_WWW = "https://www.sec.gov"
|
||||
|
||||
def __init__(self, user_agent: str, *, min_interval: float = 0.12) -> None:
|
||||
if not user_agent or "@" not in user_agent:
|
||||
raise ValueError("EDGAR requires a descriptive User-Agent with contact email (config.edgar_user_agent)")
|
||||
self.s = requests.Session()
|
||||
self.s.headers.update({"User-Agent": user_agent, "Accept-Encoding": "gzip, deflate"})
|
||||
self.min_interval = min_interval
|
||||
self._last = 0.0
|
||||
self._tickers: dict[str, int] | None = None
|
||||
|
||||
def _throttle(self) -> None:
|
||||
dt = time.monotonic() - self._last
|
||||
if dt < self.min_interval:
|
||||
time.sleep(self.min_interval - dt)
|
||||
self._last = time.monotonic()
|
||||
|
||||
def _get(self, url: str) -> requests.Response:
|
||||
self._throttle()
|
||||
r = self.s.get(url, timeout=30)
|
||||
r.raise_for_status()
|
||||
return r
|
||||
|
||||
# ---- ticker → CIK ----
|
||||
def ticker_map(self) -> dict[str, int]:
|
||||
if self._tickers is None:
|
||||
data = self._get(f"{self.BASE_WWW}/files/company_tickers.json").json()
|
||||
self._tickers = {row["ticker"].upper(): int(row["cik_str"]) for row in data.values()}
|
||||
return self._tickers
|
||||
|
||||
def cik_for(self, ticker: str) -> int | None:
|
||||
return self.ticker_map().get(ticker.upper())
|
||||
|
||||
# ---- filings ----
|
||||
def _iter_array(self, block: dict, forms, since, until) -> Iterator[dict]:
|
||||
arrays = [block.get(c, []) for c in _FILING_COLS]
|
||||
for acc, form, fdate, pdoc, pdesc in zip(*arrays):
|
||||
if forms and form not in forms:
|
||||
continue
|
||||
if since and fdate < since:
|
||||
continue
|
||||
if until and fdate > until:
|
||||
continue
|
||||
yield {"accession": acc, "form": form, "filing_date": fdate,
|
||||
"primary_document": pdoc, "description": pdesc}
|
||||
|
||||
def iter_filings(
|
||||
self,
|
||||
cik: int,
|
||||
*,
|
||||
forms: tuple[str, ...] = ("10-K", "10-Q", "8-K"),
|
||||
since: str | None = None,
|
||||
until: str | None = None,
|
||||
) -> Iterator[dict]:
|
||||
"""Yield filing descriptors. Pulls the inline 'recent' block AND any historical shards whose
|
||||
date window overlaps [since, until] — required to reach the backtest era for active filers."""
|
||||
sub = self._get(f"{self.BASE_DATA}/submissions/CIK{cik:010d}.json").json()
|
||||
recent = sub.get("filings", {}).get("recent", {})
|
||||
for f in self._iter_array(recent, forms, since, until):
|
||||
yield self._with_url(cik, f)
|
||||
for shard in sub.get("filings", {}).get("files", []):
|
||||
# shard has filingFrom / filingTo; skip shards entirely outside the window.
|
||||
if until and shard.get("filingFrom", "") > until:
|
||||
continue
|
||||
if since and shard.get("filingTo", "9999") < since:
|
||||
continue
|
||||
block = self._get(f"{self.BASE_DATA}/submissions/{shard['name']}").json()
|
||||
for f in self._iter_array(block, forms, since, until):
|
||||
yield self._with_url(cik, f)
|
||||
|
||||
def _with_url(self, cik: int, f: dict) -> dict:
|
||||
acc_nodash = f["accession"].replace("-", "")
|
||||
f["cik"] = cik
|
||||
f["url"] = f"{self.BASE_WWW}/Archives/edgar/data/{cik}/{acc_nodash}/{f['primary_document']}"
|
||||
return f
|
||||
|
||||
def fetch_html(self, filing: dict) -> str:
|
||||
return self._get(filing["url"]).text
|
||||
|
||||
|
||||
# Domestic annual/quarterly + foreign-private-issuer equivalents. 20-F (foreign annual, e.g. TSM/IREN),
|
||||
# 40-F (Canadian annual, e.g. CCJ). 8-K/6-K (current reports) excluded by default — low claim yield.
|
||||
HIGH_YIELD_FORMS = ("10-K", "10-Q", "20-F", "40-F")
|
||||
|
||||
|
||||
def ingest_filings(
|
||||
conn: sqlite3.Connection,
|
||||
client: EdgarClient,
|
||||
*,
|
||||
source_id: str,
|
||||
ticker: str,
|
||||
since: str | None = None,
|
||||
until: str | None = None,
|
||||
forms: tuple[str, ...] = HIGH_YIELD_FORMS,
|
||||
prompt_version: str = "extract-v0",
|
||||
) -> tuple[int, int]:
|
||||
"""Insert filing documents and enqueue 'extract' jobs. Filings are text → no transcription;
|
||||
they go straight to extraction (the extract worker fetches + strips the HTML later). Default
|
||||
forms cover both domestic (10-K/10-Q) and foreign-private-issuer (20-F/40-F) filers.
|
||||
Returns (new_documents, new_jobs). Idempotent on (source_id, accession)."""
|
||||
from ..backfill import queue
|
||||
|
||||
cik = client.cik_for(ticker)
|
||||
if cik is None:
|
||||
raise ValueError(f"No CIK found for ticker {ticker!r}")
|
||||
n_docs = n_jobs = 0
|
||||
for f in client.iter_filings(cik, forms=forms, since=since, until=until):
|
||||
doc_id = f"edgar:{f['accession']}"
|
||||
cur = conn.execute(
|
||||
"""INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date)
|
||||
VALUES (?,?,?,?,?,?,?)""",
|
||||
(doc_id, source_id, "filing", f["accession"], f["url"],
|
||||
f"{ticker} {f['form']} {f['filing_date']}", f["filing_date"]),
|
||||
)
|
||||
conn.commit()
|
||||
if not cur.rowcount:
|
||||
continue
|
||||
n_docs += 1
|
||||
h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
|
||||
# priority 50: filings are high-info-density (§4.1) → ahead of podcasts (100)
|
||||
if queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
|
||||
parent_doc_id=doc_id, priority=50) is not None:
|
||||
n_jobs += 1
|
||||
return n_docs, n_jobs
|
||||
@@ -0,0 +1,65 @@
|
||||
"""Podcast RSS ingestion (§4.1).
|
||||
|
||||
feedparser + conditional GET (ETag/Last-Modified) for efficient incremental polling, with a
|
||||
composite (feed_url, guid) dedup discipline. Many podcast CDNs send no validators and some feeds
|
||||
truncate to recent episodes — for the §7.1 backtest, older episodes may need the show's full
|
||||
archive feed (some hosts expose `?limit=` / a separate archive URL) or a YouTube back-catalog.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import feedparser
|
||||
|
||||
DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
|
||||
|
||||
|
||||
def fetch_feed(url: str, *, etag: str | None = None, modified: str | None = None,
|
||||
user_agent: str = DEFAULT_UA) -> feedparser.FeedParserDict:
|
||||
"""Conditional GET. On HTTP 304 the result has .status == 304 and .entries == [] → skip."""
|
||||
return feedparser.parse(url, etag=etag, modified=modified, agent=user_agent)
|
||||
|
||||
|
||||
def _published_iso(entry: Any) -> str | None:
|
||||
t = entry.get("published_parsed") or entry.get("updated_parsed")
|
||||
if not t:
|
||||
return None
|
||||
return time.strftime("%Y-%m-%d", t)
|
||||
|
||||
|
||||
def _enclosure_audio_url(entry: Any) -> str | None:
|
||||
for enc in entry.get("enclosures", []) or []:
|
||||
if str(enc.get("type", "")).startswith("audio"):
|
||||
return enc.get("href") or enc.get("url")
|
||||
# some feeds put audio only in links rel=enclosure
|
||||
for link in entry.get("links", []) or []:
|
||||
if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("audio"):
|
||||
return link.get("href")
|
||||
return None
|
||||
|
||||
|
||||
def _guid(entry: Any) -> str:
|
||||
g = entry.get("id") or entry.get("link")
|
||||
if g:
|
||||
return str(g)
|
||||
basis = f"{entry.get('title','')}|{entry.get('published','')}"
|
||||
return "sha1:" + hashlib.sha1(basis.encode()).hexdigest()
|
||||
|
||||
|
||||
def episode_records(parsed: feedparser.FeedParserDict) -> list[dict]:
|
||||
"""Normalize feed entries to episode records. Skips entries with no audio enclosure."""
|
||||
out: list[dict] = []
|
||||
for e in parsed.entries:
|
||||
audio = _enclosure_audio_url(e)
|
||||
if not audio:
|
||||
continue
|
||||
out.append({
|
||||
"guid": _guid(e),
|
||||
"title": e.get("title"),
|
||||
"audio_url": audio,
|
||||
"link": e.get("link"),
|
||||
"published": _published_iso(e),
|
||||
})
|
||||
return out
|
||||
@@ -0,0 +1,195 @@
|
||||
"""One-time backfill path: transcribe podcast episodes via the Gemini multimodal API instead of the
|
||||
local Spark Parakeet+diarizer pipeline. Used to take a bulk backfill OFF the shared Spark GPU (which
|
||||
contends with production) — it is NOT the steady-state transcriber (local Parakeet remains the default).
|
||||
|
||||
Scope/guardrail: podcast audio is PUBLIC data, so sending it to the frontier does NOT trip the
|
||||
exposure/positioning-data rule (that guardrail is about Ten31's conviction/exposure data, never public
|
||||
audio). Output is written in the SAME 'Speaker: text' transcript format the extractor consumes, so the
|
||||
downstream extract→embed stages are agnostic to which transcriber produced the file.
|
||||
|
||||
Tradeoff vs local: Gemini yields speaker-LABELED text, not voiceprint fingerprints — so no voiceprint
|
||||
auto-edges. We rely on the hand-seeded EISC edges + name-based attribution instead (acceptable for a
|
||||
bounded backfill).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
from ..backfill import queue
|
||||
from .download import download_enclosure
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_PROMPT = (
|
||||
"You are a precise podcast transcriptionist. Transcribe this audio VERBATIM as a speaker-diarized "
|
||||
"transcript.\n"
|
||||
"RULES:\n"
|
||||
"- One line per speaker turn, formatted exactly as `Name: spoken text` (a colon and one space).\n"
|
||||
"- The host of this show is {host} — label every host turn with exactly `{host}` (the person's "
|
||||
"name, never the show's name).\n"
|
||||
"- When the host introduces a guest by name (e.g. 'welcome X to the show', 'I'm joined by X'), use "
|
||||
"that real first name (or full name) as the guest's label for the WHOLE transcript. Only fall back "
|
||||
"to `Guest` (or `Guest 2`, `Guest 3`) if a name is never stated. Do not invent names.\n"
|
||||
"- Do NOT include timestamps, ad-reads markers, summaries, headings, markdown, or any commentary. "
|
||||
"Only the transcript lines.\n"
|
||||
"- Transcribe the entire episode from start to finish. Do not stop early or summarize.\n"
|
||||
)
|
||||
|
||||
|
||||
def _host_person(source_name: str) -> str:
|
||||
"""Derive the host's PERSON name from a source/show name so claimant attribution isn't the show.
|
||||
'What Bitcoin Did (Peter McCormack)' -> 'Peter McCormack'; 'Stephan Livera Podcast' -> 'Stephan
|
||||
Livera'; 'The Kevin Rooke Show' -> 'Kevin Rooke'; 'The Anita Posch Show' -> 'Anita Posch'."""
|
||||
m = re.search(r"\(([^)]+)\)", source_name or "")
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
s = re.sub(r"^The\s+", "", source_name or "").strip()
|
||||
s = re.sub(r"\s+(Podcast|Show)$", "", s, flags=re.I).strip()
|
||||
return s
|
||||
|
||||
|
||||
def _sniff_audio_mime(path: Path) -> str:
|
||||
"""Determine audio MIME from the file header — the downloaded enclosure has a generic `.src`
|
||||
extension, so the Files API can't infer it and rejects the upload without an explicit mime_type."""
|
||||
with open(path, "rb") as fh:
|
||||
head = fh.read(16)
|
||||
if head[:3] == b"ID3" or (len(head) > 1 and head[0] == 0xFF and (head[1] & 0xE0) == 0xE0):
|
||||
return "audio/mpeg"
|
||||
if head[4:8] == b"ftyp":
|
||||
return "audio/mp4" # m4a/aac
|
||||
if head[:4] == b"OggS":
|
||||
return "audio/ogg"
|
||||
if head[:4] == b"RIFF":
|
||||
return "audio/wav"
|
||||
if head[:4] == b"fLaC":
|
||||
return "audio/flac"
|
||||
return "audio/mpeg" # podcast default
|
||||
|
||||
|
||||
def _upload_and_wait(client, audio_path: Path, *, poll_s: float = 2.0, timeout_s: float = 300.0):
|
||||
"""Upload to the Files API and wait until the file is ACTIVE (audio is processed server-side)."""
|
||||
from google.genai import types
|
||||
mime = _sniff_audio_mime(audio_path)
|
||||
f = client.files.upload(file=str(audio_path), config=types.UploadFileConfig(mime_type=mime))
|
||||
waited = 0.0
|
||||
while getattr(f.state, "name", str(f.state)) == "PROCESSING" and waited < timeout_s:
|
||||
time.sleep(poll_s)
|
||||
waited += poll_s
|
||||
f = client.files.get(name=f.name)
|
||||
state = getattr(f.state, "name", str(f.state))
|
||||
if state != "ACTIVE":
|
||||
raise RuntimeError(f"Gemini file not ACTIVE (state={state}) for {audio_path.name}")
|
||||
return f
|
||||
|
||||
|
||||
def transcribe_one(client, model: str, audio_path: Path, host_name: str, *,
|
||||
max_output_tokens: int = 65536) -> tuple[str, dict]:
|
||||
"""Transcribe a single audio file → (transcript_text, usage_dict). Network/CPU only; no DB."""
|
||||
from google.genai import types
|
||||
f = _upload_and_wait(client, audio_path)
|
||||
try:
|
||||
resp = client.models.generate_content(
|
||||
model=model,
|
||||
contents=[f, _PROMPT.format(host=host_name or "the host")],
|
||||
config=types.GenerateContentConfig(temperature=0, max_output_tokens=max_output_tokens),
|
||||
)
|
||||
text = (resp.text or "").strip()
|
||||
um = getattr(resp, "usage_metadata", None)
|
||||
usage = {
|
||||
"prompt_tokens": getattr(um, "prompt_token_count", 0) or 0,
|
||||
"output_tokens": getattr(um, "candidates_token_count", 0) or 0,
|
||||
"finish_reason": str(getattr(resp.candidates[0], "finish_reason", "")) if resp.candidates else "",
|
||||
}
|
||||
return text, usage
|
||||
finally:
|
||||
try:
|
||||
client.files.delete(name=f.name)
|
||||
except Exception as e: # noqa: BLE001 — best-effort cleanup
|
||||
log.debug("file cleanup failed for %s: %s", f.name, e)
|
||||
|
||||
|
||||
def _fetch_and_transcribe(client, model: str, cfg, doc, host_name: str) -> dict:
|
||||
"""Worker-thread unit: download enclosure → Gemini transcribe → write transcript file. No DB writes."""
|
||||
cache = Path(cfg.audio_cache_dir)
|
||||
cache.mkdir(parents=True, exist_ok=True)
|
||||
safe = doc["doc_id"].replace(":", "_")
|
||||
src = cache / f"{safe}.src"
|
||||
audio = download_enclosure(doc["url"], src)
|
||||
try:
|
||||
text, usage = transcribe_one(client, model, audio, host_name)
|
||||
if not text or len(text) < 40:
|
||||
raise RuntimeError(f"empty/short transcript ({len(text)} chars)")
|
||||
tpath = Path(cfg.data_dir) / "transcripts" / f"{safe}.txt"
|
||||
tpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
tpath.write_text(text)
|
||||
return {
|
||||
"doc_id": doc["doc_id"], "ok": True, "transcript_path": str(tpath),
|
||||
"n_lines": text.count("\n") + 1, "content_hash": hashlib.sha256(text.encode()).hexdigest(),
|
||||
"usage": usage,
|
||||
}
|
||||
finally:
|
||||
try:
|
||||
if audio.exists():
|
||||
audio.unlink()
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
|
||||
def run_transcribe_gemini(conn, cfg, *, limit: int = 5, concurrency: int = 4,
|
||||
lease_seconds: int = 7200, worker_id: str = "gemini-transcribe") -> dict:
|
||||
"""Lease pending transcribe jobs and transcribe them via Gemini in parallel. DB writes stay on the
|
||||
main thread; only download+API run in the pool. Reports token usage for cost accounting."""
|
||||
from google import genai
|
||||
if not cfg.gemini_api_key:
|
||||
raise RuntimeError("GEMINI_API_KEY not configured")
|
||||
client = genai.Client(api_key=cfg.gemini_api_key)
|
||||
model = cfg.gemini_model or "gemini-2.5-flash"
|
||||
|
||||
# Lease the batch up front (main thread); resolve docs + host names.
|
||||
leased: list[tuple] = []
|
||||
while len(leased) < limit:
|
||||
job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
|
||||
if job is None:
|
||||
break
|
||||
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
|
||||
if doc is None:
|
||||
queue.skip(conn, job["job_id"], "document missing")
|
||||
continue
|
||||
host = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
|
||||
leased.append((job, doc, _host_person(host["name"]) if host else ""))
|
||||
|
||||
done = failed = prompt_tok = out_tok = 0
|
||||
with ThreadPoolExecutor(max_workers=concurrency) as pool:
|
||||
futs = {pool.submit(_fetch_and_transcribe, client, model, cfg, doc, host): (job, doc)
|
||||
for (job, doc, host) in leased}
|
||||
for fut in as_completed(futs):
|
||||
job, doc = futs[fut]
|
||||
try:
|
||||
r = fut.result()
|
||||
conn.execute(
|
||||
"UPDATE documents SET transcript_path=?, content_hash=?, processed_at=datetime('now') "
|
||||
"WHERE doc_id=?", (r["transcript_path"], r["content_hash"], doc["doc_id"]),
|
||||
)
|
||||
h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
|
||||
queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
|
||||
parent_doc_id=doc["doc_id"], priority=100)
|
||||
queue.complete(conn, job["job_id"], output_ref=f"gemini {r['n_lines']} lines")
|
||||
conn.commit()
|
||||
done += 1
|
||||
prompt_tok += r["usage"]["prompt_tokens"]
|
||||
out_tok += r["usage"]["output_tokens"]
|
||||
fr = r["usage"]["finish_reason"]
|
||||
log.info("gemini transcribed %s (%d lines, %d in/%d out tok%s)", doc["doc_id"],
|
||||
r["n_lines"], r["usage"]["prompt_tokens"], r["usage"]["output_tokens"],
|
||||
", TRUNCATED" if "MAX_TOKENS" in fr else "")
|
||||
except Exception as e: # noqa: BLE001
|
||||
state = queue.fail(conn, job["job_id"], e)
|
||||
conn.commit()
|
||||
failed += 1
|
||||
log.warning("gemini transcribe failed for %s: %s (→ %s)", doc["doc_id"], e, state)
|
||||
return {"done": done, "failed": failed, "prompt_tokens": prompt_tok, "output_tokens": out_tok}
|
||||
@@ -0,0 +1,45 @@
|
||||
"""Speaker-name identification (§4.5 enhancement).
|
||||
|
||||
In a 1-on-1 interview the host introduces the guest by name at the top. Reading the transcript head
|
||||
with the LLM, we attach a real NAME to each diarized speaker → voiceprints.person_label. This gives
|
||||
the independence graph a SECOND, orthogonal overlap signal: the same NAMED guest across two shows is
|
||||
a shared_guest edge even when the voiceprints don't cluster (different mic/codec/room). It complements
|
||||
voiceprint cosine matching and is robust to fingerprint drift — exactly the case the operator flagged.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_SYS = (
|
||||
'You identify the speakers in a podcast/interview transcript. Each line is "LABEL: text". '
|
||||
"Using the introduction and context, determine each LABEL's real full name and role. In an "
|
||||
"interview the host normally introduces themselves and the guest within the first minute. Only "
|
||||
"assert a name you can actually support from the text — if you cannot tell, use null. "
|
||||
'Return ONLY JSON: {"speakers": {"<LABEL>": {"name": "Full Name" or null, '
|
||||
'"role": "host"|"guest"|"panelist"|"unknown", "confidence": "low"|"med"|"high"}}}.'
|
||||
)
|
||||
|
||||
|
||||
def identify_speakers(backend, transcript_head: str, *, source_name: str, host_hint: str | None = None) -> dict:
|
||||
"""Returns {label: {name, role, confidence}}. `backend` is any extract.backends backend."""
|
||||
ctx = f"Show: {source_name}."
|
||||
if host_hint:
|
||||
ctx += f" The show's usual host is {host_hint}."
|
||||
ctx += "\n\nTRANSCRIPT (beginning):\n" + transcript_head
|
||||
messages = [{"role": "system", "content": _SYS}, {"role": "user", "content": ctx}]
|
||||
raw = backend.complete_json(messages, max_tokens=600)
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
except Exception:
|
||||
i, j = raw.find("{"), raw.rfind("}")
|
||||
if i < 0 or j < 0:
|
||||
return {}
|
||||
try:
|
||||
obj = json.loads(raw[i:j + 1])
|
||||
except Exception:
|
||||
return {}
|
||||
spk = obj.get("speakers", {}) if isinstance(obj, dict) else {}
|
||||
return spk if isinstance(spk, dict) else {}
|
||||
@@ -0,0 +1,111 @@
|
||||
"""Podcast ingestion → documents + 'transcribe' jobs (§4.1).
|
||||
|
||||
RSS path: parse the feed, take episodes in [since, until], register documents pointing at the audio
|
||||
enclosure. YouTube path: enumerate a channel's videos in the date window via yt-dlp (the back-catalog
|
||||
route for the ~9 shows whose RSS is a truncated rolling window — see seeds/podcast_feeds.resolved.yaml).
|
||||
The transcribe worker downloads + processes either kind identically.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
from ..backfill import queue
|
||||
from ..util import audio_dedup_key
|
||||
from .feeds import episode_records, fetch_feed
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _enqueue_doc(conn, *, source_id, kind, external_id, url, title, date) -> tuple[int, int]:
|
||||
doc_id = f"pod:{source_id}:{hashlib.sha1(external_id.encode()).hexdigest()[:12]}"
|
||||
dkey = audio_dedup_key(title, date)
|
||||
# Cross-mirror dedup (pre-GPU): if this same episode was already processed (any source/feed),
|
||||
# record the sighting for provenance but DON'T re-transcribe. (external_id UNIQUE already covers
|
||||
# same-feed re-ingest; this covers the same episode via a different feed/YouTube mirror.)
|
||||
dup = conn.execute(
|
||||
"SELECT doc_id FROM documents WHERE dedup_key=? AND processed_at IS NOT NULL LIMIT 1", (dkey,)
|
||||
).fetchone()
|
||||
cur = conn.execute(
|
||||
"""INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date, dedup_key)
|
||||
VALUES (?,?,?,?,?,?,?,?)""",
|
||||
(doc_id, source_id, kind, external_id, url, title, date, dkey),
|
||||
)
|
||||
conn.commit()
|
||||
if not cur.rowcount:
|
||||
return (0, 0) # same (source_id, external_id) already known
|
||||
if dup:
|
||||
conn.execute(
|
||||
"UPDATE documents SET processed_at=datetime('now'), raw_path=? WHERE doc_id=?",
|
||||
(f"dup_of:{dup['doc_id']}", doc_id),
|
||||
)
|
||||
conn.commit()
|
||||
log.info("skip transcribe for %s — duplicate content of %s", doc_id, dup["doc_id"])
|
||||
return (1, 0)
|
||||
h = hashlib.sha256(f"{doc_id}|audio-v0".encode()).hexdigest()
|
||||
job = queue.enqueue(conn, job_type="transcribe", target_id=doc_id, input_hash=h,
|
||||
parent_doc_id=doc_id, priority=100)
|
||||
return (1, 1 if job is not None else 0)
|
||||
|
||||
|
||||
def ingest_rss(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None, limit=20):
|
||||
if not source["rss_url"]:
|
||||
raise ValueError(f"{source['source_id']} has no rss_url")
|
||||
recs = episode_records(fetch_feed(source["rss_url"]))
|
||||
n_docs = n_jobs = count = 0
|
||||
for r in recs:
|
||||
d = r["published"]
|
||||
if since and d and d < since:
|
||||
continue
|
||||
if until and d and d > until:
|
||||
continue
|
||||
if count >= limit:
|
||||
break
|
||||
count += 1
|
||||
nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="podcast",
|
||||
external_id=r["guid"], url=r["audio_url"], title=r["title"], date=d)
|
||||
n_docs += nd
|
||||
n_jobs += nj
|
||||
return n_docs, n_jobs
|
||||
|
||||
|
||||
def ingest_youtube(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None,
|
||||
limit=20, max_scan=800):
|
||||
"""Enumerate channel videos in the date window via yt-dlp (NON-flat, so upload_date is populated —
|
||||
flat mode returns NA). Videos come newest-first, so we use --dateafter/--datebefore to select the
|
||||
window and --break-match-filters to STOP scanning once we drop below `since` (avoids walking the
|
||||
entire channel history). The transcribe worker downloads audio on demand."""
|
||||
if not source["channel_url"]:
|
||||
raise ValueError(f"{source['source_id']} has no channel_url")
|
||||
url = source["channel_url"].rstrip("/")
|
||||
if "/playlist" not in url and not url.endswith("/videos"):
|
||||
url = url + "/videos"
|
||||
cmd = ["yt-dlp", "--no-warnings", "--ignore-errors", "--skip-download",
|
||||
"--print", "%(id)s\t%(upload_date)s\t%(title)s", "--playlist-end", str(max_scan)]
|
||||
if since:
|
||||
s = since.replace("-", "")
|
||||
cmd += ["--dateafter", s, "--break-match-filters", f"upload_date>={s}"]
|
||||
if until:
|
||||
cmd += ["--datebefore", until.replace("-", "")]
|
||||
cmd.append(url)
|
||||
out = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
|
||||
n_docs = n_jobs = count = 0
|
||||
for line in out.stdout.splitlines():
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 2 or not parts[0] or parts[1] in ("NA", ""):
|
||||
continue
|
||||
vid, upd = parts[0], parts[1]
|
||||
title = parts[2] if len(parts) > 2 else vid
|
||||
date = f"{upd[:4]}-{upd[4:6]}-{upd[6:8]}" if len(upd) == 8 else None
|
||||
if count >= limit:
|
||||
break
|
||||
count += 1
|
||||
nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="youtube",
|
||||
external_id=vid, url=f"https://www.youtube.com/watch?v={vid}",
|
||||
title=title, date=date)
|
||||
n_docs += nd
|
||||
n_jobs += nj
|
||||
return n_docs, n_jobs
|
||||
@@ -0,0 +1,60 @@
|
||||
"""Cross-chunk speaker stitching + the voiceprint library (§4.1, §4.5).
|
||||
|
||||
diarize-chunk returns a 192-d TitaNet voiceprint per speaker per chunk. Because each chunk is
|
||||
diarized independently, "Speaker 1" in chunk 3 is not the same label as "Speaker 1" in chunk 7 —
|
||||
we re-cluster by cosine similarity (~0.7 distance threshold) so one person gets one identity across
|
||||
the whole episode. The SAME library then matches a guest ACROSS shows by voice (the independence
|
||||
graph's hardest edge, §4.5).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
DISTANCE_THRESHOLD = 0.7 # cosine DISTANCE (1 - cosine similarity); §4.1
|
||||
|
||||
|
||||
def _unit(v: np.ndarray) -> np.ndarray:
|
||||
n = np.linalg.norm(v)
|
||||
return v / n if n else v
|
||||
|
||||
|
||||
def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
|
||||
return float(1.0 - np.dot(_unit(np.asarray(a, dtype=float)), _unit(np.asarray(b, dtype=float))))
|
||||
|
||||
|
||||
def stitch_chunks(chunk_voiceprints: list[np.ndarray], *, threshold: float = DISTANCE_THRESHOLD) -> list[int]:
|
||||
"""Greedy online clustering of per-(chunk,speaker) voiceprints into stable speaker ids.
|
||||
|
||||
Input: a flat list of voiceprint vectors (one per chunk-speaker, in encounter order).
|
||||
Output: a parallel list of cluster ids. A vector joins the nearest existing cluster if its
|
||||
distance to that cluster's centroid < threshold, else it starts a new cluster.
|
||||
"""
|
||||
centroids: list[np.ndarray] = []
|
||||
counts: list[int] = []
|
||||
labels: list[int] = []
|
||||
for vp in chunk_voiceprints:
|
||||
vp = np.asarray(vp, dtype=float)
|
||||
if centroids:
|
||||
dists = [cosine_distance(vp, c) for c in centroids]
|
||||
j = int(np.argmin(dists))
|
||||
if dists[j] < threshold:
|
||||
centroids[j] = (centroids[j] * counts[j] + vp) / (counts[j] + 1)
|
||||
counts[j] += 1
|
||||
labels.append(j)
|
||||
continue
|
||||
centroids.append(vp.copy())
|
||||
counts.append(1)
|
||||
labels.append(len(centroids) - 1)
|
||||
return labels
|
||||
|
||||
|
||||
def match_library(vp: np.ndarray, library: list[tuple[str, np.ndarray]], *,
|
||||
threshold: float = DISTANCE_THRESHOLD) -> str | None:
|
||||
"""Return the voiceprint_id of the closest library entry within threshold, else None
|
||||
(a new speaker → caller mints a new library id)."""
|
||||
best_id, best_d = None, threshold
|
||||
for vid, lib_vec in library:
|
||||
d = cosine_distance(vp, lib_vec)
|
||||
if d < best_d:
|
||||
best_id, best_d = vid, d
|
||||
return best_id
|
||||
@@ -0,0 +1,308 @@
|
||||
"""Audio → speaker-attributed transcript + voiceprint library (§4.1, §4.5).
|
||||
|
||||
Per chunk (sequential — audio lock): diarize-chunk (192-d TitaNet fingerprints + timed speaker
|
||||
segments) + transcribe (word timestamps). Align words to speakers by time, stitch speakers ACROSS
|
||||
chunks by fingerprint cosine, then match the persisted voiceprint library so the SAME guest is
|
||||
recognized ACROSS shows by voice — the highest-leverage input to the source-independence graph.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..backfill import queue
|
||||
from .chunker import chunk_audio
|
||||
from .download import download_enclosure, download_youtube_audio, to_wav_16k_mono
|
||||
from .speaker_stitch import DISTANCE_THRESHOLD, match_library, stitch_chunks
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------- alignment ----------
|
||||
def _speaker_at(segments: list[dict], t: float) -> str:
|
||||
for s in segments:
|
||||
if s["start_s"] <= t <= s["end_s"]:
|
||||
return s["speaker"]
|
||||
if not segments:
|
||||
return "Speaker_0"
|
||||
return min(segments, key=lambda s: min(abs(s["start_s"] - t), abs(s["end_s"] - t)))["speaker"]
|
||||
|
||||
|
||||
def align_words(words: list[dict], segments: list[dict]) -> list[dict]:
|
||||
"""Group word-level transcription into speaker turns using the diarization segments."""
|
||||
turns: list[dict] = []
|
||||
cur: dict | None = None
|
||||
for w in words:
|
||||
mid = (w["start"] + w["end"]) / 2
|
||||
spk = _speaker_at(segments, mid)
|
||||
if cur and cur["speaker"] == spk:
|
||||
cur["text"] += " " + w["text"]
|
||||
cur["end"] = w["end"]
|
||||
else:
|
||||
if cur:
|
||||
turns.append(cur)
|
||||
cur = {"speaker": spk, "start": w["start"], "end": w["end"], "text": w["text"]}
|
||||
if cur:
|
||||
turns.append(cur)
|
||||
return turns
|
||||
|
||||
|
||||
# ---------- per-document audio processing ----------
|
||||
def diarize_transcribe_chunks(sc, chunks: list[Path], *, concurrency: int = 2):
|
||||
"""Returns (chunk_turns, chunk_speakers): turns per chunk + (chunk_idx, local_spk, fingerprint).
|
||||
|
||||
Drives up to `concurrency` chunks in flight — the client's global audio SEMAPHORE is the hard cap
|
||||
across both parakeet endpoints (sit at 2: keeps the single serial GPU continuously fed = full
|
||||
throughput, no idle gap). A single chunk's failure is non-fatal (skip; the client already busy-
|
||||
retries transient blips), but if a MAJORITY of chunks fail the whole job raises so it retries later
|
||||
(rather than emitting a half-empty transcript). Results are reassembled in chunk order."""
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
def _one(idx: int, ch: Path):
|
||||
dia = sc.diarize_chunk(str(ch))
|
||||
tr = sc.transcribe(str(ch))
|
||||
turns = align_words(tr.get("words", []), dia.get("segments", []))
|
||||
spks = [(idx, spk, np.asarray(vec, dtype=np.float32))
|
||||
for spk, vec in (dia.get("fingerprints") or {}).items()]
|
||||
return idx, turns, spks
|
||||
|
||||
results: dict[int, tuple] = {}
|
||||
failed = 0
|
||||
with ThreadPoolExecutor(max_workers=max(1, concurrency)) as pool:
|
||||
futs = {pool.submit(_one, i, ch): i for i, ch in enumerate(chunks)}
|
||||
for fut in as_completed(futs):
|
||||
try:
|
||||
idx, turns, spks = fut.result()
|
||||
results[idx] = (turns, spks)
|
||||
except Exception as e: # noqa: BLE001 — one contended chunk shouldn't kill the episode
|
||||
failed += 1
|
||||
log.warning("chunk %d/%d failed (%s) — skipping", futs[fut], len(chunks), str(e)[:90])
|
||||
if chunks and failed >= max(3, len(chunks) // 2):
|
||||
raise RuntimeError(f"{failed}/{len(chunks)} chunks failed — backend contended; will retry later")
|
||||
chunk_turns = [(idx, results[idx][0]) for idx in sorted(results)]
|
||||
chunk_speakers = [s for idx in sorted(results) for s in results[idx][1]]
|
||||
return chunk_turns, chunk_speakers
|
||||
|
||||
|
||||
def stitch_and_centroids(chunk_speakers, *, threshold: float = DISTANCE_THRESHOLD):
|
||||
"""Cluster all (chunk,speaker) fingerprints into within-episode global speakers."""
|
||||
if not chunk_speakers:
|
||||
return {}, {}
|
||||
vecs = [v for (_, _, v) in chunk_speakers]
|
||||
labels = stitch_chunks(vecs, threshold=threshold)
|
||||
keymap: dict[tuple[int, str], int] = {}
|
||||
groups: dict[int, list[np.ndarray]] = {}
|
||||
for (idx, spk, vec), lab in zip(chunk_speakers, labels):
|
||||
keymap[(idx, spk)] = lab
|
||||
groups.setdefault(lab, []).append(vec)
|
||||
centroids = {lab: np.mean(v, axis=0) for lab, v in groups.items()}
|
||||
return keymap, centroids
|
||||
|
||||
|
||||
def _load_library(conn) -> list[tuple[str, np.ndarray]]:
|
||||
rows = conn.execute("SELECT voiceprint_id, vector, person_label FROM voiceprints").fetchall()
|
||||
return [(r["voiceprint_id"], np.frombuffer(r["vector"], dtype=np.float32)) for r in rows]
|
||||
|
||||
|
||||
def _label_for(conn, vpid: str) -> str:
|
||||
r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
|
||||
return (r["person_label"] if r and r["person_label"] else f"SPK:{vpid[:8]}")
|
||||
|
||||
|
||||
def resolve_voiceprints(conn, doc, centroids: dict[int, np.ndarray], *, threshold: float = DISTANCE_THRESHOLD):
|
||||
"""Match each within-episode speaker to the persisted library (cross-show identity) or mint a new
|
||||
one; record observations; add shared_guest edges when the voice also appears in ANOTHER source."""
|
||||
library = _load_library(conn)
|
||||
cluster_to_vpid: dict[int, str] = {}
|
||||
for lab, cen in centroids.items():
|
||||
vpid = match_library(cen, library, threshold=threshold)
|
||||
if vpid is None:
|
||||
vpid = "vp_" + uuid.uuid4().hex[:16]
|
||||
conn.execute(
|
||||
"INSERT INTO voiceprints (voiceprint_id, vector, first_doc_id) VALUES (?,?,?)",
|
||||
(vpid, cen.astype(np.float32).tobytes(), doc["doc_id"]),
|
||||
)
|
||||
library.append((vpid, cen))
|
||||
conn.execute(
|
||||
"INSERT INTO voiceprint_observations (voiceprint_id, doc_id, chunk_idx) VALUES (?,?,?)",
|
||||
(vpid, doc["doc_id"], None),
|
||||
)
|
||||
cluster_to_vpid[lab] = vpid
|
||||
conn.commit()
|
||||
# independence graph (§4.5): if this voice appears in a DIFFERENT source, that's a shared guest.
|
||||
for vpid in set(cluster_to_vpid.values()):
|
||||
others = conn.execute(
|
||||
"""SELECT DISTINCT d.source_id FROM voiceprint_observations o
|
||||
JOIN documents d ON d.doc_id = o.doc_id
|
||||
WHERE o.voiceprint_id=? AND d.source_id != ?""",
|
||||
(vpid, doc["source_id"]),
|
||||
).fetchall()
|
||||
for o in others:
|
||||
a, b = sorted([doc["source_id"], o["source_id"]])
|
||||
conn.execute(
|
||||
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
|
||||
VALUES (?,?,'shared_guest',1.0,?)
|
||||
ON CONFLICT(src_a, src_b, edge_type)
|
||||
DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
|
||||
(a, b, vpid),
|
||||
)
|
||||
conn.commit()
|
||||
return cluster_to_vpid
|
||||
|
||||
|
||||
def _labeled(chunk_turns, keymap, label_by_cluster: dict) -> str:
|
||||
lines: list[str] = []
|
||||
for idx, turns in chunk_turns:
|
||||
for t in turns:
|
||||
lab = keymap.get((idx, t["speaker"]))
|
||||
label = label_by_cluster.get(lab, t["speaker"])
|
||||
lines.append(f"{label}: {t['text']}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_transcript(conn, chunk_turns, keymap, cluster_to_vpid) -> str:
|
||||
labels = {lab: _label_for(conn, vpid) for lab, vpid in cluster_to_vpid.items()}
|
||||
return _labeled(chunk_turns, keymap, labels)
|
||||
|
||||
|
||||
def apply_names(conn, cluster_to_vpid: dict, idmap: dict) -> dict:
|
||||
"""Attach confident names to the voiceprint library (person_label). Returns {cluster: name}."""
|
||||
named: dict[int, str] = {}
|
||||
for lab, vpid in cluster_to_vpid.items():
|
||||
info = idmap.get(f"Speaker {lab + 1}") or idmap.get(str(lab + 1)) or {}
|
||||
name = (info.get("name") or "").strip() if isinstance(info, dict) else ""
|
||||
if name and info.get("confidence") in ("med", "high"):
|
||||
conn.execute("UPDATE voiceprints SET person_label=? WHERE voiceprint_id=?", (name, vpid))
|
||||
named[lab] = name
|
||||
conn.commit()
|
||||
return named
|
||||
|
||||
|
||||
def add_name_edges(conn, doc, cluster_to_vpid: dict) -> int:
|
||||
"""Name-based shared_guest edges: same person_label seen in a DIFFERENT source → independence edge,
|
||||
even if the voiceprints didn't cluster (drift-robust complement to voiceprint matching, §4.5)."""
|
||||
n = 0
|
||||
for vpid in set(cluster_to_vpid.values()):
|
||||
r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
|
||||
name = r["person_label"] if r else None
|
||||
if not name:
|
||||
continue
|
||||
others = conn.execute(
|
||||
"""SELECT DISTINCT d.source_id FROM voiceprints v
|
||||
JOIN voiceprint_observations o ON o.voiceprint_id = v.voiceprint_id
|
||||
JOIN documents d ON d.doc_id = o.doc_id
|
||||
WHERE v.person_label = ? AND d.source_id != ?""",
|
||||
(name, doc["source_id"]),
|
||||
).fetchall()
|
||||
for o in others:
|
||||
a, b = sorted([doc["source_id"], o["source_id"]])
|
||||
conn.execute(
|
||||
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
|
||||
VALUES (?,?,'shared_guest',1.0,?)
|
||||
ON CONFLICT(src_a, src_b, edge_type)
|
||||
DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
|
||||
(a, b, f"name:{name}"),
|
||||
)
|
||||
n += 1
|
||||
conn.commit()
|
||||
return n
|
||||
|
||||
|
||||
def _download_audio(doc, cfg) -> Path:
|
||||
cache = Path(cfg.audio_cache_dir)
|
||||
cache.mkdir(parents=True, exist_ok=True)
|
||||
wav = cache / f"{doc['doc_id'].replace(':', '_')}.wav"
|
||||
if wav.exists():
|
||||
return wav
|
||||
url = doc["url"]
|
||||
if doc["kind"] == "youtube" or (url and ("youtube.com" in url or "youtu.be" in url)):
|
||||
return download_youtube_audio(url, cache, archive_file=cache / "yt-archive.txt")
|
||||
raw = download_enclosure(url, cache / f"{doc['doc_id'].replace(':', '_')}.src")
|
||||
return to_wav_16k_mono(raw, wav)
|
||||
|
||||
|
||||
def process_document(conn, sc, cfg, doc, *, max_chunks: int, chunk_seconds: int = 150,
|
||||
keep_audio: bool = False) -> int:
|
||||
audio = _download_audio(doc, cfg)
|
||||
chunkdir = Path(cfg.audio_cache_dir) / f"chunks_{doc['doc_id'].replace(':', '_')}"
|
||||
chunks = chunk_audio(audio, chunkdir, chunk_seconds=chunk_seconds)[:max_chunks]
|
||||
chunk_turns, chunk_speakers = diarize_transcribe_chunks(
|
||||
sc, chunks, concurrency=getattr(cfg, "audio_concurrency", 2))
|
||||
keymap, centroids = stitch_and_centroids(chunk_speakers)
|
||||
cluster_to_vpid = resolve_voiceprints(conn, doc, centroids)
|
||||
|
||||
# Name the speakers (§4.5): host introduces guest in 1-on-1 → attach person_label, then a
|
||||
# name-based shared_guest edge that survives voiceprint drift across shows.
|
||||
src = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
|
||||
try:
|
||||
from ..extract.backends import from_config as backend_from_config
|
||||
from .identify import identify_speakers
|
||||
backend = backend_from_config(cfg, sc)
|
||||
draft = _labeled(chunk_turns, keymap, {lab: f"Speaker {lab + 1}" for lab in cluster_to_vpid})
|
||||
idmap = identify_speakers(backend, draft[:6000], source_name=src["name"] if src else "")
|
||||
named = apply_names(conn, cluster_to_vpid, idmap)
|
||||
if named:
|
||||
log.info("named speakers in %s: %s", doc["doc_id"], ", ".join(named.values()))
|
||||
except Exception as e: # noqa: BLE001 — naming is best-effort enrichment
|
||||
log.warning("speaker identification failed for %s: %s", doc["doc_id"], e)
|
||||
add_name_edges(conn, doc, cluster_to_vpid)
|
||||
|
||||
transcript = build_transcript(conn, chunk_turns, keymap, cluster_to_vpid)
|
||||
tpath = Path(cfg.data_dir) / "transcripts" / f"{doc['doc_id'].replace(':', '_')}.txt"
|
||||
tpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
tpath.write_text(transcript)
|
||||
import hashlib
|
||||
content_hash = hashlib.sha256(transcript.encode()).hexdigest()
|
||||
conn.execute(
|
||||
"UPDATE documents SET transcript_path=?, duration_sec=?, content_hash=?, processed_at=datetime('now') WHERE doc_id=?",
|
||||
(str(tpath), len(chunks) * chunk_seconds, content_hash, doc["doc_id"]),
|
||||
)
|
||||
conn.commit()
|
||||
h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
|
||||
queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
|
||||
parent_doc_id=doc["doc_id"], priority=100)
|
||||
if not keep_audio:
|
||||
_cleanup_audio(audio, chunkdir)
|
||||
return len(chunk_turns)
|
||||
|
||||
|
||||
def _cleanup_audio(audio: Path, chunkdir: Path) -> None:
|
||||
"""Audio files are large and disposable once transcribed — reclaim the disk (the transcript +
|
||||
voiceprints are what we keep). Backfilling hundreds of 1-3 hr episodes would otherwise be tens of GB."""
|
||||
import shutil
|
||||
try:
|
||||
if audio.exists():
|
||||
audio.unlink()
|
||||
src = audio.with_suffix(".src")
|
||||
if src.exists():
|
||||
src.unlink()
|
||||
if chunkdir.exists():
|
||||
shutil.rmtree(chunkdir, ignore_errors=True)
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("audio cleanup failed for %s: %s", audio, e)
|
||||
|
||||
|
||||
def run_transcribe(conn, sc, cfg, *, limit: int = 5, max_chunks: int = 999,
|
||||
lease_seconds: int = 3600, worker_id: str = "transcribe-1") -> dict:
|
||||
processed = 0
|
||||
while processed < limit:
|
||||
job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
|
||||
if job is None:
|
||||
break
|
||||
processed += 1
|
||||
doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
|
||||
if doc is None:
|
||||
queue.skip(conn, job["job_id"], "document missing")
|
||||
continue
|
||||
try:
|
||||
n = process_document(conn, sc, cfg, doc, max_chunks=max_chunks)
|
||||
queue.complete(conn, job["job_id"], output_ref=f"{n} chunks")
|
||||
log.info("transcribed %s (%d chunks)", doc["doc_id"], n)
|
||||
except Exception as e: # noqa: BLE001
|
||||
state = queue.fail(conn, job["job_id"], e)
|
||||
log.warning("transcribe failed for %s: %s (→ %s)", job["target_id"], e, state)
|
||||
return {"jobs_processed": processed}
|
||||
@@ -0,0 +1,6 @@
|
||||
"""The scoring brain (build blueprint).
|
||||
|
||||
Stats/geometry NOMINATE candidates; the frontier model only judges/expands a pre-filtered shortlist
|
||||
(§5.1). Every count that feeds a score routes through the independence primitive (EISC), never a raw
|
||||
source count (§4.5). Every scorer reads `visible_claims` (as-of filtered), never `claims` directly.
|
||||
"""
|
||||
@@ -0,0 +1,43 @@
|
||||
"""As-of harness (§6.6 look-ahead guard).
|
||||
|
||||
Every scorer reads the `visible_claims` TEMP VIEW, never `claims` directly: at nomination time only
|
||||
claims dated <= as_of are visible, so the backtest can't reward noticing what already happened. The
|
||||
view also resolves merged canonical topics (topics.status='merged') to a stable `topic_id`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
|
||||
|
||||
class Scorer:
|
||||
"""Context manager that binds a run to an as_of date and exposes `visible_claims`.
|
||||
|
||||
mode='backtest' enforces strict as-of discipline; 'forward' is the live pilot. as_of is a
|
||||
controlled ISO date (YYYY-MM-DD) — safe to inline into the view DDL (views can't take params)."""
|
||||
|
||||
def __init__(self, conn: sqlite3.Connection, as_of: str, *, mode: str = "backtest") -> None:
|
||||
self.conn = conn
|
||||
self.as_of = as_of
|
||||
self.mode = mode
|
||||
|
||||
def __enter__(self) -> "Scorer":
|
||||
self.conn.executescript(
|
||||
f"""
|
||||
DROP VIEW IF EXISTS visible_claims;
|
||||
CREATE TEMP VIEW visible_claims AS
|
||||
SELECT c.*,
|
||||
COALESCE((SELECT t.merged_into FROM topics t
|
||||
WHERE t.topic_canonical = c.topic_canonical AND t.status='merged'),
|
||||
c.topic_canonical) AS topic_id
|
||||
FROM claims c
|
||||
JOIN documents d ON d.doc_id = c.doc_id
|
||||
WHERE c.date IS NOT NULL AND c.date <= '{self.as_of}';
|
||||
"""
|
||||
)
|
||||
return self
|
||||
|
||||
def __exit__(self, *exc) -> None:
|
||||
self.conn.execute("DROP VIEW IF EXISTS visible_claims")
|
||||
|
||||
def count_visible(self) -> int:
|
||||
return self.conn.execute("SELECT COUNT(*) FROM visible_claims").fetchone()[0]
|
||||
@@ -0,0 +1,49 @@
|
||||
"""The quantitative bar (§5.1, §6.6) — the single gate between nomination and the frontier judge.
|
||||
|
||||
Two tiers:
|
||||
- evidence bar → clears hard gates → WRITE A LEDGER ROW (the denominator, §6.6), even if never judged.
|
||||
- promotion bar → also clears the score threshold → goes to the frontier judge.
|
||||
|
||||
THE GLOBAL META-RULE (applied to every scorer): no candidate clears on a single source or single
|
||||
cluster — EISC_adj >= 2.0 AND K_eff >= 2. This is the §2.1 anti-lonely-outlier law, enforced once.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
EISC_FLOOR = 2.0
|
||||
KEFF_FLOOR = 2
|
||||
|
||||
# Defaults; overridable via the score_thresholds table (so the backtest can sweep without code edits).
|
||||
DEFAULT_MIN_SCORE = {"under_acted": 0.3, "emergence": 2.0, "contrarian": 1.5,
|
||||
"convergence": 2.5, "intersection": 2.0}
|
||||
|
||||
|
||||
def _min_score(conn, scorer: str) -> float:
|
||||
if conn is not None:
|
||||
row = conn.execute("SELECT min_score FROM score_thresholds WHERE scorer=?", (scorer,)).fetchone()
|
||||
if row and row[0] is not None:
|
||||
return float(row[0])
|
||||
return DEFAULT_MIN_SCORE.get(scorer, 0.0)
|
||||
|
||||
|
||||
def evaluate(scorer: str, result: dict, *, conn=None) -> tuple[bool, bool]:
|
||||
"""Returns (cleared_evidence_bar, cleared_promotion_bar)."""
|
||||
if scorer == "under_acted":
|
||||
return _under_acted(result, _min_score(conn, scorer))
|
||||
return (False, False) # Job A scorers wired with the forward pilot
|
||||
|
||||
|
||||
def _under_acted(result: dict, min_score: float) -> tuple[bool, bool]:
|
||||
i = result["inputs"]
|
||||
breaker = bool(i.get("is_breaker"))
|
||||
# §4.4 Job B = "rising INDEPENDENT corroboration". EISC>=2.0 enforces independence (shared-guest +
|
||||
# same-cluster discounting), so this is NOT an isolated point or one-guest echo (§2.1). Cross-cluster
|
||||
# (k_eff>=2) is the §4.5 GOLD for Job A DISCOVERY — NOT a hard gate for Job B corroboration: N
|
||||
# independent energy companies confirming a power thesis is real corroboration. Cross-cluster still
|
||||
# BOOSTS the score (eisc_corrob = eisc_adj includes the xcluster_mult) so cross-cluster ranks first.
|
||||
corroborated = (i.get("n_confirmed", 0) >= 4 and i.get("n_src", 0) >= 2
|
||||
and i.get("eisc_corrob", 0.0) >= EISC_FLOOR and i.get("a_corrob", 0.0) > 0)
|
||||
conv_ok = breaker or i.get("conviction_weight", 0.0) >= 0.7 # med-high / high
|
||||
expo_ok = breaker or i.get("exposure") in ("none", "lt2") # genuine exposure gap
|
||||
evidence = corroborated and conv_ok and expo_ok
|
||||
promotion = evidence and result["score"] >= min_score
|
||||
return evidence, promotion
|
||||
@@ -0,0 +1,86 @@
|
||||
"""Pre-registered confusion matrix on the §7.1 derivatives (DESIGN_v2 §1.3).
|
||||
|
||||
Measures PRECISION and RECALL, not recall alone. Uses the engine's already-stored candidate_scores
|
||||
(cleared_date + whisper_date) × the pre-registered external repricing (resolution.K2023.yaml). Reports
|
||||
the matrix at BOTH the cleared level (what the engine fired) and the whisper level (what it saw before
|
||||
the independence floor) — the delta is the empirical answer to the gate debate.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
import yaml
|
||||
|
||||
from .external import basket_index, fetch_eod, resolve_reprice, runway_at_signal
|
||||
|
||||
|
||||
def _engine_dates(conn) -> dict[str, dict]:
|
||||
"""For each under_acted node: earliest cleared as_of and earliest whisper as_of (n_conf>=4, a>0)."""
|
||||
rows = conn.execute(
|
||||
"SELECT node_id, conviction_id, as_of, cleared_evidence_bar ev, inputs_json "
|
||||
"FROM candidate_scores WHERE scorer='under_acted'"
|
||||
).fetchall()
|
||||
out: dict[str, dict] = {}
|
||||
for r in rows:
|
||||
k = r["node_id"] or r["conviction_id"]
|
||||
i = json.loads(r["inputs_json"])
|
||||
d = out.setdefault(k, {"cleared": None, "whisper": None})
|
||||
if r["ev"] and (d["cleared"] is None or r["as_of"] < d["cleared"]):
|
||||
d["cleared"] = r["as_of"]
|
||||
if i.get("n_confirmed", 0) >= 4 and i.get("a_corrob", 0) > 0:
|
||||
if d["whisper"] is None or r["as_of"] < d["whisper"]:
|
||||
d["whisper"] = r["as_of"]
|
||||
return out
|
||||
|
||||
|
||||
def _lead_days(repricing_date: str, signal_date: str | None) -> int | None:
|
||||
if not signal_date or not repricing_date:
|
||||
return None
|
||||
return (datetime.strptime(repricing_date, "%Y-%m-%d") - datetime.strptime(signal_date, "%Y-%m-%d")).days
|
||||
|
||||
|
||||
def run_confusion(conn, cfg, spec_path: str) -> dict:
|
||||
spec = yaml.safe_load(open(spec_path))
|
||||
w, rule = spec["window"], spec["rule"]
|
||||
engine = _engine_dates(conn)
|
||||
price_cache: dict[str, list] = {}
|
||||
|
||||
rows = []
|
||||
for node, basket in spec["baskets"].items():
|
||||
prices = {}
|
||||
for sym in basket:
|
||||
if sym not in price_cache:
|
||||
price_cache[sym] = fetch_eod(cfg.fmp_api_key, sym, w["start"], w["end"])
|
||||
prices[sym] = price_cache[sym]
|
||||
missing = [s for s in basket if not prices[s]]
|
||||
idx = basket_index(prices)
|
||||
res = resolve_reprice(idx, threshold_pct=rule["threshold_pct"], hold_pct=rule["hold_pct"],
|
||||
hold_days=rule["hold_days"])
|
||||
ed = engine.get(node, {"cleared": None, "whisper": None})
|
||||
rows.append({
|
||||
"node": node, "basket": basket, "missing": missing,
|
||||
"confirmed": res["confirmed"], "repricing_date": res["repricing_date"], "peak_pct": res["peak_pct"],
|
||||
"cleared_date": ed["cleared"], "whisper_date": ed["whisper"],
|
||||
"lead_cleared": _lead_days(res["repricing_date"], ed["cleared"]) if res["confirmed"] else None,
|
||||
"lead_whisper": _lead_days(res["repricing_date"], ed["whisper"]) if res["confirmed"] else None,
|
||||
# DESIGN_v2.1 Correction A: runway = fraction of the durable move still ahead at signal
|
||||
"runway_cleared": runway_at_signal(idx, ed["cleared"]) if res["confirmed"] else None,
|
||||
"runway_whisper": runway_at_signal(idx, ed["whisper"]) if res["confirmed"] else None,
|
||||
})
|
||||
|
||||
def classify(r, level):
|
||||
fired = bool(r[f"{level}_date"])
|
||||
real = r["confirmed"]
|
||||
return "TP" if (fired and real) else "FP" if (fired and not real) else "FN" if real else "TN"
|
||||
|
||||
def matrix(level):
|
||||
c = {"TP": 0, "FP": 0, "FN": 0, "TN": 0}
|
||||
for r in rows:
|
||||
c[classify(r, level)] += 1
|
||||
p = c["TP"] / (c["TP"] + c["FP"]) if (c["TP"] + c["FP"]) else None
|
||||
rec = c["TP"] / (c["TP"] + c["FN"]) if (c["TP"] + c["FN"]) else None
|
||||
return c, p, rec
|
||||
|
||||
return {"rows": rows, "cleared": matrix("cleared"), "whisper": matrix("whisper"),
|
||||
"classify": classify}
|
||||
@@ -0,0 +1,96 @@
|
||||
"""External-confirmation data for the resolver (DESIGN_v2 §1). Price series via FMP (already paid for).
|
||||
|
||||
This is the *resolving* leg (§6.2): real-world repricing, not discourse. Kept deliberately simple and
|
||||
transparent — the resolution rule is pre-registered, so the code here only fetches + applies it.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import requests
|
||||
|
||||
_FMP = "https://financialmodelingprep.com"
|
||||
|
||||
|
||||
def fetch_eod(api_key: str, symbol: str, start: str, end: str) -> list[tuple[str, float]]:
|
||||
"""Daily (date, close) for a symbol. Tries the FMP 'stable' then legacy 'v3' price endpoints."""
|
||||
s = requests.Session()
|
||||
attempts = [
|
||||
(f"{_FMP}/stable/historical-price-eod/full", {"symbol": symbol, "from": start, "to": end}),
|
||||
(f"{_FMP}/api/v3/historical-price-full/{symbol}", {"from": start, "to": end}),
|
||||
]
|
||||
for url, params in attempts:
|
||||
try:
|
||||
r = s.get(url, params={**params, "apikey": api_key}, timeout=40)
|
||||
if r.status_code != 200:
|
||||
continue
|
||||
j = r.json()
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
rows = j.get("historical") if isinstance(j, dict) else j
|
||||
if not rows:
|
||||
continue
|
||||
out = [(x["date"][:10], x.get("close") or x.get("adjClose")) for x in rows
|
||||
if x.get("date") and (x.get("close") or x.get("adjClose"))]
|
||||
if out:
|
||||
return sorted(out)
|
||||
return []
|
||||
|
||||
|
||||
def basket_index(prices_by_symbol: dict[str, list[tuple[str, float]]]) -> list[tuple[str, float]]:
|
||||
"""Equal-weight, each-symbol-normalized-to-its-own-first-close index, averaged over dates where
|
||||
data exists. (Symbols that IPO'd mid-window enter at 1.0 when they start — flagged by the caller.)"""
|
||||
norm = {}
|
||||
for sym, series in prices_by_symbol.items():
|
||||
if series:
|
||||
base = series[0][1]
|
||||
norm[sym] = {d: c / base for d, c in series if base}
|
||||
dates = sorted({d for n in norm.values() for d in n})
|
||||
idx = []
|
||||
for d in dates:
|
||||
vals = [n[d] for n in norm.values() if d in n]
|
||||
if vals:
|
||||
idx.append((d, sum(vals) / len(vals)))
|
||||
return idx
|
||||
|
||||
|
||||
def index_value_at(index: list[tuple[str, float]], date: str | None) -> float | None:
|
||||
"""Latest index value on or before `date` (baseline if the signal predates the data)."""
|
||||
if not index or not date:
|
||||
return None
|
||||
vals = [v for d, v in index if d <= date]
|
||||
return vals[-1] if vals else index[0][1]
|
||||
|
||||
|
||||
def runway_at_signal(index: list[tuple[str, float]], signal_date: str | None) -> float | None:
|
||||
"""Fraction of the durable move STILL AHEAD at the signal date (DESIGN_v2.1 Correction A).
|
||||
1.0 = whole move ahead (signal before it); 0.0 = signal at the peak. The right metric for a
|
||||
long-duration holder — a modestly-late signal with most of the move ahead is still actionable."""
|
||||
if not index or not signal_date:
|
||||
return None
|
||||
base = index[0][1]
|
||||
peak = max(v for _, v in index)
|
||||
val = index_value_at(index, signal_date)
|
||||
if peak <= base or val is None:
|
||||
return None
|
||||
return round(max(0.0, (peak - val) / (peak - base)), 2)
|
||||
|
||||
|
||||
def resolve_reprice(index: list[tuple[str, float]], *, threshold_pct: float, hold_pct: float,
|
||||
hold_days: int) -> dict:
|
||||
"""Apply the pre-registered rule: first date the index is ≥ +threshold% vs baseline AND still
|
||||
≥ +hold% `hold_days` later. Returns {confirmed, repricing_date, peak_pct}."""
|
||||
from datetime import datetime, timedelta
|
||||
if not index:
|
||||
return {"confirmed": False, "repricing_date": None, "peak_pct": None}
|
||||
base = index[0][1]
|
||||
thr = 1.0 + threshold_pct / 100.0
|
||||
hold = 1.0 + hold_pct / 100.0
|
||||
by_date = dict(index)
|
||||
dates = [d for d, _ in index]
|
||||
peak = max(v for _, v in index)
|
||||
for d, v in index:
|
||||
if v / base >= thr:
|
||||
target = (datetime.strptime(d, "%Y-%m-%d") + timedelta(days=hold_days)).strftime("%Y-%m-%d")
|
||||
later = [vv for dd, vv in index if dd >= target]
|
||||
if later and (later[0] / base) >= hold:
|
||||
return {"confirmed": True, "repricing_date": d, "peak_pct": round((peak / base - 1) * 100, 1)}
|
||||
return {"confirmed": False, "repricing_date": None, "peak_pct": round((peak / base - 1) * 100, 1)}
|
||||
@@ -0,0 +1,113 @@
|
||||
"""Effective Independent Source Count (EISC) — the system's differentiator (§4.5).
|
||||
|
||||
Discount convergence by source connectedness. Five shows that "independently converge" but share one
|
||||
guest must count as ~one voice; three shows across macro/energy/ai with no shared guests are gold.
|
||||
|
||||
Method (resolved in the design panel): noisy-OR connectedness matrix + inverse-row-sum EISC.
|
||||
- symmetric & order-independent (unlike a sequential pairwise-penalty walk)
|
||||
- each source's contribution is individually explainable ("counts 0.31 because connected to 3 others")
|
||||
- collapses correctly: 5 clones -> ~1.0 ; 5 cross-cluster independents -> ~5.0 (raw)
|
||||
- no eigensolve (unstable at n=2..4, our common case)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Coupling per edge type: a voiceprint-confirmed shared guest is near-total redundancy on a topic.
|
||||
KAPPA = {"shared_guest": 0.85, "citation": 0.45, "community": 0.60}
|
||||
# Same-cluster baseline correlation (sources in the same world are partly redundant even w/o an edge).
|
||||
CLUSTER_COUPLING = {"bitcoin": 0.55, "vc_consensus": 0.35}
|
||||
SAME_CLUSTER_DEFAULT = 0.25
|
||||
EDGE_CLAMP = 0.95 # cap kappa*weight so a heavily-weighted edge can't exceed near-total
|
||||
CAP_VALUE = 0.25 # §4.5: bitcoin / capped sources contribute at most 0.25 of a voice
|
||||
CLUSTER_MIN_CONTRIB = 0.5 # a cluster must add >= half an independent voice to count toward K_eff
|
||||
|
||||
|
||||
def effective_independent_N(srcs: list[tuple], edges: list[tuple], *, mode: str = "live") -> dict:
|
||||
"""srcs: [(source_id, source_cluster, cluster_capped_low[, own_network])]; edges: [(a,b,type,weight)].
|
||||
mode='live' (default) DROPS own_network sources (Ten31's own orbit — listening to ourselves, §v2.1);
|
||||
mode='test' keeps them (the reflexivity test fixture). Returns {eisc_adj, eisc_raw, k_eff, ...}."""
|
||||
if mode == "live":
|
||||
srcs = [s for s in srcs if not (len(s) > 3 and s[3])]
|
||||
ids = [s[0] for s in srcs]
|
||||
n = len(ids)
|
||||
if n == 0:
|
||||
return {"eisc_adj": 0.0, "eisc_raw": 0.0, "k_eff": 0, "xcluster_mult": 1.0, "per_source_contrib": {}}
|
||||
idx = {sid: i for i, sid in enumerate(ids)}
|
||||
cluster = {s[0]: s[1] for s in srcs}
|
||||
capped = {s[0]: (bool(s[2]) or s[1] == "bitcoin") for s in srcs}
|
||||
|
||||
# edge channel: combine all edges between a pair by noisy-OR product of (1 - kappa*weight)
|
||||
pair_factor: dict = defaultdict(lambda: 1.0)
|
||||
for a, b, etype, w in edges:
|
||||
if a in idx and b in idx and a != b:
|
||||
term = min(EDGE_CLAMP, KAPPA.get(etype, 0.0) * (w if w is not None else 1.0))
|
||||
pair_factor[frozenset((a, b))] *= (1.0 - term)
|
||||
|
||||
C = np.eye(n)
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
a, b = ids[i], ids[j]
|
||||
e = 1.0 - pair_factor[frozenset((a, b))] # 0 if no edge
|
||||
ci, cj = cluster[a], cluster[b]
|
||||
clust = (CLUSTER_COUPLING.get(ci, SAME_CLUSTER_DEFAULT)
|
||||
if (ci is not None and ci == cj) else 0.0)
|
||||
c = 1.0 - (1.0 - e) * (1.0 - clust)
|
||||
C[i, j] = C[j, i] = c
|
||||
|
||||
rowsum = C.sum(axis=1) # includes the diagonal 1.0
|
||||
contrib, eisc_raw = {}, 0.0
|
||||
cluster_mass: dict = defaultdict(float)
|
||||
for i, sid in enumerate(ids):
|
||||
cap = CAP_VALUE if capped[sid] else 1.0
|
||||
contrib[sid] = cap * (1.0 / rowsum[i])
|
||||
eisc_raw += contrib[sid]
|
||||
if not capped[sid] and cluster[sid]:
|
||||
cluster_mass[cluster[sid]] += contrib[sid]
|
||||
|
||||
# cross-cluster bonus: count NON-capped clusters that genuinely contribute an independent voice
|
||||
# (summed contribution >= half a voice). This stops "one guest across many clusters" from earning
|
||||
# the gold multiplier — the raw EISC already collapses that guest to ~1, and k_eff must agree.
|
||||
k_eff = sum(1 for m in cluster_mass.values() if m >= CLUSTER_MIN_CONTRIB)
|
||||
xmult = max(1.0, 1.0 + 0.5 * (k_eff - 1)) # 1clu->1.0, 2->1.5, 3->2.0 (gold)
|
||||
return {
|
||||
"eisc_adj": xmult * eisc_raw,
|
||||
"eisc_raw": eisc_raw,
|
||||
"k_eff": k_eff,
|
||||
"xcluster_mult": xmult,
|
||||
"per_source_contrib": {k: round(v, 4) for k, v in contrib.items()},
|
||||
}
|
||||
|
||||
|
||||
# --- DB helpers (the brain only READS the graph; edges are produced upstream by the voiceprint lib) ---
|
||||
def load_source_meta(conn, ids: list[str]) -> list[tuple]:
|
||||
ids = list(dict.fromkeys(ids))
|
||||
if not ids:
|
||||
return []
|
||||
ph = ",".join("?" * len(ids))
|
||||
rows = conn.execute(
|
||||
f"SELECT source_id, source_cluster, cluster_capped_low, COALESCE(own_network,0) "
|
||||
f"FROM sources WHERE source_id IN ({ph})", ids
|
||||
).fetchall()
|
||||
return [(r[0], r[1], r[2], r[3]) for r in rows]
|
||||
|
||||
|
||||
def load_edges(conn, ids: list[str]) -> list[tuple]:
|
||||
ids = list(dict.fromkeys(ids))
|
||||
if not ids:
|
||||
return []
|
||||
ph = ",".join("?" * len(ids))
|
||||
rows = conn.execute(
|
||||
f"SELECT src_a, src_b, edge_type, weight FROM source_edges WHERE src_a IN ({ph}) AND src_b IN ({ph})",
|
||||
ids + ids,
|
||||
).fetchall()
|
||||
return [(r[0], r[1], r[2], r[3]) for r in rows]
|
||||
|
||||
|
||||
def eisc_for(conn, source_ids: list[str], *, mode: str = "live") -> dict:
|
||||
"""Convenience: EISC for a set of source_ids, loading cluster/cap/own_network + edges from SQLite.
|
||||
mode='live' drops own_network sources; mode='test' keeps them (§v2.1 condition 1)."""
|
||||
ids = list(dict.fromkeys(source_ids))
|
||||
return effective_independent_N(load_source_meta(conn, ids), load_edges(conn, ids), mode=mode)
|
||||
@@ -0,0 +1,49 @@
|
||||
"""Ledger + candidate_scores writers. Log EVERY bar-clearer from day one (§6.6 denominator).
|
||||
|
||||
date_logged = as_of (backtest rows carry historical dates so lead-time math is correct). The
|
||||
discourse_metric JSON is FROZEN here at log time — the resolver (separate forward pass) never edits it.
|
||||
Grant's rating lives in human_evaluations; the model never reads it pre-log (§6.7).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
|
||||
|
||||
def _sig_id(scorer: str, key: str, as_of: str) -> str:
|
||||
return "sig_" + hashlib.sha1(f"{scorer}|{key}|{as_of}".encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
def _score_id(scorer: str, key: str, as_of: str) -> str:
|
||||
return hashlib.sha1(f"cs|{scorer}|{key}|{as_of}".encode()).hexdigest()
|
||||
|
||||
|
||||
def record_candidate_score(conn, result: dict, as_of: str, evidence: bool, promotion: bool) -> None:
|
||||
key = result.get("node_id") or result.get("conviction_id") or result.get("topic_canonical") or ""
|
||||
conn.execute(
|
||||
"""INSERT OR REPLACE INTO candidate_scores
|
||||
(score_id, scorer, as_of, topic_canonical, node_id, conviction_id, score,
|
||||
cleared_evidence_bar, cleared_promotion_bar, inputs_json)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?)""",
|
||||
(_score_id(result["scorer"], key, as_of), result["scorer"], as_of,
|
||||
result.get("topic_canonical"), result.get("node_id"), result.get("conviction_id"),
|
||||
result["score"], int(evidence), int(promotion), json.dumps(result["inputs"])[:8000]),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def log_candidate(conn, *, scorer: str, as_of: str, ledger_type: str, proposition: str,
|
||||
discourse_metric: dict, origin_conviction_id=None, origin_node_id=None) -> str:
|
||||
key = origin_node_id or origin_conviction_id or proposition
|
||||
signal_id = _sig_id(scorer, key, as_of)
|
||||
dm = {**discourse_metric, "scorer": scorer}
|
||||
conn.execute(
|
||||
"""INSERT OR IGNORE INTO ledger
|
||||
(signal_id, type, proposition, date_logged, discourse_metric, model_confidence,
|
||||
origin_conviction_id, origin_node_id)
|
||||
VALUES (?,?,?,?,?,?,?,?)""",
|
||||
(signal_id, ledger_type, proposition[:1000], as_of, json.dumps(dm)[:8000], None,
|
||||
origin_conviction_id, origin_node_id),
|
||||
)
|
||||
conn.commit()
|
||||
return signal_id
|
||||
@@ -0,0 +1,80 @@
|
||||
"""Local-LLM scoring helpers (§4.4). Bounded labeling passes over PRE-FILTERED candidates only —
|
||||
never nomination from the raw corpus (§5.1). JSON mode, temp 0, no thinking → deterministic.
|
||||
|
||||
Helper #2 (derivative-relevance) is built first — it's the one the §7.1 backtest needs. Helper #1
|
||||
(stance-folding for Job A contrarian) comes with the forward pilot.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_REL_SYS = (
|
||||
"You assess whether claims corroborate a specific investment hypothesis (a 2nd/3rd-order "
|
||||
"derivative of a thesis). For EACH claim decide: does it provide real-world evidence that the "
|
||||
"hypothesis is PLAYING OUT (corroborates), and the direction. 'affirms' = supports the hypothesis; "
|
||||
"'contradicts' = is evidence against it; 'tangential' = same topic words but not actually about the "
|
||||
"hypothesis (e.g. 'transformers' the ML architecture vs the electrical-grid kind). Be strict: a "
|
||||
"passing mention is tangential, not corroboration. "
|
||||
"TWO HARD RULES (these are the difference between catching a real signal and being fooled):\n"
|
||||
"1) REALIZED-ONLY. The hypothesis must be PLAYING OUT in fact. Announcements, plans, intentions, "
|
||||
"forecasts, targets, and 'may/will/expects/poised-to/aims-to/up-to' language are NOT corroboration — "
|
||||
"they are 'tangential' unless the claim states the thing has ACTUALLY HAPPENED / been DEPLOYED / "
|
||||
"closed. A $2B program 'announced' or capital 'made available' is NOT capital deployed. A company "
|
||||
"that 'may consider' or 'expects' something has not done it.\n"
|
||||
"2) ROLE-MATCH. The actor in the claim must occupy the role the hypothesis is about. If the "
|
||||
"hypothesis is that capital PROVIDERS are funding/supplying something, then a BORROWER or USER on the "
|
||||
"demand side (e.g. a firm posting an asset AS collateral to RECEIVE a loan) is the wrong side of the "
|
||||
"transaction → 'tangential' to that hypothesis, not 'affirms'. "
|
||||
'Return ONLY JSON: {"results":[{"claim_id":"...","corroborates":true|false,'
|
||||
'"direction":"affirms"|"contradicts"|"tangential"}]}.'
|
||||
)
|
||||
|
||||
|
||||
def _parse(raw: str) -> list[dict]:
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
except Exception:
|
||||
i, j = raw.find("{"), raw.rfind("}")
|
||||
if i < 0 or j < 0:
|
||||
return []
|
||||
try:
|
||||
obj = json.loads(raw[i:j + 1])
|
||||
except Exception:
|
||||
return []
|
||||
res = obj.get("results", []) if isinstance(obj, dict) else []
|
||||
return [r for r in res if isinstance(r, dict) and r.get("claim_id")]
|
||||
|
||||
|
||||
def derivative_relevance(backend, derivative: str, claims: list[dict]) -> dict[str, dict]:
|
||||
"""claims: [{claim_id, proposition}]. Returns {claim_id: {corroborates, direction}}.
|
||||
Filters retrieval near-misses; it cannot ADD claims search didn't return (not a nominator)."""
|
||||
if not claims:
|
||||
return {}
|
||||
listing = "\n".join(f"- [{c['claim_id']}] {c['proposition']}" for c in claims)
|
||||
user = (f"HYPOTHESIS (derivative): {derivative}\n\nCLAIMS:\n{listing}\n\n"
|
||||
f"Judge each claim id.")
|
||||
messages = [{"role": "system", "content": _REL_SYS}, {"role": "user", "content": user}]
|
||||
# Output is ~one JSON record per claim (claim_id + corroborates + direction ≈ 70-100 tokens). At
|
||||
# top_k=60 that's ~5k tokens — a fixed 3000 budget truncated mid-array → empty parse → a node
|
||||
# silently zeroed (the source of the unstable 5-affirm/0-affirm flip). Size the budget to the batch.
|
||||
budget = max(3000, 120 * len(claims) + 500)
|
||||
parsed = []
|
||||
for attempt in range(2): # one retry — a gateway-under-load truncation shouldn't zero out a node
|
||||
raw = backend.complete_json(messages, max_tokens=budget)
|
||||
parsed = _parse(raw)
|
||||
if parsed:
|
||||
break
|
||||
log.warning("derivative_relevance empty parse (attempt %d) for %r; raw[:160]=%r",
|
||||
attempt + 1, derivative[:50], raw[:160])
|
||||
# The listing presents ids as `- [{claim_id}] ...`; the model INCONSISTENTLY echoes the id back with
|
||||
# the surrounding brackets ("[edgar:...]") — which then misses the bracket-less lookup key and the
|
||||
# whole node reads as 0/(missing). Normalize the brackets+whitespace so matching is robust either way.
|
||||
out = {}
|
||||
for r in parsed:
|
||||
cid = str(r["claim_id"]).strip().strip("[]").strip()
|
||||
out[cid] = {"corroborates": bool(r.get("corroborates")),
|
||||
"direction": r.get("direction", "tangential")}
|
||||
return out
|
||||
@@ -0,0 +1,27 @@
|
||||
"""Resolver — the SEPARATE forward pass that closes the loop (§6.2, §6.3).
|
||||
|
||||
ARCHITECTURALLY ISOLATED from the scorers: it has no shared write path with them. Scorers write
|
||||
candidate_scores + ledger rows with outcome columns NULL and a FROZEN discourse_metric. The resolver
|
||||
runs later (larger as_of), reads ledger rows whose date_logged < as_of_now, and writes ONLY
|
||||
resolution_date / discourse_outcome / external_outcome / lead_time_days. It is FORBIDDEN from touching
|
||||
discourse_metric — that is the structural reason the ledger can't reward noticing what already happened.
|
||||
|
||||
Implementation note: real resolutions need forward time (the clock can't be backfilled). For the
|
||||
backtest, the discourse leg can be resolved by re-running the discourse metric forward from date_logged;
|
||||
the external leg (price/filings/human check, §6.5) is filled as that evidence arrives. Stubbed now to
|
||||
lock the architecture; filled out for the forward pilot.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def resolve_discourse_leg(conn, sc, cfg, *, as_of_now: str) -> int:
|
||||
"""For each ledger row logged before as_of_now without a resolution, re-measure discourse forward
|
||||
and set discourse_outcome + lead_time. (Forward-only; never reads/edits discourse_metric.)
|
||||
Returns count resolved. STUB — implemented for the forward pilot."""
|
||||
rows = conn.execute(
|
||||
"SELECT signal_id, date_logged FROM ledger WHERE resolution_date IS NULL AND date_logged < ?",
|
||||
(as_of_now,),
|
||||
).fetchall()
|
||||
# TODO(forward-pilot): re-run windowed independence from date_logged→as_of_now for each row's
|
||||
# origin derivative; set discourse_outcome in {up_cross_cluster,up_single_cluster,flat,down}.
|
||||
return 0
|
||||
@@ -0,0 +1,81 @@
|
||||
"""Scoring orchestrator. For Job B / the §7.1 backtest: march as_of dates, score every conviction +
|
||||
fan-out derivative, gate, log the denominator, promote nodes.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from ..extract.backends import from_config as backend_from_config
|
||||
from . import bar, under_acted
|
||||
from .asof import Scorer
|
||||
from .ledger_writer import log_candidate, record_candidate_score
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _nodes_for(conn, as_of, mode, conviction_ids):
|
||||
nodes = []
|
||||
where, params = "", []
|
||||
if conviction_ids:
|
||||
ph = ",".join("?" * len(conviction_ids))
|
||||
where = f" WHERE conviction_id IN ({ph})"
|
||||
params = list(conviction_ids)
|
||||
for c in conn.execute(
|
||||
f"SELECT conviction_id, thematic_proposition, conviction_level, current_exposure, is_thesis_breaker "
|
||||
f"FROM conviction_log{where}", params,
|
||||
):
|
||||
nodes.append({"conviction_id": c[0], "node_id": None, "derivative": c[1],
|
||||
"level": c[2], "exposure": c[3], "breaker": bool(c[4])})
|
||||
fq = ("SELECT f.node_id, f.parent_conviction_id, f.derivative_proposition, c.conviction_level, "
|
||||
"c.current_exposure, c.is_thesis_breaker FROM fanout_nodes f "
|
||||
"JOIN conviction_log c ON c.conviction_id = f.parent_conviction_id")
|
||||
conds, fparams = [], []
|
||||
if conviction_ids:
|
||||
conds.append(f"f.parent_conviction_id IN ({','.join('?' * len(conviction_ids))})")
|
||||
fparams += list(conviction_ids)
|
||||
if mode == "forward": # backtest uses the seeded tree as the as-of-2023 hypothesis (no created_at leak)
|
||||
conds.append("f.created_at <= ?")
|
||||
fparams.append(as_of)
|
||||
if conds:
|
||||
fq += " WHERE " + " AND ".join(conds)
|
||||
for f in conn.execute(fq, fparams):
|
||||
nodes.append({"conviction_id": f[1], "node_id": f[0], "derivative": f[2],
|
||||
"level": f[3], "exposure": f[4], "breaker": bool(f[5])})
|
||||
return nodes
|
||||
|
||||
|
||||
def run_under_acted(conn, sc, cfg, *, as_of, mode="backtest", conviction_ids=None, window_days=28) -> list[dict]:
|
||||
backend = backend_from_config(cfg, sc)
|
||||
out = []
|
||||
with Scorer(conn, as_of, mode=mode):
|
||||
for nd in _nodes_for(conn, as_of, mode, conviction_ids):
|
||||
r = under_acted.score_node(
|
||||
conn, sc, backend, as_of=as_of, derivative=nd["derivative"],
|
||||
conviction_id=nd["conviction_id"], node_id=nd["node_id"],
|
||||
conviction_level=nd["level"], exposure=nd["exposure"], is_breaker=nd["breaker"],
|
||||
window_days=window_days,
|
||||
)
|
||||
ev, pr = bar.evaluate("under_acted", r, conn=conn)
|
||||
record_candidate_score(conn, r, as_of, ev, pr)
|
||||
if ev:
|
||||
log_candidate(conn, scorer="under_acted", as_of=as_of,
|
||||
ledger_type="under_acted_conviction", proposition=nd["derivative"],
|
||||
discourse_metric=r["inputs"], origin_conviction_id=nd["conviction_id"],
|
||||
origin_node_id=nd["node_id"])
|
||||
if nd["node_id"]:
|
||||
conn.execute("UPDATE fanout_nodes SET status=? WHERE node_id=?",
|
||||
("signal" if pr else "corroborated", nd["node_id"]))
|
||||
conn.commit()
|
||||
out.append({"node": nd, "result": r, "evidence": ev, "promotion": pr})
|
||||
return out
|
||||
|
||||
|
||||
def run_backtest(conn, sc, cfg, *, conviction_id, dates, window_days=90) -> list[tuple]:
|
||||
timeline = []
|
||||
for as_of in dates:
|
||||
res = run_under_acted(conn, sc, cfg, as_of=as_of, mode="backtest",
|
||||
conviction_ids=[conviction_id], window_days=window_days)
|
||||
timeline.append((as_of, res))
|
||||
fired = [r for r in res if r["evidence"]]
|
||||
log.info("as_of %s: %d/%d nodes cleared evidence bar", as_of, len(fired), len(res))
|
||||
return timeline
|
||||
@@ -0,0 +1,105 @@
|
||||
"""Two-sided net-corroboration (DESIGN_v2.1 H5 + condition 3) — the instrument for the adversarial cases.
|
||||
|
||||
For a derivative, track the INDEPENDENCE-WEIGHTED affirms MINUS denies over time. This is the right
|
||||
output for Strike/Battery (where the question is "did the engine distinguish real adoption from
|
||||
narrative, and catch the contradiction?"), not runway:
|
||||
- STRIKE (reflexivity): a PASS = net stays low/quiet in LIVE mode (own_network dropped) while it
|
||||
would have fired in TEST mode (own_network kept) → the engine refuses the intra-cluster echo.
|
||||
- BATTERY (timing): the DEMAND derivative's net rises while the SUPPLY derivative's net stays flat →
|
||||
"half-confirmed, the load-bearing half isn't moving" = the eroding-conviction signal.
|
||||
Reuses the §4.6 relevance helper, which already returns direction affirms|contradicts|tangential.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from .independence import eisc_for
|
||||
from .llm_helpers import derivative_relevance
|
||||
from .windows import window_bounds
|
||||
|
||||
|
||||
def classify_corpus(sc, backend, derivative: str, as_of: str, *, top_k: int = 60) -> list[dict]:
|
||||
"""Retrieve (as-of filtered) + LLM-classify each claim's direction toward the derivative.
|
||||
Returns affirms/contradicts claims with source_id + date (tangential dropped)."""
|
||||
res = sc.search(derivative, collection="propositions", top_k=top_k, rerank=True)
|
||||
hits = res.get("data", []) if isinstance(res, dict) else []
|
||||
cand = []
|
||||
for h in hits:
|
||||
pl = (h.get("payload") or {})
|
||||
d = pl.get("date")
|
||||
if not pl.get("claim_id") or not d or d[:10] > as_of:
|
||||
continue
|
||||
cand.append({"claim_id": pl["claim_id"], "proposition": pl.get("proposition", ""),
|
||||
"date": d[:10], "source_id": pl.get("source_id")})
|
||||
if not cand:
|
||||
return []
|
||||
rel = derivative_relevance(backend, derivative,
|
||||
[{"claim_id": c["claim_id"], "proposition": c["proposition"]} for c in cand])
|
||||
out = []
|
||||
for c in cand:
|
||||
direction = rel.get(c["claim_id"], {}).get("direction", "tangential")
|
||||
if direction in ("affirms", "contradicts"):
|
||||
out.append({**c, "direction": direction})
|
||||
return out
|
||||
|
||||
|
||||
# DESIGN_v2 ADOPT #1 (claim-type weighting): a node "resolves" on REALIZED, descriptive disclosure —
|
||||
# not on forecasts/intent. A source counts toward the net only if it carries a HARD (realized-fact)
|
||||
# claim on this side; predictive/interpretive claims (forecasts, opinion, 'may consider', 'expects')
|
||||
# are the exact material that fooled the supply axis on Battery, so they don't qualify a source alone.
|
||||
_HARD_CLAIM_TYPES = ("descriptive", "reactive")
|
||||
|
||||
|
||||
def _hard_sources(conn, claim_ids: list[str]) -> set:
|
||||
"""Sources that contributed at least one realized-fact (descriptive/reactive) claim among claim_ids."""
|
||||
if not claim_ids:
|
||||
return set()
|
||||
ph = ",".join("?" * len(claim_ids))
|
||||
qph = ",".join("?" * len(_HARD_CLAIM_TYPES))
|
||||
rows = conn.execute(
|
||||
f"SELECT DISTINCT source_id FROM claims WHERE claim_id IN ({ph}) AND claim_type IN ({qph})",
|
||||
list(claim_ids) + list(_HARD_CLAIM_TYPES),
|
||||
).fetchall()
|
||||
return {r[0] for r in rows}
|
||||
|
||||
|
||||
def net_at(conn, classified: list[dict], as_of: str, *, window_days: int = 90, mode: str = "live",
|
||||
require_hard_evidence: bool = True) -> dict:
|
||||
"""Net independence-weighted corroboration in the trailing window ending at as_of. With
|
||||
require_hard_evidence (default), a source only counts on a side if it carries a realized-fact claim
|
||||
there — forecasts/intent alone don't qualify it (the announced-vs-deployed / opinion-vs-fact guard)."""
|
||||
_, start, end = window_bounds(as_of, n=1, days=window_days)[0]
|
||||
win = [c for c in classified if start < c["date"] <= end]
|
||||
aff = [c for c in win if c["direction"] == "affirms"]
|
||||
den = [c for c in win if c["direction"] == "contradicts"]
|
||||
aff_src_all = {c["source_id"] for c in aff}
|
||||
den_src_all = {c["source_id"] for c in den}
|
||||
if require_hard_evidence:
|
||||
hard_aff = _hard_sources(conn, [c["claim_id"] for c in aff])
|
||||
hard_den = _hard_sources(conn, [c["claim_id"] for c in den])
|
||||
aff_src = list(aff_src_all & hard_aff)
|
||||
den_src = list(den_src_all & hard_den)
|
||||
else:
|
||||
aff_src, den_src = list(aff_src_all), list(den_src_all)
|
||||
aff_e = eisc_for(conn, aff_src, mode=mode)["eisc_adj"] if aff_src else 0.0
|
||||
den_e = eisc_for(conn, den_src, mode=mode)["eisc_adj"] if den_src else 0.0
|
||||
own = 0
|
||||
if aff_src:
|
||||
ph = ",".join("?" * len(aff_src))
|
||||
own = conn.execute(
|
||||
f"SELECT COUNT(*) FROM sources WHERE source_id IN ({ph}) AND COALESCE(own_network,0)=1", aff_src
|
||||
).fetchone()[0]
|
||||
return {"as_of": as_of, "affirms_eisc": round(aff_e, 2), "denies_eisc": round(den_e, 2),
|
||||
"net": round(aff_e - den_e, 2),
|
||||
"n_affirm": len(aff), "n_deny": len(den),
|
||||
"hard_affirm_src": len(aff_src), "soft_affirm_src_dropped": len(aff_src_all) - len(aff_src),
|
||||
"own_network_affirm_src": own}
|
||||
|
||||
|
||||
def trajectory(conn, sc, backend, derivative: str, as_of_dates: list[str], *,
|
||||
window_days: int = 90, mode: str = "live", top_k: int = 60) -> list[dict]:
|
||||
"""The net-corroboration curve over as_of_dates. Run twice (mode='live' vs 'test') to see what the
|
||||
own_network quarantine removes — the reflexivity measurement."""
|
||||
out = []
|
||||
for as_of in as_of_dates:
|
||||
classified = classify_corpus(sc, backend, derivative, as_of, top_k=top_k)
|
||||
out.append(net_at(conn, classified, as_of, window_days=window_days, mode=mode))
|
||||
return out
|
||||
@@ -0,0 +1,75 @@
|
||||
"""Under-acted-conviction scorer — Job B, the §7.1 backtest target.
|
||||
|
||||
score = conviction_weight x exposure_gap x rising_independent_corroboration
|
||||
|
||||
Fires when Ten31 believes something (high conviction), has little/no position (exposure gap), and the
|
||||
world is beginning to corroborate it or a derivative of it — independently and with acceleration. This
|
||||
is the signal that should have flagged "size up power-infra picks-and-shovels" in 2023.
|
||||
|
||||
Exposure is joined LOCALLY (never crosses the frontier boundary, §4.6). Corroboration is RETRIEVED
|
||||
(stats nominate), then an LLM helper only FILTERS retrieval near-misses (§5.1) — it cannot add claims.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from .llm_helpers import derivative_relevance
|
||||
from .windows import windowed_independence
|
||||
|
||||
CONVICTION_WEIGHT = {"low": 0.15, "med": 0.4, "med-high": 0.7, "high": 1.0}
|
||||
EXPOSURE_GAP = {"none": 1.0, "lt2": 0.8, "2to10": 0.4, "gt10": 0.1, "unset": 0.6}
|
||||
|
||||
|
||||
def score_node(conn, sc, backend, *, as_of: str, derivative: str, conviction_id: str,
|
||||
node_id: str | None, conviction_level: str, exposure: str,
|
||||
is_breaker: bool = False, top_k: int = 40, window_days: int = 28) -> dict:
|
||||
cw = CONVICTION_WEIGHT.get(conviction_level, 0.4)
|
||||
eg = EXPOSURE_GAP.get(exposure, 0.6)
|
||||
|
||||
# 1. RETRIEVE (stats nominate): hybrid search over embedded propositions; as-of post-filter.
|
||||
try:
|
||||
res = sc.search(derivative, collection="propositions", top_k=top_k, rerank=True)
|
||||
except Exception as e: # noqa: BLE001
|
||||
return _result(conviction_id, node_id, 0.0, {"reason": f"search_failed:{str(e)[:60]}"},
|
||||
cw, eg, exposure, is_breaker)
|
||||
hits = res.get("data", []) if isinstance(res, dict) else []
|
||||
cand = []
|
||||
for h in hits:
|
||||
pl = (h.get("payload") or {}) if isinstance(h, dict) else {}
|
||||
d = pl.get("date")
|
||||
if not pl.get("claim_id") or not d or d[:10] > as_of: # Qdrant can't date-filter; do it here
|
||||
continue
|
||||
cand.append({"claim_id": pl["claim_id"], "proposition": pl.get("proposition", ""),
|
||||
"date": d, "source_id": pl.get("source_id")})
|
||||
if not cand:
|
||||
return _result(conviction_id, node_id, 0.0, {"reason": "no_retrieval", "n_retrieved": 0},
|
||||
cw, eg, exposure, is_breaker)
|
||||
|
||||
# 2. FILTER near-misses with the LLM (affirms-only). Not a nominator — can't add claims.
|
||||
rel = derivative_relevance(backend, derivative,
|
||||
[{"claim_id": c["claim_id"], "proposition": c["proposition"]} for c in cand])
|
||||
confirmed = [c for c in cand
|
||||
if rel.get(c["claim_id"], {}).get("corroborates")
|
||||
and rel[c["claim_id"]].get("direction") == "affirms"]
|
||||
n_src = len({c["source_id"] for c in confirmed})
|
||||
|
||||
# 3. CORROBORATION = independence-weighted acceleration over the confirmed set (treat as a topic).
|
||||
# window_days matches corpus cadence: ~90d for quarterly filings/earnings, ~28d for weekly podcasts.
|
||||
wi = windowed_independence(conn, [(c["date"], c["source_id"]) for c in confirmed], as_of, days=window_days)
|
||||
a_corrob = wi["acceleration"]
|
||||
eisc_corrob = wi["eisc0"]
|
||||
corroboration = max(0.0, a_corrob) * eisc_corrob
|
||||
|
||||
score = corroboration if is_breaker else cw * eg * corroboration
|
||||
inputs = {
|
||||
"as_of": as_of, "derivative": derivative, "n_retrieved": len(cand), "n_confirmed": len(confirmed),
|
||||
"n_src": n_src, "a_corrob": a_corrob, "eisc_corrob": eisc_corrob, "k_eff0": wi["k_eff0"],
|
||||
"window_counts": wi["counts"], "window_eisc": wi["eisc"], "corroboration": round(corroboration, 3),
|
||||
"confirmed_claim_ids": [c["claim_id"] for c in confirmed][:50],
|
||||
}
|
||||
return _result(conviction_id, node_id, score, inputs, cw, eg, exposure, is_breaker)
|
||||
|
||||
|
||||
def _result(conviction_id, node_id, score, inputs, cw, eg, exposure, is_breaker) -> dict:
|
||||
inputs = {**inputs, "conviction_weight": cw, "exposure_gap": eg, "exposure": exposure,
|
||||
"is_breaker": is_breaker}
|
||||
return {"scorer": "under_acted", "conviction_id": conviction_id, "node_id": node_id,
|
||||
"score": round(float(score), 4), "inputs": inputs}
|
||||
@@ -0,0 +1,53 @@
|
||||
"""Temporal windows + windowed independence (the single temporal layer, §4.4).
|
||||
|
||||
28-day non-overlapping windows anchored at as_of (W0 ends at as_of, then back). Non-overlapping
|
||||
avoids autocorrelation faking significance. The signal is the discrete 2nd derivative of the
|
||||
INDEPENDENCE-WEIGHTED flow (EISC per window), never the raw count — so a topic that "accelerates"
|
||||
only because one show booked the same guest three times has flat N(W).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from .independence import eisc_for
|
||||
|
||||
WINDOW_DAYS = 28
|
||||
N_WINDOWS = 3
|
||||
|
||||
|
||||
def _d(s: str) -> datetime:
|
||||
return datetime.strptime(s[:10], "%Y-%m-%d")
|
||||
|
||||
|
||||
def window_bounds(as_of: str, *, n: int = N_WINDOWS, days: int = WINDOW_DAYS) -> list[tuple[int, str, str]]:
|
||||
"""Returns [(idx, start_iso, end_iso)] with W0 ending at as_of, extending backward only."""
|
||||
end = _d(as_of)
|
||||
out = []
|
||||
for idx in range(n):
|
||||
w_end = end - timedelta(days=idx * days)
|
||||
w_start = end - timedelta(days=(idx + 1) * days)
|
||||
out.append((idx, w_start.strftime("%Y-%m-%d"), w_end.strftime("%Y-%m-%d")))
|
||||
return out
|
||||
|
||||
|
||||
def windowed_independence(conn, rows: list[tuple], as_of: str, *, n: int = N_WINDOWS,
|
||||
days: int = WINDOW_DAYS) -> dict:
|
||||
"""rows: [(date_iso, source_id)]. For each window compute raw count + EISC_adj of its sources.
|
||||
Returns {counts:[c0..], eisc:[N0..], k_eff:[...], acceleration, eisc0, sources0}.
|
||||
acceleration = N0 - 2*N1 + N2 (independence-weighted 2nd derivative)."""
|
||||
bounds = window_bounds(as_of, n=n, days=days)
|
||||
counts, eiscs, keffs, src_sets = [], [], [], []
|
||||
for _idx, start, end in bounds:
|
||||
win = [r for r in rows if r[0] and start < r[0][:10] <= end]
|
||||
srcs = list({r[1] for r in win})
|
||||
e = eisc_for(conn, srcs) if srcs else {"eisc_adj": 0.0, "k_eff": 0}
|
||||
counts.append(len(win))
|
||||
eiscs.append(e["eisc_adj"])
|
||||
keffs.append(e["k_eff"])
|
||||
src_sets.append(srcs)
|
||||
accel = eiscs[0] - 2 * eiscs[1] + eiscs[2] if n >= 3 else 0.0
|
||||
return {
|
||||
"counts": counts, "eisc": [round(x, 3) for x in eiscs], "k_eff": keffs,
|
||||
"acceleration": round(accel, 3), "eisc0": round(eiscs[0], 3), "k_eff0": keffs[0],
|
||||
"sources0": src_sets[0], "n_total": sum(counts),
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
"""Spark Control gateway client — the SINGLE chokepoint for all gateway HTTP (§13).
|
||||
|
||||
No other module in the engine knows the gateway URL. Everything local-compute
|
||||
(transcription, diarization, the local LLM, embeddings, rerank, hybrid search, and the
|
||||
scrub/rehydrate sovereignty boundary) goes through here.
|
||||
"""
|
||||
from .client import SparkControl, SparkControlError, from_config
|
||||
|
||||
__all__ = ["SparkControl", "SparkControlError", "from_config"]
|
||||
@@ -0,0 +1,242 @@
|
||||
"""Spark Control HTTP client (handoff §13.2 endpoint table).
|
||||
|
||||
Enforces the two operational invariants from §4.1 / §13.4 (revised per infra guidance 2026-06-09):
|
||||
1. AUDIO concurrency is CAPPED at 2 in-flight (hard ceiling 3), GLOBAL across both parakeet
|
||||
endpoints (/v1/audio/transcriptions + /api/audio/diarize*) — they share ONE serial GPU. A
|
||||
process-wide BoundedSemaphore enforces it. Going wider buys ZERO throughput (requests queue and
|
||||
hold the GPU); 2 just keeps the GPU continuously fed with no idle gap = full throughput.
|
||||
2. Transient unresponsiveness is NORMAL, not failure: when the GPU stays continuously busy the
|
||||
/health and in-flight requests can briefly (1-4s) stop responding. Timeouts / 503s /
|
||||
connection-resets are "busy, retry" — handled by short exponential backoff, never treated as work loss.
|
||||
|
||||
NOTE: request/response *shapes* for the non-OpenAI endpoints (/api/audio/*, /scrub,
|
||||
/rehydrate, /api/search) are provisional and marked TODO(contract) — confirm against the
|
||||
live gateway's /api/endpoints. The OpenAI-compatible routes (/v1/*) follow the standard.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Process-wide AUDIO in-flight cap, GLOBAL across both parakeet endpoints. Single serial GPU shared
|
||||
# with the operator's production app → concurrency only deepens the queue + lengthens transient
|
||||
# busy-blips; sit at 2 (full throughput, ~2-3s busy windows), hard ceiling 3.
|
||||
_AUDIO_MAX = 3
|
||||
_AUDIO_SEM = threading.BoundedSemaphore(2)
|
||||
|
||||
|
||||
def _set_audio_concurrency(n: int) -> None:
|
||||
"""Resize the global audio semaphore (clamped to [1, _AUDIO_MAX]). Called at client init from config;
|
||||
set before any worker threads start, so the rebind is not racing in-flight acquirers."""
|
||||
global _AUDIO_SEM
|
||||
_AUDIO_SEM = threading.BoundedSemaphore(min(_AUDIO_MAX, max(1, int(n))))
|
||||
|
||||
|
||||
class SparkControlError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class SparkControl:
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str,
|
||||
*,
|
||||
verify_tls: bool = False,
|
||||
timeout: float = 120.0,
|
||||
llm_model: str = "",
|
||||
embed_model: str = "",
|
||||
transcribe_model: str = "",
|
||||
audio_concurrency: int = 2,
|
||||
) -> None:
|
||||
self.base = base_url.rstrip("/")
|
||||
self.verify = verify_tls
|
||||
self.timeout = timeout
|
||||
self.llm_model = llm_model
|
||||
self.embed_model = embed_model
|
||||
self.transcribe_model = transcribe_model
|
||||
_set_audio_concurrency(audio_concurrency)
|
||||
self._session = requests.Session()
|
||||
if not verify_tls:
|
||||
# same-LAN self-signed cert (§13): suppress the per-request InsecureRequestWarning noise.
|
||||
import urllib3
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# ---------- low-level ----------
|
||||
def _post(
|
||||
self,
|
||||
path: str,
|
||||
*,
|
||||
json: Any = None,
|
||||
files: Any = None,
|
||||
data: Any = None,
|
||||
retries: int = 4,
|
||||
backoff: float = 5.0,
|
||||
) -> Any:
|
||||
url = f"{self.base}{path}"
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
r = self._session.post(
|
||||
url, json=json, files=files, data=data,
|
||||
timeout=self.timeout, verify=self.verify,
|
||||
)
|
||||
if r.status_code == 503:
|
||||
raise SparkControlError("503 from Spark Control (GPU busy / cold start)")
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
except (requests.RequestException, SparkControlError) as e:
|
||||
if attempt < retries:
|
||||
sleep = backoff * (2 ** attempt)
|
||||
log.warning("Spark Control POST %s failed (%s); retry %d/%d in %.0fs",
|
||||
path, e, attempt + 1, retries, sleep)
|
||||
time.sleep(sleep)
|
||||
else:
|
||||
raise SparkControlError(f"POST {path} failed after {retries} retries: {e}") from e
|
||||
|
||||
def _get(self, path: str) -> Any:
|
||||
r = self._session.get(f"{self.base}{path}", timeout=self.timeout, verify=self.verify)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
# ---------- health / discovery (§13.2) ----------
|
||||
def status(self) -> Any:
|
||||
return self._get("/api/status")
|
||||
|
||||
def endpoints(self) -> Any:
|
||||
return self._get("/api/endpoints")
|
||||
|
||||
# ---------- local LLM: extraction + scoring helpers (§4.2) ----------
|
||||
def chat(
|
||||
self,
|
||||
messages: list[dict[str, str]],
|
||||
*,
|
||||
json_object: bool = True,
|
||||
temperature: float = 0.0,
|
||||
enable_thinking: bool = False,
|
||||
max_tokens: int | None = None,
|
||||
) -> Any:
|
||||
"""Deterministic, no-chain-of-thought extraction per §4.2 (temp 0, thinking off,
|
||||
JSON mode for guaranteed-valid JSON)."""
|
||||
body: dict[str, Any] = {
|
||||
"model": self.llm_model,
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
"chat_template_kwargs": {"enable_thinking": enable_thinking},
|
||||
}
|
||||
if json_object:
|
||||
body["response_format"] = {"type": "json_object"}
|
||||
if max_tokens:
|
||||
body["max_tokens"] = max_tokens
|
||||
return self._post("/v1/chat/completions", json=body)
|
||||
|
||||
# ---------- embeddings / rerank / hybrid search (§4.3) ----------
|
||||
def embed(self, inputs: list[str]) -> Any:
|
||||
"""Embed DISTILLED PROPOSITIONS, not raw chunks (§4.3)."""
|
||||
return self._post("/v1/embeddings", json={"model": self.embed_model, "input": inputs})
|
||||
|
||||
def rerank(self, query: str, documents: list[str], *, top_n: int | None = None) -> Any:
|
||||
body: dict[str, Any] = {"query": query, "documents": documents}
|
||||
if top_n:
|
||||
body["top_n"] = top_n
|
||||
return self._post("/v1/rerank", json=body)
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
*,
|
||||
collection: str,
|
||||
top_k: int = 10,
|
||||
retrieve_n: int | None = None,
|
||||
rerank: bool = True,
|
||||
filter: dict[str, Any] | None = None,
|
||||
with_payload: bool = True,
|
||||
min_score: float | None = None,
|
||||
dense_vector_name: str = "bge_m3",
|
||||
sparse_vector_name: str = "bm25",
|
||||
text_field: str = "proposition",
|
||||
) -> Any:
|
||||
"""Hybrid dense+sparse retrieval (RRF) + optional rerank over a Qdrant collection (§4.3).
|
||||
The gateway defaults vector names to 'dense'/'sparse'; our `propositions` collection uses
|
||||
named vectors bge_m3/bm25, so they must be passed explicitly (confirmed live)."""
|
||||
body: dict[str, Any] = {
|
||||
"query": query, "collection": collection, "top_k": top_k,
|
||||
"rerank": rerank, "with_payload": with_payload,
|
||||
"dense_vector_name": dense_vector_name,
|
||||
"sparse_vector_name": sparse_vector_name,
|
||||
"text_field": text_field,
|
||||
}
|
||||
if retrieve_n is not None:
|
||||
body["retrieve_n"] = retrieve_n
|
||||
if filter is not None:
|
||||
body["filter"] = filter
|
||||
if min_score is not None:
|
||||
body["min_score"] = min_score
|
||||
return self._post("/api/search", json=body)
|
||||
|
||||
# ---------- audio: capped at 2 in-flight GLOBAL (semaphore), short busy-retry ----------
|
||||
# backoff=1.5 → ~1.5/3/6/12/24s: tuned to ride out the 1-4s busy-blips, not the old 5-40s.
|
||||
def transcribe(self, audio_path: str | Path, *, response_format: str = "verbose_json") -> Any:
|
||||
with _AUDIO_SEM, open(audio_path, "rb") as f:
|
||||
return self._post(
|
||||
"/v1/audio/transcriptions",
|
||||
files={"file": f},
|
||||
data={"model": self.transcribe_model, "response_format": response_format},
|
||||
retries=5, backoff=1.5,
|
||||
)
|
||||
|
||||
def diarize_chunk(self, audio_path: str | Path) -> Any:
|
||||
# TODO(contract): confirm /api/audio/diarize-chunk response shape (segments + 192-d voiceprint).
|
||||
with _AUDIO_SEM, open(audio_path, "rb") as f:
|
||||
return self._post("/api/audio/diarize-chunk", files={"file": f}, retries=5, backoff=1.5)
|
||||
|
||||
def transcribe_with_speakers(self, audio_path: str | Path) -> Any:
|
||||
with _AUDIO_SEM, open(audio_path, "rb") as f:
|
||||
return self._post("/api/audio/transcribe-with-speakers", files={"file": f}, retries=5, backoff=1.5)
|
||||
|
||||
# ---------- frontier sovereignty boundary (§4.6) ----------
|
||||
# Confirmed contract (gateway /openapi.json):
|
||||
# /scrub: task_id*, items*, known_entities, actor, tier1_action, bucket, ner, map_handle
|
||||
# /rehydrate: task_id*, map_handle*, items*, actor, strict
|
||||
# De-identifies IDENTITIES into stable placeholders; the de-anon map stays on the box and is
|
||||
# referenced by `map_handle`. Exposure/position data must NEVER be sent here at all (§4.6).
|
||||
def scrub(
|
||||
self,
|
||||
items: list[Any],
|
||||
*,
|
||||
task_id: str,
|
||||
known_entities: dict[str, str] | None = None,
|
||||
actor: str | None = None,
|
||||
ner: bool = True,
|
||||
) -> Any:
|
||||
"""Returns the scrubbed items + a `map_handle` to pass to rehydrate. `known_entities` is the
|
||||
caller-supplied dictionary (Strike→[FUND_1]); `ner` toggles the local-Qwen NER backstop."""
|
||||
body: dict[str, Any] = {"task_id": task_id, "items": items, "ner": ner}
|
||||
if known_entities is not None:
|
||||
body["known_entities"] = known_entities
|
||||
if actor is not None:
|
||||
body["actor"] = actor
|
||||
return self._post("/scrub", json=body)
|
||||
|
||||
def rehydrate(self, items: list[Any], *, task_id: str, map_handle: str, strict: bool = False) -> Any:
|
||||
"""Restore real identities in the frontier's output locally, using the scrub `map_handle`."""
|
||||
return self._post("/rehydrate", json={
|
||||
"task_id": task_id, "map_handle": map_handle, "items": items, "strict": strict,
|
||||
})
|
||||
|
||||
|
||||
def from_config(cfg: Any) -> SparkControl:
|
||||
return SparkControl(
|
||||
cfg.spark_control_url,
|
||||
verify_tls=cfg.spark_verify_tls,
|
||||
timeout=cfg.spark_timeout_s,
|
||||
llm_model=cfg.local_llm_model,
|
||||
embed_model=cfg.embed_model,
|
||||
transcribe_model=cfg.transcribe_model,
|
||||
audio_concurrency=getattr(cfg, "audio_concurrency", 2),
|
||||
)
|
||||
@@ -0,0 +1,4 @@
|
||||
"""Persistence layer: SQLite (metadata, ledger, conviction log, graph, queue).
|
||||
|
||||
Qdrant (vectors) is reached via the Spark Control gateway; see signal_engine.spark.
|
||||
"""
|
||||
@@ -0,0 +1,81 @@
|
||||
"""SQLite connection + schema initialization. Boring and inspectable (§5)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
SCHEMA_FILE = Path(__file__).with_name("schema.sql")
|
||||
|
||||
|
||||
def connect(db_path: Path) -> sqlite3.Connection:
|
||||
db_path = Path(db_path)
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = sqlite3.connect(str(db_path), timeout=30)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys = ON")
|
||||
conn.execute("PRAGMA busy_timeout = 30000") # wait, don't fail, under concurrent backfill writers
|
||||
return conn
|
||||
|
||||
|
||||
# Additive migrations for DBs created before a column existed (CREATE IF NOT EXISTS won't add columns).
|
||||
_MIGRATIONS = {
|
||||
"documents": {"content_hash": "TEXT", "processed_at": "TEXT", "dedup_key": "TEXT"},
|
||||
# DESIGN_v2.1 condition 1: own_network = the Ten31 orbit (Odell/Bent partners etc.) — listening to
|
||||
# ourselves. Quarantined: a TEST FIXTURE for the reflexivity case, DROPPED in live EISC scoring.
|
||||
"sources": {"backtest_2022_2023": "TEXT", "own_network": "INTEGER"},
|
||||
# DESIGN_v2.1: tag derivatives by distance-from-edge for TRIAGE — surfaced, NEVER used as a filter
|
||||
# (an engine that pre-filters to in-mandate reproduces the AI/compute mandate-expansion miss).
|
||||
"fanout_nodes": {"distance_from_edge": "TEXT"},
|
||||
}
|
||||
|
||||
|
||||
def _widen_cluster_check(conn: sqlite3.Connection) -> None:
|
||||
"""Add 'banks'/'credit'/'fintech' to sources.source_cluster's CHECK. SQLite can't ALTER a CHECK, so
|
||||
rebuild the (tiny) table via the standard table-swap. Idempotent: no-op once already widened. Toggles
|
||||
foreign_keys OFF around the swap (DROP would otherwise fail on inbound FKs); data copied by value so
|
||||
referential integrity holds. busy_timeout (set in connect) lets it wait out concurrent backfill writers."""
|
||||
import re
|
||||
row = conn.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='sources'").fetchone()
|
||||
if not row or "'banks'" in row[0]:
|
||||
return
|
||||
new_list = ("('macro','ai_tech','energy','bitcoin','vc_consensus','generalist',"
|
||||
"'banks','credit','fintech')")
|
||||
new_ddl = re.sub(r"source_cluster IN\s*\([^)]*\)", f"source_cluster IN {new_list}", row[0], count=1)
|
||||
new_ddl = new_ddl.replace("CREATE TABLE sources", "CREATE TABLE sources_new", 1)
|
||||
conn.commit() # close any implicit txn before toggling FK pragma
|
||||
conn.execute("PRAGMA foreign_keys=OFF")
|
||||
try:
|
||||
conn.execute(new_ddl)
|
||||
conn.execute("INSERT INTO sources_new SELECT * FROM sources")
|
||||
conn.execute("DROP TABLE sources")
|
||||
conn.execute("ALTER TABLE sources_new RENAME TO sources")
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
|
||||
def _migrate(conn: sqlite3.Connection) -> None:
|
||||
for table, cols in _MIGRATIONS.items():
|
||||
existing = {r[1] for r in conn.execute(f"PRAGMA table_info({table})")}
|
||||
for col, typ in cols.items():
|
||||
if col not in existing:
|
||||
conn.execute(f"ALTER TABLE {table} ADD COLUMN {col} {typ}")
|
||||
# indexes on migrated columns (created here so they work on DBs predating the column)
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_content_hash ON documents(content_hash)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_dedup_key ON documents(dedup_key)")
|
||||
conn.commit()
|
||||
_widen_cluster_check(conn)
|
||||
|
||||
|
||||
def init_db(conn: sqlite3.Connection) -> None:
|
||||
"""Idempotent: CREATE ... IF NOT EXISTS + additive column migrations."""
|
||||
conn.executescript(SCHEMA_FILE.read_text())
|
||||
conn.commit()
|
||||
_migrate(conn)
|
||||
|
||||
|
||||
def table_names(conn: sqlite3.Connection) -> list[str]:
|
||||
rows = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type IN ('table','view') ORDER BY name"
|
||||
).fetchall()
|
||||
return [r[0] for r in rows]
|
||||
@@ -0,0 +1,280 @@
|
||||
-- Ten31 Signal Engine — SQLite schema (pilot)
|
||||
-- Source of truth: ten31-signal-engine-handoff.md §4 (pipeline layers), §6.7 (ledger),
|
||||
-- §3.1 (conviction log), §13.4 (backfill queue).
|
||||
-- Design principle (§5, §10): boring, inspectable tables. The whole system state is a SELECT away.
|
||||
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA foreign_keys = ON;
|
||||
|
||||
-- ============================================================================
|
||||
-- CANONICAL TOPIC VOCABULARY (§4.2) — HYBRID (operator decision):
|
||||
-- seeded controlled list + emergent topics merged in on a schedule.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS topics (
|
||||
topic_canonical TEXT PRIMARY KEY,
|
||||
status TEXT CHECK (status IN ('controlled','emergent','merged')) DEFAULT 'emergent',
|
||||
merged_into TEXT REFERENCES topics(topic_canonical),
|
||||
seam TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- SOURCES & DOCUMENTS (§4.1)
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS sources (
|
||||
source_id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
kind TEXT NOT NULL CHECK (kind IN ('podcast','youtube','filing','earnings_call')),
|
||||
source_cluster TEXT CHECK (source_cluster IN
|
||||
('macro','ai_tech','energy','bitcoin','vc_consensus','generalist','banks','credit','fintech')),
|
||||
role TEXT CHECK (role IN ('CB','IND','DX','none')) DEFAULT 'none', -- §7.4
|
||||
rss_url TEXT,
|
||||
channel_url TEXT,
|
||||
ticker TEXT,
|
||||
-- §8 credibility: neutral prior that DECAYS in favor of earned track record from the ledger.
|
||||
bootstrap_prior REAL DEFAULT 1.0,
|
||||
earned_credibility REAL,
|
||||
cluster_capped_low INTEGER DEFAULT 0, -- §4.5 bitcoin cluster deliberately under-weighted
|
||||
backtest_2022_2023 TEXT, -- §7.1 reach: rss_full | rss_2023_only | youtube_only | launched_later | unavailable
|
||||
notes TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
doc_id TEXT PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES sources(source_id),
|
||||
kind TEXT NOT NULL, -- podcast|youtube|filing|earnings_call
|
||||
external_id TEXT, -- rss guid / yt video id / EDGAR accession / transcript id
|
||||
url TEXT,
|
||||
title TEXT,
|
||||
date TEXT, -- ISO publication/filing date
|
||||
duration_sec REAL,
|
||||
raw_path TEXT, -- downloaded audio / raw filing
|
||||
transcript_path TEXT,
|
||||
-- DEDUP MODEL (layered):
|
||||
-- (1) UNIQUE(source_id, external_id) below = the ROBUST guard. external_id is the stable item id
|
||||
-- (RSS GUID / YouTube video id / EDGAR accession). Checked at ingest, BEFORE any GPU work.
|
||||
-- (2) dedup_key = normalized title+date → catches the SAME episode arriving via a different
|
||||
-- feed/mirror (different external_id). Computed pre-transcription. NOT from the transcript.
|
||||
-- content_hash is ONLY an audit fingerprint of the transcript (did a re-run change?) — it is NOT
|
||||
-- a dedup key (ASR is non-deterministic, so one differing word flips the hash).
|
||||
dedup_key TEXT,
|
||||
content_hash TEXT,
|
||||
processed_at TEXT, -- set when transcription/extraction completes
|
||||
ingested_at TEXT DEFAULT (datetime('now')),
|
||||
UNIQUE (source_id, external_id) -- idempotent ingest (§13.4 dedup)
|
||||
);
|
||||
-- indexes for dedup_key / content_hash are created in db._migrate (after columns exist on older DBs).
|
||||
|
||||
-- ============================================================================
|
||||
-- CLAIMS / PROPOSITIONS (§4.2) — the atomic unit of the whole system.
|
||||
-- One passage emits 0..N claims; MOST of a podcast hour is 0 (§4.2). The
|
||||
-- extractor must be willing to find nothing.
|
||||
-- NOTE: thesis_seam is a TAG, never a hard filter (§5.7) — off-thesis &
|
||||
-- anti-thesis claims MUST survive.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS claims (
|
||||
claim_id TEXT PRIMARY KEY,
|
||||
doc_id TEXT NOT NULL REFERENCES documents(doc_id),
|
||||
source_id TEXT NOT NULL REFERENCES sources(source_id),
|
||||
proposition TEXT NOT NULL, -- normalized subject-assertion-object
|
||||
topic_canonical TEXT REFERENCES topics(topic_canonical),
|
||||
topic_raw TEXT,
|
||||
claimant TEXT,
|
||||
source_cluster TEXT,
|
||||
date TEXT,
|
||||
claim_type TEXT CHECK (claim_type IN ('interpretive','predictive','descriptive','reactive')),
|
||||
time_horizon TEXT CHECK (time_horizon IN ('near','medium','long','unspecified')),
|
||||
confidence TEXT CHECK (confidence IN ('low','med','high')),
|
||||
-- §4.2 relation: stance is EXTRACTED, never inferred from vector distance (§2.2/§5.3).
|
||||
rel_target_claim_id TEXT REFERENCES claims(claim_id),
|
||||
rel_polarity TEXT CHECK (rel_polarity IN ('affirms','denies','qualifies','none')) DEFAULT 'none',
|
||||
engages_consensus INTEGER DEFAULT 0,
|
||||
counters_position TEXT,
|
||||
thesis_seam TEXT CHECK (thesis_seam IN
|
||||
('energy_compute','debasement_bitcoin','ai_data_ownership','none')) DEFAULT 'none',
|
||||
salience TEXT CHECK (salience IN ('central','secondary','aside')) DEFAULT 'secondary',
|
||||
qdrant_point_id TEXT, -- link to the embedded proposition vector (§4.3)
|
||||
extracted_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_claims_topic ON claims(topic_canonical);
|
||||
CREATE INDEX IF NOT EXISTS idx_claims_date ON claims(date);
|
||||
CREATE INDEX IF NOT EXISTS idx_claims_seam ON claims(thesis_seam);
|
||||
CREATE INDEX IF NOT EXISTS idx_claims_type ON claims(claim_type);
|
||||
|
||||
-- ============================================================================
|
||||
-- SOURCE-INDEPENDENCE GRAPH (§4.5) — discount convergence by connectedness.
|
||||
-- Cross-cluster convergence = gold; within-cluster = near-noise.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS source_edges (
|
||||
src_a TEXT NOT NULL REFERENCES sources(source_id),
|
||||
src_b TEXT NOT NULL REFERENCES sources(source_id),
|
||||
edge_type TEXT NOT NULL CHECK (edge_type IN ('shared_guest','citation','community')),
|
||||
weight REAL DEFAULT 1.0,
|
||||
evidence TEXT, -- voiceprint_id / show-note ref / url
|
||||
updated_at TEXT DEFAULT (datetime('now')),
|
||||
PRIMARY KEY (src_a, src_b, edge_type)
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- VOICEPRINT LIBRARY (§4.5, §4.1) — same-guest-across-shows BY VOICE.
|
||||
-- 192-dim TitaNet voiceprints; cosine ~0.7 distance threshold for same speaker.
|
||||
-- This is the highest-leverage automated input to the independence graph.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS voiceprints (
|
||||
voiceprint_id TEXT PRIMARY KEY,
|
||||
vector BLOB NOT NULL, -- 192 x float32
|
||||
person_label TEXT, -- resolved name if known
|
||||
first_doc_id TEXT REFERENCES documents(doc_id),
|
||||
first_seen TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS voiceprint_observations (
|
||||
obs_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
voiceprint_id TEXT NOT NULL REFERENCES voiceprints(voiceprint_id),
|
||||
doc_id TEXT NOT NULL REFERENCES documents(doc_id),
|
||||
chunk_idx INTEGER,
|
||||
segment_start REAL,
|
||||
segment_end REAL
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- CONVICTION LOG (§3.1) — human-owned seed nodes for Job B.
|
||||
-- Structural rule (§3.1): separate the TRACKABLE thematic proposition (corpus
|
||||
-- can corroborate) from TEAM conviction (context only). The engine must NEVER
|
||||
-- present theme corroboration as validation of the team bet beneath it.
|
||||
-- Exposure scored as coarse NAV bands (operator decision): none | lt2 | 2to10 | gt10 | unset.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS conviction_log (
|
||||
conviction_id TEXT PRIMARY KEY, -- R1, E1, A1, B1 ...
|
||||
seam TEXT, -- root|energy_compute|debasement_bitcoin|ai_data_ownership
|
||||
thematic_proposition TEXT NOT NULL, -- the TRACKABLE half
|
||||
team_conviction_note TEXT, -- context ONLY, never scored as theme validation
|
||||
conviction_level TEXT CHECK (conviction_level IN ('low','med','med-high','high')),
|
||||
current_exposure TEXT CHECK (current_exposure IN ('none','lt2','2to10','gt10','unset')) DEFAULT 'unset',
|
||||
exposure_note TEXT, -- original §3.1 prose ("pervasive", "MED-HIGH") pending NAV-band finalization
|
||||
disconfirming_signal TEXT,
|
||||
is_thesis_breaker INTEGER DEFAULT 0, -- §3.1 B1-B3: engine must surface these AGAINST the thesis (§5.7)
|
||||
updated_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
-- Conviction fan-out tree (§4.6). A derivative is a HYPOTHESIS until independent
|
||||
-- corpus corroboration AND the exposure gap both clear the bar — then 'signal'.
|
||||
CREATE TABLE IF NOT EXISTS fanout_nodes (
|
||||
node_id TEXT PRIMARY KEY,
|
||||
parent_conviction_id TEXT REFERENCES conviction_log(conviction_id),
|
||||
parent_node_id TEXT REFERENCES fanout_nodes(node_id),
|
||||
derivative_proposition TEXT NOT NULL,
|
||||
depth INTEGER DEFAULT 1,
|
||||
status TEXT CHECK (status IN ('hypothesis','corroborated','signal')) DEFAULT 'hypothesis',
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- DUAL-EVALUATION LEDGER (§4.7, §6) — START DAY ONE; the clock can't be backfilled.
|
||||
-- Log EVERY candidate that clears the quantitative bar (§6.6 — you need a denominator).
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS ledger (
|
||||
signal_id TEXT PRIMARY KEY,
|
||||
type TEXT NOT NULL CHECK (type IN ('theme','event','under_acted_conviction')),
|
||||
proposition TEXT NOT NULL,
|
||||
date_logged TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
discourse_metric TEXT, -- JSON: acceleration, cross-cluster source set, independence-discounted count
|
||||
external_check TEXT, -- JSON: resolution spec / nested clean events the model proposed (§6.5)
|
||||
resolution_date TEXT,
|
||||
discourse_outcome TEXT CHECK (discourse_outcome IN
|
||||
('up_cross_cluster','up_single_cluster','flat','down')),
|
||||
external_outcome TEXT CHECK (external_outcome IN
|
||||
('correct','partial','wrong','unresolved_expired','too_early')),
|
||||
lead_time_days INTEGER, -- §6.3 THE alpha measurement (to the DERIVATIVE node for Job B)
|
||||
model_confidence REAL, -- §6.7 logged ONLY to measure its uselessness — NEVER fed into scoring
|
||||
origin_conviction_id TEXT REFERENCES conviction_log(conviction_id), -- Job B traceability
|
||||
origin_node_id TEXT REFERENCES fanout_nodes(node_id)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_ledger_type ON ledger(type);
|
||||
CREATE INDEX IF NOT EXISTS idx_ledger_logged ON ledger(date_logged);
|
||||
|
||||
-- Human eval on a SEPARATE write path (§6.7): "keep them in separate columns and do not let the
|
||||
-- model see Grant's rating before it logs its prediction." The model-facing code reads `ledger`;
|
||||
-- ONLY the eval UI writes here. A separate table makes that separation structural, not a convention.
|
||||
CREATE TABLE IF NOT EXISTS human_evaluations (
|
||||
signal_id TEXT PRIMARY KEY REFERENCES ledger(signal_id),
|
||||
grant_rating INTEGER, -- "non-obvious and relevant to me?" (e.g. 1-5)
|
||||
non_obvious INTEGER, -- 0/1
|
||||
notes TEXT,
|
||||
rated_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
-- Reporting view — the valuable cell is DISAGREEMENT (§6.7). Used for analysis, NOT by the model path.
|
||||
CREATE VIEW IF NOT EXISTS v_ledger_eval AS
|
||||
SELECT l.*, h.grant_rating, h.non_obvious, h.notes AS grant_notes, h.rated_at
|
||||
FROM ledger l LEFT JOIN human_evaluations h ON h.signal_id = l.signal_id;
|
||||
|
||||
-- ============================================================================
|
||||
-- BACKFILL QUEUE (§13.4) — client-side, measured in GPU-HOURS.
|
||||
-- Extraction (one LLM pass per chunk over the whole corpus) is the HEAVIER serial load.
|
||||
-- Audio is SEQUENTIAL (parallel → 503). Leases give crash-safe resumability.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS backfill_jobs (
|
||||
job_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
job_type TEXT NOT NULL CHECK (job_type IN ('transcribe','diarize','extract','embed')),
|
||||
target_id TEXT NOT NULL, -- doc_id or chunk id
|
||||
parent_doc_id TEXT,
|
||||
state TEXT NOT NULL CHECK (state IN
|
||||
('pending','leased','running','done','failed','skipped')) DEFAULT 'pending',
|
||||
priority INTEGER DEFAULT 100, -- lower = sooner (backtest corpus jumps the queue, §7.1)
|
||||
attempts INTEGER DEFAULT 0,
|
||||
max_attempts INTEGER DEFAULT 5,
|
||||
lease_owner TEXT,
|
||||
lease_expires_at TEXT,
|
||||
input_hash TEXT NOT NULL, -- hash(content + model/prompt version) — idempotency
|
||||
output_ref TEXT,
|
||||
gpu_seconds REAL, -- measured per job → self-calibrating GPU-hours estimate
|
||||
error TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
updated_at TEXT DEFAULT (datetime('now')),
|
||||
UNIQUE (job_type, input_hash)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_state_priority ON backfill_jobs(state, priority, job_id);
|
||||
|
||||
-- ============================================================================
|
||||
-- SCORING BRAIN state (the "brain", build blueprint). Candidate state lands here +
|
||||
-- ledger + fanout_nodes.status; existing tables unchanged.
|
||||
-- ============================================================================
|
||||
|
||||
-- Temporal layer: one row per (topic, as_of, window). 28d non-overlapping windows.
|
||||
CREATE TABLE IF NOT EXISTS topic_window_stats (
|
||||
topic_canonical TEXT NOT NULL,
|
||||
as_of TEXT NOT NULL,
|
||||
window_idx INTEGER NOT NULL, -- 0 = window ending at as_of, 1 = prior, 2 = baseline
|
||||
window_start TEXT NOT NULL,
|
||||
window_end TEXT NOT NULL,
|
||||
n_interp_pred INTEGER NOT NULL DEFAULT 0,
|
||||
n_descr_react INTEGER NOT NULL DEFAULT 0,
|
||||
n_distinct_src INTEGER NOT NULL DEFAULT 0,
|
||||
n_distinct_clu INTEGER NOT NULL DEFAULT 0,
|
||||
PRIMARY KEY (topic_canonical, as_of, window_idx)
|
||||
);
|
||||
|
||||
-- Audit trail: one row per (scorer, key, as_of). Deterministic score_id → re-run reproduces.
|
||||
CREATE TABLE IF NOT EXISTS candidate_scores (
|
||||
score_id TEXT PRIMARY KEY,
|
||||
scorer TEXT NOT NULL, -- emergence|contrarian|intersection|convergence|under_acted
|
||||
as_of TEXT NOT NULL,
|
||||
topic_canonical TEXT,
|
||||
node_id TEXT,
|
||||
conviction_id TEXT,
|
||||
score REAL NOT NULL,
|
||||
cleared_evidence_bar INTEGER NOT NULL DEFAULT 0, -- tier 1: logged to ledger (the denominator)
|
||||
cleared_promotion_bar INTEGER NOT NULL DEFAULT 0, -- tier 2: sent to frontier judge
|
||||
inputs_json TEXT NOT NULL, -- every term that produced the score (full audit)
|
||||
computed_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_cs_asof ON candidate_scores(scorer, as_of, cleared_promotion_bar);
|
||||
|
||||
-- Tunable bar config so the backtest can sweep thresholds without code edits.
|
||||
CREATE TABLE IF NOT EXISTS score_thresholds (
|
||||
scorer TEXT PRIMARY KEY,
|
||||
min_score REAL,
|
||||
gates_json TEXT,
|
||||
version TEXT
|
||||
);
|
||||
@@ -0,0 +1,74 @@
|
||||
"""Load human-owned seed data (conviction log, §3.1) into SQLite.
|
||||
|
||||
The conviction log is the highest-leverage Job B input (§3.1) and is HUMAN-OWNED:
|
||||
Grant edits the YAML seed files; this loader upserts them. Re-running is idempotent.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
_CONVICTION_COLS = (
|
||||
"conviction_id",
|
||||
"seam",
|
||||
"thematic_proposition",
|
||||
"team_conviction_note",
|
||||
"conviction_level",
|
||||
"current_exposure",
|
||||
"exposure_note",
|
||||
"disconfirming_signal",
|
||||
"is_thesis_breaker",
|
||||
)
|
||||
|
||||
|
||||
def _row(c: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
"conviction_id": c["id"],
|
||||
"seam": c.get("seam"),
|
||||
"thematic_proposition": c["thematic_proposition"],
|
||||
"team_conviction_note": c.get("team_conviction_note"),
|
||||
"conviction_level": c.get("conviction_level"),
|
||||
"current_exposure": c.get("current_exposure", "unset"),
|
||||
"exposure_note": c.get("exposure_note"),
|
||||
"disconfirming_signal": c.get("disconfirming_signal"),
|
||||
"is_thesis_breaker": 1 if c.get("is_thesis_breaker") else 0,
|
||||
}
|
||||
|
||||
|
||||
def load_fanout(conn: sqlite3.Connection, path: Path) -> int:
|
||||
"""Load a hand-written fan-out tree (§7.1 backtest). Idempotent on node_id."""
|
||||
data = yaml.safe_load(Path(path).read_text()) or {}
|
||||
parent = data["parent_conviction_id"]
|
||||
nodes = data.get("nodes", [])
|
||||
for n in nodes:
|
||||
conn.execute(
|
||||
"""INSERT INTO fanout_nodes
|
||||
(node_id, parent_conviction_id, derivative_proposition, depth, status, distance_from_edge)
|
||||
VALUES (?,?,?,?, 'hypothesis', ?)
|
||||
ON CONFLICT(node_id) DO UPDATE SET derivative_proposition=excluded.derivative_proposition,
|
||||
parent_conviction_id=excluded.parent_conviction_id,
|
||||
distance_from_edge=excluded.distance_from_edge""",
|
||||
(n["node_id"], parent, n["derivative_proposition"], n.get("depth", 1), n.get("distance_from_edge")),
|
||||
)
|
||||
conn.commit()
|
||||
return len(nodes)
|
||||
|
||||
|
||||
def load_convictions(conn: sqlite3.Connection, path: Path) -> int:
|
||||
data = yaml.safe_load(Path(path).read_text()) or {}
|
||||
rows = data.get("convictions", [])
|
||||
cols = ", ".join(_CONVICTION_COLS)
|
||||
placeholders = ", ".join(f":{c}" for c in _CONVICTION_COLS)
|
||||
updates = ", ".join(f"{c}=excluded.{c}" for c in _CONVICTION_COLS if c != "conviction_id")
|
||||
sql = (
|
||||
f"INSERT INTO conviction_log ({cols}, updated_at) "
|
||||
f"VALUES ({placeholders}, datetime('now')) "
|
||||
f"ON CONFLICT(conviction_id) DO UPDATE SET {updates}, updated_at=datetime('now')"
|
||||
)
|
||||
for c in rows:
|
||||
conn.execute(sql, _row(c))
|
||||
conn.commit()
|
||||
return len(rows)
|
||||
@@ -0,0 +1,90 @@
|
||||
"""Load the source registry (companies + podcasts, §7.3/§7.4) into SQLite. Idempotent upsert."""
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
_COLS = ("source_id", "name", "kind", "source_cluster", "role", "rss_url",
|
||||
"channel_url", "ticker", "cluster_capped_low", "own_network", "backtest_2022_2023", "notes")
|
||||
|
||||
|
||||
def _row(s: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
"source_id": s["id"],
|
||||
"name": s["name"],
|
||||
"kind": s["kind"],
|
||||
"source_cluster": s.get("cluster"),
|
||||
"role": s.get("role", "none"),
|
||||
"rss_url": s.get("rss_url"),
|
||||
"channel_url": s.get("channel_url"),
|
||||
"ticker": s.get("ticker"),
|
||||
"cluster_capped_low": 1 if s.get("cluster_capped_low") else 0,
|
||||
"own_network": 1 if s.get("own_network") else 0,
|
||||
"backtest_2022_2023": s.get("backtest_2022_2023"),
|
||||
"notes": s.get("notes"),
|
||||
}
|
||||
|
||||
|
||||
def update_feeds(conn: sqlite3.Connection, path: Path) -> int:
|
||||
"""Apply resolved/verified podcast feed URLs + backtest-reach to existing source rows."""
|
||||
try:
|
||||
conn.execute("ALTER TABLE sources ADD COLUMN backtest_2022_2023 TEXT")
|
||||
conn.commit()
|
||||
except sqlite3.OperationalError:
|
||||
pass # column already exists
|
||||
data = yaml.safe_load(Path(path).read_text()) or {}
|
||||
rows = data.get("feeds", [])
|
||||
for f in rows:
|
||||
conn.execute(
|
||||
"""UPDATE sources
|
||||
SET rss_url=:rss_url, channel_url=:youtube_channel_url,
|
||||
backtest_2022_2023=:backtest_2022_2023, notes=COALESCE(:note, notes)
|
||||
WHERE source_id=:id""",
|
||||
{
|
||||
"id": f["id"], "rss_url": f.get("rss_url"),
|
||||
"youtube_channel_url": f.get("youtube_channel_url"),
|
||||
"backtest_2022_2023": f.get("backtest_2022_2023"), "note": f.get("note"),
|
||||
},
|
||||
)
|
||||
conn.commit()
|
||||
return len(rows)
|
||||
|
||||
|
||||
def load_source_edges(conn: sqlite3.Connection, path: Path) -> int:
|
||||
"""Seed EISC connectedness edges (priors) idempotently. Stores src_a,src_b in sorted order to
|
||||
match the transcribe_worker's convention (sorted([a,b]) + ON CONFLICT weight+=1) so real detections
|
||||
accumulate on the same PK instead of creating a reversed duplicate. DO NOTHING on conflict → a
|
||||
re-run won't inflate, and won't clobber a stronger auto-detected weight."""
|
||||
data = yaml.safe_load(Path(path).read_text()) or {}
|
||||
rows = data.get("edges", [])
|
||||
applied = 0
|
||||
for e in rows:
|
||||
a, b = sorted([e["a"], e["b"]])
|
||||
cur = conn.execute(
|
||||
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
|
||||
VALUES (?,?,?,?,?)
|
||||
ON CONFLICT(src_a, src_b, edge_type) DO NOTHING""",
|
||||
(a, b, e["type"], float(e.get("weight", 1.0)), e.get("evidence")),
|
||||
)
|
||||
applied += cur.rowcount
|
||||
conn.commit()
|
||||
return applied
|
||||
|
||||
|
||||
def load_sources(conn: sqlite3.Connection, path: Path) -> int:
|
||||
data = yaml.safe_load(Path(path).read_text()) or {}
|
||||
rows = data.get("sources", [])
|
||||
cols = ", ".join(_COLS)
|
||||
placeholders = ", ".join(f":{c}" for c in _COLS)
|
||||
updates = ", ".join(f"{c}=excluded.{c}" for c in _COLS if c != "source_id")
|
||||
sql = (
|
||||
f"INSERT INTO sources ({cols}, created_at) VALUES ({placeholders}, datetime('now')) "
|
||||
f"ON CONFLICT(source_id) DO UPDATE SET {updates}"
|
||||
)
|
||||
for s in rows:
|
||||
conn.execute(sql, _row(s))
|
||||
conn.commit()
|
||||
return len(rows)
|
||||
@@ -0,0 +1,5 @@
|
||||
"""Web UI (FastAPI) — corpus management + (later) the human-eval rating interface (§4.7/§6.7).
|
||||
|
||||
This is the app the StartOS s9pk exposes on its `ui` interface. Server-rendered HTML, no template
|
||||
engine / JS framework — boring and inspectable, like the rest of the system.
|
||||
"""
|
||||
@@ -0,0 +1,179 @@
|
||||
"""Corpus-management web UI (FastAPI).
|
||||
|
||||
Pages:
|
||||
/ dashboard — corpus + pipeline counts at a glance
|
||||
/corpus full source selection (companies + podcasts) + "add source" form
|
||||
/corpus/add POST handler (manual urlencoded parse → no python-multipart dependency)
|
||||
/source/{id} per-source detail: documents + extracted claims (inspect the signal)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import html
|
||||
import re
|
||||
import sqlite3
|
||||
from urllib.parse import parse_qs
|
||||
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||
|
||||
from ..config import load_config
|
||||
from ..store import db
|
||||
|
||||
_CSS = """
|
||||
body{font:14px/1.5 -apple-system,Segoe UI,Roboto,sans-serif;margin:0;background:#0f1115;color:#e6e6e6}
|
||||
header{background:#161a22;padding:12px 20px;border-bottom:1px solid #2a2f3a}
|
||||
header a{color:#7aa2f7;text-decoration:none;margin-right:18px;font-weight:600}
|
||||
main{padding:20px;max-width:1100px;margin:0 auto}
|
||||
h1{font-size:20px}h2{font-size:16px;margin-top:28px;color:#9aa5b1}
|
||||
table{border-collapse:collapse;width:100%;margin:10px 0}
|
||||
th,td{text-align:left;padding:6px 10px;border-bottom:1px solid #232833;font-size:13px}
|
||||
th{color:#9aa5b1;font-weight:600}
|
||||
tr:hover td{background:#161a22}
|
||||
.tag{display:inline-block;padding:1px 7px;border-radius:10px;background:#232833;font-size:11px;color:#aab}
|
||||
.cards{display:flex;gap:14px;flex-wrap:wrap}
|
||||
.card{background:#161a22;border:1px solid #2a2f3a;border-radius:8px;padding:14px 18px;min-width:130px}
|
||||
.card .n{font-size:24px;font-weight:700;color:#7aa2f7}.card .l{color:#9aa5b1;font-size:12px}
|
||||
form{background:#161a22;border:1px solid #2a2f3a;border-radius:8px;padding:16px;margin:14px 0}
|
||||
label{display:block;margin:8px 0 2px;color:#9aa5b1;font-size:12px}
|
||||
input,select{background:#0f1115;border:1px solid #2a2f3a;color:#e6e6e6;border-radius:5px;padding:6px 8px;width:240px}
|
||||
button{background:#7aa2f7;color:#0f1115;border:0;border-radius:6px;padding:8px 16px;font-weight:700;cursor:pointer;margin-top:12px}
|
||||
a{color:#7aa2f7}.muted{color:#6b7280;font-size:12px}
|
||||
"""
|
||||
|
||||
_CLUSTERS = ["macro", "ai_tech", "energy", "bitcoin", "vc_consensus", "generalist"]
|
||||
_KINDS = ["podcast", "youtube", "filing", "earnings_call"]
|
||||
_ROLES = ["none", "CB", "IND", "DX"]
|
||||
|
||||
|
||||
def _page(title: str, body: str) -> HTMLResponse:
|
||||
nav = ('<header><a href="/">Dashboard</a><a href="/corpus">Corpus</a>'
|
||||
'<span class="muted">Ten31 Signal Engine</span></header>')
|
||||
doc = f"<!doctype html><html><head><meta charset=utf-8><title>{html.escape(title)}</title>" \
|
||||
f"<style>{_CSS}</style></head><body>{nav}<main>{body}</main></body></html>"
|
||||
return HTMLResponse(doc)
|
||||
|
||||
|
||||
def _slug(s: str) -> str:
|
||||
return re.sub(r"[^a-z0-9]+", "-", s.lower()).strip("-")[:40] or "src"
|
||||
|
||||
|
||||
def create_app() -> FastAPI:
|
||||
cfg = load_config()
|
||||
app = FastAPI(title="Ten31 Signal Engine")
|
||||
|
||||
def conn() -> sqlite3.Connection:
|
||||
c = db.connect(cfg.db_path)
|
||||
db.init_db(c)
|
||||
return c
|
||||
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
def dashboard() -> HTMLResponse:
|
||||
c = conn()
|
||||
def scalar(q, *a):
|
||||
r = c.execute(q, a).fetchone()
|
||||
return r[0] if r else 0
|
||||
cards = {
|
||||
"Sources": scalar("SELECT COUNT(*) FROM sources"),
|
||||
"Documents": scalar("SELECT COUNT(*) FROM documents"),
|
||||
"Claims": scalar("SELECT COUNT(*) FROM claims"),
|
||||
"Embedded": scalar("SELECT COUNT(*) FROM claims WHERE qdrant_point_id IS NOT NULL"),
|
||||
"Convictions": scalar("SELECT COUNT(*) FROM conviction_log"),
|
||||
"Ledger": scalar("SELECT COUNT(*) FROM ledger"),
|
||||
}
|
||||
cards_html = "".join(f'<div class="card"><div class="n">{v}</div><div class="l">{k}</div></div>'
|
||||
for k, v in cards.items())
|
||||
# breakdowns
|
||||
def rows(q):
|
||||
return "".join(f"<tr><td>{html.escape(str(a))}</td><td>{b}</td></tr>" for a, b in c.execute(q))
|
||||
claims_by_type = rows("SELECT claim_type, COUNT(*) FROM claims GROUP BY claim_type ORDER BY 2 DESC")
|
||||
claims_by_seam = rows("SELECT thesis_seam, COUNT(*) FROM claims GROUP BY thesis_seam ORDER BY 2 DESC")
|
||||
queue = rows("SELECT job_type||' / '||state, COUNT(*) FROM backfill_jobs GROUP BY 1 ORDER BY 1")
|
||||
c.close()
|
||||
body = f"""<h1>Dashboard</h1><div class="cards">{cards_html}</div>
|
||||
<h2>Claims by type</h2><table><tr><th>type</th><th>n</th></tr>{claims_by_type or '<tr><td class=muted colspan=2>none yet</td></tr>'}</table>
|
||||
<h2>Claims by thesis seam</h2><table><tr><th>seam</th><th>n</th></tr>{claims_by_seam or '<tr><td class=muted colspan=2>none yet</td></tr>'}</table>
|
||||
<h2>Backfill queue</h2><table><tr><th>type / state</th><th>n</th></tr>{queue or '<tr><td class=muted colspan=2>empty</td></tr>'}</table>"""
|
||||
return _page("Dashboard", body)
|
||||
|
||||
@app.get("/corpus", response_class=HTMLResponse)
|
||||
def corpus() -> HTMLResponse:
|
||||
c = conn()
|
||||
srcs = c.execute("""
|
||||
SELECT s.*,
|
||||
(SELECT COUNT(*) FROM documents d WHERE d.source_id=s.source_id) docs,
|
||||
(SELECT COUNT(*) FROM claims cl WHERE cl.source_id=s.source_id) claims
|
||||
FROM sources s ORDER BY s.kind, s.source_id""").fetchall()
|
||||
c.close()
|
||||
|
||||
def row(s):
|
||||
extra = s["ticker"] or s["backtest_2022_2023"] or ""
|
||||
return (f"<tr><td><a href='/source/{html.escape(s['source_id'])}'>{html.escape(s['name'])}</a></td>"
|
||||
f"<td><span class=tag>{s['kind']}</span></td><td>{s['source_cluster'] or ''}</td>"
|
||||
f"<td>{s['role'] or ''}</td><td>{html.escape(str(extra))}</td>"
|
||||
f"<td>{s['docs']}</td><td>{s['claims']}</td></tr>")
|
||||
table = "".join(row(s) for s in srcs)
|
||||
opt = lambda xs: "".join(f"<option>{x}</option>" for x in xs)
|
||||
form = f"""<form method=post action="/corpus/add">
|
||||
<strong>Add to corpus</strong>
|
||||
<label>Name</label><input name=name required placeholder="NVIDIA / Odd Lots">
|
||||
<label>Kind</label><select name=kind>{opt(_KINDS)}</select>
|
||||
<label>Cluster</label><select name=cluster>{opt(_CLUSTERS)}</select>
|
||||
<label>Role</label><select name=role>{opt(_ROLES)}</select>
|
||||
<label>Ticker (companies)</label><input name=ticker placeholder="NVDA">
|
||||
<label>RSS URL (podcasts)</label><input name=rss_url placeholder="https://...">
|
||||
<label>YouTube channel</label><input name=channel_url placeholder="https://youtube.com/@...">
|
||||
<button type=submit>Add source</button>
|
||||
</form>"""
|
||||
body = f"""<h1>Corpus ({len(srcs)} sources)</h1>{form}
|
||||
<table><tr><th>name</th><th>kind</th><th>cluster</th><th>role</th><th>ticker / backtest</th><th>docs</th><th>claims</th></tr>{table}</table>"""
|
||||
return _page("Corpus", body)
|
||||
|
||||
@app.post("/corpus/add")
|
||||
async def corpus_add(request: Request):
|
||||
raw = (await request.body()).decode()
|
||||
f = {k: v[0].strip() for k, v in parse_qs(raw).items() if v and v[0].strip()}
|
||||
name = f.get("name")
|
||||
if not name:
|
||||
return RedirectResponse("/corpus", status_code=303)
|
||||
kind = f.get("kind", "podcast")
|
||||
ticker = f.get("ticker")
|
||||
sid = f"co-{ticker.lower()}" if ticker else f"{'pod' if kind in ('podcast','youtube') else kind}-{_slug(name)}"
|
||||
c = conn()
|
||||
c.execute("""INSERT OR IGNORE INTO sources
|
||||
(source_id, name, kind, source_cluster, role, ticker, rss_url, channel_url)
|
||||
VALUES (?,?,?,?,?,?,?,?)""",
|
||||
(sid, name, kind, f.get("cluster"), f.get("role", "none"),
|
||||
ticker.upper() if ticker else None, f.get("rss_url"), f.get("channel_url")))
|
||||
c.commit()
|
||||
c.close()
|
||||
return RedirectResponse("/corpus", status_code=303)
|
||||
|
||||
@app.get("/source/{source_id}", response_class=HTMLResponse)
|
||||
def source_detail(source_id: str) -> HTMLResponse:
|
||||
c = conn()
|
||||
s = c.execute("SELECT * FROM sources WHERE source_id=?", (source_id,)).fetchone()
|
||||
if not s:
|
||||
c.close()
|
||||
return _page("Not found", "<h1>Source not found</h1>")
|
||||
claims = c.execute("""SELECT proposition, claim_type, time_horizon, thesis_seam, topic_canonical,
|
||||
engages_consensus, date FROM claims WHERE source_id=?
|
||||
ORDER BY date DESC LIMIT 200""", (source_id,)).fetchall()
|
||||
c.close()
|
||||
def crow(cl):
|
||||
star = " ⚔" if cl["engages_consensus"] else ""
|
||||
return (f"<tr><td>{cl['date'] or ''}</td><td><span class=tag>{cl['claim_type']}</span></td>"
|
||||
f"<td>{cl['thesis_seam']}</td><td>{html.escape(cl['topic_canonical'] or '')}</td>"
|
||||
f"<td>{html.escape(cl['proposition'])}{star}</td></tr>")
|
||||
rows = "".join(crow(cl) for cl in claims) or '<tr><td class=muted colspan=5>no claims extracted yet</td></tr>'
|
||||
meta = f"<span class=tag>{s['kind']}</span> cluster={s['source_cluster'] or '-'} role={s['role'] or '-'}"
|
||||
if s["ticker"]:
|
||||
meta += f" ticker={s['ticker']}"
|
||||
if s["backtest_2022_2023"]:
|
||||
meta += f" · backtest={s['backtest_2022_2023']}"
|
||||
body = f"""<h1>{html.escape(s['name'])}</h1><p>{meta}</p>
|
||||
<p class=muted>{html.escape(s['notes'] or '')}</p>
|
||||
<h2>Claims ({len(claims)}) <span class=muted>⚔ = engages consensus</span></h2>
|
||||
<table><tr><th>date</th><th>type</th><th>seam</th><th>topic</th><th>proposition</th></tr>{rows}</table>"""
|
||||
return _page(s["name"], body)
|
||||
|
||||
return app
|
||||
@@ -0,0 +1,28 @@
|
||||
"""Small shared utilities (normalization, dedup keys)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
_SHOW_SUFFIX = re.compile(r"\s*[|\-–—]\s*[^|\-–—]*(podcast|show|ep(isode)?\s*\d+).*$", re.I)
|
||||
_EP_PREFIX = re.compile(r"^\s*(ep(isode)?\.?\s*\d+\s*[:\-–]|#\s*\d+\s*[:\-–]|\d+\s*[:\-–])\s*", re.I)
|
||||
_NONALNUM = re.compile(r"[^a-z0-9]+")
|
||||
|
||||
|
||||
def slugify(s: str, *, maxlen: int = 60) -> str:
|
||||
return _NONALNUM.sub("-", (s or "").lower()).strip("-")[:maxlen] or "x"
|
||||
|
||||
|
||||
def normalize_title(title: str) -> str:
|
||||
"""Normalize an episode title so the SAME episode matches across feeds/mirrors despite cosmetic
|
||||
differences ('Ep 42: Foo' vs 'Foo | The Show'). Best-effort — a safety net, not the primary key."""
|
||||
t = title or ""
|
||||
t = _SHOW_SUFFIX.sub("", t)
|
||||
t = _EP_PREFIX.sub("", t)
|
||||
return _NONALNUM.sub(" ", t.lower()).strip()
|
||||
|
||||
|
||||
def audio_dedup_key(title: str | None, date: str | None) -> str:
|
||||
"""Cross-mirror dedup key for audio: normalized title + date. Computed BEFORE transcription so a
|
||||
duplicate episode (same content via a different feed/mirror) is skipped without spending GPU.
|
||||
NOT derived from the transcript (ASR is non-deterministic — a transcript hash would be brittle)."""
|
||||
return f"{normalize_title(title or '')}|{date or ''}"
|
||||
Reference in New Issue
Block a user