From a6aec775066df327332eb82f52c7a842dcc2222f Mon Sep 17 00:00:00 2001 From: Keysat Date: Mon, 15 Jun 2026 09:24:29 -0500 Subject: [PATCH] Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds) --- .gitignore | 40 ++ BATTERY_corpus_scope.md | 36 ++ DESIGN_v2.md | 126 ++++ PILOT_BACKTEST_WRITEUP.md | 288 +++++++++ README.md | 135 ++++ requirements.txt | 27 + run_strike_pipeline.sh | 47 ++ scripts/sharpen.py | 34 + seeds/battery_docs.manifest.yaml | 34 + seeds/conviction_log.adversarial.seed.yaml | 23 + seeds/conviction_log.backtest-2023.seed.yaml | 21 + seeds/conviction_log.seed.yaml | 147 +++++ seeds/fanout.BATTERY2022.seed.yaml | 17 + seeds/fanout.K2023.seed.yaml | 22 + seeds/fanout.STRIKE2022.seed.yaml | 15 + seeds/podcast_feeds.resolved.yaml | 33 + seeds/resolution.BATTERY2022.yaml | 26 + seeds/resolution.K2023.yaml | 20 + seeds/resolution.STRIKE2022.yaml | 22 + seeds/resolution_outcomes.adversarial.yaml | 37 ++ seeds/river_docs.manifest.yaml | 13 + seeds/source_edges.bitcoin.seed.yaml | 44 ++ seeds/sources.battery.seed.yaml | 29 + seeds/sources.bitcoin.seed.yaml | 31 + seeds/sources.river.seed.yaml | 8 + seeds/sources.seed.yaml | 74 +++ signal_engine/__init__.py | 11 + signal_engine/__main__.py | 4 + signal_engine/backfill/__init__.py | 1 + signal_engine/backfill/queue.py | 123 ++++ signal_engine/cli.py | 619 +++++++++++++++++++ signal_engine/config.py | 101 +++ signal_engine/embedstore/__init__.py | 6 + signal_engine/embedstore/embedder.py | 36 ++ signal_engine/embedstore/qdrant_store.py | 79 +++ signal_engine/extract/__init__.py | 6 + signal_engine/extract/backends.py | 64 ++ signal_engine/extract/claims.py | 117 ++++ signal_engine/extract/html_text.py | 47 ++ signal_engine/extract/prompt.py | 72 +++ signal_engine/extract/worker.py | 69 +++ signal_engine/ingest/__init__.py | 5 + signal_engine/ingest/chunker.py | 36 ++ signal_engine/ingest/docs.py | 159 +++++ signal_engine/ingest/download.py | 61 ++ signal_engine/ingest/earnings.py | 127 ++++ signal_engine/ingest/edgar.py | 148 +++++ signal_engine/ingest/feeds.py | 65 ++ signal_engine/ingest/gemini_transcribe.py | 195 ++++++ signal_engine/ingest/identify.py | 45 ++ signal_engine/ingest/podcasts.py | 111 ++++ signal_engine/ingest/speaker_stitch.py | 60 ++ signal_engine/ingest/transcribe_worker.py | 308 +++++++++ signal_engine/signals/__init__.py | 6 + signal_engine/signals/asof.py | 43 ++ signal_engine/signals/bar.py | 49 ++ signal_engine/signals/confusion.py | 86 +++ signal_engine/signals/external.py | 96 +++ signal_engine/signals/independence.py | 113 ++++ signal_engine/signals/ledger_writer.py | 49 ++ signal_engine/signals/llm_helpers.py | 80 +++ signal_engine/signals/resolver.py | 27 + signal_engine/signals/run.py | 81 +++ signal_engine/signals/two_sided.py | 105 ++++ signal_engine/signals/under_acted.py | 75 +++ signal_engine/signals/windows.py | 53 ++ signal_engine/spark/__init__.py | 9 + signal_engine/spark/client.py | 242 ++++++++ signal_engine/store/__init__.py | 4 + signal_engine/store/db.py | 81 +++ signal_engine/store/schema.sql | 280 +++++++++ signal_engine/store/seed.py | 74 +++ signal_engine/store/sources.py | 90 +++ signal_engine/ui/__init__.py | 5 + signal_engine/ui/app.py | 179 ++++++ signal_engine/util.py | 28 + ten31-signal-engine-handoff.md | 384 ++++++++++++ 77 files changed, 6263 insertions(+) create mode 100644 .gitignore create mode 100644 BATTERY_corpus_scope.md create mode 100644 DESIGN_v2.md create mode 100644 PILOT_BACKTEST_WRITEUP.md create mode 100644 README.md create mode 100644 requirements.txt create mode 100755 run_strike_pipeline.sh create mode 100644 scripts/sharpen.py create mode 100644 seeds/battery_docs.manifest.yaml create mode 100644 seeds/conviction_log.adversarial.seed.yaml create mode 100644 seeds/conviction_log.backtest-2023.seed.yaml create mode 100644 seeds/conviction_log.seed.yaml create mode 100644 seeds/fanout.BATTERY2022.seed.yaml create mode 100644 seeds/fanout.K2023.seed.yaml create mode 100644 seeds/fanout.STRIKE2022.seed.yaml create mode 100644 seeds/podcast_feeds.resolved.yaml create mode 100644 seeds/resolution.BATTERY2022.yaml create mode 100644 seeds/resolution.K2023.yaml create mode 100644 seeds/resolution.STRIKE2022.yaml create mode 100644 seeds/resolution_outcomes.adversarial.yaml create mode 100644 seeds/river_docs.manifest.yaml create mode 100644 seeds/source_edges.bitcoin.seed.yaml create mode 100644 seeds/sources.battery.seed.yaml create mode 100644 seeds/sources.bitcoin.seed.yaml create mode 100644 seeds/sources.river.seed.yaml create mode 100644 seeds/sources.seed.yaml create mode 100644 signal_engine/__init__.py create mode 100644 signal_engine/__main__.py create mode 100644 signal_engine/backfill/__init__.py create mode 100644 signal_engine/backfill/queue.py create mode 100644 signal_engine/cli.py create mode 100644 signal_engine/config.py create mode 100644 signal_engine/embedstore/__init__.py create mode 100644 signal_engine/embedstore/embedder.py create mode 100644 signal_engine/embedstore/qdrant_store.py create mode 100644 signal_engine/extract/__init__.py create mode 100644 signal_engine/extract/backends.py create mode 100644 signal_engine/extract/claims.py create mode 100644 signal_engine/extract/html_text.py create mode 100644 signal_engine/extract/prompt.py create mode 100644 signal_engine/extract/worker.py create mode 100644 signal_engine/ingest/__init__.py create mode 100644 signal_engine/ingest/chunker.py create mode 100644 signal_engine/ingest/docs.py create mode 100644 signal_engine/ingest/download.py create mode 100644 signal_engine/ingest/earnings.py create mode 100644 signal_engine/ingest/edgar.py create mode 100644 signal_engine/ingest/feeds.py create mode 100644 signal_engine/ingest/gemini_transcribe.py create mode 100644 signal_engine/ingest/identify.py create mode 100644 signal_engine/ingest/podcasts.py create mode 100644 signal_engine/ingest/speaker_stitch.py create mode 100644 signal_engine/ingest/transcribe_worker.py create mode 100644 signal_engine/signals/__init__.py create mode 100644 signal_engine/signals/asof.py create mode 100644 signal_engine/signals/bar.py create mode 100644 signal_engine/signals/confusion.py create mode 100644 signal_engine/signals/external.py create mode 100644 signal_engine/signals/independence.py create mode 100644 signal_engine/signals/ledger_writer.py create mode 100644 signal_engine/signals/llm_helpers.py create mode 100644 signal_engine/signals/resolver.py create mode 100644 signal_engine/signals/run.py create mode 100644 signal_engine/signals/two_sided.py create mode 100644 signal_engine/signals/under_acted.py create mode 100644 signal_engine/signals/windows.py create mode 100644 signal_engine/spark/__init__.py create mode 100644 signal_engine/spark/client.py create mode 100644 signal_engine/store/__init__.py create mode 100644 signal_engine/store/db.py create mode 100644 signal_engine/store/schema.sql create mode 100644 signal_engine/store/seed.py create mode 100644 signal_engine/store/sources.py create mode 100644 signal_engine/ui/__init__.py create mode 100644 signal_engine/ui/app.py create mode 100644 signal_engine/util.py create mode 100644 ten31-signal-engine-handoff.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5d77011 --- /dev/null +++ b/.gitignore @@ -0,0 +1,40 @@ +# Python +__pycache__/ +*.py[cod] +.venv/ +venv/ +*.egg-info/ +.pytest_cache/ + +# Data / runtime +data/ +logs/ +*.db +*.db-wal +*.db-shm +audio-cache/ + +# StartOS s9pk build artifacts +*.s9pk +node_modules/ +javascript/ +docker-images/ + +# Secrets & local env +.env +.env.* +!.env.example +*.local + +# Claude Code — deny by default, allow-list shared wiring. +# .claude/ also accumulates worktrees, editor configs, and OS cruft; commit +# only the shared parts so new local scratch (or a stray secret) stays out. +.claude/* +!.claude/rules/ +!.claude/agents/ +!.claude/commands/ +!.claude/skills/ +!.claude/settings.json + +# OS cruft +.DS_Store diff --git a/BATTERY_corpus_scope.md b/BATTERY_corpus_scope.md new file mode 100644 index 0000000..0372f14 --- /dev/null +++ b/BATTERY_corpus_scope.md @@ -0,0 +1,36 @@ +# Battery corpus scope + the regulatory-timing finding (DESIGN_v2 §3-step-6, lineage-aware) + +Scoped via lineage-tagged research (workflow `w4pt3n0wq`). This is the first installment of the +lineage-aware corpus expansion — and it produced a sharper resolution of the Battery thesis than the +adoption-evidence labels alone. + +## The finding (CORRECTED by principal ground-truth, 2026-06-08): the supply failure was capital-provider WILLINGNESS, not regulation +> **Retraction.** An earlier version of this note (and the scoping agent) claimed the supply failure was "structurally regulatory" (SAB-121 blocked it). **That was wrong — a plausible-narrative over-fit, the exact failure mode the engine exists to catch — and is retracted.** + +The institutional-supply leg of D1 failed because **capital providers — legally free to fund Battery — were unwilling to provide dollars** for a novel product from a new company in an emerging structure. **Nothing *restricted* them.** +- Regulatory uncertainty plausibly dampened institutional risk appetite for the space *broadly*, but it was **not Battery's gate**. **Proof:** the 2025 regulatory unblock (SAB-122 Jan 2025, FDIC/OCC reversals) did **NOT** produce institutional dollars for Battery — **they remain unraised as of 2026.** If regulation had been the binding constraint, removing it would have moved the outcome. It didn't. +- So the binding constraint is **capital-provider willingness / risk-appetite for the product**, not regulatory permissibility. + +**Why this matters for the engine (the sharpened S1 lesson):** an engine fanning the REGULATORY/enabler axis would have generated a FALSE "supply is unblocking in 2025" signal — a plausible-but-wrong causal model producing a *false positive*. The TRUE resolution is **actual committed/deployed capital to fund the loans at scale**, which stayed NO throughout (the frozen pre-registered criterion was right; only the causal gloss drifted). Enablers (regulation, a single bank's toe-in) are CONTEXT, never the resolver — they can resolve YES while the conviction fails. + +A correct two-sided read across 2022–2024+: **demand-net rising** (named lenders, real borrower appetite) while the **supply-net stays flat** measured as *actual capital committed to fund the loans* — and critically, NOT mistaking the 2025 regulatory enabler for supply arriving. + +## What resolves which axis (set this BEFORE listing the corpus, so policy can't creep back into the resolver) +- **SUPPLY axis (the binding constraint) resolves ONLY on actual committed/deployed dollars** funding BTC-collateralized loans at scale — Battery's own raise status, named lenders' *funding sources* (whose balance sheet, drawn vs. announced), and dollars actually deployed. This stays **NO** through the window and **NO post-2025-unblock** (Battery remains unraised as of 2026). +- **DEMAND axis resolves on borrower appetite + originated volume** — named originators, cumulative origination. This goes **YES**. +- **Policy / regulatory documents are CONTEXT, never the supply resolver.** They explain ambient institutional risk-appetite for the space; they do **not** measure whether capital reached Battery. The 2025 unblock is the proof: it flipped the policy axis YES while supply stayed NO. An engine that wired policy into the supply resolver would have printed a false "supply unblocking" signal. Tag policy `axis=context`, weight 0 into the supply score. + +## The corpus, by lineage (tag at ingestion; the downstream tier is the same-event echo) +- **PRIMARY (original disclosures/events — ingest as dated documents):** **the supply-axis primaries that actually resolve it** — Battery's own raise/funding disclosures, named lenders' funding-source disclosures (Cantor/Maple/FalconX *who-funds-it*, Unchained/Ledn balance-sheet vs. flow), and deployed-dollar reporting; then SEC SABs, OCC Interpretive Letters, FDIC FILs, Fed SR letters **(policy — `axis=context`, NOT supply)**; GlobalCapital, Bloomberg, Galaxy Research crypto-lending reports, S&P/Moody's structured-finance (trade); Unchained/Ledn/Strike/SALT/debifi blogs (demand); Cantor PR (supply *announcement* — distinguish announced from funded); JPMorgan/Goldman/MS earnings + filings (incumbent-entry — **we already ingest these via EDGAR + FMP**). +- **INDEPENDENT (separate vantage, different incentives):** CSBS + GAO (opposing regulators/oversight — leading indicators); Deloitte/KPMG (audit-liability, real-time read of what the rule does to the books); top-tier bank-regulatory law firms (represent the banks deciding); 9fin, PitchBook LCD, Private Debt Investor, Asset Securitization Report (independent credit data); bank 10-K/10-Q. +- **DOWNSTREAM (same-event echo — tag + near-zero independence, like own_network):** the long tail of law-firm client alerts + Mondaq; Creditflux, SCI; crypto trade press (Ledger Insights, Banking Dive, The Block, CoinDesk, Decrypt). **Cantor's $2B program is flagged "THE ECHO-TRAP CASE"** — every outlet on it = ONE event, not corroboration. + +## Ingestion plan (the build to actually run Battery) — ordered by what resolves the supply axis +1. **Committed-capital evidence (the supply resolver — highest priority)** = Battery's own raise/funding disclosures + named lenders' *funding-source* disclosures (whose dollars, drawn vs. announced, deployed at scale). This is what the supply-net is measured on. Pull from company disclosures, PitchBook/9fin/credit-data, and the announced-vs-funded distinction on Cantor/Maple/FalconX. **Low volume, decisive.** +2. **Lender + borrower disclosures** = a handful of company blogs (RSS where available) for the demand-net (origination volume, named originators). +3. **Bank treatment** = already available (EDGAR filings + FMP earnings calls for JPM/GS/MS — extend the existing company corpus); read for *deployed* exposure, not stated intent. +4. **Policy primaries (`axis=context`, NOT supply)** = a small set of structured, dated, RSS-accessible document feeds (SEC/OCC/FDIC/Fed) — ingest as `filing`-like text docs (clean, no transcription). Cheap to pull; **weight 0 into the supply score** — present only to explain ambient risk-appetite and to demonstrate the 2025 policy-YES / supply-NO divergence. +5. **Independent credit-data + Big-4 + law-firm** = sample, lineage-tagged independent. +6. **Downstream tier** = ingest but tag `lineage=downstream` → near-zero independence in EISC (same mechanism as `own_network`). The Cantor echo must not read as corroboration. + +This needs a small **policy-document + blog fetcher** (different from RSS-audio and EDGAR) — the next build for the Battery leg. The Strike leg (bitcoin podcasts) is already transcribing. diff --git a/DESIGN_v2.md b/DESIGN_v2.md new file mode 100644 index 0000000..4b82c9b --- /dev/null +++ b/DESIGN_v2.md @@ -0,0 +1,126 @@ +# Ten31 Signal Engine — Design v2 (hypotheses + falsification protocol) + +**Status:** This is **not** a settled-decisions document. It is a set of design *hypotheses* with *falsification criteria*, plus the *evaluation protocol* that is supposed to prove them wrong. It exists because the dev, the implementing agent, and the principal converged in argument — and convergence among three reasoning processes that share priors is **not** evidence the design is right. The discipline from here is empirical: stop improving the design by argument; subject it to data that can falsify it. + +**The single most important correction from the v1 review:** all evaluation so far was **recall-only** ("given something we know happened, did the engine catch it?"). The question a fund actually cares about is **precision** ("of everything the engine fires on, what fraction pans out?"). You cannot adjudicate a precision/recall tradeoff — which is what the whole gate debate *is* — while measuring one side. Therefore the evaluation protocol (§1) is the first-class artifact; the design hypotheses (§2) are secondary and only matter once §1 can grade them. + +--- + +## 1. Evaluation protocol (the first-class section) + +### 1.1 Pre-registration (committed before any outcome data is pulled) +To avoid the same hindsight leakage that the hand-written fan-out had (we picked the derivatives we knew were right, phrased to match the corpus), the resolution criteria below are **fixed before** fetching any price/external data. A **single uniform rule** is applied to all derivatives — no per-derivative threshold tuning. + +**Resolution rule (uniform, pre-registered):** +- Each derivative maps to a **proxy basket** of liquid equities most directly exposed to it (committed in `seeds/resolution.K2023.yaml`). +- Build an **equal-weight, start-normalized basket index** over 2023-01-01 → 2025-06-30. +- **"Confirmed real"** iff the index first reaches **≥ +40% vs. its 2023-01 baseline** on some date `D`, **and** is still **≥ +25%** 90 days after `D` (anti-transient-spike). `D` = the **repricing date**. +- Otherwise **"not confirmed"** in-window. +- These thresholds (40% / 25% / 90d) and the baskets are the pre-registration. They are deliberately coarse and uniform so they cannot be gamed per-derivative. + +### 1.2 The engine's two signal dates (to price what the gate costs) +For each derivative we record **both**: +- **cleared_date** — earliest as_of where it cleared the evidence bar (the engine's actual fire). +- **whisper_date** — earliest as_of where the engine saw *accelerating, multi-claim* corroboration **regardless of the independence floor** (`n_confirmed ≥ 4 AND a_corrob > 0`). This is the signal the gate may have suppressed. + +The gap between `whisper_date` and `cleared_date` is **the lead time the gate costs us**, measured, not argued. + +### 1.3 The confusion matrix (precision AND recall) +Per derivative, classify: +- **TP** = cleared **and** confirmed-real (report lead time = repricing_date − cleared_date). +- **FP** = cleared **and** not-confirmed (false alarm — the precision killer). +- **FN** = had a whisper / signal but never cleared, **and** confirmed-real (the gate ate a real signal — the uranium hypothesis). +- **TN** = no signal **and** not-confirmed. + +Report **precision = TP/(TP+FP)** and **recall = TP/(TP+FN)** — and the **whisper-level** matrix too (re-classify using whisper_date), so we can see what precision/recall would be *without* the independence floor. The delta between the two matrices is the empirical answer to the gate debate. + +### 1.4 Held-out precision case (pending the principal) +The 9-derivative matrix measures precision on one (known-winner) conviction. For a clean false-positive rate we need an **adversarial case: a conviction Ten31 held that *did not* pan out**, seeded and run the same way. *Open — Grant to provide a candidate failed/abandoned conviction.* Without it, precision is measured on a favorable sample and should be reported as such. + +### 1.5 The deeper thing the resolver tests +The lead-time **distribution** (not any single signal) tests the engine's *foundational assumption* — that discourse leads capital with usable, consistent lead time (§6.2). If the distribution is wide, near-zero, or sometimes negative, the leading-indicator thesis partially collapses. The resolver is therefore not just a grader of signals; it is a test of the premise. This is why it is built first. + +--- + +## 2. Design hypotheses (each with a falsification criterion) + +Adopted from the v1 review; **none are settled** until §1 grades them. + +- **S1 (STANDING RULE) — resolve on the OUTCOME (scaled substance), never on milestones or ENABLERS.** Two Battery lessons: + - *Milestone vs. substance:* the same reality resolved YES on "≥1 major institution enters" (Goldman, one loan, 2022-04) and NO on "≥$X capital *deployed* at scale" (none). + - *Enabler vs. outcome (the sharper one):* the regulatory ENABLER improved (SAB-121 → SAB-122, Jan 2025) yet the OUTCOME didn't move — Battery *still* could not raise institutional dollars post-unblock. **An engine fanning the enabler ("regulation is unblocking") would have falsely "confirmed" supply on the 2025 deregulation.** Battery's binding constraint was never regulatory *permissibility* (nothing legally barred a capital provider from funding it) — it was capital-provider *willingness* to fund a novel product from a new company, which persisted as NO even after the backdrop improved. + So: phrase derivatives as deployed/at-scale substance ("≥$X capital deployed", "sustained volume > Y"), never as checkbox milestones ("a bank entered") **or enablers** ("regulation permits it now"). Enablers/catalysts are CONTEXT, not resolution — they can resolve YES while the conviction fails. + +- **H1 — The contested gate is the EISC independence FLOOR (+ same-cluster discount), not cross-cluster.** v1 internal diagnosis: of 9 derivatives, the binding gate is overwhelmingly `EISC < 2.0` (uranium: 15 confirmations, 5 sources, highest acceleration in the run, suppressed by EISC=1.6; picks-and-shovels' early 2023-11 whisper suppressed by EISC=1.6, delayed 6 months). **Falsify:** if the confusion matrix shows the EISC-suppressed derivatives did **not** reprice early, the floor was right and v1's relaxation instinct is wrong. +- **Corrected gate framing.** Cross-cluster was always a *crude proxy for independence of information* — job-invariant, not a Job-A-only rule. Keep the **two-tier** resolution (log the single-cluster whisper to start the clock; act only on broader/independent confirmation) but as *independence-tiering*, not job-splitting. +- **H2 — Weight independence by `claim_type`.** Operational/descriptive disclosure from a self-interested source (order book, capex, interconnect queue) is more trustworthy than its predictive/opinion claims. **Falsify:** if down-weighting predictive self-interested claims does not improve precision on the held-out case. +- **H3 — Independence is lineage, not cluster** — an *edge-population* problem on the existing EISC graph (`source_edges.edge_type` already supports citation/community), not a rebuild. Cheap interim: **source-type independence priors** + `claim_type` (H2). **NOT temporal-precedence as a standalone discount** — the dev's catch: "later near-identical = discount" misclassifies independent re-derivation as echo (3 operators reporting queues = the corroboration we want). Precedence may only *direction* an already-detected dependency edge, never stand alone. +- **H4 — Replace the 2nd-derivative with a persistence / level-crossing test** on the corroboration arrival rate ("crossed N and stayed non-decreasing across k windows; shrinkage at low n"), with **per-source-type window cadence** (filings quarterly, podcasts weekly). **Falsify:** re-run the backtest with the persistence estimator; if it does not reduce the sign-flip variance visible in Appendix A, the 2nd-derivative wasn't the problem. +- **H5 — Two-sided Job B (a product feature, not just a defense).** Track **net independence-weighted corroboration (affirms − denies)**; *accelerating contradiction + over-exposure* = an "eroding-conviction / reduce" signal of equal value to under-acted-conviction. Also the structural reflexivity defense (Job B is confirmation-seeking by construction — fan out from beliefs, filter to affirms-only). +- **H6 — The frontier fan-out is the UNTESTED half, and it is the half that maps to the actual miss (§1.1).** The backtest hand-wrote the derivative tree (hindsight). Design the test: seed the 2023 conviction, give the model **2023-only** context, let it **propose** derivatives, score that tree's precision/recall vs. what actually repriced, and compare frontier-proposed vs. hand-written. **Grade on ALTITUDE, not just topical correctness (per S1):** "does it propose the derivative at the right altitude — scaled substance vs. first-instance milestone" — because that choice alone flips the verdict (Goldman vs. Cantor). A model that proposes topically-right but milestone-altitude derivatives will look prescient on headlines while missing the real conviction state. +- **Extraction fixes (low-risk):** Item-7 (MD&A) targeting for filings; relations populated at a **linking stage** over the embedded store, not by the extractor (a spec clarification the dev owns). + +--- + +## 3. Build sequence +1. **Resolver + the pre-registered confusion matrix on the 9 derivatives already run** (this measures precision, settles H1 empirically, and tests the lead-time premise). ← *building now.* +2. **Independence/estimator rework** (H2 claim_type + source-type priors; H4 persistence estimator) — in parallel; makes the *next* backtest trustworthy. +3. **Frontier-fan-out test** (H6) — right behind the estimator; cheap, strategically central. +4. **Two-sided Job B** (H5). +5. **Held-out failed-conviction case** (§1.4) — once provided. +6. **Broad corpus expansion — last, and lineage-aware** (toward independent vantage points: operator/expert-network, supply-chain OEMs/EPCs/ISOs, FERC/interconnection filings, policy — NOT more correlated sell-side/trade-press). + +## 4. Standing discipline +Every additional round of elegant consensus past this point has decreasing value and increasing risk of mistaking coherence for correctness. Changes to scoring parameters (the EISC floor especially) must be justified by the confusion matrix / held-out precision, **not** by making a known case clear. The resolution criteria in §1.1 are frozen; if they change, that is a pre-registration event to be logged and justified, not a quiet tune. + +### 4.1 Pre-registration change log (post-hoc normalizations — logged per §4) +Any judgment call made *after* seeing outcome data is a logged pre-registration event, with reasoning. Two on the adversarial cases (both correct, both post-outcome — logging them is what protects the labels from "we graded it how we wanted"): +1. **STRIKE-card-rail-disruption polarity normalization.** The verifier agent returned `verdict=confirmed`, but its evidence was *zero* bitcoin/Lightning erosion attribution across all five Visa/MC 10-Ks. The `confirm_iff` criterion was "erosion IS attributed," so the criterion was NOT met. Normalized to `played_out: no` (thesis failed). Reasoning: the agent used "confirmed" to mean "confirmed the no-erosion finding"; the pre-registered criterion is about whether disruption occurred, which it did not. Polarity, not substance. +2. **BATTERY-incumbent-entry milestone-vs-substance split.** The criterion as written ("≥1 major institution publicly entered") resolves YES (Goldman, one loan, 2022-04). Recorded as `played_out: token` with the explicit note that the *substance* (institutional capital at scale, the separate BATTERY-institutional-supply check) resolved NO. Reasoning: the milestone phrasing is a weak proxy for the thesis; logging both verdicts on the same reality is the evidence for standing rule S1. **The criterion phrasing itself was the flaw**, which is now fixed forward by S1 — but the original label is preserved, not silently rewritten. +3. **BATTERY causal misattribution — CORRECTION (principal ground-truth, 2026-06-08).** The agent's corpus-scoping narrative claimed the institutional-supply failure was "structurally regulatory" (SAB-121 blocked it). **This was wrong and is retracted.** Principal correction: nothing legally restricted capital providers from funding Battery; the binding constraint was capital-provider *willingness* to fund a novel product / new company. Proof it was NOT regulatory: the 2025 regulatory unblock did NOT produce institutional dollars for Battery (still unraised as of 2026). The frozen pre-registered criterion (BATTERY-institutional-supply = "$X *deployed* at scale", `played_out: no`) was correct and unchanged; only the post-hoc causal gloss was wrong — a textbook instance of the plausible-narrative failure mode this engine exists to catch, logged here rather than buried. Policy/regulatory sources are demoted to CONTEXT for the Battery supply axis, not its resolver (see BATTERY_corpus_scope.md, corrected). + +--- + +## v2.1 amendments (post-confusion-matrix; two protocol corrections + two adversarial cases) + +The first confusion matrix (§5 below / the power-infra run) and the dev's review forced two protocol corrections *before this doc ossifies*, plus the two failed-conviction cases the precision axis was missing. + +### Correction A — measure RUNWAY, not lead-time-vs-first-tick +Lead-time (days before the first price tick) is a *trading* metric. Ten31 is a long-duration, often-private holder. The right metric is **remaining durable runway at signal** = the fraction of the eventual durable move still ahead when the signal fired: +`runway = (durable_peak − index_at_signal) / (durable_peak − baseline)`. +- High runway = the signal was actionable for a holder even if "late" by a trading clock. +- **Do not penalize a modestly-late public read** — public comps lead the private rounds we actually enter, so a signal a few weeks/a month after the first public tick still left a real window. This is a *lens on the public number*, not a private-markets dataset to source. Public markets remain the workhorse instrument. +- *Why it matters here:* the power-infra cells with "negative lead-time" (uranium −46d, picks −116d) had most of a +225%/+392% move still ahead at signal → high runway → valuable to a holder. Lead-time mis-scored them; runway corrects it. + +### Correction B — tag derivatives by DISTANCE-FROM-EDGE, never filter on it +The engine's job is to surface derivatives — *including* ones outside the current mandate (the original AI/compute miss *was* a mandate-expansion failure; an engine that pre-filters to in-mandate reproduces exactly that blindness). So every derivative carries `distance_from_edge ∈ {in_mandate, one_hop, two_hop}` for **triage only**. (Uranium = two_hop — the most dramatic power-infra cell but the least decision-relevant; the engine should still surface it, tagged, and let judgment decide.) Schema: `fanout_nodes.distance_from_edge`. **No scorer may filter on it.** + +### The two adversarial failed-conviction cases (the negatives — §1.4 satisfied) +Both are convictions Ten31 still holds where the *falsifiable thesis* mis-fired. Pre-registered criteria committed in `seeds/resolution.STRIKE2022.yaml` / `resolution.BATTERY2022.yaml` *before* outcome-labeling; seeds in `conviction_log.adversarial.seed.yaml` + `fanout.STRIKE2022/BATTERY2022.seed.yaml`. + +- **STRIKE2022 (the single most valuable negative).** Failed thesis = Lightning-as-retail-payments-network displacing card rails. It was **narrative-driven** — the bitcoin-podcast cluster told this story loudly in 2022 = the exact single-cluster, reflexive, talk-our-own-book corroboration the relaxed gate would wave through. **The test:** if the engine *clears* the Lightning-payments derivatives on bitcoin-cluster chatter that never showed up in real retail-payment volume, that's the **false positive** that proves the cross-cluster/lineage discipline. Also a two-sided test: did *non-bitcoin* sources ever corroborate, or only the book-talkers? +- **BATTERY2022 (the two-sided / timing case).** Thesis = bitcoin-as-collateral credit goes mainstream (D1). Right on **demand**, early/wrong on **supply** (institutional lending capital didn't arrive at scale). **The test:** would the two-sided scorer (net independence-weighted affirms − denies) have surfaced that the *supply-side* corroboration was flat while *demand-side* rose — a genuine, non-hindsight "timing is early" disconfirmation? + +### Corpus implication (flagged, needed to RUN these — not yet ingested) +- The STRIKE false-positive test **requires the bitcoin-cluster Lightning-payments discourse to be in the corpus** — which is the orbit §7.4 *deliberately excluded* (TFTC / Bitcoin Alpha / the Odell-Bent network, as "Ten31's own priors"). **The test of the echo-chamber detector needs the echo chamber in the corpus.** That exclusion was right for Job A discovery and is exactly wrong for this precision test. +- Both cases resolve on **adoption-evidence indicators** (Lightning retail volume; institutional BTC-credit capital / SAB-121 / bank entry), **not equity-price baskets** — a different resolver leg than the power-infra prices. Gathering that evidence (pre-registered) + ingesting the 2022–2024 bitcoin/institutional corpus is the next build to actually run them. + +--- + +## v2.2 (post-bitcoin-cases) — three conditions BUILT, outcomes verified, scope set + +### The three conditions (all built + unit-verified) +1. **own_network quarantine — a PERMANENT live-scoring rule, not backtest-only.** `sources.own_network` + `effective_independent_N(mode=...)` default to `mode='live'` everywhere, which DROPS own_network sources (the Odell/Bent partner orbit = Ten31 listening to itself). The only path that keeps them is `two_sided.trajectory(mode='test')` — the explicit reflexivity fixture. Confirmed standing: every forward bitcoin-thesis signal automatically excludes the own_network orbit. +2. **Resolution criteria tightened** to named/countable/dated (hostile-checker standard). +3. **Two-sided net-corroboration** (`signals/two_sided.py`) — independence-weighted affirms − denies — is the instrument for the adversarial cases (NOT runway). + +### Adversarial outcomes (hostile-verified, frozen — `seeds/resolution_outcomes.adversarial.yaml`) +- **STRIKE2022 = thesis DEAD** on all 3 (merchant integration <2-of-3 at scale; no material retail Lightning volume; zero card erosion in 5 Visa/MC 10-Ks). The clean false-positive test. +- **BATTERY2022** = demand REAL (≥4 named lenders); institutional supply FAILED in-window (Cantor $2B announced 2024-07, deployed 2025-05); incumbent-entry TOKEN (Goldman one loan 2022-04 — see S1); SAB-121 rescinded 2025-01-23 (edge). + +### Scope (the two cases want OPPOSITE things — do not conflate) +- **STRIKE: narrow and FROZEN.** 3 own_network + 3 independent shows, exactly. Purpose = prove the quarantine: clear in test mode (own_network visible), silent in live (own_network dropped). Widening muddies the contrast — narrow is the feature. +- **BATTERY: scoped SEPARATELY and broader, lineage discipline ON.** Resolution lives in institutional-credit / policy discourse (SAB-121 commentary, bank earnings-call digital-asset-lending treatment, private-credit trade press, named lenders' disclosures) — NOT bitcoin podcasts. Doubles as the first installment of the §3-step-6 lineage-aware expansion. TRAP: most institutional-credit press is downstream of the same few announcements (every outlet on the Cantor $2B headline = ONE event, not corroboration) — tag downstream/same-event at ingestion. + +### Standing line (unchanged) +These two cases prove the failure-mode machinery (quarantine catches the echo; two-sided catches the timing split). They are hindsight-known. The precision number that decides whether this is worth running comes only from FORWARD operation on signals nobody pre-selected. Once these run clean, the next move is LIVE, not another backtest. diff --git a/PILOT_BACKTEST_WRITEUP.md b/PILOT_BACKTEST_WRITEUP.md new file mode 100644 index 0000000..fcae6c4 --- /dev/null +++ b/PILOT_BACKTEST_WRITEUP.md @@ -0,0 +1,288 @@ +# Ten31 Signal Engine — Pilot Backtest Write-up + +**Author:** Claude (Claude Code), implementing dev +**For:** Grant + the dev who authored the handoff/scoping document +**Date:** 2026-06-08 +**Status:** Pilot build complete; §7.1 backtest executed end-to-end with a *qualified* result. This document is the honest assessment, the judgment calls I made, and the open questions for a second opinion. + +> **Read this as a peer review request, not a victory lap.** The engine works end-to-end and surfaced the right thesis, but the *signal quality* on the current corpus is coarse, and several design tensions in the handoff doc only became visible once there was real data flowing through. Those tensions — especially the cross-cluster gating question Grant raised — are the point of this write-up. + +--- + +## 1. Executive summary + +I built the full pilot per the handoff: ingestion (audio + text) → local claim extraction → hybrid vector store → the "scoring brain" (independence-discounted, as-of-disciplined nomination) → the §7.1 backtest → a dual-evaluation ledger. It runs against the operator's real local-compute stack (Spark Control) and a real ~6,600-claim corpus drawn from ~25 companies and a handful of podcasts. + +**The §7.1 backtest verdict is a qualified YES.** Seeded with the 2023 Kirkwood "power is the binding constraint" conviction and marched as-of across 2023–2024, the under-acted-conviction scorer: + +- **surfaced the root thesis cross-cluster in May 2023** (energy *and* AI sources, independent), and +- **surfaced the headline derivative ("size up the power-infra picks-and-shovels") in May 2024**, along with transformers and utilities-repriced. + +So the mechanism the project exists to build — *fan a held conviction to its derivatives and catch the world starting to corroborate them* — demonstrably works on real history. + +**But three honest caveats keep it from being a clean win**, and they drive the open questions: + +1. The signal is **noisy** (the acceleration metric swings between earnings seasons; there's visible run-to-run variance). +2. The cross-cluster breadth shows up at the **root** level, not the **derivative** level — the specific power-infra derivatives stay energy-cluster-corroborated. +3. The derivatives only clear because I **relaxed a cross-cluster gate for Job B** — a judgment call (§7 below) that is exactly what Grant wants to debate. + +The most important open question, in Grant's words: *is strict cross-cluster gating limiting our ability to pick up signal early — and is the real fix to dramatically broaden the cluster taxonomy and the corpus?* I think the answer is largely yes, and I lay out why in §8. + +--- + +## 2. What was built (architecture as implemented) + +3,347 lines of Python, 44 modules. Everything local-compute runs through the operator's existing **Spark Control** gateway (we call HTTP endpoints; we did not stand up vLLM/Whisper/Qdrant). The one external call is the bounded frontier step (not exercised in the backtest — see §7, deferred). + +| Layer (handoff §) | What's built | Notes | +|---|---|---| +| **Ingestion — text (§4.1)** | SEC EDGAR (10-K/10-Q/20-F/40-F), FMP earnings-call transcripts | Earnings-call *audio* proved unfetchable (no uniform feed, ~30–90d replay expiry) → FMP transcript API, per §12. Filings dedup on accession; earnings on symbol+quarter. | +| **Ingestion — audio (§4.1, §4.5)** | RSS + YouTube fetch, long-audio chunking (~2.5 min), **Parakeet transcribe + Sortformer diarize + 192-d TitaNet voiceprints**, cross-chunk speaker stitching, a persisted voiceprint library | Verified live: a real podcast → speaker-attributed transcript → claims. | +| **Speaker identity (§4.5)** | Voiceprint cosine matching across episodes/shows **+ LLM speaker-naming** (host/guest from the intro) → name-based independence edges | Grant's idea: name-based overlap is robust to voiceprint drift across shows. Both edge types feed the independence graph. | +| **Extraction (§4.2)** | Local Qwen, the finalized claim schema, JSON-mode, temp 0, "willing to emit zero" | Pluggable backend: **local Qwen (default) or Gemini batch** (validated, for overflow/scale; public corpus only). | +| **Embedding + store (§4.3)** | bge-m3 dense + BM25 sparse → Qdrant hybrid collection; retrieval + rerank via the gateway | Embeds distilled propositions, not raw chunks. | +| **Scoring brain (§4.4, §4.5)** | EISC independence primitive; as-of harness; windowed acceleration; **under-acted-conviction (Job B) scorer**; the quantitative bar; ledger writer; resolver (stub) | See §3. Job A scorers (emergence/stance/intersection) and the frontier judge/fan-out are **deferred** per the blueprint build-order — the backtest is Job B only. | +| **Backfill queue (§13.4)** | Client-side GPU-hours queue: idempotent, leased/crash-safe, prioritized | Extraction ran ~900 docs on one GPU as a serial job. Transcription on the other GPU in parallel. | +| **Provenance / dedup** | Layered: stable item-id (robust pre-GPU guard) + normalized title/date (cross-mirror) + content-hash (audit only) | Corrected after Grant flagged that a transcript hash is a brittle dedup key. | +| **Ledger (§4.7, §6)** | SQLite dual-evaluation ledger; logs every bar-clearer; resolution columns separated from scoring (look-ahead guard) | Live with its first entries. | +| **UI** | FastAPI corpus-management app (dashboard, add/view sources, inspect per-source claims) | The "menu" to grow and audit the corpus over time. | + +**Corpus the backtest ran on (snapshot):** 6,569 claims (5,129 embedded at backtest time), from 411 filings + 410 earnings transcripts + 82 podcast episodes (4 RSS-full shows for 2022–2023: Dwarkesh, Hidden Forces, All-In, Invest Like the Best; plus a partial Catalyst slice). Claim types: 2,780 predictive / 1,447 interpretive / 2,267 descriptive / 75 reactive. Clusters: **energy 3,135 · ai_tech 2,329 · bitcoin 765 · vc_consensus 139 · macro 103 · generalist 98.** 90 voiceprints (35 named), 10 shared-guest edges. + +**Note the cluster imbalance** — it's central to §8. The corpus is overwhelmingly company filings/earnings (two clusters, energy + ai_tech) with a thin podcast layer. That is not a balanced cross-cluster corpus. + +--- + +## 3. The scoring brain (how nomination works) + +This is the part where the handoff's hard constraints (§5) had to become concrete code. Design was done via a 3-way design panel (statistical / graph / pragmatic lenses) synthesized into one blueprint; I then built it. + +- **EISC — Effective Independent Source Count (the §4.5 differentiator).** Given the sources converging on a topic, discount by connectedness using a noisy-OR connectedness matrix + inverse-row-sum. Verified on synthetic cases: 5 identical clones → ~1.0 voice; 5 cross-cluster independents → ~5.0; all-bitcoin → floored ~0.4; "one guest doing the rounds" across many shows → ~1.0. (I improved the cross-cluster multiplier over the blueprint so a single guest spanning many clusters can't fake the gold-tier bonus.) **Every count that feeds a score routes through EISC — never a raw source count.** +- **As-of harness (§6.6).** Every scorer reads an `as_of`-filtered view; nothing reads the raw claims table. At nomination time only claims dated ≤ as_of are visible. This is what makes the backtest honest (no look-ahead). +- **Windowed acceleration (§4.4).** The signal is the discrete 2nd derivative of the EISC-weighted claim flow per topic — *not* raw size. Window length must match corpus cadence (90 days for quarterly filings; 28 for weekly podcasts). +- **Under-acted-conviction / Job B (§4.4).** `conviction_weight × exposure_gap × rising_independent_corroboration`. Corroboration = retrieve (hybrid search) → LLM filter to affirms-only → independence-weighted acceleration over the confirmed set. **Exposure is joined locally and never crosses the frontier boundary** (§4.6). +- **The quantitative bar (§5.1).** Two tiers: an *evidence bar* (clears hard gates → log a ledger row, the denominator) and a *promotion bar* (also clears a score threshold → would go to the frontier judge). Stats nominate; the model would only judge a pre-filtered shortlist. + +--- + +## 4. The §7.1 backtest — methodology + +Per the handoff (§7.1 is the headline pilot test), I ran it **before** any forward pilot. + +- **Seed:** the 2023 Kirkwood conviction `K2023` ("compute will ~1000x; energy becomes the binding constraint; interruptible load is the edge"), logged in the human-owned conviction log with high conviction / low exposure (`lt2`). +- **Fan-out (v1, hand-written):** Per the blueprint's build order, I **hand-wrote** the 2nd/3rd-order derivative tree (grid interconnect, transformers, substations, cooling, gas turbines, nuclear, uranium, utilities repriced, and the headline "size up power-infra picks-and-shovels"). *Why hand-written:* it removes the frontier from the first backtest and isolates the real question — **does the scoring surface the derivative once it exists?** — from the separate question of whether the frontier can *propose* the right derivatives. (That second question is untested; see §6.) +- **Run:** marched a quarterly `as_of` from 2023-03 to 2024-09 (7–9 points), 90-day windows. At each as_of, for each derivative: retrieve corroboration from the corpus, LLM-filter to genuine affirmations, compute independence-weighted acceleration, apply the bar, log every clearer to the ledger. +- **Look-ahead control:** all retrieval/scoring at as_of only sees claims dated ≤ as_of. The resolver (forward leg) is a separate, isolated pass (a stub for now — see §6). + +--- + +## 5. The §7.1 backtest — results + +I ran it twice: once on the company-only corpus (~4,500 claims), then a "sharpened" re-run after the cross-cluster podcast claims landed (~5,100 embedded). **Presenting both is deliberate — the differences between them are themselves a finding (run-to-run variance / noise).** + +### Run 1 — company corpus (~4,500 claims) +| Derivative | First cleared evidence bar | Evidence at clear | +|---|---|---| +| **Root: "power is the binding constraint"** | **2023-05-30** | EISC 3.0, 4 sources, **k_eff=2 (cross-cluster: energy+AI)**, accel +1.0 | +| **Headline: "picks-and-shovels"** | 2024-05-24 | EISC 2.0, 5 sources, k_eff=1, score 2.56 | +| Utilities repriced | 2024-05-24 | EISC 2.5, **8 sources**, k_eff=1, built steadily from 2023 (src 1→2→4→8) | +| nuclear / transformers / gas / uranium / cooling | peaked but did **not** clear | EISC or acceleration fell short in the cleared window | + +### Run 2 — + cross-cluster podcast claims (~5,100 embedded) +| Derivative | First cleared | Note | +|---|---|---| +| **Root** | **2023-05-30** | unchanged (cross-cluster) | +| **Headline: "picks-and-shovels"** | **2024-05-24** | peak 3.33; notably it *scored* 3.33 back at 2023-11 but EISC was 1.6, just under the 2.0 floor, so it logged-but-didn't-clear then | +| **Transformers** | **2024-05-24** | newly cleared (peak 4.80) | +| Uranium | did not clear | peak 7.04 (!) but never simultaneously cleared all gates | +| **Utilities repriced** | did **not** clear | cleared in Run 1, *not* in Run 2 — **this is the run-to-run variance / noise, exhibited directly** | + +**What the numbers say, honestly:** + +- The **root thesis is a genuinely clean result** — it cleared cross-cluster (k_eff=2) in May 2023 in both runs, *independent of the contested design call*. The system would have flagged "the world is starting to corroborate that power is the binding constraint, and Ten31 is under-exposed" in mid-2023. +- The **derivatives surface, but messily.** They clear mid-2024, mostly single-cluster, and *which* ones clear shifts between runs. The acceleration (2nd derivative) flips sign between earnings seasons (`+2.6 → −2.2 → +1.6 → −1.0`), so a derivative clears in whatever window the curvature happens to be positive. That is fragile. + +--- + +## 6. Honest assessment + +### What worked well +1. **The end-to-end machine is real and disciplined.** Ingest (text *and* audio) → local extraction → hybrid store → independence-discounted nomination → as-of-honest backtest → ledger. It runs on the operator's actual stack, on a real multi-thousand-claim corpus. +2. **The EISC independence primitive does its job.** "Five shows, one guest" collapses to ~1 voice; the bitcoin cluster is structurally floored; cross-cluster gets the bonus. This is the heart of §4.5 and it behaves correctly and auditably (every score is reconstructable from its inputs). +3. **Extraction discipline holds.** The extractor emits *zero* on boilerplate (8-Ks, 10-K front-matter) and rich, well-typed claims on earnings Q&A (~82% interpretive/predictive vs. descriptive). Earnings calls massively out-yield filings for signal — a concrete finding that confirms a §4.1 hypothesis. +4. **The root-thesis result is the real validation.** The single most important thing §7.1 asked — would the engine have surfaced this in time — is *yes* for the root conviction, cross-cluster, in 2023. +5. **The as-of discipline + the ledger are correct by construction.** Resolution is structurally separated from scoring; the denominator started day one; the model never sees a human rating before logging. The anti-self-deception machinery is in place. + +### Limitations & open questions (the important half) +1. **Noise on sparse, quarterly, single-domain data.** The 2nd-derivative acceleration is fragile when claims cluster in earnings seasons. The blueprint *deliberately deferred* the statistical smoothing (weighted-quadratic fits, significance gates, shrinkage) as premature at small n. **Open question:** with a bigger corpus, is raw 2nd-difference enough, or do we need that smoothing now? The run-to-run variance suggests we need *something*. +2. **Cross-cluster breadth is at the root, not the derivatives.** The diagnosis was concrete: in 2022–2023, AI-company *earnings* barely mentioned electricity as a constraint (that narrative hit 2024–25). So the niche power-infra derivatives are corroborated almost entirely by the *energy* cluster. The cross-domain early discussion lived in *specialist* discourse (energy/macro podcasts), which we under-sampled. **This is the crux — see §8.** +3. **The frontier fan-out is untested.** The backtest used a *hand-written* derivative tree. We have **not** validated whether the frontier model, given the seed conviction, would *propose* the right derivatives (grid/transformers/nuclear/…). That's a separate and important test (it's the other half of Job B). It's deferred, not done. +4. **No lead-time measured yet.** The resolver (external-confirmation leg) is a stub. We can say the engine *surfaced* the derivatives at specific dates, but we have not yet measured earliness against the *actual* repricing of power infrastructure (the alpha measurement, §6.3). That needs price/event data and forward time. +5. **Filing extraction targets the wrong thing.** It reads filings front-to-back; 10-K front-matter and risk-factors are low-yield. It should target Item 7 (MD&A). This skews filing claims toward boilerplate and likely costs us signal. +6. **Stance/relation extraction is thin.** The local extractor sees one chunk at a time, so it rarely wires the cross-document `relation` links the §4.2 schema assumes. The Job A contrarian scorer therefore needs a separate LLM stance-folding pass (designed, not built). **Worth flagging to the handoff author:** the schema implies relation-linking that is hard to populate at extraction time. + +--- + +## 7. Judgment calls I made (please scrutinize all of these) + +Every place I made a decision the handoff didn't fully specify, or where I diverged: + +1. **[BIGGEST] Relaxed the cross-cluster gate for Job B.** The design blueprint applied the §4.5 cross-cluster rule (`k_eff ≥ 2`) as a *universal* hard gate. I removed it as a *hard gate for the under-acted-conviction (Job B) scorer* — keeping EISC ≥ 2.0 (genuine independence) and a ≥2-source requirement, and letting cross-cluster *boost the score* instead of gating it. **Rationale:** the handoff §4.4 defines Job B as *"rising independent corroboration,"* whereas §4.5's cross-cluster-is-gold framing is about Job A *discovery* (avoiding echo chambers). N independent energy companies confirming a power thesis is corroboration, not an echo. **This is the difference between the derivatives clearing or not** — with the strict gate, *only the root clears* (cross-cluster, 2023). This is the #1 thing to debate (§8). +2. **Window length = 90 days for the backtest** (blueprint default was 28). 28-day windows are degenerate on quarterly filings/earnings (most windows empty). Made it configurable; 90d for filing-cadence corpora, 28d for weekly podcasts. *Open question: mixed-cadence corpora (filings + podcasts) want different windows simultaneously — currently one global value.* +3. **Improved the EISC cross-cluster multiplier.** Blueprint counted "distinct non-capped clusters present." I changed it to count only clusters that contribute ≥ 0.5 of an independent voice — so "one guest spanning 4 clusters" can't earn the gold multiplier. (A correctness fix, not a divergence in intent.) +4. **Hand-wrote the fan-out for v1** (per blueprint build-order). The derivative *phrasings* are mine, and the LLM relevance filter judges corroboration against those phrasings — so wording matters. A frontier-generated tree might phrase them to match the corpus better (or worse). Untested. +5. **Deferred the statistical-significance machinery** (Design 1's fitted curves / bootstraps / z-gates) as premature at pilot n — kept the hard minimum-evidence gates, not the smoothing. This is *why* the signal is noisy. Reconsider as the corpus grows (§6.1). +6. **Build order: Job B first; Job A (emergence/stance/intersection) and the frontier judge/fan-out deferred.** So the backtest tested Job B only, with no frontier in the loop. Faithful to the blueprint, but it means large parts of the §4 design are designed-not-built. +7. **Filings = 10-K/10-Q/20-F/40-F only** (skipped 8-K/6-K as low-yield current-reports). Earnings via FMP. Podcasts = the 4 RSS-full shows + a partial Catalyst slice. **I did not get the specialist energy/macro podcasts** (Catalyst/Columbia Energy/Macro Voices/Odd Lots) for 2022–2023 — they're YouTube-only with slow date-windowed enumeration. This under-samples exactly the cluster breadth the derivatives needed. +8. **Local Qwen for all extraction + scoring LLM helpers.** Gemini validated as an overflow backend but not used in the backtest. + +--- + +## 8. The central debate: cross-cluster gating vs. corpus breadth + +This is the section to take into the brainstorm. Grant's framing (paraphrased): *strict cross-cluster gating may limit our ability to pick up signal early; perhaps the real fix is that the cluster list is too small and there isn't enough breadth within each cluster, so the corpus needs to be dramatically increased.* I think this is the right instinct, and here's the structured case. + +### The tension, precisely +- §4.5 is unambiguous and correct *for Job A discovery*: cross-cluster convergence is gold, within-cluster is near-noise (five bitcoin shows agreeing = the prior, not signal). +- But **Job B (derivatives / fan-out) has the opposite early-signal dynamic.** A niche derivative's *earliest* corroboration almost always comes from the single most-relevant cluster — the people closest to it. Power-infra repricing showed up *first* in energy-company earnings and energy-specialist discourse, and only *later* spread to AI companies and generalist macro. **Requiring cross-cluster corroboration means you only fire once the signal has already spread — which is precisely when you've lost the lead time.** The backtest demonstrates this exactly: the cross-cluster version of the signal (the root) is real but broad; the *actionable derivative* corroboration is single-cluster and earlier. + +This is, I think, a genuine gap in the handoff: §4.5's "within-cluster is near-noise" was written with discovery in mind and is in tension with §4.4's "rising independent corroboration" for Job B. The implementation had to pick; I picked "relax for Job B." **The dev who wrote the spec should weigh in on whether that's the intended reading.** + +### Why this points at corpus breadth (Grant's hypothesis), and I agree +The reason single-cluster corroboration feels uncomfortable is the fear of an echo chamber (energy companies talking their book). **The principled fix isn't to demand cross-cluster — it's to make "independent within a domain" *mean something*, which requires breadth.** Right now: +- We have **6 coarse clusters**, and the corpus is dominated by **two** of them (energy, ai_tech), almost entirely **company filings/earnings**. Within "energy," CEG/VST/TLN/NEE are independent issuers but they're all *sell-side-of-their-own-demand* — partly correlated by construction. +- A handful of podcasts (4 shows) provide the only non-company voices, and the *specialist* energy/macro podcasts that would carry the early cross-domain signal weren't ingested for the backtest window. + +So the corpus is both **too narrow** (few clusters, two dominant) and **too shallow within clusters** (few genuinely independent voice-types per cluster). Two complementary directions: + +1. **Finer cluster taxonomy.** "Energy" → {power utilities, grid/equipment, nuclear/uranium, gas, energy-specialist media}. "AI/tech" → {chips, hyperscalers, data-center REITs, AI-specialist media}. Add clusters the pilot omitted entirely: **sell-side research, trade press / industry newsletters, expert-network transcripts, specialist substacks, conference/earnings-adjacent commentary, policy/regulatory.** With a finer taxonomy, *cross-sub-cluster* convergence (e.g., a nuclear operator **and** a grid-equipment maker **and** an energy-trade newsletter) becomes a meaningful *early* signal — and the strict cross-cluster gate becomes defensible again because the clusters are now granular enough to convergence early. +2. **Dramatically more breadth within each cluster.** More issuers, far more podcasts/media, and crucially the *specialist* sources where derivatives are discussed first. This is the difference between "4 energy companies" (correlated) and "20 independent energy-ecosystem voices of different types" (genuinely independent). + +### My recommendation for the debate (not a decision — a starting position) +- **Short term:** keep Job B's gate at *independence* (EISC ≥ 2, ≥2 sources) for the **evidence/logging tier** — so we *catch and log* early single-cluster corroboration and start the lead-time clock — and use **cross-cluster as the promotion/confidence tier** (the thing we'd actually act on). This preserves earliness *and* honesty: we log the early single-cluster whisper, but we don't treat it as high-confidence until it's broadened. +- **Medium term (the real fix, Grant's point):** broaden the cluster taxonomy and dramatically expand the corpus — especially the specialist/media sources and finer sub-clusters. This likely does more for signal quality than any scoring tweak, and it would let us *re-tighten* the cross-cluster requirement without losing earliness, because convergence would happen earlier across a richer cluster space. +- **Either way:** build the **resolver / lead-time** measurement next, because *"did it clear the bar"* is far less interesting than *"how early did it clear vs. the actual repricing"* — and that number is what tells us whether the relaxed gate is finding alpha or just noise. + +--- + +## 9. Suggested agenda for the brainstorm with the handoff author + +1. **The §4.4-vs-§4.5 tension for Job B.** Is "rising independent corroboration" meant to allow single-cluster (independent-within-domain) corroboration, with cross-cluster as a confidence multiplier? Or is cross-cluster a hard requirement even for derivatives (accepting later signal)? *This is the load-bearing question.* +2. **Cluster taxonomy + corpus breadth.** How far to broaden clusters and sources? Which new source *types* matter most (sell-side, trade press, expert networks, specialist media)? What's the target corpus size for the cross-cluster signal to be early *and* honest? +3. **The temporal statistic.** Is raw 2nd-difference acceleration the right signal, or do we adopt the deferred smoothing now? The run-to-run variance argues for the latter. +4. **Frontier fan-out validation.** Design a test for whether the frontier *proposes* the right derivatives from a seed conviction (the untested half of Job B). +5. **Lead-time / resolution.** What external-confirmation data (price, signed deals, policy) feeds the resolver, and how do we grade earliness? +6. **Filing extraction → MD&A targeting**, and the relation/stance extraction gap (does the §4.2 schema's relation-linking need a dedicated pass?). + +--- + +## 10. Appendix + +**Corpus at backtest time:** 6,569 claims (5,129 embedded) · 411 filings + 410 earnings + 82 podcasts + 3 youtube · 47 sources · 90 voiceprints (35 named) · 10 shared-guest edges · 4 ledger rows · 81 candidate-score rows. + +**Key parameters:** windows 90d × 3 (84/270-day lookback); EISC floor 2.0; under-acted score floor 0.3; coupling κ {shared_guest 0.85, citation 0.45, community 0.60}; cluster coupling {bitcoin 0.55, vc_consensus 0.35, other-same 0.25}; bitcoin/capped contribution ≤ 0.25. + +**The contested gate, in code:** `signal_engine/signals/bar.py::_under_acted` — the `k_eff ≥ 2` requirement is commented out with the rationale; re-adding it reverts to "only the root clears." + +**Reproduce:** `python -m signal_engine backtest --conviction K2023 --start 2023-03-01 --end 2024-09-01 --step-days 90 --window-days 90`. Trajectories print per-derivative with the evidence at each as_of. + +**Module map:** `ingest/` (fetch + transcribe + diarize + identify), `extract/` (claims + backends), `embedstore/` (Qdrant hybrid), `signals/` (the scoring brain: independence, asof, windows, under_acted, bar, ledger_writer, resolver, run), `frontier/` (designed, deferred), `spark/` (the single gateway client), `store/` (schema + seeds), `ui/` (corpus app). + +--- + +*Bottom line for the brainstorm: the engine is built, disciplined, and it surfaced the right thesis on real history. The honest gap is signal quality, and the highest-leverage fix is almost certainly corpus breadth + a finer cluster taxonomy (Grant's instinct), which would also let us resolve the cross-cluster gating debate from a position of strength rather than scarcity.* + +--- + +> **Note on dates:** the quarterly as-of march is 2023-03, -05, -08, -11, 2024-02, -05, -08. The **2023-12 and 2024-03** columns are two ad-hoc single-date smoke runs (off the quarterly grid) that happen to be stored in the same table — included for completeness. The score for the SAME node at adjacent dates (e.g. 2023-11 vs 2023-12) swinging from 3.3 to 0 is itself a vivid illustration of the cadence-sensitivity problem. + +## Appendix A — Full score trajectories (the noise, concretely) + +Every under-acted-conviction node × every as-of date that was scored. `★` = cleared the evidence bar. The point of showing this: watch the score and the acceleration `a` swing between adjacent quarters — that is the noise the write-up (§6.1) describes. + +| derivative | 2023-03 | 2023-05 | 2023-08 | 2023-11 | 2023-12 | 2024-02 | 2024-03 | 2024-05 | 2024-08 | +|---|---|---|---|---|---|---|---|---|---| +| K2023 | 0.0 | 2.4★ | 0.0 | 0.0 | 1.6 | 0.8 | 0.8 | 0.0 | 0.0 | +| K2023-cooling | 0.8 | 0.0 | 0.0 | 0.0 | 0.0 | 1.6 | 1.6 | 0.0 | 0.0 | +| K2023-gas-turbines | 0.0 | 0.0 | 0.0 | 0.8 | 0.8 | 0.0 | 0.0 | 0.0 | 0.0 | +| K2023-grid-interconnect | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | +| K2023-nuclear | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.8 | 0.0 | 0.0 | 0.0 | +| K2023-picks-and-shovels | 0.0 | 0.0 | 0.0 | 3.3 | 0.0 | 0.0 | 0.0 | 2.6★ | 0.0 | +| K2023-transformers | 0.0 | 0.0 | 0.8 | 0.5 | 0.0 | 0.0 | 0.0 | 4.8★ | 0.0 | +| K2023-uranium | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | +| K2023-utilities-repriced | 0.8 | 0.0 | 0.8 | 0.0 | 0.8 | 0.0 | 1.6 | 0.0 | 0.0 | + +### Detail — the acceleration sign-flips (why it's noisy) + +For the headline derivative and the root, the raw inputs at each as-of (conf=confirmed corroborating claims, src=distinct sources, eisc=independence-weighted count, a=acceleration/2nd-derivative, k_eff=distinct independent clusters): + + +**K2023** + +| as_of | score | cleared | conf | src | eisc | a | k_eff | +|---|---|---|---|---|---|---|---| +| 2023-03-01 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2023-05-30 | 2.40 | YES | 6 | 4 | 3.0 | 1.0 | 2 | +| 2023-08-28 | 0.00 | — | 6 | 4 | 0.0 | -5.0 | 0 | +| 2023-11-26 | 0.00 | — | 6 | 4 | 0.0 | 3.0 | 0 | +| 2023-12-01 | 1.60 | — | 6 | 1 | 1.0 | 2.0 | 1 | +| 2024-02-24 | 0.80 | — | 7 | 4 | 1.0 | 1.0 | 1 | +| 2024-03-01 | 0.80 | — | 6 | 4 | 1.0 | 1.0 | 1 | +| 2024-05-24 | 0.00 | — | 9 | 6 | 1.6 | -0.4 | 1 | +| 2024-08-22 | 0.00 | — | 10 | 7 | 1.0 | -1.2 | 1 | + +**K2023-picks-and-shovels** + +| as_of | score | cleared | conf | src | eisc | a | k_eff | +|---|---|---|---|---|---|---|---| +| 2023-03-01 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2023-05-30 | 0.00 | — | 2 | 2 | 1.0 | -1.0 | 1 | +| 2023-08-28 | 0.00 | — | 2 | 2 | 0.0 | -1.0 | 0 | +| 2023-11-26 | 3.33 | — | 4 | 3 | 1.6 | 2.6 | 1 | +| 2023-12-01 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2024-02-24 | 0.00 | — | 5 | 3 | 1.0 | -2.2 | 1 | +| 2024-03-01 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2024-05-24 | 2.56 | YES | 10 | 5 | 2.0 | 1.6 | 1 | +| 2024-08-22 | 0.00 | — | 5 | 3 | 0.0 | -1.0 | 0 | + +**K2023-utilities-repriced** + +| as_of | score | cleared | conf | src | eisc | a | k_eff | +|---|---|---|---|---|---|---|---| +| 2023-03-01 | 0.80 | — | 1 | 1 | 1.0 | 1.0 | 1 | +| 2023-05-30 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2023-08-28 | 0.80 | — | 1 | 1 | 1.0 | 1.0 | 1 | +| 2023-11-26 | 0.00 | — | 3 | 2 | 1.0 | -1.0 | 1 | +| 2023-12-01 | 0.77 | — | 4 | 2 | 1.6 | 0.6 | 1 | +| 2024-02-24 | 0.00 | — | 4 | 3 | 1.0 | 0.0 | 1 | +| 2024-03-01 | 1.60 | — | 7 | 4 | 2.0 | 1.0 | 1 | +| 2024-05-24 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2024-08-22 | 0.00 | — | 16 | 7 | 2.286 | -1.714 | 1 | + +**K2023-nuclear** + +| as_of | score | cleared | conf | src | eisc | a | k_eff | +|---|---|---|---|---|---|---|---| +| 2023-03-01 | 0.00 | — | 6 | 4 | 1.0 | 0.0 | 1 | +| 2023-05-30 | 2.05 | — | 5 | 3 | 1.6 | 1.6 | 1 | +| 2023-08-28 | 0.00 | — | 10 | 7 | 1.0 | -7.0 | 1 | +| 2023-11-26 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2023-12-01 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2024-02-24 | 0.80 | — | 6 | 4 | 1.0 | 1.0 | 1 | +| 2024-03-01 | 0.00 | — | 2 | 2 | 0.0 | 0.0 | 0 | +| 2024-05-24 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2024-08-22 | 0.00 | — | 12 | 4 | 1.0 | -2.0 | 1 | + +**K2023-transformers** + +| as_of | score | cleared | conf | src | eisc | a | k_eff | +|---|---|---|---|---|---|---|---| +| 2023-03-01 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2023-05-30 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2023-08-28 | 0.80 | — | 1 | 1 | 1.0 | 1.0 | 1 | +| 2023-11-26 | 0.48 | — | 4 | 2 | 1.0 | 0.6 | 1 | +| 2023-12-01 | 0.00 | — | 0 | 0 | 0.0 | 0.0 | 0 | +| 2024-02-24 | 0.00 | — | 4 | 2 | 0.0 | -1.0 | 0 | +| 2024-03-01 | 0.00 | — | 6 | 4 | 0.0 | -1.0 | 0 | +| 2024-05-24 | 4.80 | YES | 8 | 5 | 2.0 | 3.0 | 1 | +| 2024-08-22 | 0.00 | — | 8 | 5 | 1.6 | -1.6 | 1 | \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b795651 --- /dev/null +++ b/README.md @@ -0,0 +1,135 @@ +# Ten31 Signal Engine (pilot) + +A recurring pipeline that ingests a growing corpus of audio (podcasts, YouTube) and text +(SEC filings, earnings-call transcripts), extracts structured **propositions**, and surfaces +**signal over time**, filtered through Ten31's thesis as a *relevance lens* — with every surfaced +signal logged as a **falsifiable prediction** so the system is scored against reality. + +**Spec / source of truth:** [`ten31-signal-engine-handoff.md`](ten31-signal-engine-handoff.md). It wins +over this README on any conflict. Section refs below (§) point into it. + +--- + +## The spine (do not violate — §2, §5) + +> **Statistics & graph structure NOMINATE candidates; the frontier model only JUDGES and FANS OUT a +> pre-filtered shortlist. The prediction ledger is the final arbiter.** + +- **Job A — Discovery (§1):** surface what Grant doesn't yet see — emergent themes (independent + cross-cluster *convergence*, scored on **acceleration** not size), credible contrarian minority + stances, and the prize: their **intersection** (a consensus about to flip). +- **Job B — Conviction-action gap (§1.1):** fan held convictions to 2nd/3rd-order derivatives and fire + `conviction(high) × exposure(low) × rising independent corroboration`. Countermeasure to the 2023 + "power is the binding constraint" call where the seed was held but derivatives under-acted. +- **Nominate-then-judge boundary:** geometry/stats emit a shortlist that already cleared a quantitative + bar; the frontier model judges/synthesizes/fans-out only. A fanned-out derivative is a **hypothesis, + not a signal**, until independent corpus corroboration confirms it (§2.3, §4.6). +- **The lens tags relevance; it must NOT gate truth (§5.7).** The engine must be able to surface an + accelerating signal *against* Ten31's thesis (the B1–B3 breakers). `thesis_seam` is a tag, never a filter. + +--- + +## Locked pilot decisions + +| Topic | Decision | Ref | +|---|---|---| +| **Packaging** | Ship as a **StartOS 0.4.0 s9pk** (operator call). Code is package-ready; TS wrapper generated once the daemon/UI/dependency contract stabilizes. | §10, §13 | +| **Earnings transcripts** | **FMP Ultimate** transcript API as backbone (audio isn't reliably fetchable: no uniform feed, ~30–90d replay expiry). EDGAR filings are the durable core. Self-transcribed audio deferred. | §4.1, §12 | +| **Conviction exposure** | Coarse **NAV bands**: `none / lt2 / 2to10 / gt10`. Grant sole editor, monthly review. | §3.1 | +| **Topic vocabulary** | **Hybrid**: seeded controlled list + emergent topics batch-merged on a schedule. | §4.2 | +| **Frontier model** | Default `claude-opus-4-8` (override via `FRONTIER_MODEL`). Scrubbed-vs-unscrubbed quality A/B is a pilot deliverable. | §4.6 | + +--- + +## Architecture (modules map 1:1 to spec layers) + +``` +ingest/ §4.1 RSS/YouTube/EDGAR/FMP fetch + scheduler; long-audio chunking; speaker stitch +extract/ §4.2 local LLM → 0..N structured claim units per chunk (willing to emit ZERO) +embedstore/ §4.3 embed DISTILLED propositions + client-side BM25 → Qdrant hybrid +signals/ §4.4 cluster acceleration · stance distributions · bridge edges · scoring +independence/ §4.5 source graph + voiceprint library → convergence discounting +frontier/ §4.6 judge · synthesis · conviction fan-out (scrub→frontier→rehydrate; SHORTLIST/SEEDS only) +ledger/ §4.7 dual-evaluation ledger + conviction log + earned credibility +spark/ §13 THE single chokepoint for all Spark Control HTTP (no other module knows the URL) +store/ SQLite schema + seed loaders +backfill/ §13.4 client-side GPU-hours queue (extraction = heavier serial load; audio sequential) +``` + +**Two invariants enforced in exactly one place each:** +- All gateway HTTP funnels through `spark/` — `spark/client.py` holds the base URL, the self-signed TLS + skip, 503 retry/backoff, and a process-wide **audio lock** (sequential audio, §4.1). +- The "model never sees Grant's rating first" rule (§6.7) is **structural**: ratings live in a separate + `human_evaluations` table; the model-facing code reads `ledger`, which has no `grant_rating` column. + +### Data stores +- **SQLite** (`store/schema.sql`): sources, documents, claims, topics, source_edges, voiceprints, + conviction_log, fanout_nodes, ledger, human_evaluations, backfill_jobs. The whole system state is a SELECT. +- **Qdrant** (via Spark Control): one `propositions` collection, **hybrid** dense `bge-m3` (1024-d) + + sparse BM25 (`Qdrant/bm25`, `modifier: idf`). Points are distilled propositions, never raw chunks. + +--- + +## Backfill queue (§13.4) — measured in GPU-hours, not real-time + +Scheduler jobs are **producers** (fetch/dedup/enqueue); a **single worker** drains the GPU queue one job +at a time → no parallel audio → no 503 by construction. Jobs are leased (crash-safe resume), idempotent on +`hash(content + prompt-version)`. Extraction is the binding load. Rough pilot estimate (~300 episodes + +25 companies): transcription ~6 GPU-h, **extraction ~90 GPU-h**, wall-clock ~4–6 days (audio on Spark 2 and +extraction on Spark 1 run in parallel). The queue self-calibrates from measured `gpu_seconds`. + +--- + +## Build order (§11) & status + +1. ✅ **Foundation** — config, `spark/` client, SQLite schema, conviction-log seed, **ledger scaffold live day one**. +2. ✅ **Ingestion + backfill queue** — backfill queue; EDGAR + FMP earnings; **audio path live-proven** (download → Parakeet transcribe + Sortformer diarize → align → voiceprint-stitch → speaker-attributed transcript); 20 podcast feeds resolved (`seeds/podcast_feeds.resolved.yaml`). Broad corpus ingested: **785 company docs (379 filings + 406 earnings), 802 extract jobs**. ⬜ Remaining: podcast audio backfill (RSS-full + dated YouTube pulls) + audio-cache cleanup; foreign-filer forms (20-F/6-K for CCJ/TSM/IREN). +3. ✅ **Extraction worker + §4.2 prompt** — **live** on text (earnings: 26 claims, insight-heavy) AND audio (Dwarkesh: 5 claims, correct speaker attribution). Backfill draining in background (`logs/extract-backfill.log`). ⬜ Refinement: target Item 7 MD&A for filings. +4. 🟡 **Embedding + storage** — ✅ Qdrant `propositions` hybrid collection (bge-m3 + BM25) **live**; hybrid search+rerank verified surfacing the power-infra theme. ⬜ embed the growing claim set; clustering. +5. 🟡 **Scoring brain** (`signal_engine/signals/`) — ✅ EISC independence primitive (verified), as-of harness, windowed acceleration, **under-acted-conviction (Job B)**, the quantitative bar, ledger writer, resolver stub, orchestrator. Speaker-name independence edges live. ⬜ Job A scorers (emergence/stance/intersection) deferred per blueprint. +6. ✅ **Source-independence graph** — voiceprint cosine + speaker-name edges feed EISC; "one guest doing the rounds" collapses to ~1 voice (verified). +7. 🟡 **§7.1 backtest** — runs end-to-end; **corpus-gated finding**: power-infra corroboration is currently single-cluster (energy companies), so it correctly won't clear the cross-cluster bar (k_eff≥2). Needs the cross-cluster podcast corpus (transcription backfill running). The scoring is disciplined, not broken. +8. ⬜ Frontier judge + synthesis + live fan-out (Anthropic key live; routed scrub→frontier→rehydrate) — deferred; v1 backtest uses a hand-written fan-out. +9. 🟡 **Web UI** (`signal_engine/ui/`, `serve`) — ✅ corpus management (dashboard, add/view sources, per-source claim inspection). ⬜ human-eval rating interface (§6.7) on the same app. +10. ⬜ **Run the §7.1 backtest FIRST**, then the forward pilot; disagreement analysis; scaling decision. + +**Scaling lever (post-pilot):** bulk extraction can offload to the Gemini batch API (operator can provide a key) while local Qwen stays default — public corpus only, never conviction/exposure data (§4). + +--- + +## Running the foundation + +```bash +python3 -m venv .venv && .venv/bin/pip install -r requirements.txt +DATA_DIR=./data .venv/bin/python -m signal_engine init-db +DATA_DIR=./data .venv/bin/python -m signal_engine seed-convictions +DATA_DIR=./data .venv/bin/python -m signal_engine seed-convictions --file seeds/conviction_log.backtest-2023.seed.yaml +# When the gateway URL is provided: +SPARK_CONTROL_URL=https:// .venv/bin/python -m signal_engine spark-status +``` + +**Pipeline + UI commands** (config from `.env`): +```bash +.venv/bin/python -m signal_engine seed-sources && python -m signal_engine load-feeds +.venv/bin/python -m signal_engine ingest-edgar --ticker CEG --since 2023-01-01 --until 2023-12-31 +.venv/bin/python -m signal_engine ingest-earnings --ticker NVDA --since 2023-01-01 --until 2023-12-31 +.venv/bin/python -m signal_engine run-extract --limit 4 # local-LLM claim extraction +.venv/bin/python -m signal_engine embed-claims # → Qdrant hybrid collection +.venv/bin/python -m signal_engine search --query "power is the binding constraint on AI" +.venv/bin/python -m signal_engine serve # corpus UI → http://localhost:8000 +``` + +Config is all env-driven (`signal_engine/config.py`) so the same code runs as a plain process now and as a +StartOS daemon later (injected via the s9pk `store.json` FileModel). + +--- + +## Operator-provided items (block runtime, not the foundation) + +- **Spark Control LAN base URL + TLS cert** (same-LAN self-signed → `SPARK_VERIFY_TLS=false`). +- **Confirm the exact §13.2 model strings** (currently defaulted: `qwen3.6-35b-a3b-nvfp4`, `bge-m3`, `parakeet-tdt-0.6b`). +- **`ANTHROPIC_API_KEY`** (frontier step) and **`FMP_API_KEY`** (earnings transcripts) — or defer FMP and start filings-only. +- **§7.1 backtest corpus:** which 2023-era podcast archives + filings we can actually fetch (the gating risk). +- **StartOS host CPU arch** (drives which `.s9pk` to install — the host, not the ARM Sparks) and Spark Control's + health-check IDs + outbound-gateway = clearnet (not Tor) — verified against a live box at packaging time. +``` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3867bde --- /dev/null +++ b/requirements.txt @@ -0,0 +1,27 @@ +# Ten31 Signal Engine — pilot dependencies, grouped by pipeline layer. +# Foundation (init-db, seed, spark client) needs only: requests, PyYAML, numpy. + +# --- core / foundation --- +requests>=2.31 +PyYAML>=6.0 +numpy>=1.26 + +# --- ingestion (§4.1) --- +feedparser>=6.0 # podcast RSS (conditional GET via etag/modified) +yt-dlp>=2025.1 # YouTube audio (needs a PO-token provider sidecar; see README) +edgartools>=3.0 # SEC EDGAR filings (sets UA, throttles ≤10 rps) +# FMP earnings transcripts are plain REST via `requests` (no SDK) + +# --- scheduling + queue (§13.4) --- +APScheduler>=3.10 + +# --- embeddings/vectors (§4.3) — vectors live in Qdrant behind Spark Control --- +qdrant-client>=1.12 +fastembed>=0.4 # client-side BM25 sparse vectors (Qdrant/bm25, modifier: idf) + +# --- frontier (§4.6), bounded final step --- +anthropic>=0.40 + +# --- eval UI (§4.7 / human eval) --- +fastapi>=0.110 +uvicorn>=0.29 diff --git a/run_strike_pipeline.sh b/run_strike_pipeline.sh new file mode 100755 index 0000000..f3af723 --- /dev/null +++ b/run_strike_pipeline.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Queued Strike pipeline: waits for the 4-show transcription to finish, then extracts → embeds → +# runs the STRIKE2022 two-sided reflexivity test (live vs test). Robust: proceeds with whatever is +# transcribed if the worker dies, and the 24h cap is a backstop. All work persists in the DB, so a +# crash mid-run is resumable by re-running run-extract / embed-claims / two-sided by hand. +set -u +cd /Users/macpro/Projects/ten31-signal-engine +PY=.venv/bin/python +LOG=data/strike_pipeline.log +SHOWS="('pod-whatbitcoindid','pod-stephanlivera','pod-kevinrooke','pod-anitaposch')" +say(){ echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG"; } + +say "Strike pipeline QUEUED — waiting for transcription of the 4 independent shows to finish." + +# 1) Wait for transcription completion (remaining=0) OR the transcribe worker dying. 24h backstop. +for i in $(seq 1 480); do + R=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id WHERE j.job_type='transcribe' AND j.state IN ('pending','running','leased') AND d.source_id IN $SHOWS;") + D=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id WHERE j.job_type='transcribe' AND j.state='done' AND d.source_id IN $SHOWS;") + ALIVE=$(ps aux | grep -cE "[r]un-transcribe") # matches run-transcribe AND run-transcribe-gemini + say "transcribe: remaining=$R done=$D worker_alive=$ALIVE (poll $i)" + if [ "$R" = "0" ]; then say "transcription COMPLETE."; break; fi + if [ "$ALIVE" = "0" ]; then say "transcribe worker not alive and work remains ($R) — proceeding with partial corpus."; break; fi + sleep 180 +done + +# 2) Prioritize bitcoin-cluster podcast extract jobs (the independent legs + the 19 TFTC for test-mode contrast) +say "prioritizing + extracting bitcoin-podcast claims (local Qwen on the now-free Spark)..." +sqlite3 data/signal.db "UPDATE backfill_jobs SET priority=8 WHERE job_type='extract' AND state='pending' AND parent_doc_id IN (SELECT d.doc_id FROM documents d JOIN sources s ON d.source_id=s.source_id WHERE s.source_cluster='bitcoin' AND s.kind='podcast');" + +# 3) Extract (priority-8 podcasts drain first). Loop in batches so a transient gateway hiccup doesn't end it. +for pass in 1 2 3 4 5 6; do + PEND=$(sqlite3 data/signal.db "SELECT COUNT(*) FROM backfill_jobs j JOIN documents d ON j.target_id=d.doc_id JOIN sources s ON d.source_id=s.source_id WHERE j.job_type='extract' AND j.state='pending' AND s.source_cluster='bitcoin' AND s.kind='podcast';") + say "extract pass $pass: $PEND bitcoin-podcast extract jobs pending" + [ "$PEND" = "0" ] && break + $PY -m signal_engine run-extract --limit 250 --max-chunks 4 2>&1 | grep -vE "httpx" | tail -3 | tee -a "$LOG" +done + +# 4) Embed all pending claims → Qdrant +say "embedding claims..." +$PY -m signal_engine embed-claims 2>&1 | grep -vE "httpx|HF_TOKEN|huggingface|show_warning|Fetching|files:" | tail -3 | tee -a "$LOG" + +# 5) STRIKE2022 two-sided: live (own_network TFTC/CD/RHR dropped) vs test (kept) — the reflexivity contrast +say "=== STRIKE2022 TWO-SIDED RESULT (live vs test) ===" +$PY -m signal_engine two-sided --conviction STRIKE2022 --modes live,test \ + --dates 2022-12-31,2023-06-30,2023-12-31 --window-days 180 2>&1 \ + | grep -vE "httpx|HF_TOKEN|huggingface|show_warning|Fetching|files:" | tee -a "$LOG" +say "Strike pipeline DONE." diff --git a/scripts/sharpen.py b/scripts/sharpen.py new file mode 100644 index 0000000..2f1e637 --- /dev/null +++ b/scripts/sharpen.py @@ -0,0 +1,34 @@ +"""Autonomous sharpening pass: wait for the cross-cluster podcast claims to extract, re-embed, then +re-run the §7.1 backtest. Run in the background; writes logs/backtest2.log for review.""" +import sqlite3 +import subprocess +import sys +import time + +DB = "data/signal.db" +PY = ".venv/bin/python" + + +def pending_podcast_extract() -> int: + return sqlite3.connect(DB).execute( + "SELECT COUNT(*) FROM backfill_jobs WHERE job_type='extract' AND state='pending' " + "AND target_id LIKE 'pod:%'" + ).fetchone()[0] + + +for i in range(60): # up to ~2h + p = pending_podcast_extract() + print(f"[sharpen] iter {i}: podcast extract pending={p}", flush=True) + if p <= 2: + break + time.sleep(120) + +print("[sharpen] embedding accumulated claims...", flush=True) +subprocess.run([PY, "-m", "signal_engine", "embed-claims"], stdout=sys.stdout, stderr=subprocess.STDOUT) + +print("[sharpen] re-running backtest...", flush=True) +with open("logs/backtest2.log", "w") as f: + subprocess.run([PY, "-m", "signal_engine", "backtest", "--conviction", "K2023", + "--start", "2023-03-01", "--end", "2024-09-01", "--step-days", "90", + "--window-days", "90"], stdout=f, stderr=subprocess.STDOUT) +print("[sharpen] DONE — see logs/backtest2.log", flush=True) diff --git a/seeds/battery_docs.manifest.yaml b/seeds/battery_docs.manifest.yaml new file mode 100644 index 0000000..64b00e4 --- /dev/null +++ b/seeds/battery_docs.manifest.yaml @@ -0,0 +1,34 @@ +# Battery corpus documents to fetch (verified FREE+FETCHABLE, manifest workflow w7559rp1x). +# Loaded by: signal_engine ingest-doc-manifest --file seeds/battery_docs.manifest.yaml +# Each → documents row (kind=filing) + extract job. method auto-detects html/pdf; override where needed. +docs: + # ===== SUPPLY: in-window deployed-dollar disclosures (the only true supply measures) ===== + - {source: bat-galaxy, method: html, date: "2024-01-01", title: "Galaxy Research — The State of Crypto Lending", url: "https://www.galaxy.com/insights/research/the-state-of-crypto-lending"} + - {source: bat-galaxy, method: pdf, date: "2024-01-01", title: "Galaxy Research — State of Crypto Lending (full PDF)", url: "https://assets.ctfassets.net/h62aj7eo1csj/4vkA9567QmK4pyYoPBtrQa/fb039fd97d657d8151dcf4d3e969e481/The_State_of_Crypto_Lending_-_Galaxy_Research.pdf"} + - {source: co-coin, method: html, date: "2024-02-15", title: "Coinbase Q4'23 Shareholder Letter ($399M loans / $546M credit)", url: "https://www.sec.gov/Archives/edgar/data/1679788/000167978824000019/shareholderletterq42023.htm"} + - {source: bat-galaxy, method: html, date: "2024-11-08", title: "Galaxy Q3'24 results (avg loan book $863M)", url: "https://www.newswire.ca/news-releases/galaxy-announces-third-quarter-2024-financial-results-837407031.html"} + - {source: bat-9fin, method: html, date: "2024-01-01", title: "9fin — Bitcoin-backed direct lending brings crypto to private credit", url: "https://www.9fin.com/insights/bitcoin--backed-direct-lending-brings-crypto-to-private"} + - {source: bat-coinspeaker, method: html, date: "2023-09-01", title: "Coinbase to Launch Crypto Lending for Institutions (>$57M, Reg D)", url: "https://www.coinspeaker.com/coinbase-crypto-lending-institutional/"} + # ===== SUPPLY: announced-not-funded boundary anchors (the binding negative) ===== + - {source: bat-cantor, method: html, date: "2024-07-27", title: "Cantor Fitzgerald to Launch Bitcoin Financing Business ($2B ANNOUNCED)", url: "https://www.prnewswire.com/news-releases/cantor-fitzgerald-to-launch-bitcoin-financing-business-302208093.html"} + - {source: bat-falconx, method: html, date: "2025-05-27", title: "FalconX Closes First Bitcoin-Backed Financing from Cantor (FUNDED, out-of-window)", url: "https://www.falconx.io/newsroom/falconx-closes-first-bitcoin-backed-financing-from-cantor"} + - {source: bat-maple, method: html, date: "2025-05-27", title: "Maple Closes First Cantor Tranche (FUNDED, out-of-window)", url: "https://maple.finance/insights/maple-cantor"} + - {source: bat-coindesk, method: html, date: "2025-05-27", title: "CoinDesk — Cantor Debuts Bitcoin Lending, First Tranches to FalconX/Maple", url: "https://www.coindesk.com/business/2025/05/27/wall-street-giant-cantor-debuts-bitcoin-lending-business-with-first-tranches-to-falconx-maple"} + - {source: bat-twoprime, method: html, date: "2023-11-08", title: "Two Prime sees $2B demand for bitcoin-backed loans (zero deployed disclosed)", url: "https://www.coindesk.com/markets/2023/11/08/investment-advisor-two-prime-sees-2b-in-demand-for-bitcoin-backed-loans"} + - {source: bat-battery, method: html, date: "2024-11-25", title: "Newmarket Launches Battery Finance (1 funded deal; no pooled committed capital)", url: "https://www.newswire.com/news/newmarket-launches-battery-finance-a-pioneering-asset-management-and-22469517"} + - {source: bat-bitcoinmag, method: html, date: "2024-11-25", title: "Bitcoin Magazine — Newmarket Launches Battery Finance (interview)", url: "https://bitcoinmagazine.com/business/newmarket-capital-launches-battery-finance-bitcoin-collateralized-loan-strategy-"} + # ===== DEMAND: originator disclosures ===== + - {source: bat-unchained, method: html, date: "2024-01-01", title: "Unchained — Announcing Institutional Lending (>$500M cumulative)", url: "https://www.unchained.com/blog/announcing-institutional-lending"} + - {source: bat-ledn, method: html, date: "2023-12-31", title: "Ledn Open Book — Dec 2023", url: "https://www.ledn.io/post/lednopenbook-dec23"} + # ===== MARKET DATA (independent) ===== + - {source: bat-asr, method: html, date: "2022-04-01", title: "Asset Securitization Report — Milo crypto-mortgage (BTC collateral)", url: "https://asreport.americanbanker.com/news/crypto-mortgage-product-allows-borrowers-to-post-bitcoin-as-collateral"} + # ===== POLICY CONTEXT (axis=context — weight 0 into supply) ===== + - {source: bat-sec, method: html, date: "2022-04-11", title: "SEC SAB-121 (govinfo full text)", url: "https://www.govinfo.gov/content/pkg/FR-2022-04-11/html/2022-07196.htm"} + - {source: bat-fed, method: html, date: "2022-08-16", title: "Fed SR 22-6 / CA 22-6 — crypto-asset activities (press release)", url: "https://www.federalreserve.gov/newsevents/pressreleases/bcreg20220816a.htm"} + - {source: bat-fed, method: pdf, date: "2022-08-16", title: "Fed SR 22-6 — letter PDF (loans collateralized by crypto-assets)", url: "https://www.federalreserve.gov/newsevents/pressreleases/files/bcreg20250424a3.pdf"} + - {source: bat-fed, method: pdf, date: "2023-08-08", title: "Fed SR 23-7 — Novel Activities Supervision Program (PDF)", url: "https://www.federalreserve.gov/newsevents/pressreleases/files/bcreg20250815a1.pdf"} + - {source: bat-fdic, method: html, date: "2022-04-07", title: "FDIC FIL-16-2022 — crypto prior-notification", url: "https://www.fdic.gov/news/inactive-financial-institution-letters/2022/fil22016.html"} + - {source: bat-occ, method: pdf, date: "2021-11-23", title: "OCC Interpretive Letter 1179 — supervisory non-objection gate", url: "https://www.occ.gov/topics/charters-and-licensing/interpretations-and-decisions/2021/int1179.pdf"} + - {source: bat-occ, method: pdf, date: "2020-07-22", title: "OCC Interpretive Letter 1170 — national-bank crypto custody", url: "https://www.occ.gov/topics/charters-and-licensing/interpretations-and-decisions/2020/int1170.pdf"} + # ===== Terminal-bracket policy (2025, out-of-window CONTEXT: the unblock that produced no capital) ===== + - {source: bat-sec, method: html, date: "2025-01-23", title: "SEC SAB-122 (rescission of SAB-121)", url: "https://www.federalregister.gov/documents/2025/01/30/2025-01864/staff-accounting-bulletin-no-122"} diff --git a/seeds/conviction_log.adversarial.seed.yaml b/seeds/conviction_log.adversarial.seed.yaml new file mode 100644 index 0000000..55adb4b --- /dev/null +++ b/seeds/conviction_log.adversarial.seed.yaml @@ -0,0 +1,23 @@ +# ADVERSARIAL failed-conviction cases (DESIGN_v2 §1.4) — the NEGATIVES the eval was missing. +# Both are convictions Ten31 ACTUALLY held where the FALSIFIABLE THESIS mis-fired (on mechanism / +# timing) even though Ten31 remains bullish on the companies. That is the most honest kind of negative +# — and the kind we are most tempted to grade leniently, which is why the resolution criteria are +# pre-registered (seeds/resolution.*.yaml) before any outcome-labeling. Let the criteria be ugly. +convictions: + - id: STRIKE2022 + seam: debasement_bitcoin + conviction_level: high + current_exposure: lt2 # TEST parameterization (so the scorer can fire) — NOT Strike's real exposure (~40% of book) + exposure_note: "test param; real Strike exposure is HIGH (largest position). The 2022 PAYMENTS thesis is the falsifiable negative." + thematic_proposition: "Bitcoin + the Lightning network becomes a retail payments network that materially disrupts the card rails (Visa/Mastercard/Amex); Lightning retail acceptance and merchant-payment volume scale." + team_conviction_note: "Ten31 led Strike's Series B (2022) on this thesis; STILL high conviction in Strike the company. But the 2022 payments-network thesis FAILED — Strike succeeded by becoming a bitcoin financial-services platform (exchange + BTC-collateralized lending), which the memo barely names. The failed thesis was NARRATIVE-driven (the whole bitcoin-podcast cluster told 'Lightning eats retail payments' loudly in 2022) — the exact single-cluster, reflexive, talk-our-own-book corroboration the relaxed gate is most likely to wave through." + disconfirming_signal: "Retail Lightning payments fail to materialize at scale; card-network volumes show no erosion; Strike's growth comes from exchange/lending not merchant payments." + + - id: BATTERY2022 + seam: debasement_bitcoin + conviction_level: high + current_exposure: lt2 + exposure_note: "Battery Finance position" + thematic_proposition: "Bitcoin-as-collateral credit goes mainstream: institutional/incumbent capital funds bitcoin-collateralized lending at scale within 24-36 months; >=1 major traditional institution enters. (= conviction D1 + R3.)" + team_conviction_note: "Ten31 invested in Battery Finance (2022); STILL high conviction. The thesis was RIGHT on the demand side (borrower appetite for BTC-collateralized credit) and EARLY/wrong on the SUPPLY side (Battery could not raise institutional lending capital at scale). A direct, datable instance of D1/R3 not arriving on schedule — the instructive two-sided test (demand rising, supply flat)." + disconfirming_signal: "Institutional/bank capital for BTC-collateralized credit stays scarce; no major incumbent entry; the supply side stalls while demand rises." diff --git a/seeds/conviction_log.backtest-2023.seed.yaml b/seeds/conviction_log.backtest-2023.seed.yaml new file mode 100644 index 0000000..8312f0f --- /dev/null +++ b/seeds/conviction_log.backtest-2023.seed.yaml @@ -0,0 +1,21 @@ +# Backtest seed — handoff §7.1 (the HEADLINE pilot validation, run BEFORE the forward pilot). +# +# Seed ONLY the ~2023 Kirkwood conviction, run the pipeline over a period-correct ~2023 corpus, and +# check whether the under-acted-conviction signal surfaces the derivative: +# "size up the power-infrastructure picks-and-shovels of the buildout" +# (grid interconnect, transformers, substations, cooling, gas turbines, nuclear, uranium, public +# picks-and-shovels). A clear yes/no on that derivative is the strongest validation the system +# does the job Ten31 actually needs (§1.1, §7.1). +# +# CRITICAL (§6.6 look-ahead guard): consensus, embeddings, and corroboration must be computed +# AS-OF the log date — never with knowledge that the theme was real in absolute terms. + +convictions: + - id: K2023 + seam: energy_compute + conviction_level: high + current_exposure: lt2 # the historical reality: seed conviction held, derivatives under-acted + exposure_note: "Seed conviction held in ~2023; derivative branches NOT systematically sized into — the failure §1.1 exists to prevent." + thematic_proposition: "Bitcoin mining and AI are both 'distributed compute'; interruptible/flexible load is the differentiator; the world will need to ~1000x rack space over the decade, so power becomes the binding constraint." + team_conviction_note: "Jonathan Kirkwood, publicly articulated ~2023. Root call correct and early; the MISS was the derivative tree, not the prediction." + disconfirming_signal: "Compute demand growth stalls; power clears without becoming the binding constraint." diff --git a/seeds/conviction_log.seed.yaml b/seeds/conviction_log.seed.yaml new file mode 100644 index 0000000..37908b0 --- /dev/null +++ b/seeds/conviction_log.seed.yaml @@ -0,0 +1,147 @@ +# Conviction log seed — handoff §3.1. HUMAN-OWNED: Grant edits this file; `seed-convictions` upserts it. +# +# Structural rule (§3.1): `thematic_proposition` is the TRACKABLE half the corpus can corroborate and +# that gets fanned out and scored. `team_conviction_note` is context ONLY — the engine must never +# present theme corroboration as validation of the team bet beneath it. +# +# `current_exposure` is a coarse NAV band (operator decision): none | lt2 | 2to10 | gt10 | unset. +# The v1 levels below are the §3.1 draft; exposure is left `unset` with the original prose preserved in +# `exposure_note` — Grant to FINALIZE the NAV bands (§12 governance item). + +convictions: + # ---------- ROOT (the forcing function) ---------- + - id: R1 + seam: root + conviction_level: high + current_exposure: unset + exposure_note: "pervasive" + thematic_proposition: "Sovereign debt keeps being monetized not repaid; fiat debasement persists; bitcoin is adopted as the neutral non-debasable reserve capital migrates to." + disconfirming_signal: "Durable fiscal surpluses + falling debt/GDP + no reserve diversification." + + - id: R2 + seam: root + conviction_level: high + current_exposure: unset + exposure_note: "thesis-wide" + thematic_proposition: "AI drives the marginal cost of the reproducible toward zero; value accrues to the scarce/verifiable; bitcoin gains relative share as the 'strongest horse'; pricing-in-bitcoin grows." + disconfirming_signal: "Scarce/verifiable assets earn no premium as AI content saturates." + + - id: R3 + seam: root + conviction_level: med-high + current_exposure: unset + exposure_note: "pervasive (esp. custody/credit names)" + thematic_proposition: "Strategic bitcoin reserves (US/nation-states), SAB-121 repeal enabling bank custody, and ETF/treasury inflows create a price-inelastic bid and invert allocator career risk." + disconfirming_signal: "Reserve plans stall or reverse; banks stay out; policy turns adversarial." + + # ---------- ENERGY <-> COMPUTE ---------- + - id: E1 + seam: energy_compute + conviction_level: high + current_exposure: unset + exposure_note: "MED-HIGH (Giga, Satoshi Energy)" + thematic_proposition: "Power, not chips, is the binding constraint on AI buildout through ~2027-28; the seam picks-and-shovels are under-priced." + team_conviction_note: "Giga, Satoshi Energy." + disconfirming_signal: "Chips/capital remain the bottleneck; interconnect clears fast." + + - id: E2 + seam: energy_compute + conviction_level: high + current_exposure: unset + exposure_note: "MED (Giga power-market optimization, Satoshi)" + thematic_proposition: "The miner flexible-load playbook (demand response, behind-the-meter) goes mainstream for AI data centers and grids; mining fluency is a transferable underwriting edge." + disconfirming_signal: "Data centers reject flexible load; the fluency proves non-transferable." + + - id: E3 + seam: energy_compute + conviction_level: med + current_exposure: unset + exposure_note: "Giga (straddle) vs Upstream (mining-only)" + thematic_proposition: "Mining-native operators that pivot into / straddle AI/HPC capture the convergence; mining-only underperforms." + team_conviction_note: "Deliberately low-conviction seed — engine should help resolve." + disconfirming_signal: "Pure-play mining outperforms straddlers." + + # ---------- DEBASEMENT <-> BITCOIN ---------- + - id: D1 + seam: debasement_bitcoin + conviction_level: high + current_exposure: unset + exposure_note: "HIGH (Strike; Battery, Unchained, debifi, AnchorWatch)" + thematic_proposition: "Bitcoin-as-collateral goes mainstream: new BTC-collateralized credit products proliferate, spreads compress, and >=1 major traditional institution enters within 24-36 months. As products mature, holders borrow rather than sell, shrinking marginal supply." + disconfirming_signal: "Stays a crypto-native niche; no incumbent entry; spreads hold." + + - id: D2 + seam: debasement_bitcoin + conviction_level: high + current_exposure: unset + exposure_note: "portfolio-wide" + thematic_proposition: "Incumbents buy, not build: legacy finance/tech acquires bitcoin-natives rather than building in-house (the published exit thesis)." + disconfirming_signal: "Incumbents build in-house or via crypto-generalists; no strategic M&A." + + - id: D3 + seam: debasement_bitcoin + conviction_level: med-high + current_exposure: unset + exposure_note: "enablers (Fold, AnchorWatch, Giga/Upstream)" + thematic_proposition: "Bitcoin commercialization of legacy operating businesses: compressed-multiple firms become structurally advantaged when rearchitected around bitcoin (treasury, settlement, self-hosted infra, stranded energy)." + disconfirming_signal: "Legacy adoption stalls; no margin advantage." + + - id: D4 + seam: debasement_bitcoin + conviction_level: high + current_exposure: unset + exposure_note: "HIGH (largest position, ~40%)" + thematic_proposition: "Strike re-rates as a bitcoin bank, not payments: market values it as exchange + major retail BTC-collateralized lender + global access (70+ jurisdictions), not legacy payments." + team_conviction_note: "Team conviction high; tracked SEPARATELY from the thematic re-rating." + disconfirming_signal: "Stays valued/stuck as payments; lending/exchange don't scale." + + # ---------- AI <-> DATA-OWNERSHIP (PRIME under-acted-conviction target) ---------- + - id: A1 + seam: ai_data_ownership + conviction_level: high + current_exposure: unset + exposure_note: "LOW (Start9, OpenSecret/Maple, maybe Primal; small checks)" + thematic_proposition: "Owned judgment is the last margin: AI commoditizes competence and profit on undifferentiated output erodes toward zero, so durable margin needs owned/protected proprietary data + judgment; demand grows for sovereign-root + confidential-inference infra." + disconfirming_signal: "Enterprises cede data/inference with no margin penalty." + + - id: A2 + seam: ai_data_ownership + conviction_level: med + current_exposure: unset + exposure_note: "LOW" + thematic_proposition: "The segment that can't cede (regulated, IP-sensitive, adversarial jurisdictions) adopts owned infra + confidential inference even as the majority cedes to convenience." + disconfirming_signal: "Even the IP-sensitive segment fully cedes." + + - id: A3 + seam: ai_data_ownership + conviction_level: low + current_exposure: unset + exposure_note: "LOW" + thematic_proposition: "Start9 broadens beyond the bitcoiner niche (SaaS -> on-prem reversion)." + team_conviction_note: "Explicitly uncertain — team high, theme unproven ('maybe drinking our own koolaid, tbd'). Low-conviction seed the engine should help resolve." + disconfirming_signal: "Stays bitcoiner-niche." + + # ---------- MONITORED THESIS-BREAKERS (engine must surface these AGAINST the thesis, §5.7) ---------- + - id: B1 + seam: root + is_thesis_breaker: true + conviction_level: low + current_exposure: unset + thematic_proposition: "Quantum acceleration compresses CRQC timelines inside NIST 2035 before mitigations deploy (bitcoin-leg breaker)." + disconfirming_signal: "n/a — this is a breaker the engine monitors FOR, not against." + + - id: B2 + seam: energy_compute + is_thesis_breaker: true + conviction_level: low + current_exposure: unset + thematic_proposition: "AI permanently outbids mining for power, pushing mining to only truly-stranded margin (energy-leg breaker)." + disconfirming_signal: "n/a — breaker the engine monitors FOR." + + - id: B3 + seam: debasement_bitcoin + is_thesis_breaker: true + conviction_level: low + current_exposure: unset + thematic_proposition: "Stablecoins/CBDCs capture the neutral-reserve role, or bitcoin fails as the exit (tests the complementary-stablecoin view)." + disconfirming_signal: "n/a — breaker the engine monitors FOR." diff --git a/seeds/fanout.BATTERY2022.seed.yaml b/seeds/fanout.BATTERY2022.seed.yaml new file mode 100644 index 0000000..26ff0ae --- /dev/null +++ b/seeds/fanout.BATTERY2022.seed.yaml @@ -0,0 +1,17 @@ +# Hand-written fan-out for the BATTERY 2022 bitcoin-as-collateral-credit thesis (D1) — two-sided test. +# The instructive split: DEMAND derivatives expected RISING (confirming), SUPPLY derivatives expected +# FLAT (the disconfirmation / timing-early signal the two-sided scorer should surface). +parent_conviction_id: BATTERY2022 +nodes: + - node_id: BATTERY-demand-borrower-appetite + distance_from_edge: in_mandate + derivative_proposition: "Borrower demand for bitcoin-collateralized credit is rising (new products, origination growth, fund formations)." + - node_id: BATTERY-institutional-supply + distance_from_edge: in_mandate + derivative_proposition: "Named institutional capital PROVIDERS have actually DEPLOYED significant dollars (not merely announced or made-available) to FUND third-party bitcoin-collateralized loans at scale. (Capital-provider side only — a firm posting bitcoin as collateral to RECEIVE a loan is borrower-side demand, not supply; an announced/planned program is not deployed capital.)" + - node_id: BATTERY-incumbent-entry + distance_from_edge: in_mandate + derivative_proposition: "A major traditional financial institution has entered bitcoin-collateralized lending (D1's explicit milestone)." + - node_id: BATTERY-custody-policy-enablement + distance_from_edge: one_hop + derivative_proposition: "Regulatory and custody developments (SAB-121 repeal, bank custody clearance) are enabling institutional bitcoin credit." diff --git a/seeds/fanout.K2023.seed.yaml b/seeds/fanout.K2023.seed.yaml new file mode 100644 index 0000000..8631153 --- /dev/null +++ b/seeds/fanout.K2023.seed.yaml @@ -0,0 +1,22 @@ +# Hand-written fan-out for the §7.1 backtest (build-order step 4). These are the 2nd/3rd-order +# derivatives the frontier WOULD have generated from the 2023 Kirkwood conviction (K2023). Hand-writing +# them for v1 removes the frontier dependency from the FIRST backtest and isolates the real question: +# does the SCORING surface the derivative once it exists? (The §1.1 derivative tree, verbatim intent.) +parent_conviction_id: K2023 +nodes: + - node_id: K2023-grid-interconnect + derivative_proposition: "Electrical grid interconnection capacity becomes the gating constraint on new data-center and compute load; interconnect queue times and grid upgrade costs blow out." + - node_id: K2023-transformers + derivative_proposition: "Demand for large power transformers and electrical equipment outstrips supply; lead times extend dramatically and pricing rises." + - node_id: K2023-nuclear + derivative_proposition: "Nuclear power — existing plants, long-term PPAs, SMRs — is repriced upward as firm clean baseload to power AI data centers." + - node_id: K2023-gas-turbines + derivative_proposition: "Natural-gas turbines and on-site/behind-the-meter generation see surging demand to power data centers where the grid cannot deliver in time." + - node_id: K2023-cooling + derivative_proposition: "Data-center cooling and thermal management (liquid cooling) demand accelerates as compute power density rises." + - node_id: K2023-uranium + derivative_proposition: "Uranium and nuclear-fuel supply tightens as nuclear demand to power compute rises." + - node_id: K2023-utilities-repriced + derivative_proposition: "Independent power producers and utilities with firm generation are repriced upward as data-center electricity demand surges." + - node_id: K2023-picks-and-shovels + derivative_proposition: "The public picks-and-shovels of the AI power buildout — power infrastructure, equipment, contracts, and generation serving data centers — are systematically under-priced and should be sized up." diff --git a/seeds/fanout.STRIKE2022.seed.yaml b/seeds/fanout.STRIKE2022.seed.yaml new file mode 100644 index 0000000..5fb6225 --- /dev/null +++ b/seeds/fanout.STRIKE2022.seed.yaml @@ -0,0 +1,15 @@ +# Hand-written fan-out for the STRIKE 2022 payments-network thesis (adversarial negative). +# distance_from_edge: tag for TRIAGE only (DESIGN_v2.1) — NEVER a filter. These are all in_mandate +# (core bitcoin). The test: does the engine CLEAR any of these on bitcoin-cluster reflexive chatter +# when retail Lightning payments never materialized? Each clear = a candidate FALSE POSITIVE. +parent_conviction_id: STRIKE2022 +nodes: + - node_id: STRIKE-lightning-retail-acceptance + distance_from_edge: in_mandate + derivative_proposition: "Lightning-based retail payment acceptance is accelerating across major merchants and point-of-sale systems." + - node_id: STRIKE-card-rail-disruption + distance_from_edge: in_mandate + derivative_proposition: "Card-network payment volume and interchange economics are being disrupted by bitcoin/Lightning payment rails." + - node_id: STRIKE-merchant-lightning-integration + distance_from_edge: in_mandate + derivative_proposition: "Major merchants and processors are integrating Lightning for retail payments at scale (NCR / Shopify / Blackhawk-class), beyond pilots." diff --git a/seeds/podcast_feeds.resolved.yaml b/seeds/podcast_feeds.resolved.yaml new file mode 100644 index 0000000..e4e0910 --- /dev/null +++ b/seeds/podcast_feeds.resolved.yaml @@ -0,0 +1,33 @@ +# Resolved + VERIFIED podcast feeds (background research, 2026-06-07). Loaded via `load-feeds`. +# backtest_2022_2023 = how to reach the §7.1 window (2022-2023) for this show: +# rss_full — the RSS feed itself carries the full back-catalog into 2022-2023 +# rss_2023_only — show launched in 2023; RSS is full from launch (no 2022 by design) +# youtube_only — RSS is a truncated rolling window; 2022-2023 must come from YouTube/site archive +# launched_later — show didn't exist in 2022-2023 (no backtest contribution) +# unavailable — no owned audio feed at all +# +# KEY FINDING: most podcast RSS feeds are rolling windows (~15-25 eps). For the backtest, the clean +# RSS-back-catalog shows (Hidden Forces, Dwarkesh, What Bitcoin Did, All-In, Invest Like the Best) +# anchor the 2022-2023 podcast leg; truncated shows need yt-dlp channel pulls with --dateafter/--datebefore. + +feeds: + - {id: pod-oddlots, rss_url: "https://www.omnycontent.com/d/playlist/e73c998e-6e60-432f-8610-ae210140c5b1/8a94442e-5a74-4fa2-8b8d-ae27003a8d6b/982f5071-765c-403d-969d-ae27003a8d83/podcast.rss", youtube_channel_url: "https://www.youtube.com/playlist?list=PLe4PRejZgr0MuA6M0zkZyy-99-qc87wKV", backtest_2022_2023: youtube_only, note: "Omny RSS truncated to ~15 recent; show since 2015. 2022-2023 via YouTube/Apple/Spotify or Bloomberg YT."} + - {id: pod-forwardguidance, rss_url: "https://feeds.megaphone.fm/forwardguidance", youtube_channel_url: "https://www.youtube.com/@ForwardGuidanceBW", backtest_2022_2023: youtube_only, note: "RSS truncated. Hosts the 2022-2023 Jack Farley macro content — KEY backtest source via YouTube @ForwardGuidanceBW."} + - {id: pod-macrovoices, rss_url: "https://feed.podbean.com/macrovoices/feed.xml", youtube_channel_url: "https://www.youtube.com/@macrovoices7508", backtest_2022_2023: youtube_only, note: "RSS ~25 recent; weekly since 2016 (energy-heavy). 2022-2023 via macrovoices.com archive (29 pages) or YouTube."} + - {id: pod-grantwilliams, rss_url: "https://feed.podbean.com/ttmygh/feed.xml", youtube_channel_url: "https://www.youtube.com/@GWTTMYGH", backtest_2022_2023: youtube_only, note: "RSS ~20 recent; much back-catalog paywalled (Copper/Silver). 2022-2023 via YouTube @GWTTMYGH."} + - {id: pod-monetarymatters, rss_url: "https://feeds.megaphone.fm/EWWMN1909747317", youtube_channel_url: "https://www.youtube.com/@Monetary-Matters", backtest_2022_2023: launched_later, note: "Launched Sep 2024 — NO 2022-2023. For 2022-2023 Jack Farley use Forward Guidance instead."} + - {id: pod-hiddenforces, rss_url: "https://hiddenforces.libsyn.com/rss", youtube_channel_url: "https://www.youtube.com/channel/UC8URhgYos5fjHqFSO4RSIEg", backtest_2022_2023: rss_full, note: "FULL via RSS: 507 eps to 2017 (libsyn). 67 eps in 2022, 55 in 2023. Anchor backtest source."} + - {id: pod-dwarkesh, rss_url: "https://apple.dwarkesh-podcast.workers.dev/feed.rss", youtube_channel_url: "https://www.youtube.com/c/DwarkeshPatel", backtest_2022_2023: rss_full, note: "FULL via RSS: 128 eps to 2020; 25 in 2022, 20 in 2023. NB do NOT use api.substack.com/feed/podcast/69345.rss (stale/truncated)."} + - {id: pod-nopriors, rss_url: "https://feeds.megaphone.fm/nopriors", youtube_channel_url: "https://www.youtube.com/@NoPriorsPodcast", backtest_2022_2023: rss_2023_only, note: "FULL from Feb 2023 launch (165 eps); no 2022 by design."} + - {id: pod-latentspace, rss_url: "https://api.substack.com/feed/podcast/1084089.rss", youtube_channel_url: "https://www.youtube.com/@LatentSpacePod", backtest_2022_2023: rss_2023_only, note: "FULL from Feb 2023 launch (207 eps, 50 in 2023); no 2022 by design."} + - {id: pod-cognitiverev, rss_url: "https://feeds.megaphone.fm/RINTP3108857801", youtube_channel_url: "https://www.youtube.com/@CognitiveRevolutionPodcast", backtest_2022_2023: rss_2023_only, note: "FULL from Feb 2023 launch (348 eps, 91 in 2023); no 2022 by design."} + - {id: pod-bg2, rss_url: "https://anchor.fm/s/f06c2370/podcast/rss", youtube_channel_url: "https://www.youtube.com/@Bg2Pod", backtest_2022_2023: launched_later, note: "Launched Jan 2024 — NO 2022-2023."} + - {id: pod-a16z, rss_url: "https://feeds.simplecast.com/JGE3yC0V", youtube_channel_url: "https://www.youtube.com/@a16z", backtest_2022_2023: youtube_only, note: "RSS truncated to ~15 recent; show since 2014 (~1000 eps). 2022-2023 via a16z.com/podcasts or YouTube."} + - {id: pod-catalyst, rss_url: "https://feeds.megaphone.fm/catalyst", youtube_channel_url: "https://www.youtube.com/channel/UC1dCBgJnwO5fgNVEn2BgQbg", backtest_2022_2023: youtube_only, note: "RSS ~15 recent; 258 eps since 2021 (energy). 2022-2023 via Latitude Media site or YouTube Catalyst playlist."} + - {id: pod-columbiaenergy, rss_url: "https://columbiaenergyexchange.libsyn.com/rss", youtube_channel_url: "https://www.youtube.com/channel/UC5vAhRqHufSZNB9coZG5t6Q", backtest_2022_2023: youtube_only, note: "RSS ~28 recent; long-running (CGEP). 2022-2023 via energypolicy.columbia.edu archive (has transcripts!) or YouTube."} + - {id: pod-doomberg, rss_url: null, youtube_channel_url: null, backtest_2022_2023: unavailable, note: "NO owned audio feed — Doomberg is a Substack newsletter; only guest appearances on other shows. Consider ingesting the written Substack (text) or drop from the audio set."} + - {id: pod-bitcoinlayer, rss_url: "https://feeds.simplecast.com/Y2219Riv", youtube_channel_url: "https://www.youtube.com/@TheBitcoinLayer", backtest_2022_2023: youtube_only, note: "RSS ~25 recent (partial); show since 2022. 2022-2023 via YouTube @TheBitcoinLayer. cluster_capped_low."} + - {id: pod-whatbitcoindid, rss_url: "https://feeds.acast.com/public/shows/69d4f193b76468caacc5068f", youtube_channel_url: "https://www.youtube.com/@WhatBitcoinDid", backtest_2022_2023: rss_full, note: "FULL via RSS (Acast): 1060 eps to 2017; 155 in 2022, 156 in 2023. cluster_capped_low. (Peter McCormack show, ID 1317356120 — not the Danny Knowles spinoff.)"} + - {id: pod-allin, rss_url: "https://rss.libsyn.com/shows/254861/destinations/1928300.xml", youtube_channel_url: "https://www.youtube.com/@allin", backtest_2022_2023: rss_full, note: "FULL via RSS: 384 eps to 2020; 61 in 2022, 50 in 2023. Consensus barometer."} + - {id: pod-iltb, rss_url: "https://feeds.megaphone.fm/investlikethebest", youtube_channel_url: "https://www.youtube.com/@ILTB_Podcast", backtest_2022_2023: rss_full, note: "FULL via RSS: 582 eps to 2016; 57 in 2022, 53 in 2023."} + - {id: pod-lex, rss_url: "https://lexfridman.com/feed/podcast/", youtube_channel_url: "https://www.youtube.com/@lexfridman", backtest_2022_2023: youtube_only, note: "RSS truncated to ~19 recent; show since 2018. 2022-2023 via YouTube @lexfridman or lexlib.io index."} diff --git a/seeds/resolution.BATTERY2022.yaml b/seeds/resolution.BATTERY2022.yaml new file mode 100644 index 0000000..a00f8b6 --- /dev/null +++ b/seeds/resolution.BATTERY2022.yaml @@ -0,0 +1,26 @@ +# PRE-REGISTERED resolution — BATTERY 2022 bitcoin-as-collateral-credit thesis (D1). Two-sided. +# DESIGN_v2.1 condition 2: hostile-checker-proof — named institutions, dollar figures, dated events. +# Instrument = the two-sided net-corroboration (affirms − denies) trajectory per derivative +# (condition 3): the engine should show DEMAND rising while SUPPLY stays flat — the "half-confirmed, +# load-bearing half not moving" disconfirmation, NOT clear the supply derivatives early. +thesis: "Bitcoin-as-collateral credit goes mainstream; institutional/incumbent capital funds it at scale within 24-36 months (>=1 major institution enters)." +window: {start: "2022-01-01", end: "2024-12-31"} +metric_type: adoption_evidence +criteria: + BATTERY-demand-borrower-appetite: + expected: rising + confirm_iff: ">=3 NAMED active BTC-collateralized credit products/originators operating by 2024-12-31 (e.g. Unchained, Ledn, Strike lending, Battery, Salt, debifi) — countable, datable." + checks: ["count of named BTC-collateralized lenders active 2024 with origination", "any public origination-volume figures"] + BATTERY-institutional-supply: + expected: flat + confirm_iff: "A NAMED institution/bank committed a stated DOLLAR figure of lending capital to BTC-collateralized credit AT SCALE (>$100M) by 2024-12-31." + checks: ["named institutional capital provider + committed $ to BTC-collateralized lending (yes/no + figure)", "BTC-credit fund AUM from institutional LPs — figure"] + BATTERY-incumbent-entry: + expected: not_yet + confirm_iff: "A top-50 (by assets) traditional bank/financial institution PUBLICLY entered BTC-collateralized lending by 2024-12-31 (named institution + dated announcement)." + checks: ["named top-50 bank offering/funding BTC-collateralized loans — yes/no + date"] + BATTERY-custody-policy-enablement: + expected: late_edge + confirm_iff: "SAB-121 repealed/rescinded AND bank BTC-custody cleared WITHIN window (<=2024-12-31)." + checks: ["SAB-121 status + exact date (pre-registered fact: issued 2022-03-31; rescinded by SAB-122 on 2025-01-23 — JUST PAST the window, so this resolves 'arrived at the edge, not in window')"] +prior_expectation: "EARLY on the SUPPLY axis: demand rose (multiple named lenders), institutional supply stalled through 2024 (no named top-50 incumbent at scale), the policy catalyst (SAB-121 repeal) landed Jan 2025 — just past window. The two-sided scorer should surface the supply-side DISCONFIRMATION." diff --git a/seeds/resolution.K2023.yaml b/seeds/resolution.K2023.yaml new file mode 100644 index 0000000..43dc0c5 --- /dev/null +++ b/seeds/resolution.K2023.yaml @@ -0,0 +1,20 @@ +# PRE-REGISTERED resolution criteria for the §7.1 backtest confusion matrix (DESIGN_v2 §1.1). +# Committed BEFORE pulling any price data. Uniform rule for ALL derivatives: +# - equal-weight, start-normalized basket index over 2023-01-01 .. 2025-06-30 +# - "confirmed real" iff index first hits >= +40% vs 2023-01 baseline on date D AND still >= +25% at D+90d +# - repricing_date = D +# Baskets are best-judgment liquid proxies for each derivative's real-world exposure, drawn from the +# §7.3 source universe. They are NOT tuned to outcomes. Grant/dev may revise the baskets — but doing so +# is a logged pre-registration change (DESIGN_v2 §4), not a quiet tune. +window: {start: "2023-01-01", end: "2025-06-30"} +rule: {threshold_pct: 40, hold_pct: 25, hold_days: 90} +baskets: + K2023: [CEG, VST, TLN, NEE, GEV, VRT, PWR, CCJ] # root: broad power-infra build + K2023-picks-and-shovels: [CEG, VST, TLN, NEE, GEV, VRT, PWR, CCJ] # broad power-infra basket + K2023-uranium: [CCJ] # Cameco = the liquid uranium proxy in-corpus + K2023-nuclear: [CEG, TLN, VST] # nuclear-heavy IPPs + K2023-utilities-repriced: [CEG, VST, NEE, TLN] # IPPs / utilities with firm generation + K2023-grid-interconnect: [PWR, GEV] # grid build / interconnection (Quanta, GE Vernova) + K2023-transformers: [GEV, VRT] # electrical equipment + K2023-gas-turbines: [GEV] # GE Vernova gas turbines + K2023-cooling: [VRT] # Vertiv = data-center thermal/cooling proxy diff --git a/seeds/resolution.STRIKE2022.yaml b/seeds/resolution.STRIKE2022.yaml new file mode 100644 index 0000000..4d171c4 --- /dev/null +++ b/seeds/resolution.STRIKE2022.yaml @@ -0,0 +1,22 @@ +# PRE-REGISTERED resolution — STRIKE 2022 payments-network thesis (adversarial NEGATIVE). +# DESIGN_v2.1 condition 2: indicators must survive a HOSTILE checker — named, countable, datable +# things a disinterested third party could look up and get the SAME yes/no. No "material scale" hand-waving. +# Committed before evidence-gathering. The instrument for the engine output is NOT runway — it is the +# two-sided net-corroboration (affirms − denies) trajectory (DESIGN_v2.1 condition 3): a PASS for Strike +# looks like the engine STAYING QUIET (no clear) or the disconfirming side accumulating, NOT clearing early. +thesis: "Bitcoin/Lightning becomes a retail payments network that materially disrupts the card rails." +window: {start: "2022-01-01", end: "2024-12-31"} +metric_type: adoption_evidence +criteria: + STRIKE-merchant-lightning-integration: + # The 2022 memo promised specific integrations. Datable: did each ship Lightning RETAIL acceptance + # at general availability (not a pilot/press release) by 2024-12-31? Three named yes/no checks. + confirm_iff: ">=2 of {NCR/Aloha PoS, Shopify, Blackhawk Network} shipped Lightning retail payment acceptance at GENERAL AVAILABILITY with sustained merchant usage by 2024-12-31." + checks: ["NCR (now NCR Voyix/Aloha) Lightning retail acceptance GA? yes/no", "Shopify native Lightning checkout GA at scale? yes/no", "Blackhawk Network Lightning retail acceptance GA? yes/no"] + STRIKE-lightning-retail-acceptance: + confirm_iff: "A NAMED, citable figure shows Lightning-settled RETAIL payment volume > $1B/year by 2024, OR Strike publicly reports merchant-acceptance/payments as a material (>10% of revenue) line." + checks: ["Lightning Network public retail payment-volume estimate (bitcoinvisuals / River Lightning report) — figure + date", "Strike public revenue/product mix disclosure — merchant payments a named line? yes/no"] + STRIKE-card-rail-disruption: + confirm_iff: "Any Visa/Mastercard 10-K/earnings disclosure attributing SUSTAINED US retail payment-volume erosion to bitcoin/Lightning by 2024-12-31." + checks: ["Visa FY22-24 10-K: bitcoin/Lightning named as a volume-erosion factor? yes/no", "Mastercard FY22-24 10-K: same? yes/no"] +prior_expectation: "FAILED. Strike's growth came from exchange + BTC-collateralized lending, not retail payments. The PRECISION TEST: if the engine CLEARS any of these on bitcoin-cluster (own_network-tagged) chatter while the named checks are NO, that is the reflexive false positive the discipline must catch." diff --git a/seeds/resolution_outcomes.adversarial.yaml b/seeds/resolution_outcomes.adversarial.yaml new file mode 100644 index 0000000..925e00d --- /dev/null +++ b/seeds/resolution_outcomes.adversarial.yaml @@ -0,0 +1,37 @@ +# HOSTILE-VERIFIED outcome labels for the adversarial cases (gathered AFTER the criteria in +# resolution.{STRIKE,BATTERY}2022.yaml were frozen — correct pre-registration order). Each was +# researched then independently re-verified by a skeptic agent to the "survive a hostile checker" +# standard. `played_out` is normalized to the THESIS reality (yes = the derivative came true), not the +# agent's verdict word (which was polarity-inconsistent on the card-erosion check). +STRIKE2022: # payments thesis — verified DEAD on all three (the clean negative we needed) + STRIKE-merchant-lightning-integration: + played_out: no + evidence: "Only Shopify reached GA (Strike app, 8 lifetime reviews in 3 yrs = trivial). Blackhawk: Strike CEO's own 2022-12-31 update = 'final testing, planned early-2023 launch at 50 locations' — never confirmed live. NCR/Aloha: no ship, no timeline. <2 of 3 at scale." + cite: ["jimmymow.medium.com/strike-commerce-update", "apps.shopify.com/strike", "fortune.com/crypto/2024/04/24 (demand was trading/custody, not POS)"] + STRIKE-lightning-retail-acceptance: + played_out: no + evidence: "No named retail Lightning volume >$1B/yr in 2024. Strike's only disclosure ($6B 2024 'payments volume', Apr-2025) is unsegmented and dominated by brokerage buy/sell + remittance. River: total Lightning (all uses) only hit ~$1B/MONTH in late 2025." + cite: ["news.bitcoin.com/bitcoin-payments-firm-strike-grew-600-in-2024", "River Lightning report"] + STRIKE-card-rail-disruption: + played_out: no + evidence: "ZERO erosion attribution. Full-text search of all 5 as-filed 10-Ks (Visa FY22-24, MC FY23-24): 'bitcoin' and 'lightning' appear 0 times; crypto only in generic forward-looking risk lists. Card networks grew." + cite: ["SEC EDGAR Visa/Mastercard 10-Ks"] +BATTERY2022: # D1 — demand REAL, supply FAILED, + an instructive milestone-vs-substance wrinkle + BATTERY-demand-borrower-appetite: + played_out: yes + evidence: ">=4 named BTC-collateralized originators active through 2024 (Unchained ~$1B cumulative by 2025; Ledn ~$392M FY24 origination; plus Strike lending, Salt, debifi, AnchorWatch)." + cite: ["thebloc / Ledn", "Unchained disclosures"] + BATTERY-institutional-supply: + played_out: no + binding_constraint: capital_provider_willingness # NOT regulation — see custody-policy note below + evidence: "No named institution DEPLOYED >$100M of BTC-collateralized lending capital at scale IN 2024. Cantor Fitzgerald's $2B program announced 2024-07-27 but first loans (FalconX, Maple) closed 2025-05-27 — zero deployed in window. CRITICAL CAUSAL NOTE: the constraint was capital-provider WILLINGNESS, not regulatory permissibility — nothing legally restricted dollar holders from funding these loans. Proof: the 2025 regulatory unblock (SAB-122) did NOT produce institutional dollars; Battery remains unraised as of 2026. Measure supply as actual committed/deployed capital, never as the regulatory enabler." + cite: ["Cantor BTC-lending announcements", "Battery raise status thru 2026 (unraised)"] + BATTERY-incumbent-entry: + played_out: token # IMPORTANT WRINKLE: milestone technically met EARLY, but token — NOT the at-scale thesis + evidence: "Goldman Sachs (5th-largest US bank) executed ONE bitcoin-collateralized loan ~late-Apr-2022 (on-record spokeswoman, CoinDesk 2022-04-28). A one-off facility, not scaled BTC-lending entry. The D1 milestone phrasing ('>=1 major institution enters') resolves YES on this; the SUBSTANCE ('institutional capital at scale') resolves NO. Same reality, opposite verdicts by phrasing." + cite: ["coindesk.com 2022-04-28 (Goldman BTC-collateralized loan)"] + BATTERY-custody-policy-enablement: + played_out: edge + axis: context # ENABLER/CONTEXT, not the supply resolver — weight 0 into the supply score + evidence: "SAB-121 issued 2022-03-31; rescinded by SAB-122 on 2025-01-23 — just PAST the 2024-12-31 window. But this axis is CONTEXT, not supply: the post-window unblock did NOT produce capital for Battery (still unraised in 2026), demonstrating regulation was never the binding constraint on Battery's supply leg. Retained here only to show the policy-YES / supply-NO divergence — an engine must NOT read this enabler as supply arriving (that's the false-positive S1 exists to catch)." + cite: ["sec.gov SAB-121 / SAB-122", "Battery still unraised post-SAB-122 (2026)"] diff --git a/seeds/river_docs.manifest.yaml b/seeds/river_docs.manifest.yaml new file mode 100644 index 0000000..9c23240 --- /dev/null +++ b/seeds/river_docs.manifest.yaml @@ -0,0 +1,13 @@ +# River research docs to ingest (verified fetchable + TEXT-extractable, workflow wxlh2oinb). +# Image-based PDFs (river-lightning-report.pdf 2022, river-bitcoin-adoption-report-2025/2026.pdf) are +# OMITTED — zero text layer (pypdf extracts nothing); revisit with OCR if needed. +# Loaded by: signal_engine ingest-doc-manifest --file seeds/river_docs.manifest.yaml +docs: + # --- LIGHTNING / PAYMENTS (most Strike-thesis-relevant, in-window) --- + - {source: src-river, method: html, date: "2023-10-10", title: "River Lightning Report 2023 — The Lightning Network Grew 1212% in 2 Years", url: "https://river.com/content/the-lightning-network-in-2023"} + - {source: src-river, method: pdf, date: "2023-06-14", title: "River Payments Report — Bitcoin vs the $156 Trillion Global Payments Industry", url: "https://river.com/learn/files/river-payments-report.pdf"} + # --- ADOPTION / THESIS research --- + - {source: src-river, method: html, date: "2024-09-04", title: "River — Why Business Bitcoin Adoption Grew by 30% in 1 Year", url: "https://river.com/content/business-bitcoin-adoption-2024"} + - {source: src-river, method: pdf, date: "2024-11-14", title: "River — Entering The Dual Money Era", url: "https://river.com/learn/files/river-dual-money-era-2024.pdf"} + - {source: src-river, method: html, date: "2023-11-21", title: "River — Is There a Best Time and Day to DCA Bitcoin?", url: "https://river.com/content/best-time-and-day-to-dca-bitcoin"} + - {source: src-river, method: html, date: "2022-12-08", title: "River — What Could Bitcoin Mining Look Like at One Zettahash?", url: "https://river.com/content/what-could-bitcoin-mining-look-like-at-one-zettahash"} diff --git a/seeds/source_edges.bitcoin.seed.yaml b/seeds/source_edges.bitcoin.seed.yaml new file mode 100644 index 0000000..895f8e1 --- /dev/null +++ b/seeds/source_edges.bitcoin.seed.yaml @@ -0,0 +1,44 @@ +# EISC connectedness edges for the bitcoin cluster (workflow wd2a9zb9e, 2026-06-08). +# These are PRIORS so the independence model discounts shared-guest/citation/community overlap BEFORE +# transcription auto-detects it. The transcribe_worker upserts onto the SAME PK (it stores sorted([a,b]), +# weight += 1.0 on conflict), so seeding in sorted order means real detections accumulate — no reversed dup. +# Math is undirected (frozenset); kappa is applied in-code {shared_guest 0.85, citation 0.45, community 0.60}. +# weight = per-edge strength multiplier (1.0 = one strong overlap; clamp 0.95). Loaded idempotently (DO NOTHING). +# RULE: an own_network host (Ten31 portfolio/partner) is QUARANTINED in live mode regardless of edges; +# edges still matter in TEST mode and for independent<->independent discounting. +edges: + # --- host-identity / co-host core (the Ten31 own_network orbit; near-total redundancy) --- + - {a: pod-tftc, b: pod-rabbitholerecap, type: shared_guest, weight: 1.0, evidence: "Marty Bent hosts both (host identity)"} + - {a: pod-citadeldispatch, b: pod-rabbitholerecap, type: shared_guest, weight: 1.0, evidence: "Matt Odell hosts both (host identity)"} + - {a: pod-tftc, b: pod-citadeldispatch, type: shared_guest, weight: 1.0, evidence: "Bent+Odell co-host RHR weekly + constant guest-swap"} + - {a: pod-tftc, b: pod-rabbitholerecap, type: citation, weight: 1.0, evidence: "RHR is the weekly recap/companion within the TFTC network; re-surfaces TFTC segments"} + # --- What Bitcoin Did <-> Ten31-orbit core (own_network=false but heavy host-level overlap) --- + - {a: pod-whatbitcoindid, b: pod-citadeldispatch, type: shared_guest, weight: 1.0, evidence: "McCormack/Odell repeat cross-guests + conference panels 2022-2023 (Odell on WBD518)"} + - {a: pod-whatbitcoindid, b: pod-tftc, type: shared_guest, weight: 1.0, evidence: "McCormack/Bent mutual guests + co-panelists 2022-2023"} + - {a: pod-whatbitcoindid, b: pod-rabbitholerecap, type: shared_guest, weight: 1.0, evidence: "McCormack alongside Odell+Bent repeatedly; densest guest-swap triangle"} + # --- Stephan Livera <-> core + WBD (connective tissue of the independent leg) --- + - {a: pod-stephanlivera, b: pod-citadeldispatch, type: shared_guest, weight: 1.0, evidence: "Livera/Odell frequent mutual guests; shared Lightning/self-custody circuit"} + - {a: pod-stephanlivera, b: pod-tftc, type: shared_guest, weight: 1.0, evidence: "Livera/Bent repeat cross-guests + co-panelists 2022-2023"} + - {a: pod-stephanlivera, b: pod-rabbitholerecap, type: shared_guest, weight: 1.0, evidence: "Livera overlaps RHR hosts via constant mutual guesting"} + - {a: pod-stephanlivera, b: pod-whatbitcoindid, type: shared_guest, weight: 1.0, evidence: "Top-tier interview hosts sharing the same revolving guest roster"} + # --- Bitcoin Audible + Anita Posch peripheral (independent leg internal coupling) --- + - {a: pod-bitcoinaudible, b: pod-stephanlivera, type: shared_guest, weight: 1.0, evidence: "Guy Swann recurring on SLP / Lightning-essay circuit"} + - {a: pod-bitcoinaudible, b: pod-whatbitcoindid, type: community, weight: 1.0, evidence: "Same English-language BTC podcast sub-scene; overlapping audience/guest pool"} + - {a: pod-anitaposch, b: pod-stephanlivera, type: community, weight: 1.0, evidence: "Same advocacy sub-scene; peripheral (Global-South focus, separate roster)"} + - {a: pod-anitaposch, b: pod-bitcoinaudible, type: community, weight: 1.0, evidence: "Educational/advocacy BTC sub-scene overlap"} + # --- The Bitcoin Layer (macro node; edges-only, RSS dropped) --- + - {a: pod-bitcoinlayer, b: pod-whatbitcoindid, type: shared_guest, weight: 1.0, evidence: "Nik Bhatia recurring macro/rates guest on WBD 2022-2023"} + - {a: pod-bitcoinlayer, b: pod-stephanlivera, type: shared_guest, weight: 1.0, evidence: "Bhatia macro/Fed guest in SLP orbit 2022-2023"} + - {a: pod-bitcoinlayer, b: pod-stephanlivera, type: community, weight: 1.0, evidence: "Shared macro-leaning corner (rates/liquidity framing)"} + # --- NEW independent legs <-> core (the 'missing' edges: they share the same guest roster) --- + - {a: pod-citadeldispatch, b: pod-coinstories, type: shared_guest, weight: 1.0, evidence: "Odell guested Coin Stories; Brunell on shared interview circuit"} + - {a: pod-coinstories, b: pod-tftc, type: shared_guest, weight: 1.0, evidence: "Bent co-paneled w/ Brunell (Pomp panel etc.)"} + - {a: pod-coinstories, b: pod-whatbitcoindid, type: community, weight: 1.0, evidence: "Same high-reach BTC interview circuit; overlapping Mallers/Saylor/Marcus roster"} + - {a: pod-bitcoinstandard, b: pod-citadeldispatch, type: shared_guest, weight: 1.0, evidence: "Odell guested Bitcoin Standard ep #126 'Stacking Sats with Matt Odell' (Aug 2022)"} + - {a: pod-kevinrooke, b: pod-stephanlivera, type: community, weight: 1.0, evidence: "Shared Lightning-operator guest pool (Breez/Voltage/Amboss/River)"} + - {a: pod-kevinrooke, b: pod-tftc, type: community, weight: 1.0, evidence: "Shared Lightning-operator guest pool / scene overlap"} + - {a: pod-bitcoinmagazine, b: pod-citadeldispatch, type: community, weight: 1.0, evidence: "Odell historical BM contributor; shared contributor/guest pool"} + - {a: pod-simplybitcoin, b: pod-tftc, type: shared_guest, weight: 1.0, evidence: "Bent guested 'WE WILL WIN | Simply Bitcoin IRL'"} + # --- Quarantine fixtures <-> core (matter only in TEST mode; live-dropped as own_network) --- + - {a: pod-bitcoinreview, b: pod-citadeldispatch, type: shared_guest, weight: 1.0, evidence: "Odell recurring co-host of Bitcoin.Review (BR001-011 'ft. Odell')"} + - {a: pod-cafebitcoin, b: pod-citadeldispatch, type: shared_guest, weight: 1.0, evidence: "Odell (Swan advisor) ran 'Bitcoin Lightning Summit' on Cafe Bitcoin"} diff --git a/seeds/sources.battery.seed.yaml b/seeds/sources.battery.seed.yaml new file mode 100644 index 0000000..795d516 --- /dev/null +++ b/seeds/sources.battery.seed.yaml @@ -0,0 +1,29 @@ +# Battery (bitcoin-collateralized lending) text-corpus sources — from verified manifest (workflow w7559rp1x, 2026-06-08). +# kind=filing (text docs, no transcription). cluster: 'credit' = the crypto-credit ecosystem; 'macro' = policy regulators. +# own_network=1 → Ten31-affiliated (quarantined in live: Ten31 hearing its own ecosystem). lineage captured in notes. +# AXIS DISCIPLINE: policy sources are CONTEXT — their claims map to the custody-policy-enablement fan-out node, never the +# institutional-supply node. SUPPLY resolves only on committed/deployed-capital claims (Galaxy/Coinbase/Cantor-funded). +sources: + # --- SUPPLY: committed/deployed capital + market data (the resolver tier) --- + - {id: bat-galaxy, name: "Galaxy Research", kind: filing, cluster: credit, role: IND, own_network: false, notes: "independent — State of Crypto Lending dataset + quarterly loan-book ($863M Q3'24). THE in-window deployed-dollar supply measure."} + - {id: bat-9fin, name: "9fin (credit data)", kind: filing, cluster: credit, role: IND, own_network: false, notes: "independent credit-data — names BTC-backed private-credit funders."} + - {id: bat-cantor, name: "Cantor Fitzgerald (PR)", kind: filing, cluster: credit, role: IND, own_network: false, notes: "primary — $2B program ANNOUNCED Jul-2024 (future tense, no funding source named). Announced≠funded anchor."} + - {id: bat-falconx, name: "FalconX (newsroom)", kind: filing, cluster: credit, role: IND, own_network: false, notes: "primary — closed FIRST Cantor financing May-2025 (out-of-window): proves in-window Cantor was announce-only."} + - {id: bat-maple, name: "Maple Finance (insights)", kind: filing, cluster: credit, role: IND, own_network: false, notes: "primary — first Cantor tranche May-2025 (out-of-window). Funding source = Cantor balance sheet."} + - {id: bat-twoprime, name: "Two Prime", kind: filing, cluster: credit, role: IND, own_network: false, notes: "primary/independent — sees $2B DEMAND, discloses ZERO deployed (the supply absence), Nov-2023 in-window."} + - {id: bat-battery, name: "Battery Finance / Newmarket (PR)", kind: filing, cluster: credit, role: IND, own_network: true, notes: "OWN_NETWORK — Battery is a Ten31 PARTNERSHIP. Launch Nov-2024: ONE funded deal, NO pooled committed-capital figure, NO named funding source. The absence is the signal."} + - {id: bat-bitcoinmag, name: "Bitcoin Magazine (business desk)", kind: filing, cluster: credit, role: IND, own_network: false, notes: "independent-but-aligned (bitcoin-advocacy media) — original interview re Battery launch; mechanics, no fund size."} + # --- DEMAND: originator disclosures --- + - {id: bat-unchained, name: "Unchained (blog)", kind: filing, cluster: credit, role: IND, own_network: true, notes: "OWN_NETWORK — Unchained is a Ten31 PORTFOLIO co. Demand: >$500M cumulative origination."} + - {id: bat-ledn, name: "Ledn (Open Book)", kind: filing, cluster: credit, role: IND, own_network: false, notes: "primary — monthly Open Book origination disclosures (>$10B cumulative)."} + - {id: bat-salt, name: "SALT Lending (blog)", kind: filing, cluster: credit, role: IND, own_network: false, notes: "primary — paginated blog archive (v2: crawl /blog/page/N/)."} + - {id: bat-debifi, name: "Debifi", kind: filing, cluster: credit, role: IND, own_network: true, notes: "OWN_NETWORK — Ten31 portfolio. Demand-side lender (v2: Medium RSS crawl)."} + # --- DOWNSTREAM echo (near-zero independence — event detection / corroboration only) --- + - {id: bat-coindesk, name: "CoinDesk", kind: filing, cluster: credit, role: DX, own_network: false, notes: "downstream — independent trade press; Cantor-funded + Two Prime coverage."} + - {id: bat-coinspeaker, name: "Coinspeaker", kind: filing, cluster: credit, role: DX, own_network: false, notes: "downstream — Coinbase institutional-lending (>$57M committed, Reg D) Sept-2023."} + - {id: bat-asr, name: "Asset Securitization Report", kind: filing, cluster: credit, role: IND, own_network: false, notes: "independent structured-credit — Milo crypto-mortgage 2022 (soft paywall)."} + # --- POLICY CONTEXT (axis=context — weight 0 into supply; maps to custody-policy fan-out node only) --- + - {id: bat-sec, name: "SEC (policy primaries)", kind: filing, cluster: macro, role: IND, own_network: false, notes: "axis=CONTEXT — SAB-121 (2022) / SAB-122 (2025). NOT a supply input."} + - {id: bat-fed, name: "Federal Reserve (policy)", kind: filing, cluster: macro, role: IND, own_network: false, notes: "axis=CONTEXT — SR 22-6 (crypto-collateralized loans), SR 23-7 Novel Activities."} + - {id: bat-fdic, name: "FDIC (policy)", kind: filing, cluster: macro, role: IND, own_network: false, notes: "axis=CONTEXT — FIL-16-2022 crypto prior-notification."} + - {id: bat-occ, name: "OCC (policy)", kind: filing, cluster: macro, role: IND, own_network: false, notes: "axis=CONTEXT — IL 1170/1179 (custody + supervisory-non-objection gate)."} diff --git a/seeds/sources.bitcoin.seed.yaml b/seeds/sources.bitcoin.seed.yaml new file mode 100644 index 0000000..1fa6281 --- /dev/null +++ b/seeds/sources.bitcoin.seed.yaml @@ -0,0 +1,31 @@ +# Bitcoin-cluster sources for the STRIKE reflexivity backtest (DESIGN_v2.1). Resolved + verified. +# own_network=1 → the Ten31 orbit (Odell/Bent partners) = QUARANTINED (dropped in live EISC, test fixture). +# The independent leg (own_network unset) is the non-self-referential corroboration for the Strike test. +# All bitcoin cluster → cluster_capped_low (§4.5). rss_url verified to parse; FULL = reaches 2022-2023. +sources: + # --- OWN-NETWORK ORBIT (quarantined) --- + - {id: pod-tftc, name: "TFTC (Marty Bent)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: true, rss_url: "https://feeds.fountain.fm/ZwwaDULvAj0yZvJ5kdB9", channel_url: "https://www.youtube.com/@TFTC", backtest_2022_2023: rss_full, notes: "FULL: 112 eps 2022, 88 in 2023. Ten31 partner — own_network."} + - {id: pod-citadeldispatch, name: "Citadel Dispatch (Matt Odell)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: true, rss_url: "https://serve.podhome.fm/rss/c90e609a-df1e-596a-bd5e-57bcc8aad6cc", channel_url: "https://www.youtube.com/channel/UCoA72saVAuQ8hYCnBO0Lymw", backtest_2022_2023: rss_full, notes: "FULL: 53 eps 2022, 37 in 2023. Ten31 partner — own_network."} + - {id: pod-rabbitholerecap, name: "Rabbit Hole Recap (Odell + Bent)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: true, rss_url: "https://feeds.fountain.fm/0EAzqUaM4qqanDr1qNuK", channel_url: "https://www.youtube.com/@TFTC", backtest_2022_2023: rss_full, notes: "FULL: 230 eps 2022 (launched Apr 2022), 51 in 2023. Both Ten31 partners — own_network."} + # --- INDEPENDENT bitcoin / Lightning-payments discourse (the non-self-referential leg) --- + - {id: pod-stephanlivera, name: "Stephan Livera Podcast", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://anchor.fm/s/7d083a4/podcast/rss", channel_url: "https://www.youtube.com/channel/UCDqPIrJSzHyyJpmH6wnxVxA", backtest_2022_2023: rss_full, notes: "VERIFIED RSS FULL (2026-06-08): anchor.fm feed has 739 eps 2018-2026, 203 in 2022-2023 window — earlier 'truncated/youtube_only' note was wrong. Strong independent Lightning-payments discourse."} + - {id: pod-bitcoinaudible, name: "Bitcoin Audible (Guy Swann)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://feeds.castos.com/mj96z", channel_url: "https://www.youtube.com/channel/UClG-wqz-OuXfzbpqwJd3fVA", backtest_2022_2023: rss_full, notes: "FULL: 1375 eps since 2018, deep archive. Reads/narrates Lightning retail-payments essays. Independent."} + - {id: pod-anitaposch, name: "The Anita Posch Show", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://www.vodio.fr/rssmedias.php?valeur=1673", channel_url: "https://www.youtube.com/AnitaPosch", backtest_2022_2023: rss_partial, notes: "RESOLVED 2026-06-08: original bitcoinundco feed 302s to homepage (0 items); vodio re-host has 183 eps 2018-2025, 30 in 2022-2023 window (op3.dev->mp3 enclosures). Modest but real independent leg (Global South / payments focus)."} + # === EXPANSION (workflow wd2a9zb9e, 2026-06-08): feeds curl-verified + own_network cross-checked vs Ten31 portfolio/team by an adversarial pass. === + # --- INDEPENDENT, HIGH Strike relevance (Lightning/retail-payments) — primary corroboration legs --- + - {id: pod-kevinrooke, name: "The Kevin Rooke Show", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://anchor.fm/s/71a8cc78/podcast/rss", channel_url: "https://www.youtube.com/@KevinRooke", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: 145 eps, 127 in 2022-2023. MOST Lightning-laser-focused independent show (TLS series: Breez/BTCPay/OpenNode/Muun/Voltage/River). Highest signal-per-ep for payments thesis. No Ten31 tie."} + - {id: pod-whatbitcoindid, name: "What Bitcoin Did (Peter McCormack)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://feeds.acast.com/public/shows/69d4f193b76468caacc5068f", channel_url: "https://www.youtube.com/@WhatBitcoinDid", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: 1060 eps, 311 in 2022-2023 (now branded 'The Peter McCormack Show', WBD### back-catalog intact). own_network=FALSE — McCormack NOT a Ten31 partner/portfolio (external capital is Winklevoss/Gemini); heavy guest-overlap w/ Odell/Bent core captured as shared_guest EDGES, not quarantine. Ran Lightning series + Mallers/Strike interviews."} + - {id: pod-bitcoinmagazine, name: "Bitcoin Magazine Podcast", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://anchor.fm/s/cefa18a0/podcast/rss", channel_url: "https://www.youtube.com/@BitcoinMagazine", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: 788 eps, 304 in 2022-2023. BTC Inc (Nakamoto/Bailey orbit, NOT Ten31). Hosts 2022-23 Keroles/Lindner. Heavy Lightning/Mallers/merchant coverage. own_network=false."} + - {id: pod-coinstories, name: "Coin Stories (Natalie Brunell)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://rss.libsyn.com/shows/344543/destinations/2813255.xml", channel_url: "https://www.youtube.com/@CoinStoriesNatalieBrunell", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: 500 eps, 155 in 2022-2023. Independent journalist; 2x Jack Mallers (Strike CEO), Marcus/Saylor on Lightning, El-Salvador/merchant eps. own_network=false (Odell/Bent only as guests)."} + - {id: pod-bitcoinstandard, name: "The Bitcoin Standard Podcast (Saifedean Ammous)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://rss.buzzsprout.com/1849151.rss", channel_url: "https://www.youtube.com/@saifedean", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: 365 eps, 103 in 2022-2023. CORRECTION: in-window Strike relevance is MEDIUM not high (cited Mallers ep #89 is Nov-2021, out-of-window; in-window core is Austrian econ/Fiat Food). Independent. own_network=false."} + - {id: pod-simplybitcoin, name: "Simply Bitcoin", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://anchor.fm/s/717a2198/podcast/rss", channel_url: "https://www.youtube.com/@SimplyBitcoinTV", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: 1689 eps, 574 in 2022-2023. Independent daily-news (Nico+Opti, not Swan). Recurring Mallers/Strike/El-Salvador/Lightning but daily-news frame → strike_relevance MEDIUM. own_network=false (Bent only as guest)."} + # --- OWN-NETWORK (Ten31 PORTFOLIO host) — QUARANTINE fixtures: highest Strike density, but Ten31-owned → drop in live EISC --- + - {id: pod-cafebitcoin, name: "The Cafe Bitcoin Podcast (Swan)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://feeds.simplecast.com/H9Jmx_ko", channel_url: "https://www.youtube.com/@SwanBitcoin", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: 648 eps, 495 in 2022-2023. own_network=FALSE — RECLASSIFIED 2026-06-08 per Grant: Ten31's Swan investment is IMMATERIAL → NOT a conflict → INDEPENDENT. Highest Lightning-payments density → strong INDEPENDENT leg for Strike."} + - {id: pod-bitcoinreview, name: "Bitcoin.Review (NVK / Coinkite)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: true, rss_url: "https://serve.podhome.fm/rss/7cd0202b-463c-5b2e-b252-d4845cb71466", channel_url: "https://www.youtube.com/@BitcoinReview", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: 97 eps, 57 in 2022-2023 (launched Jun 2022). own_network=TRUE — host NVK is CEO of Coinkite, a Ten31 PORTFOLIO co (Ten31-led Series A); Odell recurring co-host. Dense Lightning/ecash/wallet content → QUARANTINE fixture."} + # --- INDEPENDENT, lower Strike relevance — BROAD bitcoin corpus (source rows recorded; NOT auto-queued for transcription yet) --- + - {id: pod-whatismoney, name: "The What is Money? Show (Robert Breedlove)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://feeds.simplecast.com/MLdpYXYI", channel_url: "https://www.youtube.com/@RobertBreedlove", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: 600 eps, 314 in 2022-2023. Money-philosophy/macro dominant; intermittent Lightning → strike med-low. Independent (no Ten31 tie). BROAD corpus."} + - {id: pod-unchained-shin, name: "Unchained (Laura Shin)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://feeds.megaphone.fm/LSHML4761942757", channel_url: "https://www.youtube.com/@Unchained_pod", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: 1174 eps, 303 in 2022-2023. NAME-COLLISION GUARD: Laura Shin's podcast is NOT Ten31's portfolio co 'Unchained Capital' (custody firm) — distinct entities. own_network=false. Broad multi-chain/crypto, Eth-heavy → strike medium. BROAD corpus."} + - {id: pod-bitcoinfundamentals, name: "Bitcoin Fundamentals (Preston Pysh)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://feeds.megaphone.fm/PPLLC8974708240", channel_url: "https://www.youtube.com/@PrestonPysh", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: combined We-Study-Billionaires feed; Bitcoin leg = BTC###-prefixed eps, 105 in 2022-2023. INGEST NOTE: needs a BTC### title filter (not yet supported by ingest-podcast) → defer. Pysh is GP at Ego Death Capital (rival BTC VC) → independent of Ten31. BROAD corpus."} + - {id: pod-pomp, name: "The Pomp Podcast (Anthony Pompliano)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://anchor.fm/s/b4841110/podcast/rss", channel_url: "https://www.youtube.com/@AnthonyPompliano", backtest_2022_2023: rss_full, notes: "VERIFIED rss_full: 1741 eps, 519 in 2022-2023. Broad business/crypto, light on Lightning retail-payments → strike low. Consensus barometer. Independent. BROAD corpus."} + # --- KEEP but DROP from RSS ingest: pod-bitcoinlayer 2022-2023 audio is YouTube-only (Soundwise/Simplecast feeds start 2024). Edge-node only unless YouTube path added. --- + - {id: pod-bitcoinlayer, name: "The Bitcoin Layer (Nik Bhatia)", kind: podcast, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, rss_url: "https://app.mysoundwise.com/rss/1665708396190s", channel_url: "https://www.youtube.com/c/thebitcoinlayer", backtest_2022_2023: youtube_only, notes: "rss_partial: all audio RSS feeds (Soundwise/Simplecast) start 2024; 2022-2023 back-catalog only on YouTube (chan UCDo6-SUypaXlTmH6AyrYBZA, launched Jun 2022). Macro/rates focus → strike low. own_network=false. NOT auto-queued; serves as shared_guest edge node into WBD/SLP."} diff --git a/seeds/sources.river.seed.yaml b/seeds/sources.river.seed.yaml new file mode 100644 index 0000000..340a3b0 --- /dev/null +++ b/seeds/sources.river.seed.yaml @@ -0,0 +1,8 @@ +# River Research (river.com) — independent Bitcoin/Lightning research (workflow wxlh2oinb, 2026-06-08). +# own_network=FALSE per Grant: River IS a Ten31 portfolio co, but the stake is IMMATERIAL → not a +# conflict → INDEPENDENT (same materiality rule as Swan). Real Lightning transaction/usage DATA — a +# strong independent leg for the Strike payments thesis (tests "Lightning grew" vs "retail payments at scale"). +# kind=filing (text docs). cluster=bitcoin. NOTE: River's report PDFs are largely IMAGE-BASED (no text +# layer) → ingest the HTML landing pages + the text-layer PDFs only; image-PDFs deferred (need OCR). +sources: + - {id: src-river, name: "River Research", kind: filing, cluster: bitcoin, role: IND, cluster_capped_low: true, own_network: false, channel_url: "https://river.com/research/", notes: "Independent (immaterial Ten31 stake). Bitcoin/Lightning usage research incl. annual Lightning Report (real tx data). Image-PDF reports deferred for OCR; HTML + text-PDFs ingested."} diff --git a/seeds/sources.seed.yaml b/seeds/sources.seed.yaml new file mode 100644 index 0000000..6c5ee2f --- /dev/null +++ b/seeds/sources.seed.yaml @@ -0,0 +1,74 @@ +# Source registry seed — handoff §7.3 (companies) + §7.4 (podcasts). VERIFY tickers/feeds at ingestion. +# `notes` flag backtest-era (2022-2023) coverage for §7.1, since some entities didn't exist yet. +# Podcast rss_url is left null where the real feed must still be resolved (some feeds truncate the +# back-catalog — resolution may need the show's archive feed or a YouTube back-catalog). +# Bitcoin-cluster podcasts are cluster_capped_low (§4.5 — most correlated with Ten31's own priors). + +sources: + # ============================ COMPANIES (§7.3) ============================ + # --- AI compute & hyperscalers → ai_tech --- + - {id: co-nvda, name: NVIDIA, kind: filing, cluster: ai_tech, ticker: NVDA} + - {id: co-googl, name: Alphabet, kind: filing, cluster: ai_tech, ticker: GOOGL} + - {id: co-msft, name: Microsoft, kind: filing, cluster: ai_tech, ticker: MSFT} + - {id: co-amzn, name: Amazon, kind: filing, cluster: ai_tech, ticker: AMZN} + - {id: co-meta, name: Meta, kind: filing, cluster: ai_tech, ticker: META} + - {id: co-avgo, name: Broadcom, kind: filing, cluster: ai_tech, ticker: AVGO} + - {id: co-tsm, name: TSMC, kind: filing, cluster: ai_tech, ticker: TSM, notes: "Foreign filer — files 20-F/6-K, NOT 10-K/10-Q; override forms at ingest"} + - {id: co-crwv, name: CoreWeave, kind: filing, cluster: ai_tech, ticker: CRWV, notes: "IPO 2025 — NO 2022-23 filings (backtest: absent, expected)"} + - {id: co-orcl, name: Oracle, kind: filing, cluster: ai_tech, ticker: ORCL} + # --- Energy & power (binding constraint) → energy --- + - {id: co-ceg, name: Constellation Energy, kind: filing, cluster: energy, ticker: CEG, notes: "Spun from Exelon Feb 2022 — backtest-era coverage from 2022"} + - {id: co-vst, name: Vistra, kind: filing, cluster: energy, ticker: VST} + - {id: co-tln, name: Talen Energy, kind: filing, cluster: energy, ticker: TLN, notes: "Relisted 2024 post-restructuring — thin pre-2024 public filings"} + - {id: co-gev, name: GE Vernova, kind: filing, cluster: energy, ticker: GEV, notes: "Spun from GE Apr 2024 — pre-2024 power data is inside GE filings"} + - {id: co-nee, name: NextEra Energy, kind: filing, cluster: energy, ticker: NEE} + - {id: co-ccj, name: Cameco, kind: filing, cluster: energy, ticker: CCJ, notes: "Foreign filer (Canada) — 40-F/6-K"} + - {id: co-vrt, name: Vertiv, kind: filing, cluster: energy, ticker: VRT} + - {id: co-pwr, name: Quanta Services, kind: filing, cluster: energy, ticker: PWR, notes: "watch — grid/interconnect picks-and-shovels"} + - {id: co-oklo, name: Oklo, kind: filing, cluster: energy, ticker: OKLO, notes: "watch; public 2024"} + - {id: co-smr, name: NuScale Power, kind: filing, cluster: energy, ticker: SMR, notes: "watch"} + # --- Mining <-> AI/HPC (energy-compute seam) → energy --- + - {id: co-corz, name: Core Scientific, kind: filing, cluster: energy, ticker: CORZ, notes: "Ch.11 Dec 2022, relisted Jan 2024 — messy 2022-23 filings"} + - {id: co-iren, name: IREN (Iris Energy), kind: filing, cluster: energy, ticker: IREN, notes: "IPO Nov 2021 — backtest-era OK; foreign filer 20-F"} + - {id: co-wulf, name: TeraWulf, kind: filing, cluster: energy, ticker: WULF, notes: "Public Dec 2021 — backtest-era OK"} + - {id: co-cifr, name: Cipher Mining, kind: filing, cluster: energy, ticker: CIFR, notes: "Public Aug 2021 — backtest-era OK"} + # --- Debasement <-> bitcoin (treasury/custody) → bitcoin --- + - {id: co-mstr, name: Strategy (MicroStrategy), kind: filing, cluster: bitcoin, ticker: MSTR} + - {id: co-coin, name: Coinbase, kind: filing, cluster: bitcoin, ticker: COIN} + - {id: co-xyz, name: Block, kind: filing, cluster: bitcoin, ticker: XYZ, notes: "Ticker SQ→XYZ (2025) — verify"} + - {id: co-hood, name: Robinhood Markets, kind: filing, cluster: bitcoin, ticker: HOOD, notes: "Crypto/fintech broker — alongside COIN/XYZ. Public Jul 2021, backtest-era OK."} + - {id: co-xxi, name: Twenty One, kind: filing, cluster: bitcoin, ticker: XXI, notes: "Formed 2025 — does not exist in backtest era"} + # --- Major banks (general corpus + Battery incumbent-entry/supply axis). New 'banks' cluster = genuine cross-cluster independence vs bitcoin. --- + - {id: co-jpm, name: JPMorgan Chase, kind: filing, cluster: banks, ticker: JPM} + - {id: co-bac, name: Bank of America, kind: filing, cluster: banks, ticker: BAC} + - {id: co-c, name: Citigroup, kind: filing, cluster: banks, ticker: C} + - {id: co-wfc, name: Wells Fargo, kind: filing, cluster: banks, ticker: WFC} + - {id: co-gs, name: Goldman Sachs, kind: filing, cluster: banks, ticker: GS, notes: "Battery incumbent-entry: executed ONE BTC-collateralized loan Apr-2022 (token, not at-scale)."} + - {id: co-ms, name: Morgan Stanley, kind: filing, cluster: banks, ticker: MS} + + # ============================ PODCASTS (§7.4) ============================ + # Macro/monetary + - {id: pod-oddlots, name: Odd Lots, kind: podcast, cluster: macro, role: IND, notes: "highest-independence cross-domain"} + - {id: pod-forwardguidance, name: Forward Guidance, kind: podcast, cluster: macro, role: DX} + - {id: pod-macrovoices, name: Macro Voices, kind: podcast, cluster: macro, role: DX, notes: "energy-heavy"} + - {id: pod-grantwilliams, name: The Grant Williams Podcast, kind: podcast, cluster: macro, role: IND} + - {id: pod-monetarymatters, name: Monetary Matters, kind: podcast, cluster: macro, role: DX, notes: "launched ~2024 — thin backtest-era coverage"} + - {id: pod-hiddenforces, name: Hidden Forces, kind: podcast, cluster: macro, role: IND, notes: "highest-independence cross-domain"} + # AI/tech + - {id: pod-dwarkesh, name: Dwarkesh Podcast, kind: podcast, cluster: ai_tech, role: DX, notes: "launched ~2023 — partial backtest-era coverage; highest-independence"} + - {id: pod-nopriors, name: No Priors, kind: podcast, cluster: ai_tech, role: DX, notes: "VC-consensus cluster — discount internal convergence"} + - {id: pod-latentspace, name: Latent Space, kind: podcast, cluster: ai_tech, role: DX, notes: "technical; launched ~2023"} + - {id: pod-cognitiverev, name: Cognitive Revolution, kind: podcast, cluster: ai_tech, role: DX} + - {id: pod-bg2, name: BG2, kind: podcast, cluster: vc_consensus, role: DX, notes: "launched ~2023; mild Ten31 correlation; VC-consensus"} + - {id: pod-a16z, name: a16z Podcast, kind: podcast, cluster: vc_consensus, role: DX, notes: "crypto correlation; VC-consensus"} + # Energy + - {id: pod-catalyst, name: Catalyst w/ Shayle Kann, kind: podcast, cluster: energy, role: DX} + - {id: pod-columbiaenergy, name: Columbia Energy Exchange, kind: podcast, cluster: energy, role: DX} + - {id: pod-doomberg, name: Doomberg, kind: podcast, cluster: energy, role: IND} + # Bitcoin (limited, capped low — §4.5) + - {id: pod-bitcoinlayer, name: The Bitcoin Layer, kind: podcast, cluster: bitcoin, role: DX, cluster_capped_low: true, notes: "macro-literate"} + - {id: pod-whatbitcoindid, name: What Bitcoin Did, kind: podcast, cluster: bitcoin, role: none, cluster_capped_low: true} + # Generalist + - {id: pod-allin, name: All-In, kind: podcast, cluster: vc_consensus, role: CB, notes: "consensus barometer"} + - {id: pod-iltb, name: Invest Like the Best, kind: podcast, cluster: generalist, role: DX, notes: "cross-domain"} + - {id: pod-lex, name: Lex Fridman, kind: podcast, cluster: generalist, role: none, notes: "wide reach, variable"} diff --git a/signal_engine/__init__.py b/signal_engine/__init__.py new file mode 100644 index 0000000..1b333a5 --- /dev/null +++ b/signal_engine/__init__.py @@ -0,0 +1,11 @@ +"""Ten31 Signal Engine — pilot. + +A recurring pipeline that ingests audio + text, extracts structured propositions +locally, and surfaces signal over time. The discipline that separates signal from +plausible-sounding noise (handoff §5): statistics & graph structure NOMINATE +candidates; the frontier model only JUDGES and FANS OUT a pre-filtered shortlist. + +See README.md for the architecture and ten31-signal-engine-handoff.md for the spec. +""" + +__version__ = "0.1.0" diff --git a/signal_engine/__main__.py b/signal_engine/__main__.py new file mode 100644 index 0000000..bfdcd0c --- /dev/null +++ b/signal_engine/__main__.py @@ -0,0 +1,4 @@ +from .cli import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/signal_engine/backfill/__init__.py b/signal_engine/backfill/__init__.py new file mode 100644 index 0000000..b67aa80 --- /dev/null +++ b/signal_engine/backfill/__init__.py @@ -0,0 +1 @@ +"""Client-side backfill queue (§13.4). Producers enqueue; ONE worker drains sequentially.""" diff --git a/signal_engine/backfill/queue.py b/signal_engine/backfill/queue.py new file mode 100644 index 0000000..3c0e466 --- /dev/null +++ b/signal_engine/backfill/queue.py @@ -0,0 +1,123 @@ +"""Backfill job queue over the `backfill_jobs` table (§13.4). + +Model the corpus backfill as a managed GPU-hours queue, not a real-time fan-out. Producers +(ingestion) enqueue lightweight job descriptors; a SINGLE worker leases and drains them one at a +time so audio never goes parallel (→ 503). Jobs are: + - idempotent: UNIQUE(job_type, input_hash); re-enqueue of seen content is a no-op. + - crash-safe: leases expire, so a dead worker's job returns to the pool automatically. + - prioritized: lower `priority` runs first (backtest corpus + filings jump ahead). + +This is plain SQLite so the whole queue is `SELECT * FROM backfill_jobs`. +""" +from __future__ import annotations + +import sqlite3 +from typing import Any, Optional, Sequence + +LEASE_SECONDS_DEFAULT = 600 + + +def enqueue( + conn: sqlite3.Connection, + *, + job_type: str, + target_id: str, + input_hash: str, + parent_doc_id: str | None = None, + priority: int = 100, + max_attempts: int = 5, +) -> Optional[int]: + """Insert a job. Returns job_id, or None if this (job_type, input_hash) is already queued/done + (idempotent skip — §13.4).""" + cur = conn.execute( + """INSERT OR IGNORE INTO backfill_jobs + (job_type, target_id, parent_doc_id, priority, max_attempts, input_hash, state) + VALUES (?,?,?,?,?,?, 'pending')""", + (job_type, target_id, parent_doc_id, priority, max_attempts, input_hash), + ) + conn.commit() + return cur.lastrowid if cur.rowcount else None + + +def lease_next( + conn: sqlite3.Connection, + *, + worker_id: str, + job_types: Sequence[str] | None = None, + lease_seconds: int = LEASE_SECONDS_DEFAULT, +) -> Optional[sqlite3.Row]: + """Atomically claim the highest-priority eligible job. Eligible = pending, OR a running/leased + job whose lease has expired (crash recovery). Increments `attempts`.""" + params: list[Any] = [] + type_filter = "" + if job_types: + type_filter = f" AND job_type IN ({','.join('?' * len(job_types))})" + params.extend(job_types) + row = conn.execute( + f"""SELECT job_id FROM backfill_jobs + WHERE (state = 'pending' + OR (state IN ('leased','running') + AND lease_expires_at IS NOT NULL + AND lease_expires_at < datetime('now'))) + {type_filter} + ORDER BY priority ASC, job_id ASC + LIMIT 1""", + params, + ).fetchone() + if row is None: + return None + conn.execute( + """UPDATE backfill_jobs + SET state='running', lease_owner=?, lease_expires_at=datetime('now', ?), + attempts=attempts+1, updated_at=datetime('now') + WHERE job_id=?""", + (worker_id, f"+{int(lease_seconds)} seconds", row["job_id"]), + ) + conn.commit() + return conn.execute("SELECT * FROM backfill_jobs WHERE job_id=?", (row["job_id"],)).fetchone() + + +def complete(conn: sqlite3.Connection, job_id: int, *, output_ref: str | None = None, + gpu_seconds: float | None = None) -> None: + conn.execute( + """UPDATE backfill_jobs SET state='done', output_ref=?, gpu_seconds=?, error=NULL, + updated_at=datetime('now') WHERE job_id=?""", + (output_ref, gpu_seconds, job_id), + ) + conn.commit() + + +def fail(conn: sqlite3.Connection, job_id: int, error: Any) -> str: + """Retry (→ pending) if attempts remain, else dead-letter (→ failed). Returns the new state.""" + row = conn.execute( + "SELECT attempts, max_attempts FROM backfill_jobs WHERE job_id=?", (job_id,) + ).fetchone() + exhausted = bool(row) and row["attempts"] >= row["max_attempts"] + new_state = "failed" if exhausted else "pending" + conn.execute( + """UPDATE backfill_jobs SET state=?, error=?, lease_owner=NULL, lease_expires_at=NULL, + updated_at=datetime('now') WHERE job_id=?""", + (new_state, str(error)[:2000], job_id), + ) + conn.commit() + return new_state + + +def skip(conn: sqlite3.Connection, job_id: int, reason: str | None = None) -> None: + """Terminal non-error skip (e.g. a chunk that produced zero claims is still 'done', but an + intentionally dropped job is 'skipped').""" + conn.execute( + "UPDATE backfill_jobs SET state='skipped', error=?, updated_at=datetime('now') WHERE job_id=?", + (reason, job_id), + ) + conn.commit() + + +def stats(conn: sqlite3.Connection) -> dict[str, dict[str, int]]: + rows = conn.execute( + "SELECT job_type, state, COUNT(*) AS n FROM backfill_jobs GROUP BY job_type, state" + ).fetchall() + out: dict[str, dict[str, int]] = {} + for r in rows: + out.setdefault(r["job_type"], {})[r["state"]] = r["n"] + return out diff --git a/signal_engine/cli.py b/signal_engine/cli.py new file mode 100644 index 0000000..27eefd4 --- /dev/null +++ b/signal_engine/cli.py @@ -0,0 +1,619 @@ +"""Pilot CLI. Subcommands map to the build order in handoff §11. + +Currently implemented (foundation): init-db, seed-convictions, spark-status, db-tables. +Later stages (ingest, extract, score, judge, eval-ui) are added as they're built. +""" +from __future__ import annotations + +import argparse +import logging +import sys +from pathlib import Path + +from .config import load_config +from .store import db +from .store.seed import load_convictions, load_fanout +from .store.sources import load_source_edges, load_sources, update_feeds + +DEFAULT_CONVICTION_SEED = Path("seeds/conviction_log.seed.yaml") +DEFAULT_SOURCES_SEED = Path("seeds/sources.seed.yaml") +DEFAULT_FEEDS_SEED = Path("seeds/podcast_feeds.resolved.yaml") + + +def _setup_logging(level: str) -> None: + logging.basicConfig(level=getattr(logging, level.upper(), logging.INFO), + format="%(asctime)s %(levelname)s %(name)s: %(message)s") + + +def cmd_init_db(args: argparse.Namespace) -> int: + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + print(f"Initialized DB at {cfg.db_path}") + print("Tables/views:", ", ".join(db.table_names(conn))) + return 0 + + +def cmd_seed_convictions(args: argparse.Namespace) -> int: + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) # ensure schema exists + path = Path(args.file) + n = load_convictions(conn, path) + print(f"Upserted {n} convictions from {path}") + breakers = conn.execute( + "SELECT conviction_id, thematic_proposition FROM conviction_log WHERE is_thesis_breaker = 1" + ).fetchall() + if breakers: + print("Thesis-breakers loaded (engine must surface these AGAINST the thesis, §5.7):") + for b in breakers: + print(f" {b['conviction_id']}: {b['thematic_proposition'][:80]}...") + return 0 + + +def cmd_seed_sources(args: argparse.Namespace) -> int: + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + n = load_sources(conn, Path(args.file)) + by_kind = conn.execute( + "SELECT kind, COUNT(*) n FROM sources GROUP BY kind ORDER BY kind" + ).fetchall() + print(f"Upserted {n} sources from {args.file}") + for r in by_kind: + print(f" {r['kind']}: {r['n']}") + return 0 + + +def cmd_seed_edges(args: argparse.Namespace) -> int: + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + n = load_source_edges(conn, Path(args.file)) + total = conn.execute("SELECT COUNT(*) FROM source_edges").fetchone()[0] + print(f"Inserted {n} new edges from {args.file} ({total} edges total)") + return 0 + + +def cmd_load_feeds(args: argparse.Namespace) -> int: + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + n = update_feeds(conn, Path(args.file)) + print(f"updated {n} podcast feeds") + rows = conn.execute( + "SELECT backtest_2022_2023, COUNT(*) c FROM sources WHERE kind='podcast' " + "GROUP BY backtest_2022_2023 ORDER BY c DESC" + ).fetchall() + print("backtest 2022-2023 reach:") + for r in rows: + print(f" {r['backtest_2022_2023'] or 'unset'}: {r['c']}") + return 0 + + +def cmd_ingest_edgar(args: argparse.Namespace) -> int: + from .ingest.edgar import EdgarClient, ingest_filings + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + client = EdgarClient(cfg.edgar_user_agent) + forms = tuple(f.strip() for f in args.forms.split(",")) if args.forms else ("10-K", "10-Q", "8-K") + + # resolve source_id from ticker (create a lightweight source row if not seeded) + row = conn.execute("SELECT source_id FROM sources WHERE upper(ticker)=upper(?)", (args.ticker,)).fetchone() + if row: + source_id = row["source_id"] + else: + source_id = f"co-{args.ticker.lower()}" + conn.execute( + "INSERT OR IGNORE INTO sources (source_id, name, kind, ticker) VALUES (?,?,?,?)", + (source_id, args.ticker, "filing", args.ticker.upper()), + ) + conn.commit() + + n_docs, n_jobs = ingest_filings(conn, client, source_id=source_id, ticker=args.ticker, + since=args.since, until=args.until, forms=forms) + print(f"{args.ticker}: +{n_docs} filing documents, +{n_jobs} extract jobs queued " + f"(forms={','.join(forms)}, since={args.since}, until={args.until})") + return 0 + + +def _resolve_source_id(conn, ticker: str, kind: str = "filing") -> str: + row = conn.execute("SELECT source_id FROM sources WHERE upper(ticker)=upper(?)", (ticker,)).fetchone() + if row: + return row["source_id"] + source_id = f"co-{ticker.lower()}" + conn.execute("INSERT OR IGNORE INTO sources (source_id, name, kind, ticker) VALUES (?,?,?,?)", + (source_id, ticker.upper(), kind, ticker.upper())) + conn.commit() + return source_id + + +def cmd_ingest_doc(args: argparse.Namespace) -> int: + from .ingest.docs import ingest_one + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + doc_id = ingest_one(conn, cfg, source_id=args.source, url=args.url, + title=args.title or args.url, date=args.date, method=args.method) + print(f"ingested: {doc_id}" if doc_id else "no new doc (duplicate / too short / fetch failed)") + return 0 + + +def cmd_ingest_feed_text(args: argparse.Namespace) -> int: + from .ingest.docs import ingest_feed_text + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + n = ingest_feed_text(conn, cfg, source_id=args.source, rss_url=args.url, + since=args.since, until=args.until, limit=args.limit) + print(f"ingested {n} article docs from feed for {args.source}") + return 0 + + +def cmd_ingest_doc_manifest(args: argparse.Namespace) -> int: + from .ingest.docs import ingest_manifest + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + r = ingest_manifest(conn, cfg, Path(args.file)) + print(f"manifest: ingested={r['ingested']} skipped={r['skipped']} missing_source={r['missing_source']}") + return 0 + + +def cmd_ingest_earnings(args: argparse.Namespace) -> int: + from .ingest.earnings import FMPClient, ingest_for_ticker + cfg = load_config() + if not cfg.fmp_api_key: + print("FMP_API_KEY not set", file=sys.stderr) + return 1 + conn = db.connect(cfg.db_path) + db.init_db(conn) + fmp = FMPClient(cfg.fmp_api_key) + source_id = _resolve_source_id(conn, args.ticker) + n_docs, n_jobs = ingest_for_ticker(conn, fmp, source_id=source_id, symbol=args.ticker.upper(), + data_dir=cfg.data_dir, since=args.since, until=args.until, limit=args.limit) + print(f"{args.ticker}: +{n_docs} earnings transcripts, +{n_jobs} extract jobs (since={args.since}, until={args.until})") + return 0 + + +def cmd_embed_claims(args: argparse.Namespace) -> int: + from .spark import from_config + from .embedstore.qdrant_store import get_client, ensure_collection, upsert_pending + from .embedstore.embedder import SparseEmbedder + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + sc = from_config(cfg) + client = get_client(args.qdrant_url) + created = ensure_collection(client) + print(f"collection {'created' if created else 'exists'}") + sparse = SparseEmbedder() if not args.no_sparse else None + n = upsert_pending(conn, sc, client, sparse) + print(f"embedded + upserted {n} propositions (sparse={'on' if sparse and sparse.available else 'off'})") + return 0 + + +def cmd_search(args: argparse.Namespace) -> int: + from .spark import from_config + cfg = load_config() + sc = from_config(cfg) + res = sc.search(args.query, collection="propositions", top_k=args.top_k, rerank=not args.no_rerank) + hits = res.get("results") or res.get("hits") or res + print(json.dumps(hits, indent=2)[:2500]) + return 0 + + +def cmd_ingest_podcast(args: argparse.Namespace) -> int: + from .ingest.podcasts import ingest_rss, ingest_youtube + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + src = conn.execute("SELECT * FROM sources WHERE source_id=?", (args.source,)).fetchone() + if not src: + print(f"unknown source {args.source}", file=sys.stderr) + return 1 + via = args.via + if via == "auto": + via = "youtube" if (src["backtest_2022_2023"] == "youtube_only" and args.since) else "rss" + fn = ingest_youtube if via == "youtube" else ingest_rss + n_docs, n_jobs = fn(conn, src, since=args.since, until=args.until, limit=args.limit) + print(f"{src['name']} via {via}: +{n_docs} episodes, +{n_jobs} transcribe jobs") + return 0 + + +def cmd_run_transcribe(args: argparse.Namespace) -> int: + from .spark import from_config + from .ingest.transcribe_worker import run_transcribe + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + sc = from_config(cfg) + result = run_transcribe(conn, sc, cfg, limit=args.limit, max_chunks=args.max_chunks) + print(f"transcription: {result['jobs_processed']} jobs processed") + return 0 + + +def cmd_run_transcribe_gemini(args: argparse.Namespace) -> int: + from .ingest.gemini_transcribe import run_transcribe_gemini + cfg = load_config() + conn = db.connect(cfg.db_path) + r = run_transcribe_gemini(conn, cfg, limit=args.limit, concurrency=args.concurrency) + tok_in, tok_out = r["prompt_tokens"], r["output_tokens"] + # Gemini 2.5 Flash list price: ~$0.30/1M text-in, audio-in ~$1.00/1M, $2.50/1M out. Audio dominates in. + est = tok_in / 1_000_000 * 1.00 + tok_out / 1_000_000 * 2.50 + print(f"gemini transcribe: done={r['done']} failed={r['failed']} | " + f"tokens in={tok_in:,} out={tok_out:,} | ~${est:.2f} this run (≈${est/max(r['done'],1):.3f}/ep)") + return 0 + + +def cmd_run_extract(args: argparse.Namespace) -> int: + from .spark import from_config + from .extract.worker import run_extract + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + sc = from_config(cfg) + result = run_extract(conn, sc, cfg, limit=args.limit, max_chunks_per_doc=args.max_chunks) + print(f"extraction: {result['jobs_processed']} jobs, {result['claims_written']} claims written") + return 0 + + +def cmd_queue_status(args: argparse.Namespace) -> int: + from .backfill import queue + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + s = queue.stats(conn) + if not s: + print("queue empty") + return 0 + for job_type, states in sorted(s.items()): + parts = ", ".join(f"{st}={n}" for st, n in sorted(states.items())) + print(f" {job_type}: {parts}") + return 0 + + +def cmd_feed_peek(args: argparse.Namespace) -> int: + from .ingest.feeds import fetch_feed, episode_records + parsed = fetch_feed(args.url) + status = getattr(parsed, "status", None) + recs = episode_records(parsed) + print(f"status={status} bozo={getattr(parsed, 'bozo', None)} episodes_with_audio={len(recs)}") + for r in recs[: args.limit]: + print(f" [{r['published']}] {str(r['title'])[:70]}") + if recs: + print(f"oldest in feed: {recs[-1]['published']} newest: {recs[0]['published']}") + return 0 + + +def cmd_serve(args: argparse.Namespace) -> int: + import uvicorn + from .ui.app import create_app + cfg = load_config() + port = args.port or cfg.ui_port + print(f"serving corpus UI on http://0.0.0.0:{port}") + uvicorn.run(create_app(), host="0.0.0.0", port=port) + return 0 + + +def cmd_seed_fanout(args: argparse.Namespace) -> int: + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + n = load_fanout(conn, Path(args.file)) + print(f"seeded {n} fan-out derivative nodes") + return 0 + + +def cmd_backtest(args: argparse.Namespace) -> int: + from .spark import from_config + from .signals.run import run_backtest + from datetime import datetime, timedelta + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + sc = from_config(cfg) + # monthly as_of march + start = datetime.strptime(args.start, "%Y-%m-%d") + end = datetime.strptime(args.end, "%Y-%m-%d") + dates, d = [], start + while d <= end: + dates.append(d.strftime("%Y-%m-%d")) + d = d + timedelta(days=args.step_days) + print(f"§7.1 backtest: conviction={args.conviction}, as_of march {args.start}→{args.end} ({len(dates)} points)") + timeline = run_backtest(conn, sc, cfg, conviction_id=args.conviction, dates=dates, window_days=args.window_days) + + # report: per-node first-clear date + score trajectory; highlight the headline derivative + print("\n=== node trajectories (score by as_of; ★=cleared evidence bar) ===") + nodes = {} + for as_of, res in timeline: + for r in res: + key = r["node"]["node_id"] or r["node"]["conviction_id"] + nodes.setdefault(key, []).append((as_of, r["result"]["score"], r["evidence"], r["promotion"], r["result"]["inputs"])) + for key, traj in sorted(nodes.items()): + first = next((t for t in traj if t[2]), None) + peak = max(traj, key=lambda t: t[1]) + mark = f"first-cleared {first[0]}" if first else "never cleared" + print(f" {key:28} peak={peak[1]:.2f} {mark}") + head = nodes.get(args.headline) + if head: + print(f"\n=== HEADLINE derivative: {args.headline} ===") + for as_of, score, ev, pr, inp in head: + star = "★" if ev else ("·" if score > 0 else " ") + print(f" {as_of} {star} score={score:.2f} corrob={inp.get('corroboration',0)} " + f"n_conf={inp.get('n_confirmed',0)} eisc={inp.get('eisc_corrob',0)} " + f"a={inp.get('a_corrob',0)} k_eff={inp.get('k_eff0',0)}") + firstclear = next((t for t in head if t[2]), None) + print(f"\n VERDICT: headline power-infra derivative " + f"{'SURFACED at ' + firstclear[0] if firstclear else 'did NOT surface'} " + f"(bar = under_acted ≥ {0.3})") + return 0 + + +def cmd_two_sided(args: argparse.Namespace) -> int: + """Two-sided net-corroboration trajectory (DESIGN_v2.1 H5) for the adversarial cases. + BATTERY: demand-net should rise while supply-net stays flat. STRIKE: net stays quiet in live, fires in test.""" + from .spark import from_config as spark_from_config + from .extract.backends import from_config as backend_from_config + from .signals.two_sided import trajectory + cfg = load_config() + conn = db.connect(cfg.db_path) + sc = spark_from_config(cfg) + backend = backend_from_config(cfg, sc) + nodes = conn.execute( + "SELECT node_id, derivative_proposition FROM fanout_nodes WHERE parent_conviction_id=? ORDER BY node_id", + (args.conviction,), + ).fetchall() + dates = [d.strip() for d in args.dates.split(",")] + filt = [s for s in args.nodes.split(",") if s] if args.nodes else [] + for r in nodes: + if filt and not any(k.lower() in r["node_id"].lower() for k in filt): + continue + for mode in [m.strip() for m in args.modes.split(",")]: + traj = trajectory(conn, sc, backend, r["derivative_proposition"], dates, + window_days=args.window_days, mode=mode) + print(f"\n### {r['node_id']} [mode={mode}, window={args.window_days}d] ###") + for pt in traj: + print(f" {pt['as_of']}: net={pt['net']:+.2f} " + f"affirm(eisc={pt['affirms_eisc']}, hard_src={pt.get('hard_affirm_src','?')}, " + f"n_claims={pt['n_affirm']}, soft_dropped={pt.get('soft_affirm_src_dropped','?')}) " + f"deny(eisc={pt['denies_eisc']}, n={pt['n_deny']}) " + f"own_net={pt['own_network_affirm_src']}") + return 0 + + +def cmd_confusion(args: argparse.Namespace) -> int: + from .signals.confusion import run_confusion + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + out = run_confusion(conn, cfg, args.spec) + classify = out["classify"] + print("=== PRE-REGISTERED confusion matrix (DESIGN_v2 §1) — precision AND recall; RUNWAY = frac of move still ahead at signal ===") + print(f"{'derivative':26} {'reprice?':8} {'peak%':>6} {'whisper':>9} {'run_wh':>6} {'cleared':>9} {'run_cl':>6} cl/wh") + for r in out["rows"]: + cl, wh = classify(r, "cleared"), classify(r, "whisper") + miss = f" (no px:{','.join(r['missing'])})" if r["missing"] else "" + print(f"{r['node']:26} {('REAL' if r['confirmed'] else 'no'):8} {str(r['peak_pct']):>6} " + f"{str(r['whisper_date'] or '-'):>9} {str(r['runway_whisper'] if r['runway_whisper'] is not None else '-'):>6} " + f"{str(r['cleared_date'] or '-'):>9} {str(r['runway_cleared'] if r['runway_cleared'] is not None else '-'):>6} " + f"{cl}/{wh}{miss}") + for level in ("cleared", "whisper"): + c, p, rec = out[level] + print(f"\n{level.upper()} level: TP={c['TP']} FP={c['FP']} FN={c['FN']} TN={c['TN']} | " + f"precision={p if p is None else round(p,2)} recall={rec if rec is None else round(rec,2)}") + print("\nlead_* = days the repricing came AFTER the signal (positive = engine was early).") + print("The cleared→whisper delta = what the independence floor cost in lead time / recall.") + return 0 + + +def cmd_provenance(args: argparse.Namespace) -> int: + """The processing log — what's been ingested/processed, so we never reprocess silently.""" + cfg = load_config() + conn = db.connect(cfg.db_path) + db.init_db(conn) + print("processed documents (the durable log):") + for r in conn.execute( + "SELECT kind, COUNT(*) total, SUM(CASE WHEN processed_at IS NOT NULL THEN 1 ELSE 0 END) proc " + "FROM documents GROUP BY kind ORDER BY kind" + ): + print(f" {r['kind']:14} {r['proc']}/{r['total']} processed") + print("dedup model: (1) UNIQUE(source_id, external_id) = robust pre-GPU guard; " + "(2) dedup_key = cross-mirror (title+date); content_hash = audit only.") + dups = conn.execute( + "SELECT dedup_key, COUNT(*) c FROM documents WHERE dedup_key IS NOT NULL " + "GROUP BY dedup_key HAVING c > 1" + ).fetchall() + print(f"cross-mirror dedup_key groups (same episode via >1 feed): {len(dups)}") + miss = conn.execute("SELECT COUNT(*) FROM documents WHERE dedup_key IS NULL").fetchone()[0] + if miss: + print(f" ({miss} docs missing dedup_key — run `provenance --backfill-hashes`)") + if args.backfill_hashes: + import hashlib + import os + from .util import audio_dedup_key + ndk = nch = 0 + for r in conn.execute("SELECT doc_id, kind, title, date, external_id, transcript_path, dedup_key, content_hash FROM documents"): + updates: dict = {} + if not r["dedup_key"]: + updates["dedup_key"] = (audio_dedup_key(r["title"], r["date"]) + if r["kind"] in ("podcast", "youtube") else r["external_id"]) + ndk += 1 + if not r["content_hash"] and r["transcript_path"] and os.path.exists(r["transcript_path"]): + updates["content_hash"] = hashlib.sha256(open(r["transcript_path"], "rb").read()).hexdigest() + nch += 1 + if updates: + sets = ", ".join(f"{k}=?" for k in updates) + conn.execute(f"UPDATE documents SET {sets} WHERE doc_id=?", (*updates.values(), r["doc_id"])) + conn.commit() + print(f"backfilled {ndk} dedup_keys, {nch} content hashes (audit)") + return 0 + + +def cmd_db_tables(args: argparse.Namespace) -> int: + cfg = load_config() + conn = db.connect(cfg.db_path) + for t in db.table_names(conn): + print(t) + return 0 + + +def cmd_spark_status(args: argparse.Namespace) -> int: + from .spark import from_config + cfg = load_config() + sc = from_config(cfg) + try: + print("status:", sc.status()) + print("endpoints:", sc.endpoints()) + return 0 + except Exception as e: # noqa: BLE001 — health probe; surface, don't crash + print(f"Spark Control unreachable at {cfg.spark_control_url}: {e}", file=sys.stderr) + return 1 + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(prog="signal_engine", description="Ten31 Signal Engine (pilot)") + sub = p.add_subparsers(dest="command", required=True) + + sub.add_parser("init-db", help="Create the SQLite schema").set_defaults(func=cmd_init_db) + + sp = sub.add_parser("seed-convictions", help="Load the conviction log (§3.1)") + sp.add_argument("--file", default=str(DEFAULT_CONVICTION_SEED)) + sp.set_defaults(func=cmd_seed_convictions) + + ss = sub.add_parser("seed-sources", help="Load the source registry (§7.3/§7.4)") + ss.add_argument("--file", default=str(DEFAULT_SOURCES_SEED)) + ss.set_defaults(func=cmd_seed_sources) + + sde = sub.add_parser("seed-edges", help="Seed EISC connectedness edges (priors) idempotently") + sde.add_argument("--file", default="seeds/source_edges.bitcoin.seed.yaml") + sde.set_defaults(func=cmd_seed_edges) + + lf = sub.add_parser("load-feeds", help="Apply resolved/verified podcast feed URLs + backtest reach") + lf.add_argument("--file", default=str(DEFAULT_FEEDS_SEED)) + lf.set_defaults(func=cmd_load_feeds) + + sf = sub.add_parser("seed-fanout", help="Load the hand-written fan-out tree (§7.1 backtest)") + sf.add_argument("--file", default="seeds/fanout.K2023.seed.yaml") + sf.set_defaults(func=cmd_seed_fanout) + + bt = sub.add_parser("backtest", help="Run the §7.1 under-acted-conviction backtest (as-of march)") + bt.add_argument("--conviction", default="K2023") + bt.add_argument("--start", default="2023-01-01") + bt.add_argument("--end", default="2024-06-01") + bt.add_argument("--step-days", type=int, default=30) + bt.add_argument("--window-days", type=int, default=90, help="~quarterly for filings/earnings cadence") + bt.add_argument("--headline", default="K2023-picks-and-shovels") + bt.set_defaults(func=cmd_backtest) + + ie = sub.add_parser("ingest-edgar", help="Fetch SEC filings for a ticker → documents + extract jobs") + ie.add_argument("--ticker", required=True) + ie.add_argument("--since", help="ISO date lower bound, e.g. 2022-01-01") + ie.add_argument("--until", help="ISO date upper bound, e.g. 2023-12-31") + ie.add_argument("--forms", help="comma list, default 10-K,10-Q,8-K") + ie.set_defaults(func=cmd_ingest_edgar) + + idoc = sub.add_parser("ingest-doc", help="Fetch one text doc (HTML/PDF) → document + extract job (Battery corpus)") + idoc.add_argument("--source", required=True, help="source_id (must exist)") + idoc.add_argument("--url", required=True) + idoc.add_argument("--title") + idoc.add_argument("--date", help="ISO date of the document") + idoc.add_argument("--method", choices=["auto", "html", "pdf"], default="auto") + idoc.set_defaults(func=cmd_ingest_doc) + + idm = sub.add_parser("ingest-doc-manifest", help="Batch-ingest a YAML doc manifest (Battery corpus)") + idm.add_argument("--file", default="seeds/battery_docs.manifest.yaml") + idm.set_defaults(func=cmd_ingest_doc_manifest) + + ift = sub.add_parser("ingest-feed-text", help="Ingest article bodies behind a text RSS feed (blog/press)") + ift.add_argument("--source", required=True) + ift.add_argument("--url", required=True, help="RSS feed URL") + ift.add_argument("--since") + ift.add_argument("--until") + ift.add_argument("--limit", type=int, default=50) + ift.set_defaults(func=cmd_ingest_feed_text) + + ge = sub.add_parser("ingest-earnings", help="Fetch FMP earnings transcripts → documents + extract jobs") + ge.add_argument("--ticker", required=True) + ge.add_argument("--since", help="ISO date lower bound (uses transcript date)") + ge.add_argument("--until", help="ISO date upper bound") + ge.add_argument("--limit", type=int, default=8) + ge.set_defaults(func=cmd_ingest_earnings) + + ts = sub.add_parser("two-sided", help="Two-sided net-corroboration trajectory (Strike/Battery adversarial cases)") + ts.add_argument("--conviction", default="BATTERY2022") + ts.add_argument("--nodes", default="", help="comma substrings to filter fan-out nodes, e.g. demand,supply") + ts.add_argument("--dates", default="2022-12-31,2023-06-30,2023-12-31,2024-06-30,2024-12-31") + ts.add_argument("--modes", default="live", help="comma list: live,test") + ts.add_argument("--window-days", type=int, default=365) + ts.set_defaults(func=cmd_two_sided) + + ec = sub.add_parser("embed-claims", help="Embed pending propositions → Qdrant hybrid collection (§4.3)") + ec.add_argument("--qdrant-url", default="http://192.168.1.87:6333") + ec.add_argument("--no-sparse", action="store_true", help="dense-only (skip BM25)") + ec.set_defaults(func=cmd_embed_claims) + + se = sub.add_parser("search", help="Hybrid search the proposition store via the gateway") + se.add_argument("--query", required=True) + se.add_argument("--top-k", type=int, default=8) + se.add_argument("--no-rerank", action="store_true") + se.set_defaults(func=cmd_search) + + ip = sub.add_parser("ingest-podcast", help="Register podcast episodes → transcribe jobs (RSS or YouTube)") + ip.add_argument("--source", required=True, help="source_id, e.g. pod-dwarkesh") + ip.add_argument("--via", choices=["auto", "rss", "youtube"], default="auto") + ip.add_argument("--since") + ip.add_argument("--until") + ip.add_argument("--limit", type=int, default=20) + ip.set_defaults(func=cmd_ingest_podcast) + + rt = sub.add_parser("run-transcribe", help="Drain 'transcribe' jobs → speaker-attributed transcripts + voiceprints") + rt.add_argument("--limit", type=int, default=5) + rt.add_argument("--max-chunks", type=int, default=999) + rt.set_defaults(func=cmd_run_transcribe) + + rtg = sub.add_parser("run-transcribe-gemini", + help="One-time backfill: drain 'transcribe' jobs via Gemini (off the Spark GPU)") + rtg.add_argument("--limit", type=int, default=5) + rtg.add_argument("--concurrency", type=int, default=4) + rtg.set_defaults(func=cmd_run_transcribe_gemini) + + re = sub.add_parser("run-extract", help="Drain 'extract' jobs → claims via the local LLM (§4.2)") + re.add_argument("--limit", type=int, default=5, help="max jobs to process this run") + re.add_argument("--max-chunks", type=int, default=4, help="max chunks per document") + re.set_defaults(func=cmd_run_extract) + + sub.add_parser("queue-status", help="Backfill queue counts by type/state").set_defaults(func=cmd_queue_status) + + fp = sub.add_parser("feed-peek", help="Parse an RSS feed and show episode coverage") + fp.add_argument("--url", required=True) + fp.add_argument("--limit", type=int, default=5) + fp.set_defaults(func=cmd_feed_peek) + + sv = sub.add_parser("serve", help="Run the corpus-management web UI (FastAPI)") + sv.add_argument("--port", type=int, default=None) + sv.set_defaults(func=cmd_serve) + + cm = sub.add_parser("confusion-matrix", help="Pre-registered precision/recall on the §7.1 derivatives (resolver)") + cm.add_argument("--spec", default="seeds/resolution.K2023.yaml") + cm.set_defaults(func=cmd_confusion) + + pv = sub.add_parser("provenance", help="Processing log: what's ingested/processed (dedup-safe)") + pv.add_argument("--backfill-hashes", action="store_true", help="compute content_hash for older transcripts") + pv.set_defaults(func=cmd_provenance) + + sub.add_parser("db-tables", help="List tables/views").set_defaults(func=cmd_db_tables) + sub.add_parser("spark-status", help="Probe Spark Control health").set_defaults(func=cmd_spark_status) + return p + + +def main(argv: list[str] | None = None) -> int: + args = build_parser().parse_args(argv) + cfg = load_config() + _setup_logging(cfg.log_level) + return args.func(args) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/signal_engine/config.py b/signal_engine/config.py new file mode 100644 index 0000000..2409352 --- /dev/null +++ b/signal_engine/config.py @@ -0,0 +1,101 @@ +"""Environment-driven configuration (handoff §10, §13). + +All config flows through env vars so the SAME code runs as a plain process now and, later, as a +StartOS s9pk daemon (which injects these via the daemon's `exec.env` from a `store.json` FileModel). +A local `.env` (gitignored) is loaded for convenience during the pilot. + +Live values confirmed against the operator's gateway 2026-06-07 (GET /api/status,/api/endpoints): + gateway = https://192.168.1.72:62419 (self-signed → SPARK_VERIFY_TLS=false) + LLM = RedHatAI/Qwen3.6-35B-A3B-NVFP4 + embed = BAAI/bge-m3 (1024-d) rerank = BAAI/bge-reranker-v2-m3 + ASR = nvidia/parakeet-tdt-0.6b-v3 diarizer = nvidia/diar_sortformer_4spk-v1 +""" +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path + + +def _load_dotenv(path: str = ".env") -> None: + """Minimal .env loader (no dependency): KEY=VALUE lines populate os.environ if not already set.""" + p = Path(path) + if not p.exists(): + return + for line in p.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, _, val = line.partition("=") + os.environ.setdefault(key.strip(), val.strip().strip('"').strip("'")) + + +def _env(key: str, default: str | None = None) -> str | None: + return os.environ.get(key, default) + + +@dataclass(frozen=True) +class Config: + spark_control_url: str + spark_verify_tls: bool + spark_timeout_s: float + audio_concurrency: int # global in-flight cap across BOTH parakeet audio endpoints (sit at 2, ceiling 3) + + local_llm_model: str + embed_model: str + transcribe_model: str + + anthropic_api_key: str | None + frontier_model: str + + # Extraction backend: 'local' (Qwen via Spark Control, default) | 'gemini' (batch overflow/fallback, §scaling) + extraction_backend: str + gemini_api_key: str | None + gemini_model: str + + fmp_api_key: str | None + edgar_user_agent: str + + data_dir: Path + database_url: str + audio_cache_dir: Path + + ui_port: int + log_level: str + + @classmethod + def from_env(cls) -> "Config": + _load_dotenv() + data_dir = Path(_env("DATA_DIR", "./data") or "./data") + return cls( + spark_control_url=_env("SPARK_CONTROL_URL", "https://192.168.1.72:62419") or "", + spark_verify_tls=(_env("SPARK_VERIFY_TLS", "false") or "false").lower() == "true", + spark_timeout_s=float(_env("SPARK_TIMEOUT_S", "180") or "180"), + audio_concurrency=min(3, max(1, int(_env("AUDIO_CONCURRENCY", "2") or "2"))), + local_llm_model=_env("LOCAL_LLM_MODEL", "RedHatAI/Qwen3.6-35B-A3B-NVFP4") or "", + embed_model=_env("EMBED_MODEL", "BAAI/bge-m3") or "", + transcribe_model=_env("TRANSCRIBE_MODEL", "nvidia/parakeet-tdt-0.6b-v3") or "", + anthropic_api_key=_env("ANTHROPIC_API_KEY"), + frontier_model=_env("FRONTIER_MODEL", "claude-opus-4-8") or "", + extraction_backend=_env("EXTRACTION_BACKEND", "local") or "local", + gemini_api_key=_env("GEMINI_API_KEY"), + gemini_model=_env("GEMINI_MODEL", "gemini-2.5-flash") or "", + fmp_api_key=_env("FMP_API_KEY"), + edgar_user_agent=_env("EDGAR_USER_AGENT", "Ten31 Research grant@ten31.xyz") or "", + data_dir=data_dir, + database_url=_env("DATABASE_URL", "") or "", + audio_cache_dir=Path(_env("AUDIO_CACHE_DIR", str(data_dir / "audio-cache")) or "audio-cache"), + ui_port=int(_env("UI_PORT", "8000") or "8000"), + log_level=_env("LOG_LEVEL", "INFO") or "INFO", + ) + + @property + def db_path(self) -> Path: + prefix = "sqlite:///" + if self.database_url.startswith(prefix): + return Path(self.database_url[len(prefix):]) + return self.data_dir / "signal.db" + + +def load_config() -> Config: + return Config.from_env() diff --git a/signal_engine/embedstore/__init__.py b/signal_engine/embedstore/__init__.py new file mode 100644 index 0000000..7617f9f --- /dev/null +++ b/signal_engine/embedstore/__init__.py @@ -0,0 +1,6 @@ +"""Embedding + vector storage (§4.3). + +Embed DISTILLED PROPOSITIONS (not raw chunks) into a Qdrant HYBRID collection: dense bge-m3 +(via the gateway) + BM25 sparse (client-side), so entity-heavy propositions (MSTR/Strategy/ +Microstrategy) match on the lexical leg too. Retrieval goes through the gateway's /api/search. +""" diff --git a/signal_engine/embedstore/embedder.py b/signal_engine/embedstore/embedder.py new file mode 100644 index 0000000..1fba3ac --- /dev/null +++ b/signal_engine/embedstore/embedder.py @@ -0,0 +1,36 @@ +"""Proposition embedding: dense (bge-m3 via gateway) + optional BM25 sparse (client-side).""" +from __future__ import annotations + +import logging + +log = logging.getLogger(__name__) + + +def dense_embed(sc, texts: list[str]) -> list[list[float]]: + """Dense bge-m3 (1024-d) via the gateway /v1/embeddings (§4.3).""" + resp = sc.embed(texts) + data = sorted(resp["data"], key=lambda d: d.get("index", 0)) + return [d["embedding"] for d in data] + + +class SparseEmbedder: + """BM25 sparse vectors via FastEmbed `Qdrant/bm25` (the operator's CRM uses this exact model, + with the collection's `modifier: idf`). Degrades gracefully to dense-only if fastembed is absent.""" + + def __init__(self, model_name: str = "Qdrant/bm25") -> None: + self.available = False + self._model = None + try: + from fastembed import SparseTextEmbedding + self._model = SparseTextEmbedding(model_name=model_name) + self.available = True + except Exception as e: # noqa: BLE001 + log.warning("fastembed sparse unavailable (%s) — upserting dense-only; add sparse later", e) + + def embed(self, texts: list[str]) -> list[dict | None]: + if not self.available or self._model is None: + return [None] * len(texts) + out: list[dict | None] = [] + for emb in self._model.embed(texts): + out.append({"indices": emb.indices.tolist(), "values": emb.values.tolist()}) + return out diff --git a/signal_engine/embedstore/qdrant_store.py b/signal_engine/embedstore/qdrant_store.py new file mode 100644 index 0000000..61a818e --- /dev/null +++ b/signal_engine/embedstore/qdrant_store.py @@ -0,0 +1,79 @@ +"""Qdrant hybrid collection: create + upsert distilled propositions (§4.3). + +Collection mgmt + upserts go DIRECT to Qdrant (§13.2 "(Qdrant direct) :6333"); retrieval goes +through the gateway's /api/search. Named dense vector `bge_m3` (1024-d cosine) + sparse `bm25` +(modifier IDF). Point id is a deterministic UUID5 of claim_id, so re-upsert is idempotent. +""" +from __future__ import annotations + +import logging +import sqlite3 +import uuid + +from qdrant_client import QdrantClient, models + +from .embedder import SparseEmbedder, dense_embed + +log = logging.getLogger(__name__) + +COLLECTION = "propositions" +DENSE = "bge_m3" +SPARSE = "bm25" +_NS = uuid.UUID("5f9b7e10-0000-4000-8000-000000000001") + +# Filterable payload (§4.3): stance/topic/cluster/date for stance distributions, time-windowed +# consensus, corroboration lookups. NEVER infer stance from vector distance (§2.2/§5.3). +_PAYLOAD_FIELDS = ( + "claim_id", "doc_id", "source_id", "source_cluster", "topic_canonical", "date", + "claim_type", "time_horizon", "confidence", "rel_polarity", "engages_consensus", + "counters_position", "thesis_seam", "salience", "claimant", "proposition", +) + + +def get_client(qdrant_url: str) -> QdrantClient: + return QdrantClient(url=qdrant_url, prefer_grpc=False, timeout=60) + + +def ensure_collection(client: QdrantClient, *, dim: int = 1024) -> bool: + names = [c.name for c in client.get_collections().collections] + if COLLECTION in names: + return False + client.create_collection( + collection_name=COLLECTION, + vectors_config={DENSE: models.VectorParams(size=dim, distance=models.Distance.COSINE)}, + sparse_vectors_config={SPARSE: models.SparseVectorParams(modifier=models.Modifier.IDF)}, + ) + log.info("created Qdrant collection %r (dense %s %dd + sparse %s/idf)", COLLECTION, DENSE, dim, SPARSE) + return True + + +def _point_id(claim_id: str) -> str: + return str(uuid.uuid5(_NS, claim_id)) + + +def upsert_pending(conn: sqlite3.Connection, sc, client: QdrantClient, + sparse: SparseEmbedder | None = None, *, batch: int = 64) -> int: + """Embed + upsert every claim that has no qdrant_point_id yet; back-link the id into SQLite.""" + rows = conn.execute("SELECT * FROM claims WHERE qdrant_point_id IS NULL").fetchall() + if not rows: + return 0 + total = 0 + for i in range(0, len(rows), batch): + chunk = rows[i:i + batch] + texts = [r["proposition"] for r in chunk] + dvecs = dense_embed(sc, texts) + svecs = sparse.embed(texts) if sparse else [None] * len(texts) + points = [] + for r, dv, sv in zip(chunk, dvecs, svecs): + vectors: dict = {DENSE: dv} + if sv is not None: + vectors[SPARSE] = models.SparseVector(indices=sv["indices"], values=sv["values"]) + payload = {f: r[f] for f in _PAYLOAD_FIELDS} + points.append(models.PointStruct(id=_point_id(r["claim_id"]), vector=vectors, payload=payload)) + client.upsert(collection_name=COLLECTION, points=points) + for r in chunk: + conn.execute("UPDATE claims SET qdrant_point_id=? WHERE claim_id=?", + (_point_id(r["claim_id"]), r["claim_id"])) + conn.commit() + total += len(chunk) + return total diff --git a/signal_engine/extract/__init__.py b/signal_engine/extract/__init__.py new file mode 100644 index 0000000..c54bebf --- /dev/null +++ b/signal_engine/extract/__init__.py @@ -0,0 +1,6 @@ +"""Extraction (§4.2) — local LLM → structured claim units. The cost & quality center. + +Emits at the level of the PROPOSITION: a passage may yield 0..N claims, and MOST passages yield +zero. An extractor that dutifully emits a claim per chunk reintroduces exactly the noise the rest +of the system is designed to remove. +""" diff --git a/signal_engine/extract/backends.py b/signal_engine/extract/backends.py new file mode 100644 index 0000000..b80b073 --- /dev/null +++ b/signal_engine/extract/backends.py @@ -0,0 +1,64 @@ +"""Pluggable extraction backends (§scaling). + +The §4.2 extractor calls a backend that turns chat messages into a JSON string. Default is the +LOCAL Qwen via Spark Control (the ~95%-local design). The Gemini backend is the documented +overflow/fallback for bulk back-cataloging at scale, or if the Sparks are unavailable — used for +the PUBLIC corpus only, never conviction/exposure data (sovereignty boundary, §4.6). + +A backend exposes: complete_json(messages, max_tokens) -> str (a JSON object string). +""" +from __future__ import annotations + +import logging + +log = logging.getLogger(__name__) + + +class LocalQwenBackend: + name = "local" + + def __init__(self, sc) -> None: + self.sc = sc + + def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str: + resp = self.sc.chat(messages, json_object=True, temperature=0, + enable_thinking=False, max_tokens=max_tokens) + return resp["choices"][0]["message"]["content"] + + +class GeminiBackend: + """Gemini fallback/overflow. Implemented against the `google-genai` SDK. NOTE: untested until a + key is provided — validate end-to-end before relying on it for a real backfill. The async BATCH + API is the eventual scale path; this synchronous form is the drop-in fallback.""" + name = "gemini" + + def __init__(self, api_key: str, model: str = "gemini-2.5-flash") -> None: + from google import genai # guarded import; pip install google-genai + self._genai = genai + self.client = genai.Client(api_key=api_key) + self.model = model + + def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str: + from google.genai import types + system = "\n\n".join(m["content"] for m in messages if m["role"] == "system") + user = "\n\n".join(m["content"] for m in messages if m["role"] != "system") + resp = self.client.models.generate_content( + model=self.model, + contents=user, + config=types.GenerateContentConfig( + system_instruction=system or None, + temperature=0, + max_output_tokens=max_tokens, + response_mime_type="application/json", + ), + ) + return resp.text or "{}" + + +def from_config(cfg, sc) -> "LocalQwenBackend | GeminiBackend": + if cfg.extraction_backend == "gemini": + if not cfg.gemini_api_key: + log.warning("EXTRACTION_BACKEND=gemini but GEMINI_API_KEY missing — falling back to local") + else: + return GeminiBackend(cfg.gemini_api_key, cfg.gemini_model) + return LocalQwenBackend(sc) diff --git a/signal_engine/extract/claims.py b/signal_engine/extract/claims.py new file mode 100644 index 0000000..1e6fb4b --- /dev/null +++ b/signal_engine/extract/claims.py @@ -0,0 +1,117 @@ +"""Claim extraction: text → 0..N claim units → SQLite (§4.2).""" +from __future__ import annotations + +import json +import logging +import sqlite3 +from typing import Any + +from .prompt import SEED_TOPICS, build_messages + +log = logging.getLogger(__name__) + +_ENUMS = { + "claim_type": {"interpretive", "predictive", "descriptive", "reactive"}, + "time_horizon": {"near", "medium", "long", "unspecified"}, + "confidence": {"low", "med", "high"}, + "thesis_seam": {"energy_compute", "debasement_bitcoin", "ai_data_ownership", "none"}, + "salience": {"central", "secondary", "aside"}, +} + + +def register_seed_topics(conn: sqlite3.Connection) -> None: + """Pre-load the controlled half of the hybrid topic vocabulary (§4.2).""" + for t in SEED_TOPICS: + conn.execute( + "INSERT INTO topics (topic_canonical, status) VALUES (?, 'controlled') " + "ON CONFLICT(topic_canonical) DO UPDATE SET status='controlled'", + (t,), + ) + conn.commit() + + +def chunk_text(text: str, max_chars: int) -> list[str]: + """Split on paragraph boundaries into windows that fit the model context alongside the prompt.""" + text = text.strip() + if not text: + return [] + if len(text) <= max_chars: + return [text] + chunks: list[str] = [] + cur: list[str] = [] + size = 0 + for para in text.split("\n\n"): + if size + len(para) > max_chars and cur: + chunks.append("\n\n".join(cur)) + cur, size = [], 0 + cur.append(para) + size += len(para) + 2 + if cur: + chunks.append("\n\n".join(cur)) + return chunks + + +def _parse_claims(content: str) -> list[dict]: + try: + obj = json.loads(content) + except Exception: + i, j = content.find("{"), content.rfind("}") + if i < 0 or j < 0: + return [] + try: + obj = json.loads(content[i:j + 1]) + except Exception: + return [] + claims = obj.get("claims", []) if isinstance(obj, dict) else [] + return [c for c in claims if isinstance(c, dict) and c.get("proposition")] + + +def extract_claims_from_text(backend, text: str, *, source_name: str, source_cluster: str | None, + date: str | None, kind: str) -> list[dict]: + """`backend` is any object with .complete_json(messages, max_tokens) -> str + (see extract.backends: LocalQwenBackend | GeminiBackend).""" + messages = build_messages(text, source_name=source_name, source_cluster=source_cluster, + date=date, kind=kind) + content = backend.complete_json(messages, max_tokens=4000) + return _parse_claims(content) + + +def _enum(c: dict, field: str, default: str) -> str: + v = c.get(field) + return v if v in _ENUMS[field] else default + + +def persist_claims(conn: sqlite3.Connection, *, doc: sqlite3.Row, source: sqlite3.Row | None, + claims: list[dict], chunk_idx: int) -> int: + n = 0 + cluster = source["source_cluster"] if source else None + for i, c in enumerate(claims): + seam = _enum(c, "thesis_seam", "none") + topic = c.get("topic_canonical") or None + if topic: + # register emergent topics BEFORE the claim (claims.topic_canonical is a FK → topics) + conn.execute( + "INSERT OR IGNORE INTO topics (topic_canonical, status, seam) VALUES (?, 'emergent', ?)", + (topic, seam), + ) + claim_id = f"{doc['doc_id']}:{chunk_idx}:{i}" + conn.execute( + """INSERT OR IGNORE INTO claims + (claim_id, doc_id, source_id, proposition, topic_canonical, topic_raw, claimant, + source_cluster, date, claim_type, time_horizon, confidence, rel_polarity, + engages_consensus, counters_position, thesis_seam, salience) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", + ( + claim_id, doc["doc_id"], doc["source_id"], str(c["proposition"])[:1000], + topic, c.get("topic_raw"), + c.get("claimant") or (source["name"] if source else None), + cluster, doc["date"], + _enum(c, "claim_type", "descriptive"), _enum(c, "time_horizon", "unspecified"), + _enum(c, "confidence", "med"), "none", + 1 if c.get("engages_consensus") else 0, c.get("counters_position"), + seam, _enum(c, "salience", "secondary"), + ), + ) + n += 1 + conn.commit() + return n diff --git a/signal_engine/extract/html_text.py b/signal_engine/extract/html_text.py new file mode 100644 index 0000000..057d14b --- /dev/null +++ b/signal_engine/extract/html_text.py @@ -0,0 +1,47 @@ +"""SEC filing HTML → plain text. Stdlib only (boring, inspectable). + +Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge section of +numeric facts that would otherwise swamp the extractor), and collapses whitespace. +""" +from __future__ import annotations + +import re +from html.parser import HTMLParser + +_SKIP_TAGS = {"script", "style", "head"} +_SKIP_PREFIXES = ("ix:hidden",) # inline-XBRL hidden fact dump +_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"} + + +class _Stripper(HTMLParser): + def __init__(self) -> None: + super().__init__(convert_charrefs=True) + self._skip_depth = 0 + self._parts: list[str] = [] + + def handle_starttag(self, tag: str, attrs) -> None: + if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES): + self._skip_depth += 1 + elif tag in _BLOCK_TAGS: + self._parts.append("\n") + + def handle_endtag(self, tag: str) -> None: + if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES): + self._skip_depth = max(0, self._skip_depth - 1) + elif tag in _BLOCK_TAGS: + self._parts.append("\n") + + def handle_data(self, data: str) -> None: + if self._skip_depth == 0 and data.strip(): + self._parts.append(data) + + +def html_to_text(html: str, *, max_chars: int = 300_000) -> str: + p = _Stripper() + p.feed(html) + text = "".join(p._parts) + text = re.sub(r"[ \t ]+", " ", text) + text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text) + text = "\n".join(line.strip() for line in text.splitlines()) + text = text.strip() + return text[:max_chars] diff --git a/signal_engine/extract/prompt.py b/signal_engine/extract/prompt.py new file mode 100644 index 0000000..3372f1d --- /dev/null +++ b/signal_engine/extract/prompt.py @@ -0,0 +1,72 @@ +"""The §4.2 claim-extraction prompt. Prompt engineering is ours (§13.3); the schema is finalized. + +Discipline encoded here (the whole point of the system, §2/§4.2): + - Extract at the level of the PROPOSITION; emit ZERO when there is no substantive claim. + - Separate topic from stance: capture stance-vs-consensus explicitly, never as a bull/bear label. + - thesis_seam is a TAG, not a filter — off-thesis and anti-thesis claims are still extracted. +""" +from __future__ import annotations + +# Hybrid topic vocabulary (§4.2): a small SEEDED controlled list. The model reuses one when it +# fits and proposes a concise snake_case topic otherwise; emergent topics are merged on a schedule. +SEED_TOPICS = [ + # energy <-> compute + "ai_compute_demand", "ai_power_constraint", "datacenter_buildout", "grid_interconnect", + "transformers_equipment", "nuclear_power", "natural_gas_power", "uranium_supply", + "cooling_infrastructure", "miner_flexible_load", "mining_ai_pivot", + # debasement <-> bitcoin + "bitcoin_reserve_asset", "bitcoin_collateral_credit", "bitcoin_treasury_strategy", + "btc_custody_regulation", "sovereign_bitcoin_adoption", + # ai <-> data ownership + "ai_data_ownership", "confidential_inference", "ai_commoditization", + # macro + "fed_policy", "fiscal_debasement", "stablecoins_cbdc", +] + +_SYSTEM = """You are the claim-extraction component of an investment signal engine. You read a passage \ +(an SEC filing excerpt or a podcast/earnings-call transcript) and extract structured CLAIM UNITS. + +A CLAIM UNIT is a single normalized proposition that someone asserts — a forward-looking prediction, \ +an interpretive or causal judgment, or a stance taken against a prevailing view. It must be specific \ +enough to later be checked against the world. + +CRITICAL DISCIPLINE — be willing to extract NOTHING: +- Most passages contain ZERO claim units. Boilerplate, legal disclaimers, ad reads, pleasantries, \ +generic descriptions, routine financial line-items, and recitations of well-known news are NOT claims. +- Do NOT invent claims. Do NOT emit one claim per paragraph to seem thorough. If the passage has no \ +substantive proposition, return {"claims": []}. A precise empty answer is the correct, valued output. +- Extract at the level of the PROPOSITION: one normalized subject-assertion-object sentence each. A \ +single rich passage may yield several; a long dull one yields none. + +For EACH claim unit, output these fields: +- "proposition": one normalized sentence (subject-assertion-object), self-contained. +- "topic_canonical": a concise snake_case topic for clustering. REUSE one of the provided seed topics \ +when it fits; otherwise propose a new concise snake_case label. Normalize synonyms (Fed/FOMC/rates → fed_policy). +- "topic_raw": the topic as actually phrased in the passage. +- "claimant": who asserts it (speaker name or the filing company). Use "unknown" if unclear. +- "claim_type": one of interpretive | predictive | descriptive | reactive. (interpretive/predictive = \ +insight; descriptive/reactive = news echo — extract those only if clearly salient.) +- "time_horizon": one of near | medium | long | unspecified (for predictive claims especially). +- "confidence": the claimant's apparent conviction — one of low | med | high. +- "engages_consensus": true ONLY if the claim explicitly argues against a stated mainstream view. +- "counters_position": the mainstream position it argues against, or null. +- "thesis_seam": one of energy_compute | debasement_bitcoin | ai_data_ownership | none. This is a TAG \ +for relevance only — tag off-thesis claims "none" and STILL extract them. +- "salience": central | secondary | aside (how central the claim is to the passage). + +Return ONLY a JSON object: {"claims": [ {...}, ... ]}. No prose, no markdown.""" + + +def build_messages(text: str, *, source_name: str, source_cluster: str | None, + date: str | None, kind: str) -> list[dict[str, str]]: + seed = ", ".join(SEED_TOPICS) + context = ( + f"Source: {source_name or 'unknown'} (cluster: {source_cluster or 'n/a'}, type: {kind}, " + f"date: {date or 'n/a'}).\n" + f"Seed topics to reuse when they fit: {seed}.\n\n" + f"PASSAGE:\n{text}" + ) + return [ + {"role": "system", "content": _SYSTEM}, + {"role": "user", "content": context}, + ] diff --git a/signal_engine/extract/worker.py b/signal_engine/extract/worker.py new file mode 100644 index 0000000..fa0d0a0 --- /dev/null +++ b/signal_engine/extract/worker.py @@ -0,0 +1,69 @@ +"""Extraction worker — drains 'extract' jobs from the backfill queue (§4.2, §13.4). + +Single sequential worker by design: extraction is the heavier serial load on the one LLM GPU. +For each job: load the document, get its text (fetch+strip filing HTML, or read a stored transcript), +chunk it, run the §4.2 extractor per chunk, persist 0..N claims, complete the job. +""" +from __future__ import annotations + +import logging +from pathlib import Path + +import requests + +from ..backfill import queue +from . import claims as claims_mod +from .html_text import html_to_text + +log = logging.getLogger(__name__) + + +def _document_text(doc, *, user_agent: str) -> str: + if doc["transcript_path"]: + return Path(doc["transcript_path"]).read_text() + if doc["kind"] == "filing" and doc["url"]: + r = requests.get(doc["url"], headers={"User-Agent": user_agent}, timeout=90) + r.raise_for_status() + return html_to_text(r.text) + raise ValueError(f"no text source for {doc['doc_id']} (kind={doc['kind']}, url={doc['url']})") + + +def run_extract(conn, sc, cfg, *, limit: int = 10, max_chunks_per_doc: int = 4, + chunk_chars: int = 18_000, lease_seconds: int = 900, + worker_id: str = "extract-1") -> dict: + from .backends import from_config as backend_from_config + backend = backend_from_config(cfg, sc) + log.info("extraction backend: %s", backend.name) + claims_mod.register_seed_topics(conn) + processed = total_claims = 0 + while processed < limit: + job = queue.lease_next(conn, worker_id=worker_id, job_types=["extract"], lease_seconds=lease_seconds) + if job is None: + break + processed += 1 + doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone() + if doc is None: + queue.skip(conn, job["job_id"], "document missing") + continue + src = conn.execute("SELECT * FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone() + try: + text = _document_text(doc, user_agent=cfg.edgar_user_agent) + chunks = claims_mod.chunk_text(text, chunk_chars)[:max_chunks_per_doc] + doc_claims = 0 + for idx, chunk in enumerate(chunks): + cl = claims_mod.extract_claims_from_text( + backend, chunk, + source_name=src["name"] if src else "", + source_cluster=src["source_cluster"] if src else None, + date=doc["date"], kind=doc["kind"], + ) + doc_claims += claims_mod.persist_claims(conn, doc=doc, source=src, claims=cl, chunk_idx=idx) + conn.execute("UPDATE documents SET processed_at=datetime('now') WHERE doc_id=?", (doc["doc_id"],)) + conn.commit() + queue.complete(conn, job["job_id"], output_ref=f"{doc_claims} claims / {len(chunks)} chunks") + total_claims += doc_claims + log.info("extracted %d claims from %s (%d chunks)", doc_claims, doc["doc_id"], len(chunks)) + except Exception as e: # noqa: BLE001 + state = queue.fail(conn, job["job_id"], e) + log.warning("extract failed for %s: %s (→ %s)", job["target_id"], e, state) + return {"jobs_processed": processed, "claims_written": total_claims} diff --git a/signal_engine/ingest/__init__.py b/signal_engine/ingest/__init__.py new file mode 100644 index 0000000..a517ad8 --- /dev/null +++ b/signal_engine/ingest/__init__.py @@ -0,0 +1,5 @@ +"""Ingestion layer (§4.1) — the biggest greenfield piece. + +Spark Control transcribes audio you hand it; it does NOT fetch. Everything here is fetch/schedule: +RSS + YouTube + EDGAR + FMP earnings, long-audio chunking, and cross-chunk speaker stitching. +""" diff --git a/signal_engine/ingest/chunker.py b/signal_engine/ingest/chunker.py new file mode 100644 index 0000000..f8a61b6 --- /dev/null +++ b/signal_engine/ingest/chunker.py @@ -0,0 +1,36 @@ +"""Long-audio chunking (§4.1, §13.4). + +Podcasts run 1–3 h; the diarizer caps at 4 speakers/chunk and Spark 2 is a single GPU, so we cut +long audio into ~2–3 min pieces sent SEQUENTIALLY (parallel audio → 503 FFT race). Each chunk is +diarized independently and re-stitched across chunks by voiceprint (see speaker_stitch.py). +Requires ffmpeg/ffprobe. +""" +from __future__ import annotations + +import subprocess +from pathlib import Path + +CHUNK_SECONDS_DEFAULT = 150 # 2.5 min, within the ~2–3 min guidance + + +def duration_seconds(src: str | Path) -> float: + out = subprocess.run( + ["ffprobe", "-v", "error", "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", str(src)], + check=True, capture_output=True, text=True, + ) + return float(out.stdout.strip()) + + +def chunk_audio(src: str | Path, out_dir: str | Path, *, chunk_seconds: int = CHUNK_SECONDS_DEFAULT) -> list[Path]: + """Split into fixed-length WAV chunks using ffmpeg's segment muxer (no re-encode of timing). + Returns chunk paths in order. Order matters: the queue sends them sequentially.""" + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + pattern = str(out_dir / "chunk_%04d.wav") + subprocess.run( + ["ffmpeg", "-y", "-i", str(src), "-f", "segment", "-segment_time", str(chunk_seconds), + "-ar", "16000", "-ac", "1", "-reset_timestamps", "1", pattern], + check=True, capture_output=True, + ) + return sorted(out_dir.glob("chunk_*.wav")) diff --git a/signal_engine/ingest/docs.py b/signal_engine/ingest/docs.py new file mode 100644 index 0000000..a6cdeed --- /dev/null +++ b/signal_engine/ingest/docs.py @@ -0,0 +1,159 @@ +"""Text-document fetcher for the Battery (bitcoin-collateralized lending) corpus and any non-filing, +non-audio source: policy primaries (SEC SABs, OCC/FDIC/Fed), lender/issuer blogs, credit-market data. + +Unlike EDGAR (CIK-driven) and the podcast path (audio→transcribe), these are dated HTML pages, PDFs, or +article RSS feeds. We fetch ONCE, extract clean text (HTML via html_to_text, PDF via pypdf), save it, and +point documents.transcript_path at the saved text so the extract worker reads it directly (it already +supports transcript_path) — this also lets PDFs work, which the worker's on-demand html_to_text fetch can't. + +A source row must exist first (FK). Lineage/axis live on the source's cluster/notes (set in the seed); +policy sources are axis=context and must NOT feed the supply resolver (weight 0) — enforced downstream. +""" +from __future__ import annotations + +import hashlib +import io +import logging +import sqlite3 +from pathlib import Path + +import requests + +from ..backfill import queue +from ..extract.html_text import html_to_text +from .feeds import fetch_feed + +log = logging.getLogger(__name__) + +DEFAULT_UA = "ten31-signal-engine/1.0 (research; contact ops@ten31.xyz)" + + +def _pdf_to_text(data: bytes, *, max_chars: int) -> str: + import pypdf + reader = pypdf.PdfReader(io.BytesIO(data)) + parts: list[str] = [] + total = 0 + for page in reader.pages: + t = page.extract_text() or "" + parts.append(t) + total += len(t) + if total > max_chars: + break + return "\n".join(parts)[:max_chars] + + +def fetch_clean_text(url: str, *, method: str = "auto", ua: str = DEFAULT_UA, + timeout: int = 90, max_chars: int = 300_000) -> str: + """Fetch a URL once and return clean text. Auto-detects PDF vs HTML by content-type + magic bytes.""" + r = requests.get(url, headers={"User-Agent": ua}, timeout=timeout) + r.raise_for_status() + ctype = r.headers.get("Content-Type", "").lower() + is_pdf = method == "pdf" or "application/pdf" in ctype or r.content[:5] == b"%PDF-" + if is_pdf: + return _pdf_to_text(r.content, max_chars=max_chars) + return html_to_text(r.text, max_chars=max_chars) + + +_BLOCK_MARKERS = ( + "aggressive automated scraping", "request access", "access denied", "are you a robot", + "enable javascript", "captcha", "verify you are human", "rate limit exceeded", + "403 forbidden", "unusual traffic", "checking your browser", +) + + +def _looks_blocked(text: str) -> bool: + """Anti-scraping interstitials return 200 + a short access-denied body. Detect so we don't ingest + a block page as if it were the document (a real policy/blog doc is long and has no such markers).""" + low = text[:2500].lower() + return any(m in low for m in _BLOCK_MARKERS) + + +def _doc_id(source_id: str, url: str) -> str: + return f"doc:{source_id}:{hashlib.sha256(url.encode()).hexdigest()[:12]}" + + +def ingest_one(conn: sqlite3.Connection, cfg, *, source_id: str, url: str, title: str, + date: str | None, method: str = "auto", prompt_version: str = "extract-v0", + min_chars: int = 400) -> str | None: + """Fetch+store one text document and enqueue extraction. Idempotent on (source_id, url). + Returns doc_id if newly ingested, else None (duplicate, too-short, or fetch error → logged).""" + doc_id = _doc_id(source_id, url) + if conn.execute("SELECT 1 FROM documents WHERE doc_id=?", (doc_id,)).fetchone(): + return None + ua = getattr(cfg, "user_agent", None) or DEFAULT_UA + try: + text = fetch_clean_text(url, method=method, ua=ua) + except Exception as e: # noqa: BLE001 + log.warning("doc fetch failed %s: %s", url, e) + return None + if not text or len(text) < min_chars: + log.warning("doc too short (%d chars), skipping %s", len(text or ""), url) + return None + if _looks_blocked(text): + log.warning("blocked/anti-scrape page detected, skipping %s", url) + return None + safe = doc_id.replace(":", "_") + tpath = Path(cfg.data_dir) / "docs" / f"{safe}.txt" + tpath.parent.mkdir(parents=True, exist_ok=True) + tpath.write_text(text) + content_hash = hashlib.sha256(text.encode()).hexdigest() + conn.execute( + """INSERT OR IGNORE INTO documents + (doc_id, source_id, kind, external_id, url, title, date, transcript_path, content_hash, processed_at) + VALUES (?,?,?,?,?,?,?,?,?,datetime('now'))""", + (doc_id, source_id, "filing", url, url, title[:300] if title else url, date, str(tpath), content_hash), + ) + conn.commit() + h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest() + queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h, + parent_doc_id=doc_id, priority=50) + conn.commit() + log.info("ingested doc %s (%d chars) for %s", doc_id, len(text), source_id) + return doc_id + + +def ingest_manifest(conn: sqlite3.Connection, cfg, path) -> dict: + """Batch-ingest the docs listed in a YAML manifest ({docs:[{source,url,title,date,method}]}). + Returns {ingested, skipped, missing_source}. Each source must already exist (FK).""" + import yaml + from pathlib import Path as _Path + data = yaml.safe_load(_Path(path).read_text()) or {} + docs = data.get("docs", []) + ingested = skipped = missing = 0 + for d in docs: + src = d.get("source") + if not conn.execute("SELECT 1 FROM sources WHERE source_id=?", (src,)).fetchone(): + log.warning("manifest doc references missing source %r — skipping %s", src, d.get("url")) + missing += 1 + continue + doc_id = ingest_one(conn, cfg, source_id=src, url=d["url"], title=d.get("title", d["url"]), + date=d.get("date"), method=d.get("method", "auto")) + if doc_id: + ingested += 1 + else: + skipped += 1 + return {"ingested": ingested, "skipped": skipped, "missing_source": missing} + + +def ingest_feed_text(conn: sqlite3.Connection, cfg, *, source_id: str, rss_url: str, + since: str | None = None, until: str | None = None, limit: int = 50) -> int: + """Ingest the ARTICLE bodies behind a text RSS feed (blog/press feed). Each item's link is fetched + and stored as a dated text document. Returns count of newly-ingested docs.""" + from .feeds import _published_iso + parsed = fetch_feed(rss_url, user_agent=getattr(cfg, "user_agent", None) or DEFAULT_UA) + n = 0 + for entry in parsed.entries: + if n >= limit: + break + link = entry.get("link") + if not link: + continue + date = _published_iso(entry) + if since and date and date < since: + continue + if until and date and date > until: + continue + if ingest_one(conn, cfg, source_id=source_id, url=link, + title=entry.get("title", link), date=date): + n += 1 + return n diff --git a/signal_engine/ingest/download.py b/signal_engine/ingest/download.py new file mode 100644 index 0000000..e52c2bf --- /dev/null +++ b/signal_engine/ingest/download.py @@ -0,0 +1,61 @@ +"""Audio acquisition (§4.1). Spark Control transcribes audio you fetch — this fetches it. + +- Podcast enclosures: a plain streaming download that follows the Podtrac/Megaphone redirects to the + final signed CDN object (download immediately; resolved URLs carry short-lived params). +- YouTube: yt-dlp (audio-only → 16 kHz mono WAV). NOTE: 2026 YouTube enforces PO Tokens broadly — run + the `bgutil-ytdlp-pot-provider` sidecar or pulls will 403. yt-dlp is treated as a LAST resort; prefer + the RSS enclosure where a show publishes both (ToS: downloading YT audio violates YouTube ToS). +""" +from __future__ import annotations + +import subprocess +from pathlib import Path + +import requests + +DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)" + + +def download_enclosure(url: str, dest: str | Path, *, user_agent: str = DEFAULT_UA, timeout: int = 120) -> Path: + dest = Path(dest) + dest.parent.mkdir(parents=True, exist_ok=True) + with requests.get(url, stream=True, allow_redirects=True, + headers={"User-Agent": user_agent}, timeout=timeout) as r: + r.raise_for_status() + with open(dest, "wb") as f: + for chunk in r.iter_content(chunk_size=1 << 16): + f.write(chunk) + return dest + + +def to_wav_16k_mono(src: str | Path, dst: str | Path) -> Path: + """Normalize any audio to 16 kHz mono PCM WAV (what the ASR endpoint wants). Requires ffmpeg.""" + dst = Path(dst) + dst.parent.mkdir(parents=True, exist_ok=True) + subprocess.run( + ["ffmpeg", "-y", "-i", str(src), "-ar", "16000", "-ac", "1", "-f", "wav", str(dst)], + check=True, capture_output=True, + ) + return dst + + +def download_youtube_audio(url: str, out_dir: str | Path, *, archive_file: str | Path | None = None) -> Path: + """Audio-only via yt-dlp → 16 kHz mono WAV. `archive_file` (yt-dlp --download-archive) is the + canonical 'only-new' dedup for channel/playlist back-catalog pulls.""" + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + cmd = [ + "yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "wav", + "--postprocessor-args", "ffmpeg:-ar 16000 -ac 1", + "-o", str(out_dir / "%(id)s.%(ext)s"), + "--no-progress", + ] + if archive_file: + cmd += ["--download-archive", str(archive_file)] + cmd.append(url) + subprocess.run(cmd, check=True, capture_output=True) + # yt-dlp names the file by video id; return the newest wav + wavs = sorted(out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime) + if not wavs: + raise RuntimeError("yt-dlp produced no wav (PO-token/cookies issue? see module docstring)") + return wavs[-1] diff --git a/signal_engine/ingest/earnings.py b/signal_engine/ingest/earnings.py new file mode 100644 index 0000000..bae40c3 --- /dev/null +++ b/signal_engine/ingest/earnings.py @@ -0,0 +1,127 @@ +"""Earnings-call transcripts via Financial Modeling Prep (§4.1, §12 — decision: FMP). + +Audio isn't reliably fetchable for large-caps (no uniform feed; ~30–90d replay expiry breaks +backfill), so FMP's transcript API is the backbone and EDGAR filings remain the durable core. FMP +also exposes an earnings *calendar* to trigger ingestion on the day a call drops. + +Endpoint paths/params are marked TODO(contract): confirm against the FMP 'stable' docs for the +account tier at integration. Needs config.fmp_api_key. +""" +from __future__ import annotations + +import hashlib +import sqlite3 +from pathlib import Path +from typing import Any + +import requests + +FMP_BASE = "https://financialmodelingprep.com/stable" + + +class FMPClient: + def __init__(self, api_key: str, *, base: str = FMP_BASE, timeout: int = 30) -> None: + if not api_key: + raise ValueError("FMP_API_KEY is required for earnings-call transcripts") + self.api_key = api_key + self.base = base + self.timeout = timeout + self.s = requests.Session() + + def _get(self, path: str, **params: Any) -> Any: + params["apikey"] = self.api_key + r = self.s.get(f"{self.base}/{path}", params=params, timeout=self.timeout) + r.raise_for_status() + return r.json() + + # Confirmed against FMP 'stable' 2026-06-07 (v3 is legacy/403). Note singular "earning". + def transcript_dates(self, symbol: str) -> Any: + """List available transcripts: [{quarter, fiscalYear, date}, ...].""" + return self._get("earning-call-transcript-dates", symbol=symbol) + + def transcript(self, symbol: str, *, year: int, quarter: int) -> Any: + """One transcript: [{symbol, period, year, date, content}]. Use the `date` field as the + document date — FMP's year/quarter labels are fiscal and can be offset from the call date.""" + return self._get("earning-call-transcript", symbol=symbol, year=year, quarter=quarter) + + def earnings_calendar(self, *, from_date: str, to_date: str) -> Any: + """Earnings calendar (ingestion trigger): [{symbol, date, epsActual, ...}, ...].""" + return self._get("earnings-calendar", **{"from": from_date, "to": to_date}) + + +def ingest_transcript( + conn: sqlite3.Connection, + *, + source_id: str, + symbol: str, + year: int, + quarter: int, + content: str, + date: str | None, + data_dir: Path, + prompt_version: str = "extract-v0", +) -> tuple[bool, bool]: + """Store one transcript (content written to disk → transcript_path) and enqueue an 'extract' + job. Idempotent. Returns (new_document, new_job).""" + from ..backfill import queue + + external_id = f"{symbol}-{year}Q{quarter}" + doc_id = f"earnings:{external_id}" + tdir = Path(data_dir) / "transcripts" + tdir.mkdir(parents=True, exist_ok=True) + tpath = tdir / f"{external_id}.txt" + tpath.write_text(content) + content_hash = hashlib.sha256(content.encode()).hexdigest() + cur = conn.execute( + """INSERT OR IGNORE INTO documents + (doc_id, source_id, kind, external_id, title, date, transcript_path, content_hash, processed_at) + VALUES (?,?,?,?,?,?,?,?, datetime('now'))""", + (doc_id, source_id, "earnings_call", external_id, f"{symbol} {year} Q{quarter} call", + date, str(tpath), content_hash), + ) + conn.commit() + if not cur.rowcount: + return (False, False) + # earnings-call Q&A is the highest-yield text source (§4.1) → priority 40, ahead of filings (50). + h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest() + new_job = queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h, + parent_doc_id=doc_id, priority=40) is not None + return (True, new_job) + + +def ingest_for_ticker( + conn: sqlite3.Connection, + fmp: FMPClient, + *, + source_id: str, + symbol: str, + data_dir: Path, + since: str | None = None, + until: str | None = None, + limit: int = 8, +) -> tuple[int, int]: + """Enumerate available transcripts via the dates index, fetch those in [since, until], and + ingest. Uses each transcript's own `date` (FMP fiscal labels are offset). Returns (docs, jobs).""" + dates = fmp.transcript_dates(symbol) + picked = [] + for d in dates if isinstance(dates, list) else []: + dt = d.get("date") + if since and dt and dt < since: + continue + if until and dt and dt > until: + continue + picked.append(d) + n_docs = n_jobs = 0 + for d in picked[:limit]: + tr = fmp.transcript(symbol, year=d["fiscalYear"], quarter=d["quarter"]) + item = (tr[0] if isinstance(tr, list) and tr else tr) or {} + content = item.get("content") or "" + if not content: + continue + nd, nj = ingest_transcript( + conn, source_id=source_id, symbol=symbol, year=d["fiscalYear"], quarter=d["quarter"], + content=content, date=item.get("date") or d.get("date"), data_dir=data_dir, + ) + n_docs += int(nd) + n_jobs += int(nj) + return n_docs, n_jobs diff --git a/signal_engine/ingest/edgar.py b/signal_engine/ingest/edgar.py new file mode 100644 index 0000000..18115e8 --- /dev/null +++ b/signal_engine/ingest/edgar.py @@ -0,0 +1,148 @@ +"""SEC EDGAR ingestion (§4.1). + +Hits the official data.sec.gov / www.sec.gov APIs directly (free, keyless, full history). +Two hard requirements: + - a descriptive User-Agent (SEC 403s requests without one) — from config.edgar_user_agent. + - ≤10 requests/sec aggregate — enforced by a min-interval throttle here. + +Supports an explicit date range AND historical shards (filings.files[]), so the §7.1 backtest can +reach 2022–2023 filings, not just the most-recent ~1000. +""" +from __future__ import annotations + +import hashlib +import sqlite3 +import time +from typing import Iterator + +import requests + +_FILING_COLS = ("accessionNumber", "form", "filingDate", "primaryDocument", "primaryDocDescription") + + +class EdgarClient: + BASE_DATA = "https://data.sec.gov" + BASE_WWW = "https://www.sec.gov" + + def __init__(self, user_agent: str, *, min_interval: float = 0.12) -> None: + if not user_agent or "@" not in user_agent: + raise ValueError("EDGAR requires a descriptive User-Agent with contact email (config.edgar_user_agent)") + self.s = requests.Session() + self.s.headers.update({"User-Agent": user_agent, "Accept-Encoding": "gzip, deflate"}) + self.min_interval = min_interval + self._last = 0.0 + self._tickers: dict[str, int] | None = None + + def _throttle(self) -> None: + dt = time.monotonic() - self._last + if dt < self.min_interval: + time.sleep(self.min_interval - dt) + self._last = time.monotonic() + + def _get(self, url: str) -> requests.Response: + self._throttle() + r = self.s.get(url, timeout=30) + r.raise_for_status() + return r + + # ---- ticker → CIK ---- + def ticker_map(self) -> dict[str, int]: + if self._tickers is None: + data = self._get(f"{self.BASE_WWW}/files/company_tickers.json").json() + self._tickers = {row["ticker"].upper(): int(row["cik_str"]) for row in data.values()} + return self._tickers + + def cik_for(self, ticker: str) -> int | None: + return self.ticker_map().get(ticker.upper()) + + # ---- filings ---- + def _iter_array(self, block: dict, forms, since, until) -> Iterator[dict]: + arrays = [block.get(c, []) for c in _FILING_COLS] + for acc, form, fdate, pdoc, pdesc in zip(*arrays): + if forms and form not in forms: + continue + if since and fdate < since: + continue + if until and fdate > until: + continue + yield {"accession": acc, "form": form, "filing_date": fdate, + "primary_document": pdoc, "description": pdesc} + + def iter_filings( + self, + cik: int, + *, + forms: tuple[str, ...] = ("10-K", "10-Q", "8-K"), + since: str | None = None, + until: str | None = None, + ) -> Iterator[dict]: + """Yield filing descriptors. Pulls the inline 'recent' block AND any historical shards whose + date window overlaps [since, until] — required to reach the backtest era for active filers.""" + sub = self._get(f"{self.BASE_DATA}/submissions/CIK{cik:010d}.json").json() + recent = sub.get("filings", {}).get("recent", {}) + for f in self._iter_array(recent, forms, since, until): + yield self._with_url(cik, f) + for shard in sub.get("filings", {}).get("files", []): + # shard has filingFrom / filingTo; skip shards entirely outside the window. + if until and shard.get("filingFrom", "") > until: + continue + if since and shard.get("filingTo", "9999") < since: + continue + block = self._get(f"{self.BASE_DATA}/submissions/{shard['name']}").json() + for f in self._iter_array(block, forms, since, until): + yield self._with_url(cik, f) + + def _with_url(self, cik: int, f: dict) -> dict: + acc_nodash = f["accession"].replace("-", "") + f["cik"] = cik + f["url"] = f"{self.BASE_WWW}/Archives/edgar/data/{cik}/{acc_nodash}/{f['primary_document']}" + return f + + def fetch_html(self, filing: dict) -> str: + return self._get(filing["url"]).text + + +# Domestic annual/quarterly + foreign-private-issuer equivalents. 20-F (foreign annual, e.g. TSM/IREN), +# 40-F (Canadian annual, e.g. CCJ). 8-K/6-K (current reports) excluded by default — low claim yield. +HIGH_YIELD_FORMS = ("10-K", "10-Q", "20-F", "40-F") + + +def ingest_filings( + conn: sqlite3.Connection, + client: EdgarClient, + *, + source_id: str, + ticker: str, + since: str | None = None, + until: str | None = None, + forms: tuple[str, ...] = HIGH_YIELD_FORMS, + prompt_version: str = "extract-v0", +) -> tuple[int, int]: + """Insert filing documents and enqueue 'extract' jobs. Filings are text → no transcription; + they go straight to extraction (the extract worker fetches + strips the HTML later). Default + forms cover both domestic (10-K/10-Q) and foreign-private-issuer (20-F/40-F) filers. + Returns (new_documents, new_jobs). Idempotent on (source_id, accession).""" + from ..backfill import queue + + cik = client.cik_for(ticker) + if cik is None: + raise ValueError(f"No CIK found for ticker {ticker!r}") + n_docs = n_jobs = 0 + for f in client.iter_filings(cik, forms=forms, since=since, until=until): + doc_id = f"edgar:{f['accession']}" + cur = conn.execute( + """INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date) + VALUES (?,?,?,?,?,?,?)""", + (doc_id, source_id, "filing", f["accession"], f["url"], + f"{ticker} {f['form']} {f['filing_date']}", f["filing_date"]), + ) + conn.commit() + if not cur.rowcount: + continue + n_docs += 1 + h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest() + # priority 50: filings are high-info-density (§4.1) → ahead of podcasts (100) + if queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h, + parent_doc_id=doc_id, priority=50) is not None: + n_jobs += 1 + return n_docs, n_jobs diff --git a/signal_engine/ingest/feeds.py b/signal_engine/ingest/feeds.py new file mode 100644 index 0000000..1441d91 --- /dev/null +++ b/signal_engine/ingest/feeds.py @@ -0,0 +1,65 @@ +"""Podcast RSS ingestion (§4.1). + +feedparser + conditional GET (ETag/Last-Modified) for efficient incremental polling, with a +composite (feed_url, guid) dedup discipline. Many podcast CDNs send no validators and some feeds +truncate to recent episodes — for the §7.1 backtest, older episodes may need the show's full +archive feed (some hosts expose `?limit=` / a separate archive URL) or a YouTube back-catalog. +""" +from __future__ import annotations + +import hashlib +import time +from typing import Any + +import feedparser + +DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)" + + +def fetch_feed(url: str, *, etag: str | None = None, modified: str | None = None, + user_agent: str = DEFAULT_UA) -> feedparser.FeedParserDict: + """Conditional GET. On HTTP 304 the result has .status == 304 and .entries == [] → skip.""" + return feedparser.parse(url, etag=etag, modified=modified, agent=user_agent) + + +def _published_iso(entry: Any) -> str | None: + t = entry.get("published_parsed") or entry.get("updated_parsed") + if not t: + return None + return time.strftime("%Y-%m-%d", t) + + +def _enclosure_audio_url(entry: Any) -> str | None: + for enc in entry.get("enclosures", []) or []: + if str(enc.get("type", "")).startswith("audio"): + return enc.get("href") or enc.get("url") + # some feeds put audio only in links rel=enclosure + for link in entry.get("links", []) or []: + if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("audio"): + return link.get("href") + return None + + +def _guid(entry: Any) -> str: + g = entry.get("id") or entry.get("link") + if g: + return str(g) + basis = f"{entry.get('title','')}|{entry.get('published','')}" + return "sha1:" + hashlib.sha1(basis.encode()).hexdigest() + + +def episode_records(parsed: feedparser.FeedParserDict) -> list[dict]: + """Normalize feed entries to episode records. Skips entries with no audio enclosure.""" + out: list[dict] = [] + for e in parsed.entries: + audio = _enclosure_audio_url(e) + if not audio: + continue + out.append({ + "guid": _guid(e), + "title": e.get("title"), + "audio_url": audio, + "link": e.get("link"), + "published": _published_iso(e), + }) + return out diff --git a/signal_engine/ingest/gemini_transcribe.py b/signal_engine/ingest/gemini_transcribe.py new file mode 100644 index 0000000..db3e52e --- /dev/null +++ b/signal_engine/ingest/gemini_transcribe.py @@ -0,0 +1,195 @@ +"""One-time backfill path: transcribe podcast episodes via the Gemini multimodal API instead of the +local Spark Parakeet+diarizer pipeline. Used to take a bulk backfill OFF the shared Spark GPU (which +contends with production) — it is NOT the steady-state transcriber (local Parakeet remains the default). + +Scope/guardrail: podcast audio is PUBLIC data, so sending it to the frontier does NOT trip the +exposure/positioning-data rule (that guardrail is about Ten31's conviction/exposure data, never public +audio). Output is written in the SAME 'Speaker: text' transcript format the extractor consumes, so the +downstream extract→embed stages are agnostic to which transcriber produced the file. + +Tradeoff vs local: Gemini yields speaker-LABELED text, not voiceprint fingerprints — so no voiceprint +auto-edges. We rely on the hand-seeded EISC edges + name-based attribution instead (acceptable for a +bounded backfill). +""" +from __future__ import annotations + +import hashlib +import logging +import re +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +from ..backfill import queue +from .download import download_enclosure + +log = logging.getLogger(__name__) + +_PROMPT = ( + "You are a precise podcast transcriptionist. Transcribe this audio VERBATIM as a speaker-diarized " + "transcript.\n" + "RULES:\n" + "- One line per speaker turn, formatted exactly as `Name: spoken text` (a colon and one space).\n" + "- The host of this show is {host} — label every host turn with exactly `{host}` (the person's " + "name, never the show's name).\n" + "- When the host introduces a guest by name (e.g. 'welcome X to the show', 'I'm joined by X'), use " + "that real first name (or full name) as the guest's label for the WHOLE transcript. Only fall back " + "to `Guest` (or `Guest 2`, `Guest 3`) if a name is never stated. Do not invent names.\n" + "- Do NOT include timestamps, ad-reads markers, summaries, headings, markdown, or any commentary. " + "Only the transcript lines.\n" + "- Transcribe the entire episode from start to finish. Do not stop early or summarize.\n" +) + + +def _host_person(source_name: str) -> str: + """Derive the host's PERSON name from a source/show name so claimant attribution isn't the show. + 'What Bitcoin Did (Peter McCormack)' -> 'Peter McCormack'; 'Stephan Livera Podcast' -> 'Stephan + Livera'; 'The Kevin Rooke Show' -> 'Kevin Rooke'; 'The Anita Posch Show' -> 'Anita Posch'.""" + m = re.search(r"\(([^)]+)\)", source_name or "") + if m: + return m.group(1).strip() + s = re.sub(r"^The\s+", "", source_name or "").strip() + s = re.sub(r"\s+(Podcast|Show)$", "", s, flags=re.I).strip() + return s + + +def _sniff_audio_mime(path: Path) -> str: + """Determine audio MIME from the file header — the downloaded enclosure has a generic `.src` + extension, so the Files API can't infer it and rejects the upload without an explicit mime_type.""" + with open(path, "rb") as fh: + head = fh.read(16) + if head[:3] == b"ID3" or (len(head) > 1 and head[0] == 0xFF and (head[1] & 0xE0) == 0xE0): + return "audio/mpeg" + if head[4:8] == b"ftyp": + return "audio/mp4" # m4a/aac + if head[:4] == b"OggS": + return "audio/ogg" + if head[:4] == b"RIFF": + return "audio/wav" + if head[:4] == b"fLaC": + return "audio/flac" + return "audio/mpeg" # podcast default + + +def _upload_and_wait(client, audio_path: Path, *, poll_s: float = 2.0, timeout_s: float = 300.0): + """Upload to the Files API and wait until the file is ACTIVE (audio is processed server-side).""" + from google.genai import types + mime = _sniff_audio_mime(audio_path) + f = client.files.upload(file=str(audio_path), config=types.UploadFileConfig(mime_type=mime)) + waited = 0.0 + while getattr(f.state, "name", str(f.state)) == "PROCESSING" and waited < timeout_s: + time.sleep(poll_s) + waited += poll_s + f = client.files.get(name=f.name) + state = getattr(f.state, "name", str(f.state)) + if state != "ACTIVE": + raise RuntimeError(f"Gemini file not ACTIVE (state={state}) for {audio_path.name}") + return f + + +def transcribe_one(client, model: str, audio_path: Path, host_name: str, *, + max_output_tokens: int = 65536) -> tuple[str, dict]: + """Transcribe a single audio file → (transcript_text, usage_dict). Network/CPU only; no DB.""" + from google.genai import types + f = _upload_and_wait(client, audio_path) + try: + resp = client.models.generate_content( + model=model, + contents=[f, _PROMPT.format(host=host_name or "the host")], + config=types.GenerateContentConfig(temperature=0, max_output_tokens=max_output_tokens), + ) + text = (resp.text or "").strip() + um = getattr(resp, "usage_metadata", None) + usage = { + "prompt_tokens": getattr(um, "prompt_token_count", 0) or 0, + "output_tokens": getattr(um, "candidates_token_count", 0) or 0, + "finish_reason": str(getattr(resp.candidates[0], "finish_reason", "")) if resp.candidates else "", + } + return text, usage + finally: + try: + client.files.delete(name=f.name) + except Exception as e: # noqa: BLE001 — best-effort cleanup + log.debug("file cleanup failed for %s: %s", f.name, e) + + +def _fetch_and_transcribe(client, model: str, cfg, doc, host_name: str) -> dict: + """Worker-thread unit: download enclosure → Gemini transcribe → write transcript file. No DB writes.""" + cache = Path(cfg.audio_cache_dir) + cache.mkdir(parents=True, exist_ok=True) + safe = doc["doc_id"].replace(":", "_") + src = cache / f"{safe}.src" + audio = download_enclosure(doc["url"], src) + try: + text, usage = transcribe_one(client, model, audio, host_name) + if not text or len(text) < 40: + raise RuntimeError(f"empty/short transcript ({len(text)} chars)") + tpath = Path(cfg.data_dir) / "transcripts" / f"{safe}.txt" + tpath.parent.mkdir(parents=True, exist_ok=True) + tpath.write_text(text) + return { + "doc_id": doc["doc_id"], "ok": True, "transcript_path": str(tpath), + "n_lines": text.count("\n") + 1, "content_hash": hashlib.sha256(text.encode()).hexdigest(), + "usage": usage, + } + finally: + try: + if audio.exists(): + audio.unlink() + except Exception: # noqa: BLE001 + pass + + +def run_transcribe_gemini(conn, cfg, *, limit: int = 5, concurrency: int = 4, + lease_seconds: int = 7200, worker_id: str = "gemini-transcribe") -> dict: + """Lease pending transcribe jobs and transcribe them via Gemini in parallel. DB writes stay on the + main thread; only download+API run in the pool. Reports token usage for cost accounting.""" + from google import genai + if not cfg.gemini_api_key: + raise RuntimeError("GEMINI_API_KEY not configured") + client = genai.Client(api_key=cfg.gemini_api_key) + model = cfg.gemini_model or "gemini-2.5-flash" + + # Lease the batch up front (main thread); resolve docs + host names. + leased: list[tuple] = [] + while len(leased) < limit: + job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds) + if job is None: + break + doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone() + if doc is None: + queue.skip(conn, job["job_id"], "document missing") + continue + host = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone() + leased.append((job, doc, _host_person(host["name"]) if host else "")) + + done = failed = prompt_tok = out_tok = 0 + with ThreadPoolExecutor(max_workers=concurrency) as pool: + futs = {pool.submit(_fetch_and_transcribe, client, model, cfg, doc, host): (job, doc) + for (job, doc, host) in leased} + for fut in as_completed(futs): + job, doc = futs[fut] + try: + r = fut.result() + conn.execute( + "UPDATE documents SET transcript_path=?, content_hash=?, processed_at=datetime('now') " + "WHERE doc_id=?", (r["transcript_path"], r["content_hash"], doc["doc_id"]), + ) + h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest() + queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h, + parent_doc_id=doc["doc_id"], priority=100) + queue.complete(conn, job["job_id"], output_ref=f"gemini {r['n_lines']} lines") + conn.commit() + done += 1 + prompt_tok += r["usage"]["prompt_tokens"] + out_tok += r["usage"]["output_tokens"] + fr = r["usage"]["finish_reason"] + log.info("gemini transcribed %s (%d lines, %d in/%d out tok%s)", doc["doc_id"], + r["n_lines"], r["usage"]["prompt_tokens"], r["usage"]["output_tokens"], + ", TRUNCATED" if "MAX_TOKENS" in fr else "") + except Exception as e: # noqa: BLE001 + state = queue.fail(conn, job["job_id"], e) + conn.commit() + failed += 1 + log.warning("gemini transcribe failed for %s: %s (→ %s)", doc["doc_id"], e, state) + return {"done": done, "failed": failed, "prompt_tokens": prompt_tok, "output_tokens": out_tok} diff --git a/signal_engine/ingest/identify.py b/signal_engine/ingest/identify.py new file mode 100644 index 0000000..15164f6 --- /dev/null +++ b/signal_engine/ingest/identify.py @@ -0,0 +1,45 @@ +"""Speaker-name identification (§4.5 enhancement). + +In a 1-on-1 interview the host introduces the guest by name at the top. Reading the transcript head +with the LLM, we attach a real NAME to each diarized speaker → voiceprints.person_label. This gives +the independence graph a SECOND, orthogonal overlap signal: the same NAMED guest across two shows is +a shared_guest edge even when the voiceprints don't cluster (different mic/codec/room). It complements +voiceprint cosine matching and is robust to fingerprint drift — exactly the case the operator flagged. +""" +from __future__ import annotations + +import json +import logging + +log = logging.getLogger(__name__) + +_SYS = ( + 'You identify the speakers in a podcast/interview transcript. Each line is "LABEL: text". ' + "Using the introduction and context, determine each LABEL's real full name and role. In an " + "interview the host normally introduces themselves and the guest within the first minute. Only " + "assert a name you can actually support from the text — if you cannot tell, use null. " + 'Return ONLY JSON: {"speakers": {"