Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
@@ -0,0 +1,4 @@
+"""Persistence layer: SQLite (metadata, ledger, conviction log, graph, queue).
+
+Qdrant (vectors) is reached via the Spark Control gateway; see signal_engine.spark.
+"""
@@ -0,0 +1,81 @@
+"""SQLite connection + schema initialization. Boring and inspectable (§5)."""
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+
+SCHEMA_FILE = Path(__file__).with_name("schema.sql")
+
+
+def connect(db_path: Path) -> sqlite3.Connection:
+    db_path = Path(db_path)
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(str(db_path), timeout=30)
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA foreign_keys = ON")
+    conn.execute("PRAGMA busy_timeout = 30000")  # wait, don't fail, under concurrent backfill writers
+    return conn
+
+
+# Additive migrations for DBs created before a column existed (CREATE IF NOT EXISTS won't add columns).
+_MIGRATIONS = {
+    "documents": {"content_hash": "TEXT", "processed_at": "TEXT", "dedup_key": "TEXT"},
+    # DESIGN_v2.1 condition 1: own_network = the Ten31 orbit (Odell/Bent partners etc.) — listening to
+    # ourselves. Quarantined: a TEST FIXTURE for the reflexivity case, DROPPED in live EISC scoring.
+    "sources": {"backtest_2022_2023": "TEXT", "own_network": "INTEGER"},
+    # DESIGN_v2.1: tag derivatives by distance-from-edge for TRIAGE — surfaced, NEVER used as a filter
+    # (an engine that pre-filters to in-mandate reproduces the AI/compute mandate-expansion miss).
+    "fanout_nodes": {"distance_from_edge": "TEXT"},
+}
+
+
+def _widen_cluster_check(conn: sqlite3.Connection) -> None:
+    """Add 'banks'/'credit'/'fintech' to sources.source_cluster's CHECK. SQLite can't ALTER a CHECK, so
+    rebuild the (tiny) table via the standard table-swap. Idempotent: no-op once already widened. Toggles
+    foreign_keys OFF around the swap (DROP would otherwise fail on inbound FKs); data copied by value so
+    referential integrity holds. busy_timeout (set in connect) lets it wait out concurrent backfill writers."""
+    import re
+    row = conn.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='sources'").fetchone()
+    if not row or "'banks'" in row[0]:
+        return
+    new_list = ("('macro','ai_tech','energy','bitcoin','vc_consensus','generalist',"
+                "'banks','credit','fintech')")
+    new_ddl = re.sub(r"source_cluster IN\s*\([^)]*\)", f"source_cluster IN {new_list}", row[0], count=1)
+    new_ddl = new_ddl.replace("CREATE TABLE sources", "CREATE TABLE sources_new", 1)
+    conn.commit()                              # close any implicit txn before toggling FK pragma
+    conn.execute("PRAGMA foreign_keys=OFF")
+    try:
+        conn.execute(new_ddl)
+        conn.execute("INSERT INTO sources_new SELECT * FROM sources")
+        conn.execute("DROP TABLE sources")
+        conn.execute("ALTER TABLE sources_new RENAME TO sources")
+        conn.commit()
+    finally:
+        conn.execute("PRAGMA foreign_keys=ON")
+
+
+def _migrate(conn: sqlite3.Connection) -> None:
+    for table, cols in _MIGRATIONS.items():
+        existing = {r[1] for r in conn.execute(f"PRAGMA table_info({table})")}
+        for col, typ in cols.items():
+            if col not in existing:
+                conn.execute(f"ALTER TABLE {table} ADD COLUMN {col} {typ}")
+    # indexes on migrated columns (created here so they work on DBs predating the column)
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_content_hash ON documents(content_hash)")
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_dedup_key ON documents(dedup_key)")
+    conn.commit()
+    _widen_cluster_check(conn)
+
+
+def init_db(conn: sqlite3.Connection) -> None:
+    """Idempotent: CREATE ... IF NOT EXISTS + additive column migrations."""
+    conn.executescript(SCHEMA_FILE.read_text())
+    conn.commit()
+    _migrate(conn)
+
+
+def table_names(conn: sqlite3.Connection) -> list[str]:
+    rows = conn.execute(
+        "SELECT name FROM sqlite_master WHERE type IN ('table','view') ORDER BY name"
+    ).fetchall()
+    return [r[0] for r in rows]
@@ -0,0 +1,280 @@
+-- Ten31 Signal Engine — SQLite schema (pilot)
+-- Source of truth: ten31-signal-engine-handoff.md  §4 (pipeline layers), §6.7 (ledger),
+--   §3.1 (conviction log), §13.4 (backfill queue).
+-- Design principle (§5, §10): boring, inspectable tables. The whole system state is a SELECT away.
+
+PRAGMA journal_mode = WAL;
+PRAGMA foreign_keys = ON;
+
+-- ============================================================================
+-- CANONICAL TOPIC VOCABULARY (§4.2) — HYBRID (operator decision):
+--   seeded controlled list + emergent topics merged in on a schedule.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS topics (
+  topic_canonical TEXT PRIMARY KEY,
+  status          TEXT CHECK (status IN ('controlled','emergent','merged')) DEFAULT 'emergent',
+  merged_into     TEXT REFERENCES topics(topic_canonical),
+  seam            TEXT,
+  created_at      TEXT DEFAULT (datetime('now'))
+);
+
+-- ============================================================================
+-- SOURCES & DOCUMENTS (§4.1)
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS sources (
+  source_id          TEXT PRIMARY KEY,
+  name               TEXT NOT NULL,
+  kind               TEXT NOT NULL CHECK (kind IN ('podcast','youtube','filing','earnings_call')),
+  source_cluster     TEXT CHECK (source_cluster IN
+                       ('macro','ai_tech','energy','bitcoin','vc_consensus','generalist','banks','credit','fintech')),
+  role               TEXT CHECK (role IN ('CB','IND','DX','none')) DEFAULT 'none',  -- §7.4
+  rss_url            TEXT,
+  channel_url        TEXT,
+  ticker             TEXT,
+  -- §8 credibility: neutral prior that DECAYS in favor of earned track record from the ledger.
+  bootstrap_prior    REAL DEFAULT 1.0,
+  earned_credibility REAL,
+  cluster_capped_low INTEGER DEFAULT 0,   -- §4.5 bitcoin cluster deliberately under-weighted
+  backtest_2022_2023 TEXT,                -- §7.1 reach: rss_full | rss_2023_only | youtube_only | launched_later | unavailable
+  notes              TEXT,
+  created_at         TEXT DEFAULT (datetime('now'))
+);
+
+CREATE TABLE IF NOT EXISTS documents (
+  doc_id          TEXT PRIMARY KEY,
+  source_id       TEXT NOT NULL REFERENCES sources(source_id),
+  kind            TEXT NOT NULL,        -- podcast|youtube|filing|earnings_call
+  external_id     TEXT,                 -- rss guid / yt video id / EDGAR accession / transcript id
+  url             TEXT,
+  title           TEXT,
+  date            TEXT,                 -- ISO publication/filing date
+  duration_sec    REAL,
+  raw_path        TEXT,                 -- downloaded audio / raw filing
+  transcript_path TEXT,
+  -- DEDUP MODEL (layered):
+  --   (1) UNIQUE(source_id, external_id) below = the ROBUST guard. external_id is the stable item id
+  --       (RSS GUID / YouTube video id / EDGAR accession). Checked at ingest, BEFORE any GPU work.
+  --   (2) dedup_key = normalized title+date → catches the SAME episode arriving via a different
+  --       feed/mirror (different external_id). Computed pre-transcription. NOT from the transcript.
+  --   content_hash is ONLY an audit fingerprint of the transcript (did a re-run change?) — it is NOT
+  --       a dedup key (ASR is non-deterministic, so one differing word flips the hash).
+  dedup_key       TEXT,
+  content_hash    TEXT,
+  processed_at    TEXT,                 -- set when transcription/extraction completes
+  ingested_at     TEXT DEFAULT (datetime('now')),
+  UNIQUE (source_id, external_id)       -- idempotent ingest (§13.4 dedup)
+);
+-- indexes for dedup_key / content_hash are created in db._migrate (after columns exist on older DBs).
+
+-- ============================================================================
+-- CLAIMS / PROPOSITIONS (§4.2) — the atomic unit of the whole system.
+-- One passage emits 0..N claims; MOST of a podcast hour is 0 (§4.2). The
+-- extractor must be willing to find nothing.
+-- NOTE: thesis_seam is a TAG, never a hard filter (§5.7) — off-thesis &
+--   anti-thesis claims MUST survive.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS claims (
+  claim_id            TEXT PRIMARY KEY,
+  doc_id              TEXT NOT NULL REFERENCES documents(doc_id),
+  source_id           TEXT NOT NULL REFERENCES sources(source_id),
+  proposition         TEXT NOT NULL,    -- normalized subject-assertion-object
+  topic_canonical     TEXT REFERENCES topics(topic_canonical),
+  topic_raw           TEXT,
+  claimant            TEXT,
+  source_cluster      TEXT,
+  date                TEXT,
+  claim_type          TEXT CHECK (claim_type IN ('interpretive','predictive','descriptive','reactive')),
+  time_horizon        TEXT CHECK (time_horizon IN ('near','medium','long','unspecified')),
+  confidence          TEXT CHECK (confidence IN ('low','med','high')),
+  -- §4.2 relation: stance is EXTRACTED, never inferred from vector distance (§2.2/§5.3).
+  rel_target_claim_id TEXT REFERENCES claims(claim_id),
+  rel_polarity        TEXT CHECK (rel_polarity IN ('affirms','denies','qualifies','none')) DEFAULT 'none',
+  engages_consensus   INTEGER DEFAULT 0,
+  counters_position   TEXT,
+  thesis_seam         TEXT CHECK (thesis_seam IN
+                        ('energy_compute','debasement_bitcoin','ai_data_ownership','none')) DEFAULT 'none',
+  salience            TEXT CHECK (salience IN ('central','secondary','aside')) DEFAULT 'secondary',
+  qdrant_point_id     TEXT,             -- link to the embedded proposition vector (§4.3)
+  extracted_at        TEXT DEFAULT (datetime('now'))
+);
+CREATE INDEX IF NOT EXISTS idx_claims_topic ON claims(topic_canonical);
+CREATE INDEX IF NOT EXISTS idx_claims_date  ON claims(date);
+CREATE INDEX IF NOT EXISTS idx_claims_seam  ON claims(thesis_seam);
+CREATE INDEX IF NOT EXISTS idx_claims_type  ON claims(claim_type);
+
+-- ============================================================================
+-- SOURCE-INDEPENDENCE GRAPH (§4.5) — discount convergence by connectedness.
+-- Cross-cluster convergence = gold; within-cluster = near-noise.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS source_edges (
+  src_a      TEXT NOT NULL REFERENCES sources(source_id),
+  src_b      TEXT NOT NULL REFERENCES sources(source_id),
+  edge_type  TEXT NOT NULL CHECK (edge_type IN ('shared_guest','citation','community')),
+  weight     REAL DEFAULT 1.0,
+  evidence   TEXT,        -- voiceprint_id / show-note ref / url
+  updated_at TEXT DEFAULT (datetime('now')),
+  PRIMARY KEY (src_a, src_b, edge_type)
+);
+
+-- ============================================================================
+-- VOICEPRINT LIBRARY (§4.5, §4.1) — same-guest-across-shows BY VOICE.
+-- 192-dim TitaNet voiceprints; cosine ~0.7 distance threshold for same speaker.
+-- This is the highest-leverage automated input to the independence graph.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS voiceprints (
+  voiceprint_id TEXT PRIMARY KEY,
+  vector        BLOB NOT NULL,         -- 192 x float32
+  person_label  TEXT,                  -- resolved name if known
+  first_doc_id  TEXT REFERENCES documents(doc_id),
+  first_seen    TEXT DEFAULT (datetime('now'))
+);
+CREATE TABLE IF NOT EXISTS voiceprint_observations (
+  obs_id        INTEGER PRIMARY KEY AUTOINCREMENT,
+  voiceprint_id TEXT NOT NULL REFERENCES voiceprints(voiceprint_id),
+  doc_id        TEXT NOT NULL REFERENCES documents(doc_id),
+  chunk_idx     INTEGER,
+  segment_start REAL,
+  segment_end   REAL
+);
+
+-- ============================================================================
+-- CONVICTION LOG (§3.1) — human-owned seed nodes for Job B.
+-- Structural rule (§3.1): separate the TRACKABLE thematic proposition (corpus
+--   can corroborate) from TEAM conviction (context only). The engine must NEVER
+--   present theme corroboration as validation of the team bet beneath it.
+-- Exposure scored as coarse NAV bands (operator decision): none | lt2 | 2to10 | gt10 | unset.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS conviction_log (
+  conviction_id        TEXT PRIMARY KEY,            -- R1, E1, A1, B1 ...
+  seam                 TEXT,                        -- root|energy_compute|debasement_bitcoin|ai_data_ownership
+  thematic_proposition TEXT NOT NULL,               -- the TRACKABLE half
+  team_conviction_note TEXT,                         -- context ONLY, never scored as theme validation
+  conviction_level     TEXT CHECK (conviction_level IN ('low','med','med-high','high')),
+  current_exposure     TEXT CHECK (current_exposure IN ('none','lt2','2to10','gt10','unset')) DEFAULT 'unset',
+  exposure_note        TEXT,                         -- original §3.1 prose ("pervasive", "MED-HIGH") pending NAV-band finalization
+  disconfirming_signal TEXT,
+  is_thesis_breaker    INTEGER DEFAULT 0,            -- §3.1 B1-B3: engine must surface these AGAINST the thesis (§5.7)
+  updated_at           TEXT DEFAULT (datetime('now'))
+);
+
+-- Conviction fan-out tree (§4.6). A derivative is a HYPOTHESIS until independent
+-- corpus corroboration AND the exposure gap both clear the bar — then 'signal'.
+CREATE TABLE IF NOT EXISTS fanout_nodes (
+  node_id                TEXT PRIMARY KEY,
+  parent_conviction_id   TEXT REFERENCES conviction_log(conviction_id),
+  parent_node_id         TEXT REFERENCES fanout_nodes(node_id),
+  derivative_proposition TEXT NOT NULL,
+  depth                  INTEGER DEFAULT 1,
+  status                 TEXT CHECK (status IN ('hypothesis','corroborated','signal')) DEFAULT 'hypothesis',
+  created_at             TEXT DEFAULT (datetime('now'))
+);
+
+-- ============================================================================
+-- DUAL-EVALUATION LEDGER (§4.7, §6) — START DAY ONE; the clock can't be backfilled.
+-- Log EVERY candidate that clears the quantitative bar (§6.6 — you need a denominator).
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS ledger (
+  signal_id            TEXT PRIMARY KEY,
+  type                 TEXT NOT NULL CHECK (type IN ('theme','event','under_acted_conviction')),
+  proposition          TEXT NOT NULL,
+  date_logged          TEXT NOT NULL DEFAULT (datetime('now')),
+  discourse_metric     TEXT,           -- JSON: acceleration, cross-cluster source set, independence-discounted count
+  external_check       TEXT,           -- JSON: resolution spec / nested clean events the model proposed (§6.5)
+  resolution_date      TEXT,
+  discourse_outcome    TEXT CHECK (discourse_outcome IN
+                         ('up_cross_cluster','up_single_cluster','flat','down')),
+  external_outcome     TEXT CHECK (external_outcome IN
+                         ('correct','partial','wrong','unresolved_expired','too_early')),
+  lead_time_days       INTEGER,        -- §6.3 THE alpha measurement (to the DERIVATIVE node for Job B)
+  model_confidence     REAL,           -- §6.7 logged ONLY to measure its uselessness — NEVER fed into scoring
+  origin_conviction_id TEXT REFERENCES conviction_log(conviction_id),  -- Job B traceability
+  origin_node_id       TEXT REFERENCES fanout_nodes(node_id)
+);
+CREATE INDEX IF NOT EXISTS idx_ledger_type   ON ledger(type);
+CREATE INDEX IF NOT EXISTS idx_ledger_logged ON ledger(date_logged);
+
+-- Human eval on a SEPARATE write path (§6.7): "keep them in separate columns and do not let the
+-- model see Grant's rating before it logs its prediction." The model-facing code reads `ledger`;
+-- ONLY the eval UI writes here. A separate table makes that separation structural, not a convention.
+CREATE TABLE IF NOT EXISTS human_evaluations (
+  signal_id    TEXT PRIMARY KEY REFERENCES ledger(signal_id),
+  grant_rating INTEGER,               -- "non-obvious and relevant to me?" (e.g. 1-5)
+  non_obvious  INTEGER,               -- 0/1
+  notes        TEXT,
+  rated_at     TEXT DEFAULT (datetime('now'))
+);
+
+-- Reporting view — the valuable cell is DISAGREEMENT (§6.7). Used for analysis, NOT by the model path.
+CREATE VIEW IF NOT EXISTS v_ledger_eval AS
+  SELECT l.*, h.grant_rating, h.non_obvious, h.notes AS grant_notes, h.rated_at
+  FROM ledger l LEFT JOIN human_evaluations h ON h.signal_id = l.signal_id;
+
+-- ============================================================================
+-- BACKFILL QUEUE (§13.4) — client-side, measured in GPU-HOURS.
+-- Extraction (one LLM pass per chunk over the whole corpus) is the HEAVIER serial load.
+-- Audio is SEQUENTIAL (parallel → 503). Leases give crash-safe resumability.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS backfill_jobs (
+  job_id           INTEGER PRIMARY KEY AUTOINCREMENT,
+  job_type         TEXT NOT NULL CHECK (job_type IN ('transcribe','diarize','extract','embed')),
+  target_id        TEXT NOT NULL,        -- doc_id or chunk id
+  parent_doc_id    TEXT,
+  state            TEXT NOT NULL CHECK (state IN
+                     ('pending','leased','running','done','failed','skipped')) DEFAULT 'pending',
+  priority         INTEGER DEFAULT 100,  -- lower = sooner (backtest corpus jumps the queue, §7.1)
+  attempts         INTEGER DEFAULT 0,
+  max_attempts     INTEGER DEFAULT 5,
+  lease_owner      TEXT,
+  lease_expires_at TEXT,
+  input_hash       TEXT NOT NULL,        -- hash(content + model/prompt version) — idempotency
+  output_ref       TEXT,
+  gpu_seconds      REAL,                 -- measured per job → self-calibrating GPU-hours estimate
+  error            TEXT,
+  created_at       TEXT DEFAULT (datetime('now')),
+  updated_at       TEXT DEFAULT (datetime('now')),
+  UNIQUE (job_type, input_hash)
+);
+CREATE INDEX IF NOT EXISTS idx_jobs_state_priority ON backfill_jobs(state, priority, job_id);
+
+-- ============================================================================
+-- SCORING BRAIN state (the "brain", build blueprint). Candidate state lands here +
+-- ledger + fanout_nodes.status; existing tables unchanged.
+-- ============================================================================
+
+-- Temporal layer: one row per (topic, as_of, window). 28d non-overlapping windows.
+CREATE TABLE IF NOT EXISTS topic_window_stats (
+  topic_canonical TEXT NOT NULL,
+  as_of           TEXT NOT NULL,
+  window_idx      INTEGER NOT NULL,          -- 0 = window ending at as_of, 1 = prior, 2 = baseline
+  window_start    TEXT NOT NULL,
+  window_end      TEXT NOT NULL,
+  n_interp_pred   INTEGER NOT NULL DEFAULT 0,
+  n_descr_react   INTEGER NOT NULL DEFAULT 0,
+  n_distinct_src  INTEGER NOT NULL DEFAULT 0,
+  n_distinct_clu  INTEGER NOT NULL DEFAULT 0,
+  PRIMARY KEY (topic_canonical, as_of, window_idx)
+);
+
+-- Audit trail: one row per (scorer, key, as_of). Deterministic score_id → re-run reproduces.
+CREATE TABLE IF NOT EXISTS candidate_scores (
+  score_id        TEXT PRIMARY KEY,
+  scorer          TEXT NOT NULL,             -- emergence|contrarian|intersection|convergence|under_acted
+  as_of           TEXT NOT NULL,
+  topic_canonical TEXT,
+  node_id         TEXT,
+  conviction_id   TEXT,
+  score           REAL NOT NULL,
+  cleared_evidence_bar  INTEGER NOT NULL DEFAULT 0,   -- tier 1: logged to ledger (the denominator)
+  cleared_promotion_bar INTEGER NOT NULL DEFAULT 0,   -- tier 2: sent to frontier judge
+  inputs_json     TEXT NOT NULL,             -- every term that produced the score (full audit)
+  computed_at     TEXT DEFAULT (datetime('now'))
+);
+CREATE INDEX IF NOT EXISTS idx_cs_asof ON candidate_scores(scorer, as_of, cleared_promotion_bar);
+
+-- Tunable bar config so the backtest can sweep thresholds without code edits.
+CREATE TABLE IF NOT EXISTS score_thresholds (
+  scorer     TEXT PRIMARY KEY,
+  min_score  REAL,
+  gates_json TEXT,
+  version    TEXT
+);
@@ -0,0 +1,74 @@
+"""Load human-owned seed data (conviction log, §3.1) into SQLite.
+
+The conviction log is the highest-leverage Job B input (§3.1) and is HUMAN-OWNED:
+Grant edits the YAML seed files; this loader upserts them. Re-running is idempotent.
+"""
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+_CONVICTION_COLS = (
+    "conviction_id",
+    "seam",
+    "thematic_proposition",
+    "team_conviction_note",
+    "conviction_level",
+    "current_exposure",
+    "exposure_note",
+    "disconfirming_signal",
+    "is_thesis_breaker",
+)
+
+
+def _row(c: dict[str, Any]) -> dict[str, Any]:
+    return {
+        "conviction_id": c["id"],
+        "seam": c.get("seam"),
+        "thematic_proposition": c["thematic_proposition"],
+        "team_conviction_note": c.get("team_conviction_note"),
+        "conviction_level": c.get("conviction_level"),
+        "current_exposure": c.get("current_exposure", "unset"),
+        "exposure_note": c.get("exposure_note"),
+        "disconfirming_signal": c.get("disconfirming_signal"),
+        "is_thesis_breaker": 1 if c.get("is_thesis_breaker") else 0,
+    }
+
+
+def load_fanout(conn: sqlite3.Connection, path: Path) -> int:
+    """Load a hand-written fan-out tree (§7.1 backtest). Idempotent on node_id."""
+    data = yaml.safe_load(Path(path).read_text()) or {}
+    parent = data["parent_conviction_id"]
+    nodes = data.get("nodes", [])
+    for n in nodes:
+        conn.execute(
+            """INSERT INTO fanout_nodes
+                 (node_id, parent_conviction_id, derivative_proposition, depth, status, distance_from_edge)
+               VALUES (?,?,?,?, 'hypothesis', ?)
+               ON CONFLICT(node_id) DO UPDATE SET derivative_proposition=excluded.derivative_proposition,
+                 parent_conviction_id=excluded.parent_conviction_id,
+                 distance_from_edge=excluded.distance_from_edge""",
+            (n["node_id"], parent, n["derivative_proposition"], n.get("depth", 1), n.get("distance_from_edge")),
+        )
+    conn.commit()
+    return len(nodes)
+
+
+def load_convictions(conn: sqlite3.Connection, path: Path) -> int:
+    data = yaml.safe_load(Path(path).read_text()) or {}
+    rows = data.get("convictions", [])
+    cols = ", ".join(_CONVICTION_COLS)
+    placeholders = ", ".join(f":{c}" for c in _CONVICTION_COLS)
+    updates = ", ".join(f"{c}=excluded.{c}" for c in _CONVICTION_COLS if c != "conviction_id")
+    sql = (
+        f"INSERT INTO conviction_log ({cols}, updated_at) "
+        f"VALUES ({placeholders}, datetime('now')) "
+        f"ON CONFLICT(conviction_id) DO UPDATE SET {updates}, updated_at=datetime('now')"
+    )
+    for c in rows:
+        conn.execute(sql, _row(c))
+    conn.commit()
+    return len(rows)
@@ -0,0 +1,90 @@
+"""Load the source registry (companies + podcasts, §7.3/§7.4) into SQLite. Idempotent upsert."""
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+_COLS = ("source_id", "name", "kind", "source_cluster", "role", "rss_url",
+         "channel_url", "ticker", "cluster_capped_low", "own_network", "backtest_2022_2023", "notes")
+
+
+def _row(s: dict[str, Any]) -> dict[str, Any]:
+    return {
+        "source_id": s["id"],
+        "name": s["name"],
+        "kind": s["kind"],
+        "source_cluster": s.get("cluster"),
+        "role": s.get("role", "none"),
+        "rss_url": s.get("rss_url"),
+        "channel_url": s.get("channel_url"),
+        "ticker": s.get("ticker"),
+        "cluster_capped_low": 1 if s.get("cluster_capped_low") else 0,
+        "own_network": 1 if s.get("own_network") else 0,
+        "backtest_2022_2023": s.get("backtest_2022_2023"),
+        "notes": s.get("notes"),
+    }
+
+
+def update_feeds(conn: sqlite3.Connection, path: Path) -> int:
+    """Apply resolved/verified podcast feed URLs + backtest-reach to existing source rows."""
+    try:
+        conn.execute("ALTER TABLE sources ADD COLUMN backtest_2022_2023 TEXT")
+        conn.commit()
+    except sqlite3.OperationalError:
+        pass  # column already exists
+    data = yaml.safe_load(Path(path).read_text()) or {}
+    rows = data.get("feeds", [])
+    for f in rows:
+        conn.execute(
+            """UPDATE sources
+                 SET rss_url=:rss_url, channel_url=:youtube_channel_url,
+                     backtest_2022_2023=:backtest_2022_2023, notes=COALESCE(:note, notes)
+               WHERE source_id=:id""",
+            {
+                "id": f["id"], "rss_url": f.get("rss_url"),
+                "youtube_channel_url": f.get("youtube_channel_url"),
+                "backtest_2022_2023": f.get("backtest_2022_2023"), "note": f.get("note"),
+            },
+        )
+    conn.commit()
+    return len(rows)
+
+
+def load_source_edges(conn: sqlite3.Connection, path: Path) -> int:
+    """Seed EISC connectedness edges (priors) idempotently. Stores src_a,src_b in sorted order to
+    match the transcribe_worker's convention (sorted([a,b]) + ON CONFLICT weight+=1) so real detections
+    accumulate on the same PK instead of creating a reversed duplicate. DO NOTHING on conflict → a
+    re-run won't inflate, and won't clobber a stronger auto-detected weight."""
+    data = yaml.safe_load(Path(path).read_text()) or {}
+    rows = data.get("edges", [])
+    applied = 0
+    for e in rows:
+        a, b = sorted([e["a"], e["b"]])
+        cur = conn.execute(
+            """INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
+               VALUES (?,?,?,?,?)
+               ON CONFLICT(src_a, src_b, edge_type) DO NOTHING""",
+            (a, b, e["type"], float(e.get("weight", 1.0)), e.get("evidence")),
+        )
+        applied += cur.rowcount
+    conn.commit()
+    return applied
+
+
+def load_sources(conn: sqlite3.Connection, path: Path) -> int:
+    data = yaml.safe_load(Path(path).read_text()) or {}
+    rows = data.get("sources", [])
+    cols = ", ".join(_COLS)
+    placeholders = ", ".join(f":{c}" for c in _COLS)
+    updates = ", ".join(f"{c}=excluded.{c}" for c in _COLS if c != "source_id")
+    sql = (
+        f"INSERT INTO sources ({cols}, created_at) VALUES ({placeholders}, datetime('now')) "
+        f"ON CONFLICT(source_id) DO UPDATE SET {updates}"
+    )
+    for s in rows:
+        conn.execute(sql, _row(s))
+    conn.commit()
+    return len(rows)