Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)

2026-06-15 09:24:29 -05:00
commit a6aec77506
77 changed files with 6263 additions and 0 deletions
@@ -0,0 +1,11 @@
+"""Ten31 Signal Engine — pilot.
+
+A recurring pipeline that ingests audio + text, extracts structured propositions
+locally, and surfaces signal over time. The discipline that separates signal from
+plausible-sounding noise (handoff §5): statistics & graph structure NOMINATE
+candidates; the frontier model only JUDGES and FANS OUT a pre-filtered shortlist.
+
+See README.md for the architecture and ten31-signal-engine-handoff.md for the spec.
+"""
+
+__version__ = "0.1.0"
@@ -0,0 +1,4 @@
+from .cli import main
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1 @@
+"""Client-side backfill queue (§13.4). Producers enqueue; ONE worker drains sequentially."""
@@ -0,0 +1,123 @@
+"""Backfill job queue over the `backfill_jobs` table (§13.4).
+
+Model the corpus backfill as a managed GPU-hours queue, not a real-time fan-out. Producers
+(ingestion) enqueue lightweight job descriptors; a SINGLE worker leases and drains them one at a
+time so audio never goes parallel (→ 503). Jobs are:
+  - idempotent: UNIQUE(job_type, input_hash); re-enqueue of seen content is a no-op.
+  - crash-safe: leases expire, so a dead worker's job returns to the pool automatically.
+  - prioritized: lower `priority` runs first (backtest corpus + filings jump ahead).
+
+This is plain SQLite so the whole queue is `SELECT * FROM backfill_jobs`.
+"""
+from __future__ import annotations
+
+import sqlite3
+from typing import Any, Optional, Sequence
+
+LEASE_SECONDS_DEFAULT = 600
+
+
+def enqueue(
+    conn: sqlite3.Connection,
+    *,
+    job_type: str,
+    target_id: str,
+    input_hash: str,
+    parent_doc_id: str | None = None,
+    priority: int = 100,
+    max_attempts: int = 5,
+) -> Optional[int]:
+    """Insert a job. Returns job_id, or None if this (job_type, input_hash) is already queued/done
+    (idempotent skip — §13.4)."""
+    cur = conn.execute(
+        """INSERT OR IGNORE INTO backfill_jobs
+             (job_type, target_id, parent_doc_id, priority, max_attempts, input_hash, state)
+           VALUES (?,?,?,?,?,?, 'pending')""",
+        (job_type, target_id, parent_doc_id, priority, max_attempts, input_hash),
+    )
+    conn.commit()
+    return cur.lastrowid if cur.rowcount else None
+
+
+def lease_next(
+    conn: sqlite3.Connection,
+    *,
+    worker_id: str,
+    job_types: Sequence[str] | None = None,
+    lease_seconds: int = LEASE_SECONDS_DEFAULT,
+) -> Optional[sqlite3.Row]:
+    """Atomically claim the highest-priority eligible job. Eligible = pending, OR a running/leased
+    job whose lease has expired (crash recovery). Increments `attempts`."""
+    params: list[Any] = []
+    type_filter = ""
+    if job_types:
+        type_filter = f" AND job_type IN ({','.join('?' * len(job_types))})"
+        params.extend(job_types)
+    row = conn.execute(
+        f"""SELECT job_id FROM backfill_jobs
+             WHERE (state = 'pending'
+                    OR (state IN ('leased','running')
+                        AND lease_expires_at IS NOT NULL
+                        AND lease_expires_at < datetime('now')))
+               {type_filter}
+             ORDER BY priority ASC, job_id ASC
+             LIMIT 1""",
+        params,
+    ).fetchone()
+    if row is None:
+        return None
+    conn.execute(
+        """UPDATE backfill_jobs
+             SET state='running', lease_owner=?, lease_expires_at=datetime('now', ?),
+                 attempts=attempts+1, updated_at=datetime('now')
+           WHERE job_id=?""",
+        (worker_id, f"+{int(lease_seconds)} seconds", row["job_id"]),
+    )
+    conn.commit()
+    return conn.execute("SELECT * FROM backfill_jobs WHERE job_id=?", (row["job_id"],)).fetchone()
+
+
+def complete(conn: sqlite3.Connection, job_id: int, *, output_ref: str | None = None,
+             gpu_seconds: float | None = None) -> None:
+    conn.execute(
+        """UPDATE backfill_jobs SET state='done', output_ref=?, gpu_seconds=?, error=NULL,
+                  updated_at=datetime('now') WHERE job_id=?""",
+        (output_ref, gpu_seconds, job_id),
+    )
+    conn.commit()
+
+
+def fail(conn: sqlite3.Connection, job_id: int, error: Any) -> str:
+    """Retry (→ pending) if attempts remain, else dead-letter (→ failed). Returns the new state."""
+    row = conn.execute(
+        "SELECT attempts, max_attempts FROM backfill_jobs WHERE job_id=?", (job_id,)
+    ).fetchone()
+    exhausted = bool(row) and row["attempts"] >= row["max_attempts"]
+    new_state = "failed" if exhausted else "pending"
+    conn.execute(
+        """UPDATE backfill_jobs SET state=?, error=?, lease_owner=NULL, lease_expires_at=NULL,
+                  updated_at=datetime('now') WHERE job_id=?""",
+        (new_state, str(error)[:2000], job_id),
+    )
+    conn.commit()
+    return new_state
+
+
+def skip(conn: sqlite3.Connection, job_id: int, reason: str | None = None) -> None:
+    """Terminal non-error skip (e.g. a chunk that produced zero claims is still 'done', but an
+    intentionally dropped job is 'skipped')."""
+    conn.execute(
+        "UPDATE backfill_jobs SET state='skipped', error=?, updated_at=datetime('now') WHERE job_id=?",
+        (reason, job_id),
+    )
+    conn.commit()
+
+
+def stats(conn: sqlite3.Connection) -> dict[str, dict[str, int]]:
+    rows = conn.execute(
+        "SELECT job_type, state, COUNT(*) AS n FROM backfill_jobs GROUP BY job_type, state"
+    ).fetchall()
+    out: dict[str, dict[str, int]] = {}
+    for r in rows:
+        out.setdefault(r["job_type"], {})[r["state"]] = r["n"]
+    return out
@@ -0,0 +1,619 @@
+"""Pilot CLI. Subcommands map to the build order in handoff §11.
+
+Currently implemented (foundation): init-db, seed-convictions, spark-status, db-tables.
+Later stages (ingest, extract, score, judge, eval-ui) are added as they're built.
+"""
+from __future__ import annotations
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+from .config import load_config
+from .store import db
+from .store.seed import load_convictions, load_fanout
+from .store.sources import load_source_edges, load_sources, update_feeds
+
+DEFAULT_CONVICTION_SEED = Path("seeds/conviction_log.seed.yaml")
+DEFAULT_SOURCES_SEED = Path("seeds/sources.seed.yaml")
+DEFAULT_FEEDS_SEED = Path("seeds/podcast_feeds.resolved.yaml")
+
+
+def _setup_logging(level: str) -> None:
+    logging.basicConfig(level=getattr(logging, level.upper(), logging.INFO),
+                        format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+
+
+def cmd_init_db(args: argparse.Namespace) -> int:
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    print(f"Initialized DB at {cfg.db_path}")
+    print("Tables/views:", ", ".join(db.table_names(conn)))
+    return 0
+
+
+def cmd_seed_convictions(args: argparse.Namespace) -> int:
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)  # ensure schema exists
+    path = Path(args.file)
+    n = load_convictions(conn, path)
+    print(f"Upserted {n} convictions from {path}")
+    breakers = conn.execute(
+        "SELECT conviction_id, thematic_proposition FROM conviction_log WHERE is_thesis_breaker = 1"
+    ).fetchall()
+    if breakers:
+        print("Thesis-breakers loaded (engine must surface these AGAINST the thesis, §5.7):")
+        for b in breakers:
+            print(f"  {b['conviction_id']}: {b['thematic_proposition'][:80]}...")
+    return 0
+
+
+def cmd_seed_sources(args: argparse.Namespace) -> int:
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    n = load_sources(conn, Path(args.file))
+    by_kind = conn.execute(
+        "SELECT kind, COUNT(*) n FROM sources GROUP BY kind ORDER BY kind"
+    ).fetchall()
+    print(f"Upserted {n} sources from {args.file}")
+    for r in by_kind:
+        print(f"  {r['kind']}: {r['n']}")
+    return 0
+
+
+def cmd_seed_edges(args: argparse.Namespace) -> int:
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    n = load_source_edges(conn, Path(args.file))
+    total = conn.execute("SELECT COUNT(*) FROM source_edges").fetchone()[0]
+    print(f"Inserted {n} new edges from {args.file} ({total} edges total)")
+    return 0
+
+
+def cmd_load_feeds(args: argparse.Namespace) -> int:
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    n = update_feeds(conn, Path(args.file))
+    print(f"updated {n} podcast feeds")
+    rows = conn.execute(
+        "SELECT backtest_2022_2023, COUNT(*) c FROM sources WHERE kind='podcast' "
+        "GROUP BY backtest_2022_2023 ORDER BY c DESC"
+    ).fetchall()
+    print("backtest 2022-2023 reach:")
+    for r in rows:
+        print(f"  {r['backtest_2022_2023'] or 'unset'}: {r['c']}")
+    return 0
+
+
+def cmd_ingest_edgar(args: argparse.Namespace) -> int:
+    from .ingest.edgar import EdgarClient, ingest_filings
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    client = EdgarClient(cfg.edgar_user_agent)
+    forms = tuple(f.strip() for f in args.forms.split(",")) if args.forms else ("10-K", "10-Q", "8-K")
+
+    # resolve source_id from ticker (create a lightweight source row if not seeded)
+    row = conn.execute("SELECT source_id FROM sources WHERE upper(ticker)=upper(?)", (args.ticker,)).fetchone()
+    if row:
+        source_id = row["source_id"]
+    else:
+        source_id = f"co-{args.ticker.lower()}"
+        conn.execute(
+            "INSERT OR IGNORE INTO sources (source_id, name, kind, ticker) VALUES (?,?,?,?)",
+            (source_id, args.ticker, "filing", args.ticker.upper()),
+        )
+        conn.commit()
+
+    n_docs, n_jobs = ingest_filings(conn, client, source_id=source_id, ticker=args.ticker,
+                                    since=args.since, until=args.until, forms=forms)
+    print(f"{args.ticker}: +{n_docs} filing documents, +{n_jobs} extract jobs queued "
+          f"(forms={','.join(forms)}, since={args.since}, until={args.until})")
+    return 0
+
+
+def _resolve_source_id(conn, ticker: str, kind: str = "filing") -> str:
+    row = conn.execute("SELECT source_id FROM sources WHERE upper(ticker)=upper(?)", (ticker,)).fetchone()
+    if row:
+        return row["source_id"]
+    source_id = f"co-{ticker.lower()}"
+    conn.execute("INSERT OR IGNORE INTO sources (source_id, name, kind, ticker) VALUES (?,?,?,?)",
+                 (source_id, ticker.upper(), kind, ticker.upper()))
+    conn.commit()
+    return source_id
+
+
+def cmd_ingest_doc(args: argparse.Namespace) -> int:
+    from .ingest.docs import ingest_one
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    doc_id = ingest_one(conn, cfg, source_id=args.source, url=args.url,
+                        title=args.title or args.url, date=args.date, method=args.method)
+    print(f"ingested: {doc_id}" if doc_id else "no new doc (duplicate / too short / fetch failed)")
+    return 0
+
+
+def cmd_ingest_feed_text(args: argparse.Namespace) -> int:
+    from .ingest.docs import ingest_feed_text
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    n = ingest_feed_text(conn, cfg, source_id=args.source, rss_url=args.url,
+                         since=args.since, until=args.until, limit=args.limit)
+    print(f"ingested {n} article docs from feed for {args.source}")
+    return 0
+
+
+def cmd_ingest_doc_manifest(args: argparse.Namespace) -> int:
+    from .ingest.docs import ingest_manifest
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    r = ingest_manifest(conn, cfg, Path(args.file))
+    print(f"manifest: ingested={r['ingested']} skipped={r['skipped']} missing_source={r['missing_source']}")
+    return 0
+
+
+def cmd_ingest_earnings(args: argparse.Namespace) -> int:
+    from .ingest.earnings import FMPClient, ingest_for_ticker
+    cfg = load_config()
+    if not cfg.fmp_api_key:
+        print("FMP_API_KEY not set", file=sys.stderr)
+        return 1
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    fmp = FMPClient(cfg.fmp_api_key)
+    source_id = _resolve_source_id(conn, args.ticker)
+    n_docs, n_jobs = ingest_for_ticker(conn, fmp, source_id=source_id, symbol=args.ticker.upper(),
+                                       data_dir=cfg.data_dir, since=args.since, until=args.until, limit=args.limit)
+    print(f"{args.ticker}: +{n_docs} earnings transcripts, +{n_jobs} extract jobs (since={args.since}, until={args.until})")
+    return 0
+
+
+def cmd_embed_claims(args: argparse.Namespace) -> int:
+    from .spark import from_config
+    from .embedstore.qdrant_store import get_client, ensure_collection, upsert_pending
+    from .embedstore.embedder import SparseEmbedder
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    sc = from_config(cfg)
+    client = get_client(args.qdrant_url)
+    created = ensure_collection(client)
+    print(f"collection {'created' if created else 'exists'}")
+    sparse = SparseEmbedder() if not args.no_sparse else None
+    n = upsert_pending(conn, sc, client, sparse)
+    print(f"embedded + upserted {n} propositions (sparse={'on' if sparse and sparse.available else 'off'})")
+    return 0
+
+
+def cmd_search(args: argparse.Namespace) -> int:
+    from .spark import from_config
+    cfg = load_config()
+    sc = from_config(cfg)
+    res = sc.search(args.query, collection="propositions", top_k=args.top_k, rerank=not args.no_rerank)
+    hits = res.get("results") or res.get("hits") or res
+    print(json.dumps(hits, indent=2)[:2500])
+    return 0
+
+
+def cmd_ingest_podcast(args: argparse.Namespace) -> int:
+    from .ingest.podcasts import ingest_rss, ingest_youtube
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    src = conn.execute("SELECT * FROM sources WHERE source_id=?", (args.source,)).fetchone()
+    if not src:
+        print(f"unknown source {args.source}", file=sys.stderr)
+        return 1
+    via = args.via
+    if via == "auto":
+        via = "youtube" if (src["backtest_2022_2023"] == "youtube_only" and args.since) else "rss"
+    fn = ingest_youtube if via == "youtube" else ingest_rss
+    n_docs, n_jobs = fn(conn, src, since=args.since, until=args.until, limit=args.limit)
+    print(f"{src['name']} via {via}: +{n_docs} episodes, +{n_jobs} transcribe jobs")
+    return 0
+
+
+def cmd_run_transcribe(args: argparse.Namespace) -> int:
+    from .spark import from_config
+    from .ingest.transcribe_worker import run_transcribe
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    sc = from_config(cfg)
+    result = run_transcribe(conn, sc, cfg, limit=args.limit, max_chunks=args.max_chunks)
+    print(f"transcription: {result['jobs_processed']} jobs processed")
+    return 0
+
+
+def cmd_run_transcribe_gemini(args: argparse.Namespace) -> int:
+    from .ingest.gemini_transcribe import run_transcribe_gemini
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    r = run_transcribe_gemini(conn, cfg, limit=args.limit, concurrency=args.concurrency)
+    tok_in, tok_out = r["prompt_tokens"], r["output_tokens"]
+    # Gemini 2.5 Flash list price: ~$0.30/1M text-in, audio-in ~$1.00/1M, $2.50/1M out. Audio dominates in.
+    est = tok_in / 1_000_000 * 1.00 + tok_out / 1_000_000 * 2.50
+    print(f"gemini transcribe: done={r['done']} failed={r['failed']} | "
+          f"tokens in={tok_in:,} out={tok_out:,} | ~${est:.2f} this run (≈${est/max(r['done'],1):.3f}/ep)")
+    return 0
+
+
+def cmd_run_extract(args: argparse.Namespace) -> int:
+    from .spark import from_config
+    from .extract.worker import run_extract
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    sc = from_config(cfg)
+    result = run_extract(conn, sc, cfg, limit=args.limit, max_chunks_per_doc=args.max_chunks)
+    print(f"extraction: {result['jobs_processed']} jobs, {result['claims_written']} claims written")
+    return 0
+
+
+def cmd_queue_status(args: argparse.Namespace) -> int:
+    from .backfill import queue
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    s = queue.stats(conn)
+    if not s:
+        print("queue empty")
+        return 0
+    for job_type, states in sorted(s.items()):
+        parts = ", ".join(f"{st}={n}" for st, n in sorted(states.items()))
+        print(f"  {job_type}: {parts}")
+    return 0
+
+
+def cmd_feed_peek(args: argparse.Namespace) -> int:
+    from .ingest.feeds import fetch_feed, episode_records
+    parsed = fetch_feed(args.url)
+    status = getattr(parsed, "status", None)
+    recs = episode_records(parsed)
+    print(f"status={status} bozo={getattr(parsed, 'bozo', None)} episodes_with_audio={len(recs)}")
+    for r in recs[: args.limit]:
+        print(f"  [{r['published']}] {str(r['title'])[:70]}")
+    if recs:
+        print(f"oldest in feed: {recs[-1]['published']}  newest: {recs[0]['published']}")
+    return 0
+
+
+def cmd_serve(args: argparse.Namespace) -> int:
+    import uvicorn
+    from .ui.app import create_app
+    cfg = load_config()
+    port = args.port or cfg.ui_port
+    print(f"serving corpus UI on http://0.0.0.0:{port}")
+    uvicorn.run(create_app(), host="0.0.0.0", port=port)
+    return 0
+
+
+def cmd_seed_fanout(args: argparse.Namespace) -> int:
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    n = load_fanout(conn, Path(args.file))
+    print(f"seeded {n} fan-out derivative nodes")
+    return 0
+
+
+def cmd_backtest(args: argparse.Namespace) -> int:
+    from .spark import from_config
+    from .signals.run import run_backtest
+    from datetime import datetime, timedelta
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    sc = from_config(cfg)
+    # monthly as_of march
+    start = datetime.strptime(args.start, "%Y-%m-%d")
+    end = datetime.strptime(args.end, "%Y-%m-%d")
+    dates, d = [], start
+    while d <= end:
+        dates.append(d.strftime("%Y-%m-%d"))
+        d = d + timedelta(days=args.step_days)
+    print(f"§7.1 backtest: conviction={args.conviction}, as_of march {args.start}→{args.end} ({len(dates)} points)")
+    timeline = run_backtest(conn, sc, cfg, conviction_id=args.conviction, dates=dates, window_days=args.window_days)
+
+    # report: per-node first-clear date + score trajectory; highlight the headline derivative
+    print("\n=== node trajectories (score by as_of; ★=cleared evidence bar) ===")
+    nodes = {}
+    for as_of, res in timeline:
+        for r in res:
+            key = r["node"]["node_id"] or r["node"]["conviction_id"]
+            nodes.setdefault(key, []).append((as_of, r["result"]["score"], r["evidence"], r["promotion"], r["result"]["inputs"]))
+    for key, traj in sorted(nodes.items()):
+        first = next((t for t in traj if t[2]), None)
+        peak = max(traj, key=lambda t: t[1])
+        mark = f"first-cleared {first[0]}" if first else "never cleared"
+        print(f"  {key:28} peak={peak[1]:.2f}  {mark}")
+    head = nodes.get(args.headline)
+    if head:
+        print(f"\n=== HEADLINE derivative: {args.headline} ===")
+        for as_of, score, ev, pr, inp in head:
+            star = "★" if ev else ("·" if score > 0 else " ")
+            print(f"  {as_of} {star} score={score:.2f} corrob={inp.get('corroboration',0)} "
+                  f"n_conf={inp.get('n_confirmed',0)} eisc={inp.get('eisc_corrob',0)} "
+                  f"a={inp.get('a_corrob',0)} k_eff={inp.get('k_eff0',0)}")
+        firstclear = next((t for t in head if t[2]), None)
+        print(f"\n  VERDICT: headline power-infra derivative "
+              f"{'SURFACED at ' + firstclear[0] if firstclear else 'did NOT surface'} "
+              f"(bar = under_acted ≥ {0.3})")
+    return 0
+
+
+def cmd_two_sided(args: argparse.Namespace) -> int:
+    """Two-sided net-corroboration trajectory (DESIGN_v2.1 H5) for the adversarial cases.
+    BATTERY: demand-net should rise while supply-net stays flat. STRIKE: net stays quiet in live, fires in test."""
+    from .spark import from_config as spark_from_config
+    from .extract.backends import from_config as backend_from_config
+    from .signals.two_sided import trajectory
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    sc = spark_from_config(cfg)
+    backend = backend_from_config(cfg, sc)
+    nodes = conn.execute(
+        "SELECT node_id, derivative_proposition FROM fanout_nodes WHERE parent_conviction_id=? ORDER BY node_id",
+        (args.conviction,),
+    ).fetchall()
+    dates = [d.strip() for d in args.dates.split(",")]
+    filt = [s for s in args.nodes.split(",") if s] if args.nodes else []
+    for r in nodes:
+        if filt and not any(k.lower() in r["node_id"].lower() for k in filt):
+            continue
+        for mode in [m.strip() for m in args.modes.split(",")]:
+            traj = trajectory(conn, sc, backend, r["derivative_proposition"], dates,
+                              window_days=args.window_days, mode=mode)
+            print(f"\n### {r['node_id']}  [mode={mode}, window={args.window_days}d] ###")
+            for pt in traj:
+                print(f"  {pt['as_of']}: net={pt['net']:+.2f}  "
+                      f"affirm(eisc={pt['affirms_eisc']}, hard_src={pt.get('hard_affirm_src','?')}, "
+                      f"n_claims={pt['n_affirm']}, soft_dropped={pt.get('soft_affirm_src_dropped','?')})  "
+                      f"deny(eisc={pt['denies_eisc']}, n={pt['n_deny']})  "
+                      f"own_net={pt['own_network_affirm_src']}")
+    return 0
+
+
+def cmd_confusion(args: argparse.Namespace) -> int:
+    from .signals.confusion import run_confusion
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    out = run_confusion(conn, cfg, args.spec)
+    classify = out["classify"]
+    print("=== PRE-REGISTERED confusion matrix (DESIGN_v2 §1) — precision AND recall; RUNWAY = frac of move still ahead at signal ===")
+    print(f"{'derivative':26} {'reprice?':8} {'peak%':>6} {'whisper':>9} {'run_wh':>6} {'cleared':>9} {'run_cl':>6} cl/wh")
+    for r in out["rows"]:
+        cl, wh = classify(r, "cleared"), classify(r, "whisper")
+        miss = f" (no px:{','.join(r['missing'])})" if r["missing"] else ""
+        print(f"{r['node']:26} {('REAL' if r['confirmed'] else 'no'):8} {str(r['peak_pct']):>6} "
+              f"{str(r['whisper_date'] or '-'):>9} {str(r['runway_whisper'] if r['runway_whisper'] is not None else '-'):>6} "
+              f"{str(r['cleared_date'] or '-'):>9} {str(r['runway_cleared'] if r['runway_cleared'] is not None else '-'):>6} "
+              f"{cl}/{wh}{miss}")
+    for level in ("cleared", "whisper"):
+        c, p, rec = out[level]
+        print(f"\n{level.upper()} level: TP={c['TP']} FP={c['FP']} FN={c['FN']} TN={c['TN']} | "
+              f"precision={p if p is None else round(p,2)} recall={rec if rec is None else round(rec,2)}")
+    print("\nlead_* = days the repricing came AFTER the signal (positive = engine was early).")
+    print("The cleared→whisper delta = what the independence floor cost in lead time / recall.")
+    return 0
+
+
+def cmd_provenance(args: argparse.Namespace) -> int:
+    """The processing log — what's been ingested/processed, so we never reprocess silently."""
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    db.init_db(conn)
+    print("processed documents (the durable log):")
+    for r in conn.execute(
+        "SELECT kind, COUNT(*) total, SUM(CASE WHEN processed_at IS NOT NULL THEN 1 ELSE 0 END) proc "
+        "FROM documents GROUP BY kind ORDER BY kind"
+    ):
+        print(f"  {r['kind']:14} {r['proc']}/{r['total']} processed")
+    print("dedup model: (1) UNIQUE(source_id, external_id) = robust pre-GPU guard; "
+          "(2) dedup_key = cross-mirror (title+date); content_hash = audit only.")
+    dups = conn.execute(
+        "SELECT dedup_key, COUNT(*) c FROM documents WHERE dedup_key IS NOT NULL "
+        "GROUP BY dedup_key HAVING c > 1"
+    ).fetchall()
+    print(f"cross-mirror dedup_key groups (same episode via >1 feed): {len(dups)}")
+    miss = conn.execute("SELECT COUNT(*) FROM documents WHERE dedup_key IS NULL").fetchone()[0]
+    if miss:
+        print(f"  ({miss} docs missing dedup_key — run `provenance --backfill-hashes`)")
+    if args.backfill_hashes:
+        import hashlib
+        import os
+        from .util import audio_dedup_key
+        ndk = nch = 0
+        for r in conn.execute("SELECT doc_id, kind, title, date, external_id, transcript_path, dedup_key, content_hash FROM documents"):
+            updates: dict = {}
+            if not r["dedup_key"]:
+                updates["dedup_key"] = (audio_dedup_key(r["title"], r["date"])
+                                        if r["kind"] in ("podcast", "youtube") else r["external_id"])
+                ndk += 1
+            if not r["content_hash"] and r["transcript_path"] and os.path.exists(r["transcript_path"]):
+                updates["content_hash"] = hashlib.sha256(open(r["transcript_path"], "rb").read()).hexdigest()
+                nch += 1
+            if updates:
+                sets = ", ".join(f"{k}=?" for k in updates)
+                conn.execute(f"UPDATE documents SET {sets} WHERE doc_id=?", (*updates.values(), r["doc_id"]))
+        conn.commit()
+        print(f"backfilled {ndk} dedup_keys, {nch} content hashes (audit)")
+    return 0
+
+
+def cmd_db_tables(args: argparse.Namespace) -> int:
+    cfg = load_config()
+    conn = db.connect(cfg.db_path)
+    for t in db.table_names(conn):
+        print(t)
+    return 0
+
+
+def cmd_spark_status(args: argparse.Namespace) -> int:
+    from .spark import from_config
+    cfg = load_config()
+    sc = from_config(cfg)
+    try:
+        print("status:", sc.status())
+        print("endpoints:", sc.endpoints())
+        return 0
+    except Exception as e:  # noqa: BLE001 — health probe; surface, don't crash
+        print(f"Spark Control unreachable at {cfg.spark_control_url}: {e}", file=sys.stderr)
+        return 1
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(prog="signal_engine", description="Ten31 Signal Engine (pilot)")
+    sub = p.add_subparsers(dest="command", required=True)
+
+    sub.add_parser("init-db", help="Create the SQLite schema").set_defaults(func=cmd_init_db)
+
+    sp = sub.add_parser("seed-convictions", help="Load the conviction log (§3.1)")
+    sp.add_argument("--file", default=str(DEFAULT_CONVICTION_SEED))
+    sp.set_defaults(func=cmd_seed_convictions)
+
+    ss = sub.add_parser("seed-sources", help="Load the source registry (§7.3/§7.4)")
+    ss.add_argument("--file", default=str(DEFAULT_SOURCES_SEED))
+    ss.set_defaults(func=cmd_seed_sources)
+
+    sde = sub.add_parser("seed-edges", help="Seed EISC connectedness edges (priors) idempotently")
+    sde.add_argument("--file", default="seeds/source_edges.bitcoin.seed.yaml")
+    sde.set_defaults(func=cmd_seed_edges)
+
+    lf = sub.add_parser("load-feeds", help="Apply resolved/verified podcast feed URLs + backtest reach")
+    lf.add_argument("--file", default=str(DEFAULT_FEEDS_SEED))
+    lf.set_defaults(func=cmd_load_feeds)
+
+    sf = sub.add_parser("seed-fanout", help="Load the hand-written fan-out tree (§7.1 backtest)")
+    sf.add_argument("--file", default="seeds/fanout.K2023.seed.yaml")
+    sf.set_defaults(func=cmd_seed_fanout)
+
+    bt = sub.add_parser("backtest", help="Run the §7.1 under-acted-conviction backtest (as-of march)")
+    bt.add_argument("--conviction", default="K2023")
+    bt.add_argument("--start", default="2023-01-01")
+    bt.add_argument("--end", default="2024-06-01")
+    bt.add_argument("--step-days", type=int, default=30)
+    bt.add_argument("--window-days", type=int, default=90, help="~quarterly for filings/earnings cadence")
+    bt.add_argument("--headline", default="K2023-picks-and-shovels")
+    bt.set_defaults(func=cmd_backtest)
+
+    ie = sub.add_parser("ingest-edgar", help="Fetch SEC filings for a ticker → documents + extract jobs")
+    ie.add_argument("--ticker", required=True)
+    ie.add_argument("--since", help="ISO date lower bound, e.g. 2022-01-01")
+    ie.add_argument("--until", help="ISO date upper bound, e.g. 2023-12-31")
+    ie.add_argument("--forms", help="comma list, default 10-K,10-Q,8-K")
+    ie.set_defaults(func=cmd_ingest_edgar)
+
+    idoc = sub.add_parser("ingest-doc", help="Fetch one text doc (HTML/PDF) → document + extract job (Battery corpus)")
+    idoc.add_argument("--source", required=True, help="source_id (must exist)")
+    idoc.add_argument("--url", required=True)
+    idoc.add_argument("--title")
+    idoc.add_argument("--date", help="ISO date of the document")
+    idoc.add_argument("--method", choices=["auto", "html", "pdf"], default="auto")
+    idoc.set_defaults(func=cmd_ingest_doc)
+
+    idm = sub.add_parser("ingest-doc-manifest", help="Batch-ingest a YAML doc manifest (Battery corpus)")
+    idm.add_argument("--file", default="seeds/battery_docs.manifest.yaml")
+    idm.set_defaults(func=cmd_ingest_doc_manifest)
+
+    ift = sub.add_parser("ingest-feed-text", help="Ingest article bodies behind a text RSS feed (blog/press)")
+    ift.add_argument("--source", required=True)
+    ift.add_argument("--url", required=True, help="RSS feed URL")
+    ift.add_argument("--since")
+    ift.add_argument("--until")
+    ift.add_argument("--limit", type=int, default=50)
+    ift.set_defaults(func=cmd_ingest_feed_text)
+
+    ge = sub.add_parser("ingest-earnings", help="Fetch FMP earnings transcripts → documents + extract jobs")
+    ge.add_argument("--ticker", required=True)
+    ge.add_argument("--since", help="ISO date lower bound (uses transcript date)")
+    ge.add_argument("--until", help="ISO date upper bound")
+    ge.add_argument("--limit", type=int, default=8)
+    ge.set_defaults(func=cmd_ingest_earnings)
+
+    ts = sub.add_parser("two-sided", help="Two-sided net-corroboration trajectory (Strike/Battery adversarial cases)")
+    ts.add_argument("--conviction", default="BATTERY2022")
+    ts.add_argument("--nodes", default="", help="comma substrings to filter fan-out nodes, e.g. demand,supply")
+    ts.add_argument("--dates", default="2022-12-31,2023-06-30,2023-12-31,2024-06-30,2024-12-31")
+    ts.add_argument("--modes", default="live", help="comma list: live,test")
+    ts.add_argument("--window-days", type=int, default=365)
+    ts.set_defaults(func=cmd_two_sided)
+
+    ec = sub.add_parser("embed-claims", help="Embed pending propositions → Qdrant hybrid collection (§4.3)")
+    ec.add_argument("--qdrant-url", default="http://192.168.1.87:6333")
+    ec.add_argument("--no-sparse", action="store_true", help="dense-only (skip BM25)")
+    ec.set_defaults(func=cmd_embed_claims)
+
+    se = sub.add_parser("search", help="Hybrid search the proposition store via the gateway")
+    se.add_argument("--query", required=True)
+    se.add_argument("--top-k", type=int, default=8)
+    se.add_argument("--no-rerank", action="store_true")
+    se.set_defaults(func=cmd_search)
+
+    ip = sub.add_parser("ingest-podcast", help="Register podcast episodes → transcribe jobs (RSS or YouTube)")
+    ip.add_argument("--source", required=True, help="source_id, e.g. pod-dwarkesh")
+    ip.add_argument("--via", choices=["auto", "rss", "youtube"], default="auto")
+    ip.add_argument("--since")
+    ip.add_argument("--until")
+    ip.add_argument("--limit", type=int, default=20)
+    ip.set_defaults(func=cmd_ingest_podcast)
+
+    rt = sub.add_parser("run-transcribe", help="Drain 'transcribe' jobs → speaker-attributed transcripts + voiceprints")
+    rt.add_argument("--limit", type=int, default=5)
+    rt.add_argument("--max-chunks", type=int, default=999)
+    rt.set_defaults(func=cmd_run_transcribe)
+
+    rtg = sub.add_parser("run-transcribe-gemini",
+                         help="One-time backfill: drain 'transcribe' jobs via Gemini (off the Spark GPU)")
+    rtg.add_argument("--limit", type=int, default=5)
+    rtg.add_argument("--concurrency", type=int, default=4)
+    rtg.set_defaults(func=cmd_run_transcribe_gemini)
+
+    re = sub.add_parser("run-extract", help="Drain 'extract' jobs → claims via the local LLM (§4.2)")
+    re.add_argument("--limit", type=int, default=5, help="max jobs to process this run")
+    re.add_argument("--max-chunks", type=int, default=4, help="max chunks per document")
+    re.set_defaults(func=cmd_run_extract)
+
+    sub.add_parser("queue-status", help="Backfill queue counts by type/state").set_defaults(func=cmd_queue_status)
+
+    fp = sub.add_parser("feed-peek", help="Parse an RSS feed and show episode coverage")
+    fp.add_argument("--url", required=True)
+    fp.add_argument("--limit", type=int, default=5)
+    fp.set_defaults(func=cmd_feed_peek)
+
+    sv = sub.add_parser("serve", help="Run the corpus-management web UI (FastAPI)")
+    sv.add_argument("--port", type=int, default=None)
+    sv.set_defaults(func=cmd_serve)
+
+    cm = sub.add_parser("confusion-matrix", help="Pre-registered precision/recall on the §7.1 derivatives (resolver)")
+    cm.add_argument("--spec", default="seeds/resolution.K2023.yaml")
+    cm.set_defaults(func=cmd_confusion)
+
+    pv = sub.add_parser("provenance", help="Processing log: what's ingested/processed (dedup-safe)")
+    pv.add_argument("--backfill-hashes", action="store_true", help="compute content_hash for older transcripts")
+    pv.set_defaults(func=cmd_provenance)
+
+    sub.add_parser("db-tables", help="List tables/views").set_defaults(func=cmd_db_tables)
+    sub.add_parser("spark-status", help="Probe Spark Control health").set_defaults(func=cmd_spark_status)
+    return p
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_parser().parse_args(argv)
+    cfg = load_config()
+    _setup_logging(cfg.log_level)
+    return args.func(args)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,101 @@
+"""Environment-driven configuration (handoff §10, §13).
+
+All config flows through env vars so the SAME code runs as a plain process now and, later, as a
+StartOS s9pk daemon (which injects these via the daemon's `exec.env` from a `store.json` FileModel).
+A local `.env` (gitignored) is loaded for convenience during the pilot.
+
+Live values confirmed against the operator's gateway 2026-06-07 (GET /api/status,/api/endpoints):
+  gateway  = https://192.168.1.72:62419  (self-signed → SPARK_VERIFY_TLS=false)
+  LLM      = RedHatAI/Qwen3.6-35B-A3B-NVFP4
+  embed    = BAAI/bge-m3 (1024-d)         rerank = BAAI/bge-reranker-v2-m3
+  ASR      = nvidia/parakeet-tdt-0.6b-v3  diarizer = nvidia/diar_sortformer_4spk-v1
+"""
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+
+def _load_dotenv(path: str = ".env") -> None:
+    """Minimal .env loader (no dependency): KEY=VALUE lines populate os.environ if not already set."""
+    p = Path(path)
+    if not p.exists():
+        return
+    for line in p.read_text().splitlines():
+        line = line.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        key, _, val = line.partition("=")
+        os.environ.setdefault(key.strip(), val.strip().strip('"').strip("'"))
+
+
+def _env(key: str, default: str | None = None) -> str | None:
+    return os.environ.get(key, default)
+
+
+@dataclass(frozen=True)
+class Config:
+    spark_control_url: str
+    spark_verify_tls: bool
+    spark_timeout_s: float
+    audio_concurrency: int   # global in-flight cap across BOTH parakeet audio endpoints (sit at 2, ceiling 3)
+
+    local_llm_model: str
+    embed_model: str
+    transcribe_model: str
+
+    anthropic_api_key: str | None
+    frontier_model: str
+
+    # Extraction backend: 'local' (Qwen via Spark Control, default) | 'gemini' (batch overflow/fallback, §scaling)
+    extraction_backend: str
+    gemini_api_key: str | None
+    gemini_model: str
+
+    fmp_api_key: str | None
+    edgar_user_agent: str
+
+    data_dir: Path
+    database_url: str
+    audio_cache_dir: Path
+
+    ui_port: int
+    log_level: str
+
+    @classmethod
+    def from_env(cls) -> "Config":
+        _load_dotenv()
+        data_dir = Path(_env("DATA_DIR", "./data") or "./data")
+        return cls(
+            spark_control_url=_env("SPARK_CONTROL_URL", "https://192.168.1.72:62419") or "",
+            spark_verify_tls=(_env("SPARK_VERIFY_TLS", "false") or "false").lower() == "true",
+            spark_timeout_s=float(_env("SPARK_TIMEOUT_S", "180") or "180"),
+            audio_concurrency=min(3, max(1, int(_env("AUDIO_CONCURRENCY", "2") or "2"))),
+            local_llm_model=_env("LOCAL_LLM_MODEL", "RedHatAI/Qwen3.6-35B-A3B-NVFP4") or "",
+            embed_model=_env("EMBED_MODEL", "BAAI/bge-m3") or "",
+            transcribe_model=_env("TRANSCRIBE_MODEL", "nvidia/parakeet-tdt-0.6b-v3") or "",
+            anthropic_api_key=_env("ANTHROPIC_API_KEY"),
+            frontier_model=_env("FRONTIER_MODEL", "claude-opus-4-8") or "",
+            extraction_backend=_env("EXTRACTION_BACKEND", "local") or "local",
+            gemini_api_key=_env("GEMINI_API_KEY"),
+            gemini_model=_env("GEMINI_MODEL", "gemini-2.5-flash") or "",
+            fmp_api_key=_env("FMP_API_KEY"),
+            edgar_user_agent=_env("EDGAR_USER_AGENT", "Ten31 Research grant@ten31.xyz") or "",
+            data_dir=data_dir,
+            database_url=_env("DATABASE_URL", "") or "",
+            audio_cache_dir=Path(_env("AUDIO_CACHE_DIR", str(data_dir / "audio-cache")) or "audio-cache"),
+            ui_port=int(_env("UI_PORT", "8000") or "8000"),
+            log_level=_env("LOG_LEVEL", "INFO") or "INFO",
+        )
+
+    @property
+    def db_path(self) -> Path:
+        prefix = "sqlite:///"
+        if self.database_url.startswith(prefix):
+            return Path(self.database_url[len(prefix):])
+        return self.data_dir / "signal.db"
+
+
+def load_config() -> Config:
+    return Config.from_env()
@@ -0,0 +1,6 @@
+"""Embedding + vector storage (§4.3).
+
+Embed DISTILLED PROPOSITIONS (not raw chunks) into a Qdrant HYBRID collection: dense bge-m3
+(via the gateway) + BM25 sparse (client-side), so entity-heavy propositions (MSTR/Strategy/
+Microstrategy) match on the lexical leg too. Retrieval goes through the gateway's /api/search.
+"""
@@ -0,0 +1,36 @@
+"""Proposition embedding: dense (bge-m3 via gateway) + optional BM25 sparse (client-side)."""
+from __future__ import annotations
+
+import logging
+
+log = logging.getLogger(__name__)
+
+
+def dense_embed(sc, texts: list[str]) -> list[list[float]]:
+    """Dense bge-m3 (1024-d) via the gateway /v1/embeddings (§4.3)."""
+    resp = sc.embed(texts)
+    data = sorted(resp["data"], key=lambda d: d.get("index", 0))
+    return [d["embedding"] for d in data]
+
+
+class SparseEmbedder:
+    """BM25 sparse vectors via FastEmbed `Qdrant/bm25` (the operator's CRM uses this exact model,
+    with the collection's `modifier: idf`). Degrades gracefully to dense-only if fastembed is absent."""
+
+    def __init__(self, model_name: str = "Qdrant/bm25") -> None:
+        self.available = False
+        self._model = None
+        try:
+            from fastembed import SparseTextEmbedding
+            self._model = SparseTextEmbedding(model_name=model_name)
+            self.available = True
+        except Exception as e:  # noqa: BLE001
+            log.warning("fastembed sparse unavailable (%s) — upserting dense-only; add sparse later", e)
+
+    def embed(self, texts: list[str]) -> list[dict | None]:
+        if not self.available or self._model is None:
+            return [None] * len(texts)
+        out: list[dict | None] = []
+        for emb in self._model.embed(texts):
+            out.append({"indices": emb.indices.tolist(), "values": emb.values.tolist()})
+        return out
@@ -0,0 +1,79 @@
+"""Qdrant hybrid collection: create + upsert distilled propositions (§4.3).
+
+Collection mgmt + upserts go DIRECT to Qdrant (§13.2 "(Qdrant direct) :6333"); retrieval goes
+through the gateway's /api/search. Named dense vector `bge_m3` (1024-d cosine) + sparse `bm25`
+(modifier IDF). Point id is a deterministic UUID5 of claim_id, so re-upsert is idempotent.
+"""
+from __future__ import annotations
+
+import logging
+import sqlite3
+import uuid
+
+from qdrant_client import QdrantClient, models
+
+from .embedder import SparseEmbedder, dense_embed
+
+log = logging.getLogger(__name__)
+
+COLLECTION = "propositions"
+DENSE = "bge_m3"
+SPARSE = "bm25"
+_NS = uuid.UUID("5f9b7e10-0000-4000-8000-000000000001")
+
+# Filterable payload (§4.3): stance/topic/cluster/date for stance distributions, time-windowed
+# consensus, corroboration lookups. NEVER infer stance from vector distance (§2.2/§5.3).
+_PAYLOAD_FIELDS = (
+    "claim_id", "doc_id", "source_id", "source_cluster", "topic_canonical", "date",
+    "claim_type", "time_horizon", "confidence", "rel_polarity", "engages_consensus",
+    "counters_position", "thesis_seam", "salience", "claimant", "proposition",
+)
+
+
+def get_client(qdrant_url: str) -> QdrantClient:
+    return QdrantClient(url=qdrant_url, prefer_grpc=False, timeout=60)
+
+
+def ensure_collection(client: QdrantClient, *, dim: int = 1024) -> bool:
+    names = [c.name for c in client.get_collections().collections]
+    if COLLECTION in names:
+        return False
+    client.create_collection(
+        collection_name=COLLECTION,
+        vectors_config={DENSE: models.VectorParams(size=dim, distance=models.Distance.COSINE)},
+        sparse_vectors_config={SPARSE: models.SparseVectorParams(modifier=models.Modifier.IDF)},
+    )
+    log.info("created Qdrant collection %r (dense %s %dd + sparse %s/idf)", COLLECTION, DENSE, dim, SPARSE)
+    return True
+
+
+def _point_id(claim_id: str) -> str:
+    return str(uuid.uuid5(_NS, claim_id))
+
+
+def upsert_pending(conn: sqlite3.Connection, sc, client: QdrantClient,
+                   sparse: SparseEmbedder | None = None, *, batch: int = 64) -> int:
+    """Embed + upsert every claim that has no qdrant_point_id yet; back-link the id into SQLite."""
+    rows = conn.execute("SELECT * FROM claims WHERE qdrant_point_id IS NULL").fetchall()
+    if not rows:
+        return 0
+    total = 0
+    for i in range(0, len(rows), batch):
+        chunk = rows[i:i + batch]
+        texts = [r["proposition"] for r in chunk]
+        dvecs = dense_embed(sc, texts)
+        svecs = sparse.embed(texts) if sparse else [None] * len(texts)
+        points = []
+        for r, dv, sv in zip(chunk, dvecs, svecs):
+            vectors: dict = {DENSE: dv}
+            if sv is not None:
+                vectors[SPARSE] = models.SparseVector(indices=sv["indices"], values=sv["values"])
+            payload = {f: r[f] for f in _PAYLOAD_FIELDS}
+            points.append(models.PointStruct(id=_point_id(r["claim_id"]), vector=vectors, payload=payload))
+        client.upsert(collection_name=COLLECTION, points=points)
+        for r in chunk:
+            conn.execute("UPDATE claims SET qdrant_point_id=? WHERE claim_id=?",
+                         (_point_id(r["claim_id"]), r["claim_id"]))
+        conn.commit()
+        total += len(chunk)
+    return total
@@ -0,0 +1,6 @@
+"""Extraction (§4.2) — local LLM → structured claim units. The cost & quality center.
+
+Emits at the level of the PROPOSITION: a passage may yield 0..N claims, and MOST passages yield
+zero. An extractor that dutifully emits a claim per chunk reintroduces exactly the noise the rest
+of the system is designed to remove.
+"""
@@ -0,0 +1,64 @@
+"""Pluggable extraction backends (§scaling).
+
+The §4.2 extractor calls a backend that turns chat messages into a JSON string. Default is the
+LOCAL Qwen via Spark Control (the ~95%-local design). The Gemini backend is the documented
+overflow/fallback for bulk back-cataloging at scale, or if the Sparks are unavailable — used for
+the PUBLIC corpus only, never conviction/exposure data (sovereignty boundary, §4.6).
+
+A backend exposes: complete_json(messages, max_tokens) -> str  (a JSON object string).
+"""
+from __future__ import annotations
+
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class LocalQwenBackend:
+    name = "local"
+
+    def __init__(self, sc) -> None:
+        self.sc = sc
+
+    def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
+        resp = self.sc.chat(messages, json_object=True, temperature=0,
+                            enable_thinking=False, max_tokens=max_tokens)
+        return resp["choices"][0]["message"]["content"]
+
+
+class GeminiBackend:
+    """Gemini fallback/overflow. Implemented against the `google-genai` SDK. NOTE: untested until a
+    key is provided — validate end-to-end before relying on it for a real backfill. The async BATCH
+    API is the eventual scale path; this synchronous form is the drop-in fallback."""
+    name = "gemini"
+
+    def __init__(self, api_key: str, model: str = "gemini-2.5-flash") -> None:
+        from google import genai  # guarded import; pip install google-genai
+        self._genai = genai
+        self.client = genai.Client(api_key=api_key)
+        self.model = model
+
+    def complete_json(self, messages: list[dict], *, max_tokens: int = 4000) -> str:
+        from google.genai import types
+        system = "\n\n".join(m["content"] for m in messages if m["role"] == "system")
+        user = "\n\n".join(m["content"] for m in messages if m["role"] != "system")
+        resp = self.client.models.generate_content(
+            model=self.model,
+            contents=user,
+            config=types.GenerateContentConfig(
+                system_instruction=system or None,
+                temperature=0,
+                max_output_tokens=max_tokens,
+                response_mime_type="application/json",
+            ),
+        )
+        return resp.text or "{}"
+
+
+def from_config(cfg, sc) -> "LocalQwenBackend | GeminiBackend":
+    if cfg.extraction_backend == "gemini":
+        if not cfg.gemini_api_key:
+            log.warning("EXTRACTION_BACKEND=gemini but GEMINI_API_KEY missing — falling back to local")
+        else:
+            return GeminiBackend(cfg.gemini_api_key, cfg.gemini_model)
+    return LocalQwenBackend(sc)
@@ -0,0 +1,117 @@
+"""Claim extraction: text → 0..N claim units → SQLite (§4.2)."""
+from __future__ import annotations
+
+import json
+import logging
+import sqlite3
+from typing import Any
+
+from .prompt import SEED_TOPICS, build_messages
+
+log = logging.getLogger(__name__)
+
+_ENUMS = {
+    "claim_type": {"interpretive", "predictive", "descriptive", "reactive"},
+    "time_horizon": {"near", "medium", "long", "unspecified"},
+    "confidence": {"low", "med", "high"},
+    "thesis_seam": {"energy_compute", "debasement_bitcoin", "ai_data_ownership", "none"},
+    "salience": {"central", "secondary", "aside"},
+}
+
+
+def register_seed_topics(conn: sqlite3.Connection) -> None:
+    """Pre-load the controlled half of the hybrid topic vocabulary (§4.2)."""
+    for t in SEED_TOPICS:
+        conn.execute(
+            "INSERT INTO topics (topic_canonical, status) VALUES (?, 'controlled') "
+            "ON CONFLICT(topic_canonical) DO UPDATE SET status='controlled'",
+            (t,),
+        )
+    conn.commit()
+
+
+def chunk_text(text: str, max_chars: int) -> list[str]:
+    """Split on paragraph boundaries into windows that fit the model context alongside the prompt."""
+    text = text.strip()
+    if not text:
+        return []
+    if len(text) <= max_chars:
+        return [text]
+    chunks: list[str] = []
+    cur: list[str] = []
+    size = 0
+    for para in text.split("\n\n"):
+        if size + len(para) > max_chars and cur:
+            chunks.append("\n\n".join(cur))
+            cur, size = [], 0
+        cur.append(para)
+        size += len(para) + 2
+    if cur:
+        chunks.append("\n\n".join(cur))
+    return chunks
+
+
+def _parse_claims(content: str) -> list[dict]:
+    try:
+        obj = json.loads(content)
+    except Exception:
+        i, j = content.find("{"), content.rfind("}")
+        if i < 0 or j < 0:
+            return []
+        try:
+            obj = json.loads(content[i:j + 1])
+        except Exception:
+            return []
+    claims = obj.get("claims", []) if isinstance(obj, dict) else []
+    return [c for c in claims if isinstance(c, dict) and c.get("proposition")]
+
+
+def extract_claims_from_text(backend, text: str, *, source_name: str, source_cluster: str | None,
+                             date: str | None, kind: str) -> list[dict]:
+    """`backend` is any object with .complete_json(messages, max_tokens) -> str
+    (see extract.backends: LocalQwenBackend | GeminiBackend)."""
+    messages = build_messages(text, source_name=source_name, source_cluster=source_cluster,
+                              date=date, kind=kind)
+    content = backend.complete_json(messages, max_tokens=4000)
+    return _parse_claims(content)
+
+
+def _enum(c: dict, field: str, default: str) -> str:
+    v = c.get(field)
+    return v if v in _ENUMS[field] else default
+
+
+def persist_claims(conn: sqlite3.Connection, *, doc: sqlite3.Row, source: sqlite3.Row | None,
+                   claims: list[dict], chunk_idx: int) -> int:
+    n = 0
+    cluster = source["source_cluster"] if source else None
+    for i, c in enumerate(claims):
+        seam = _enum(c, "thesis_seam", "none")
+        topic = c.get("topic_canonical") or None
+        if topic:
+            # register emergent topics BEFORE the claim (claims.topic_canonical is a FK → topics)
+            conn.execute(
+                "INSERT OR IGNORE INTO topics (topic_canonical, status, seam) VALUES (?, 'emergent', ?)",
+                (topic, seam),
+            )
+        claim_id = f"{doc['doc_id']}:{chunk_idx}:{i}"
+        conn.execute(
+            """INSERT OR IGNORE INTO claims
+                 (claim_id, doc_id, source_id, proposition, topic_canonical, topic_raw, claimant,
+                  source_cluster, date, claim_type, time_horizon, confidence, rel_polarity,
+                  engages_consensus, counters_position, thesis_seam, salience)
+               VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+            (
+                claim_id, doc["doc_id"], doc["source_id"], str(c["proposition"])[:1000],
+                topic, c.get("topic_raw"),
+                c.get("claimant") or (source["name"] if source else None),
+                cluster, doc["date"],
+                _enum(c, "claim_type", "descriptive"), _enum(c, "time_horizon", "unspecified"),
+                _enum(c, "confidence", "med"), "none",
+                1 if c.get("engages_consensus") else 0, c.get("counters_position"),
+                seam, _enum(c, "salience", "secondary"),
+            ),
+        )
+        n += 1
+    conn.commit()
+    return n
@@ -0,0 +1,47 @@
+"""SEC filing HTML → plain text. Stdlib only (boring, inspectable).
+
+Drops script/style/head and inline-XBRL hidden blocks (10-Ks embed a huge <ix:hidden> section of
+numeric facts that would otherwise swamp the extractor), and collapses whitespace.
+"""
+from __future__ import annotations
+
+import re
+from html.parser import HTMLParser
+
+_SKIP_TAGS = {"script", "style", "head"}
+_SKIP_PREFIXES = ("ix:hidden",)          # inline-XBRL hidden fact dump
+_BLOCK_TAGS = {"p", "div", "br", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"}
+
+
+class _Stripper(HTMLParser):
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self._skip_depth = 0
+        self._parts: list[str] = []
+
+    def handle_starttag(self, tag: str, attrs) -> None:
+        if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
+            self._skip_depth += 1
+        elif tag in _BLOCK_TAGS:
+            self._parts.append("\n")
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag in _SKIP_TAGS or tag.startswith(_SKIP_PREFIXES):
+            self._skip_depth = max(0, self._skip_depth - 1)
+        elif tag in _BLOCK_TAGS:
+            self._parts.append("\n")
+
+    def handle_data(self, data: str) -> None:
+        if self._skip_depth == 0 and data.strip():
+            self._parts.append(data)
+
+
+def html_to_text(html: str, *, max_chars: int = 300_000) -> str:
+    p = _Stripper()
+    p.feed(html)
+    text = "".join(p._parts)
+    text = re.sub(r"[ \t ]+", " ", text)
+    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
+    text = "\n".join(line.strip() for line in text.splitlines())
+    text = text.strip()
+    return text[:max_chars]
@@ -0,0 +1,72 @@
+"""The §4.2 claim-extraction prompt. Prompt engineering is ours (§13.3); the schema is finalized.
+
+Discipline encoded here (the whole point of the system, §2/§4.2):
+  - Extract at the level of the PROPOSITION; emit ZERO when there is no substantive claim.
+  - Separate topic from stance: capture stance-vs-consensus explicitly, never as a bull/bear label.
+  - thesis_seam is a TAG, not a filter — off-thesis and anti-thesis claims are still extracted.
+"""
+from __future__ import annotations
+
+# Hybrid topic vocabulary (§4.2): a small SEEDED controlled list. The model reuses one when it
+# fits and proposes a concise snake_case topic otherwise; emergent topics are merged on a schedule.
+SEED_TOPICS = [
+    # energy <-> compute
+    "ai_compute_demand", "ai_power_constraint", "datacenter_buildout", "grid_interconnect",
+    "transformers_equipment", "nuclear_power", "natural_gas_power", "uranium_supply",
+    "cooling_infrastructure", "miner_flexible_load", "mining_ai_pivot",
+    # debasement <-> bitcoin
+    "bitcoin_reserve_asset", "bitcoin_collateral_credit", "bitcoin_treasury_strategy",
+    "btc_custody_regulation", "sovereign_bitcoin_adoption",
+    # ai <-> data ownership
+    "ai_data_ownership", "confidential_inference", "ai_commoditization",
+    # macro
+    "fed_policy", "fiscal_debasement", "stablecoins_cbdc",
+]
+
+_SYSTEM = """You are the claim-extraction component of an investment signal engine. You read a passage \
+(an SEC filing excerpt or a podcast/earnings-call transcript) and extract structured CLAIM UNITS.
+
+A CLAIM UNIT is a single normalized proposition that someone asserts — a forward-looking prediction, \
+an interpretive or causal judgment, or a stance taken against a prevailing view. It must be specific \
+enough to later be checked against the world.
+
+CRITICAL DISCIPLINE — be willing to extract NOTHING:
+- Most passages contain ZERO claim units. Boilerplate, legal disclaimers, ad reads, pleasantries, \
+generic descriptions, routine financial line-items, and recitations of well-known news are NOT claims.
+- Do NOT invent claims. Do NOT emit one claim per paragraph to seem thorough. If the passage has no \
+substantive proposition, return {"claims": []}. A precise empty answer is the correct, valued output.
+- Extract at the level of the PROPOSITION: one normalized subject-assertion-object sentence each. A \
+single rich passage may yield several; a long dull one yields none.
+
+For EACH claim unit, output these fields:
+- "proposition": one normalized sentence (subject-assertion-object), self-contained.
+- "topic_canonical": a concise snake_case topic for clustering. REUSE one of the provided seed topics \
+when it fits; otherwise propose a new concise snake_case label. Normalize synonyms (Fed/FOMC/rates → fed_policy).
+- "topic_raw": the topic as actually phrased in the passage.
+- "claimant": who asserts it (speaker name or the filing company). Use "unknown" if unclear.
+- "claim_type": one of interpretive | predictive | descriptive | reactive. (interpretive/predictive = \
+insight; descriptive/reactive = news echo — extract those only if clearly salient.)
+- "time_horizon": one of near | medium | long | unspecified (for predictive claims especially).
+- "confidence": the claimant's apparent conviction — one of low | med | high.
+- "engages_consensus": true ONLY if the claim explicitly argues against a stated mainstream view.
+- "counters_position": the mainstream position it argues against, or null.
+- "thesis_seam": one of energy_compute | debasement_bitcoin | ai_data_ownership | none. This is a TAG \
+for relevance only — tag off-thesis claims "none" and STILL extract them.
+- "salience": central | secondary | aside (how central the claim is to the passage).
+
+Return ONLY a JSON object: {"claims": [ {...}, ... ]}. No prose, no markdown."""
+
+
+def build_messages(text: str, *, source_name: str, source_cluster: str | None,
+                   date: str | None, kind: str) -> list[dict[str, str]]:
+    seed = ", ".join(SEED_TOPICS)
+    context = (
+        f"Source: {source_name or 'unknown'} (cluster: {source_cluster or 'n/a'}, type: {kind}, "
+        f"date: {date or 'n/a'}).\n"
+        f"Seed topics to reuse when they fit: {seed}.\n\n"
+        f"PASSAGE:\n{text}"
+    )
+    return [
+        {"role": "system", "content": _SYSTEM},
+        {"role": "user", "content": context},
+    ]
@@ -0,0 +1,69 @@
+"""Extraction worker — drains 'extract' jobs from the backfill queue (§4.2, §13.4).
+
+Single sequential worker by design: extraction is the heavier serial load on the one LLM GPU.
+For each job: load the document, get its text (fetch+strip filing HTML, or read a stored transcript),
+chunk it, run the §4.2 extractor per chunk, persist 0..N claims, complete the job.
+"""
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+import requests
+
+from ..backfill import queue
+from . import claims as claims_mod
+from .html_text import html_to_text
+
+log = logging.getLogger(__name__)
+
+
+def _document_text(doc, *, user_agent: str) -> str:
+    if doc["transcript_path"]:
+        return Path(doc["transcript_path"]).read_text()
+    if doc["kind"] == "filing" and doc["url"]:
+        r = requests.get(doc["url"], headers={"User-Agent": user_agent}, timeout=90)
+        r.raise_for_status()
+        return html_to_text(r.text)
+    raise ValueError(f"no text source for {doc['doc_id']} (kind={doc['kind']}, url={doc['url']})")
+
+
+def run_extract(conn, sc, cfg, *, limit: int = 10, max_chunks_per_doc: int = 4,
+                chunk_chars: int = 18_000, lease_seconds: int = 900,
+                worker_id: str = "extract-1") -> dict:
+    from .backends import from_config as backend_from_config
+    backend = backend_from_config(cfg, sc)
+    log.info("extraction backend: %s", backend.name)
+    claims_mod.register_seed_topics(conn)
+    processed = total_claims = 0
+    while processed < limit:
+        job = queue.lease_next(conn, worker_id=worker_id, job_types=["extract"], lease_seconds=lease_seconds)
+        if job is None:
+            break
+        processed += 1
+        doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
+        if doc is None:
+            queue.skip(conn, job["job_id"], "document missing")
+            continue
+        src = conn.execute("SELECT * FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
+        try:
+            text = _document_text(doc, user_agent=cfg.edgar_user_agent)
+            chunks = claims_mod.chunk_text(text, chunk_chars)[:max_chunks_per_doc]
+            doc_claims = 0
+            for idx, chunk in enumerate(chunks):
+                cl = claims_mod.extract_claims_from_text(
+                    backend, chunk,
+                    source_name=src["name"] if src else "",
+                    source_cluster=src["source_cluster"] if src else None,
+                    date=doc["date"], kind=doc["kind"],
+                )
+                doc_claims += claims_mod.persist_claims(conn, doc=doc, source=src, claims=cl, chunk_idx=idx)
+            conn.execute("UPDATE documents SET processed_at=datetime('now') WHERE doc_id=?", (doc["doc_id"],))
+            conn.commit()
+            queue.complete(conn, job["job_id"], output_ref=f"{doc_claims} claims / {len(chunks)} chunks")
+            total_claims += doc_claims
+            log.info("extracted %d claims from %s (%d chunks)", doc_claims, doc["doc_id"], len(chunks))
+        except Exception as e:  # noqa: BLE001
+            state = queue.fail(conn, job["job_id"], e)
+            log.warning("extract failed for %s: %s (→ %s)", job["target_id"], e, state)
+    return {"jobs_processed": processed, "claims_written": total_claims}
@@ -0,0 +1,5 @@
+"""Ingestion layer (§4.1) — the biggest greenfield piece.
+
+Spark Control transcribes audio you hand it; it does NOT fetch. Everything here is fetch/schedule:
+RSS + YouTube + EDGAR + FMP earnings, long-audio chunking, and cross-chunk speaker stitching.
+"""
@@ -0,0 +1,36 @@
+"""Long-audio chunking (§4.1, §13.4).
+
+Podcasts run 1–3 h; the diarizer caps at 4 speakers/chunk and Spark 2 is a single GPU, so we cut
+long audio into ~2–3 min pieces sent SEQUENTIALLY (parallel audio → 503 FFT race). Each chunk is
+diarized independently and re-stitched across chunks by voiceprint (see speaker_stitch.py).
+Requires ffmpeg/ffprobe.
+"""
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+CHUNK_SECONDS_DEFAULT = 150  # 2.5 min, within the ~2–3 min guidance
+
+
+def duration_seconds(src: str | Path) -> float:
+    out = subprocess.run(
+        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+         "-of", "default=noprint_wrappers=1:nokey=1", str(src)],
+        check=True, capture_output=True, text=True,
+    )
+    return float(out.stdout.strip())
+
+
+def chunk_audio(src: str | Path, out_dir: str | Path, *, chunk_seconds: int = CHUNK_SECONDS_DEFAULT) -> list[Path]:
+    """Split into fixed-length WAV chunks using ffmpeg's segment muxer (no re-encode of timing).
+    Returns chunk paths in order. Order matters: the queue sends them sequentially."""
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    pattern = str(out_dir / "chunk_%04d.wav")
+    subprocess.run(
+        ["ffmpeg", "-y", "-i", str(src), "-f", "segment", "-segment_time", str(chunk_seconds),
+         "-ar", "16000", "-ac", "1", "-reset_timestamps", "1", pattern],
+        check=True, capture_output=True,
+    )
+    return sorted(out_dir.glob("chunk_*.wav"))
@@ -0,0 +1,159 @@
+"""Text-document fetcher for the Battery (bitcoin-collateralized lending) corpus and any non-filing,
+non-audio source: policy primaries (SEC SABs, OCC/FDIC/Fed), lender/issuer blogs, credit-market data.
+
+Unlike EDGAR (CIK-driven) and the podcast path (audio→transcribe), these are dated HTML pages, PDFs, or
+article RSS feeds. We fetch ONCE, extract clean text (HTML via html_to_text, PDF via pypdf), save it, and
+point documents.transcript_path at the saved text so the extract worker reads it directly (it already
+supports transcript_path) — this also lets PDFs work, which the worker's on-demand html_to_text fetch can't.
+
+A source row must exist first (FK). Lineage/axis live on the source's cluster/notes (set in the seed);
+policy sources are axis=context and must NOT feed the supply resolver (weight 0) — enforced downstream.
+"""
+from __future__ import annotations
+
+import hashlib
+import io
+import logging
+import sqlite3
+from pathlib import Path
+
+import requests
+
+from ..backfill import queue
+from ..extract.html_text import html_to_text
+from .feeds import fetch_feed
+
+log = logging.getLogger(__name__)
+
+DEFAULT_UA = "ten31-signal-engine/1.0 (research; contact ops@ten31.xyz)"
+
+
+def _pdf_to_text(data: bytes, *, max_chars: int) -> str:
+    import pypdf
+    reader = pypdf.PdfReader(io.BytesIO(data))
+    parts: list[str] = []
+    total = 0
+    for page in reader.pages:
+        t = page.extract_text() or ""
+        parts.append(t)
+        total += len(t)
+        if total > max_chars:
+            break
+    return "\n".join(parts)[:max_chars]
+
+
+def fetch_clean_text(url: str, *, method: str = "auto", ua: str = DEFAULT_UA,
+                     timeout: int = 90, max_chars: int = 300_000) -> str:
+    """Fetch a URL once and return clean text. Auto-detects PDF vs HTML by content-type + magic bytes."""
+    r = requests.get(url, headers={"User-Agent": ua}, timeout=timeout)
+    r.raise_for_status()
+    ctype = r.headers.get("Content-Type", "").lower()
+    is_pdf = method == "pdf" or "application/pdf" in ctype or r.content[:5] == b"%PDF-"
+    if is_pdf:
+        return _pdf_to_text(r.content, max_chars=max_chars)
+    return html_to_text(r.text, max_chars=max_chars)
+
+
+_BLOCK_MARKERS = (
+    "aggressive automated scraping", "request access", "access denied", "are you a robot",
+    "enable javascript", "captcha", "verify you are human", "rate limit exceeded",
+    "403 forbidden", "unusual traffic", "checking your browser",
+)
+
+
+def _looks_blocked(text: str) -> bool:
+    """Anti-scraping interstitials return 200 + a short access-denied body. Detect so we don't ingest
+    a block page as if it were the document (a real policy/blog doc is long and has no such markers)."""
+    low = text[:2500].lower()
+    return any(m in low for m in _BLOCK_MARKERS)
+
+
+def _doc_id(source_id: str, url: str) -> str:
+    return f"doc:{source_id}:{hashlib.sha256(url.encode()).hexdigest()[:12]}"
+
+
+def ingest_one(conn: sqlite3.Connection, cfg, *, source_id: str, url: str, title: str,
+               date: str | None, method: str = "auto", prompt_version: str = "extract-v0",
+               min_chars: int = 400) -> str | None:
+    """Fetch+store one text document and enqueue extraction. Idempotent on (source_id, url).
+    Returns doc_id if newly ingested, else None (duplicate, too-short, or fetch error → logged)."""
+    doc_id = _doc_id(source_id, url)
+    if conn.execute("SELECT 1 FROM documents WHERE doc_id=?", (doc_id,)).fetchone():
+        return None
+    ua = getattr(cfg, "user_agent", None) or DEFAULT_UA
+    try:
+        text = fetch_clean_text(url, method=method, ua=ua)
+    except Exception as e:  # noqa: BLE001
+        log.warning("doc fetch failed %s: %s", url, e)
+        return None
+    if not text or len(text) < min_chars:
+        log.warning("doc too short (%d chars), skipping %s", len(text or ""), url)
+        return None
+    if _looks_blocked(text):
+        log.warning("blocked/anti-scrape page detected, skipping %s", url)
+        return None
+    safe = doc_id.replace(":", "_")
+    tpath = Path(cfg.data_dir) / "docs" / f"{safe}.txt"
+    tpath.parent.mkdir(parents=True, exist_ok=True)
+    tpath.write_text(text)
+    content_hash = hashlib.sha256(text.encode()).hexdigest()
+    conn.execute(
+        """INSERT OR IGNORE INTO documents
+             (doc_id, source_id, kind, external_id, url, title, date, transcript_path, content_hash, processed_at)
+           VALUES (?,?,?,?,?,?,?,?,?,datetime('now'))""",
+        (doc_id, source_id, "filing", url, url, title[:300] if title else url, date, str(tpath), content_hash),
+    )
+    conn.commit()
+    h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
+    queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
+                  parent_doc_id=doc_id, priority=50)
+    conn.commit()
+    log.info("ingested doc %s (%d chars) for %s", doc_id, len(text), source_id)
+    return doc_id
+
+
+def ingest_manifest(conn: sqlite3.Connection, cfg, path) -> dict:
+    """Batch-ingest the docs listed in a YAML manifest ({docs:[{source,url,title,date,method}]}).
+    Returns {ingested, skipped, missing_source}. Each source must already exist (FK)."""
+    import yaml
+    from pathlib import Path as _Path
+    data = yaml.safe_load(_Path(path).read_text()) or {}
+    docs = data.get("docs", [])
+    ingested = skipped = missing = 0
+    for d in docs:
+        src = d.get("source")
+        if not conn.execute("SELECT 1 FROM sources WHERE source_id=?", (src,)).fetchone():
+            log.warning("manifest doc references missing source %r — skipping %s", src, d.get("url"))
+            missing += 1
+            continue
+        doc_id = ingest_one(conn, cfg, source_id=src, url=d["url"], title=d.get("title", d["url"]),
+                            date=d.get("date"), method=d.get("method", "auto"))
+        if doc_id:
+            ingested += 1
+        else:
+            skipped += 1
+    return {"ingested": ingested, "skipped": skipped, "missing_source": missing}
+
+
+def ingest_feed_text(conn: sqlite3.Connection, cfg, *, source_id: str, rss_url: str,
+                     since: str | None = None, until: str | None = None, limit: int = 50) -> int:
+    """Ingest the ARTICLE bodies behind a text RSS feed (blog/press feed). Each item's link is fetched
+    and stored as a dated text document. Returns count of newly-ingested docs."""
+    from .feeds import _published_iso
+    parsed = fetch_feed(rss_url, user_agent=getattr(cfg, "user_agent", None) or DEFAULT_UA)
+    n = 0
+    for entry in parsed.entries:
+        if n >= limit:
+            break
+        link = entry.get("link")
+        if not link:
+            continue
+        date = _published_iso(entry)
+        if since and date and date < since:
+            continue
+        if until and date and date > until:
+            continue
+        if ingest_one(conn, cfg, source_id=source_id, url=link,
+                      title=entry.get("title", link), date=date):
+            n += 1
+    return n
@@ -0,0 +1,61 @@
+"""Audio acquisition (§4.1). Spark Control transcribes audio you fetch — this fetches it.
+
+- Podcast enclosures: a plain streaming download that follows the Podtrac/Megaphone redirects to the
+  final signed CDN object (download immediately; resolved URLs carry short-lived params).
+- YouTube: yt-dlp (audio-only → 16 kHz mono WAV). NOTE: 2026 YouTube enforces PO Tokens broadly — run
+  the `bgutil-ytdlp-pot-provider` sidecar or pulls will 403. yt-dlp is treated as a LAST resort; prefer
+  the RSS enclosure where a show publishes both (ToS: downloading YT audio violates YouTube ToS).
+"""
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+import requests
+
+DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
+
+
+def download_enclosure(url: str, dest: str | Path, *, user_agent: str = DEFAULT_UA, timeout: int = 120) -> Path:
+    dest = Path(dest)
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    with requests.get(url, stream=True, allow_redirects=True,
+                      headers={"User-Agent": user_agent}, timeout=timeout) as r:
+        r.raise_for_status()
+        with open(dest, "wb") as f:
+            for chunk in r.iter_content(chunk_size=1 << 16):
+                f.write(chunk)
+    return dest
+
+
+def to_wav_16k_mono(src: str | Path, dst: str | Path) -> Path:
+    """Normalize any audio to 16 kHz mono PCM WAV (what the ASR endpoint wants). Requires ffmpeg."""
+    dst = Path(dst)
+    dst.parent.mkdir(parents=True, exist_ok=True)
+    subprocess.run(
+        ["ffmpeg", "-y", "-i", str(src), "-ar", "16000", "-ac", "1", "-f", "wav", str(dst)],
+        check=True, capture_output=True,
+    )
+    return dst
+
+
+def download_youtube_audio(url: str, out_dir: str | Path, *, archive_file: str | Path | None = None) -> Path:
+    """Audio-only via yt-dlp → 16 kHz mono WAV. `archive_file` (yt-dlp --download-archive) is the
+    canonical 'only-new' dedup for channel/playlist back-catalog pulls."""
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    cmd = [
+        "yt-dlp", "-f", "bestaudio/best", "-x", "--audio-format", "wav",
+        "--postprocessor-args", "ffmpeg:-ar 16000 -ac 1",
+        "-o", str(out_dir / "%(id)s.%(ext)s"),
+        "--no-progress",
+    ]
+    if archive_file:
+        cmd += ["--download-archive", str(archive_file)]
+    cmd.append(url)
+    subprocess.run(cmd, check=True, capture_output=True)
+    # yt-dlp names the file by video id; return the newest wav
+    wavs = sorted(out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime)
+    if not wavs:
+        raise RuntimeError("yt-dlp produced no wav (PO-token/cookies issue? see module docstring)")
+    return wavs[-1]
@@ -0,0 +1,127 @@
+"""Earnings-call transcripts via Financial Modeling Prep (§4.1, §12 — decision: FMP).
+
+Audio isn't reliably fetchable for large-caps (no uniform feed; ~30–90d replay expiry breaks
+backfill), so FMP's transcript API is the backbone and EDGAR filings remain the durable core. FMP
+also exposes an earnings *calendar* to trigger ingestion on the day a call drops.
+
+Endpoint paths/params are marked TODO(contract): confirm against the FMP 'stable' docs for the
+account tier at integration. Needs config.fmp_api_key.
+"""
+from __future__ import annotations
+
+import hashlib
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+import requests
+
+FMP_BASE = "https://financialmodelingprep.com/stable"
+
+
+class FMPClient:
+    def __init__(self, api_key: str, *, base: str = FMP_BASE, timeout: int = 30) -> None:
+        if not api_key:
+            raise ValueError("FMP_API_KEY is required for earnings-call transcripts")
+        self.api_key = api_key
+        self.base = base
+        self.timeout = timeout
+        self.s = requests.Session()
+
+    def _get(self, path: str, **params: Any) -> Any:
+        params["apikey"] = self.api_key
+        r = self.s.get(f"{self.base}/{path}", params=params, timeout=self.timeout)
+        r.raise_for_status()
+        return r.json()
+
+    # Confirmed against FMP 'stable' 2026-06-07 (v3 is legacy/403). Note singular "earning".
+    def transcript_dates(self, symbol: str) -> Any:
+        """List available transcripts: [{quarter, fiscalYear, date}, ...]."""
+        return self._get("earning-call-transcript-dates", symbol=symbol)
+
+    def transcript(self, symbol: str, *, year: int, quarter: int) -> Any:
+        """One transcript: [{symbol, period, year, date, content}]. Use the `date` field as the
+        document date — FMP's year/quarter labels are fiscal and can be offset from the call date."""
+        return self._get("earning-call-transcript", symbol=symbol, year=year, quarter=quarter)
+
+    def earnings_calendar(self, *, from_date: str, to_date: str) -> Any:
+        """Earnings calendar (ingestion trigger): [{symbol, date, epsActual, ...}, ...]."""
+        return self._get("earnings-calendar", **{"from": from_date, "to": to_date})
+
+
+def ingest_transcript(
+    conn: sqlite3.Connection,
+    *,
+    source_id: str,
+    symbol: str,
+    year: int,
+    quarter: int,
+    content: str,
+    date: str | None,
+    data_dir: Path,
+    prompt_version: str = "extract-v0",
+) -> tuple[bool, bool]:
+    """Store one transcript (content written to disk → transcript_path) and enqueue an 'extract'
+    job. Idempotent. Returns (new_document, new_job)."""
+    from ..backfill import queue
+
+    external_id = f"{symbol}-{year}Q{quarter}"
+    doc_id = f"earnings:{external_id}"
+    tdir = Path(data_dir) / "transcripts"
+    tdir.mkdir(parents=True, exist_ok=True)
+    tpath = tdir / f"{external_id}.txt"
+    tpath.write_text(content)
+    content_hash = hashlib.sha256(content.encode()).hexdigest()
+    cur = conn.execute(
+        """INSERT OR IGNORE INTO documents
+             (doc_id, source_id, kind, external_id, title, date, transcript_path, content_hash, processed_at)
+           VALUES (?,?,?,?,?,?,?,?, datetime('now'))""",
+        (doc_id, source_id, "earnings_call", external_id, f"{symbol} {year} Q{quarter} call",
+         date, str(tpath), content_hash),
+    )
+    conn.commit()
+    if not cur.rowcount:
+        return (False, False)
+    # earnings-call Q&A is the highest-yield text source (§4.1) → priority 40, ahead of filings (50).
+    h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
+    new_job = queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
+                            parent_doc_id=doc_id, priority=40) is not None
+    return (True, new_job)
+
+
+def ingest_for_ticker(
+    conn: sqlite3.Connection,
+    fmp: FMPClient,
+    *,
+    source_id: str,
+    symbol: str,
+    data_dir: Path,
+    since: str | None = None,
+    until: str | None = None,
+    limit: int = 8,
+) -> tuple[int, int]:
+    """Enumerate available transcripts via the dates index, fetch those in [since, until], and
+    ingest. Uses each transcript's own `date` (FMP fiscal labels are offset). Returns (docs, jobs)."""
+    dates = fmp.transcript_dates(symbol)
+    picked = []
+    for d in dates if isinstance(dates, list) else []:
+        dt = d.get("date")
+        if since and dt and dt < since:
+            continue
+        if until and dt and dt > until:
+            continue
+        picked.append(d)
+    n_docs = n_jobs = 0
+    for d in picked[:limit]:
+        tr = fmp.transcript(symbol, year=d["fiscalYear"], quarter=d["quarter"])
+        item = (tr[0] if isinstance(tr, list) and tr else tr) or {}
+        content = item.get("content") or ""
+        if not content:
+            continue
+        nd, nj = ingest_transcript(
+            conn, source_id=source_id, symbol=symbol, year=d["fiscalYear"], quarter=d["quarter"],
+            content=content, date=item.get("date") or d.get("date"), data_dir=data_dir,
+        )
+        n_docs += int(nd)
+        n_jobs += int(nj)
+    return n_docs, n_jobs
@@ -0,0 +1,148 @@
+"""SEC EDGAR ingestion (§4.1).
+
+Hits the official data.sec.gov / www.sec.gov APIs directly (free, keyless, full history).
+Two hard requirements:
+  - a descriptive User-Agent (SEC 403s requests without one) — from config.edgar_user_agent.
+  - ≤10 requests/sec aggregate — enforced by a min-interval throttle here.
+
+Supports an explicit date range AND historical shards (filings.files[]), so the §7.1 backtest can
+reach 2022–2023 filings, not just the most-recent ~1000.
+"""
+from __future__ import annotations
+
+import hashlib
+import sqlite3
+import time
+from typing import Iterator
+
+import requests
+
+_FILING_COLS = ("accessionNumber", "form", "filingDate", "primaryDocument", "primaryDocDescription")
+
+
+class EdgarClient:
+    BASE_DATA = "https://data.sec.gov"
+    BASE_WWW = "https://www.sec.gov"
+
+    def __init__(self, user_agent: str, *, min_interval: float = 0.12) -> None:
+        if not user_agent or "@" not in user_agent:
+            raise ValueError("EDGAR requires a descriptive User-Agent with contact email (config.edgar_user_agent)")
+        self.s = requests.Session()
+        self.s.headers.update({"User-Agent": user_agent, "Accept-Encoding": "gzip, deflate"})
+        self.min_interval = min_interval
+        self._last = 0.0
+        self._tickers: dict[str, int] | None = None
+
+    def _throttle(self) -> None:
+        dt = time.monotonic() - self._last
+        if dt < self.min_interval:
+            time.sleep(self.min_interval - dt)
+        self._last = time.monotonic()
+
+    def _get(self, url: str) -> requests.Response:
+        self._throttle()
+        r = self.s.get(url, timeout=30)
+        r.raise_for_status()
+        return r
+
+    # ---- ticker → CIK ----
+    def ticker_map(self) -> dict[str, int]:
+        if self._tickers is None:
+            data = self._get(f"{self.BASE_WWW}/files/company_tickers.json").json()
+            self._tickers = {row["ticker"].upper(): int(row["cik_str"]) for row in data.values()}
+        return self._tickers
+
+    def cik_for(self, ticker: str) -> int | None:
+        return self.ticker_map().get(ticker.upper())
+
+    # ---- filings ----
+    def _iter_array(self, block: dict, forms, since, until) -> Iterator[dict]:
+        arrays = [block.get(c, []) for c in _FILING_COLS]
+        for acc, form, fdate, pdoc, pdesc in zip(*arrays):
+            if forms and form not in forms:
+                continue
+            if since and fdate < since:
+                continue
+            if until and fdate > until:
+                continue
+            yield {"accession": acc, "form": form, "filing_date": fdate,
+                   "primary_document": pdoc, "description": pdesc}
+
+    def iter_filings(
+        self,
+        cik: int,
+        *,
+        forms: tuple[str, ...] = ("10-K", "10-Q", "8-K"),
+        since: str | None = None,
+        until: str | None = None,
+    ) -> Iterator[dict]:
+        """Yield filing descriptors. Pulls the inline 'recent' block AND any historical shards whose
+        date window overlaps [since, until] — required to reach the backtest era for active filers."""
+        sub = self._get(f"{self.BASE_DATA}/submissions/CIK{cik:010d}.json").json()
+        recent = sub.get("filings", {}).get("recent", {})
+        for f in self._iter_array(recent, forms, since, until):
+            yield self._with_url(cik, f)
+        for shard in sub.get("filings", {}).get("files", []):
+            # shard has filingFrom / filingTo; skip shards entirely outside the window.
+            if until and shard.get("filingFrom", "") > until:
+                continue
+            if since and shard.get("filingTo", "9999") < since:
+                continue
+            block = self._get(f"{self.BASE_DATA}/submissions/{shard['name']}").json()
+            for f in self._iter_array(block, forms, since, until):
+                yield self._with_url(cik, f)
+
+    def _with_url(self, cik: int, f: dict) -> dict:
+        acc_nodash = f["accession"].replace("-", "")
+        f["cik"] = cik
+        f["url"] = f"{self.BASE_WWW}/Archives/edgar/data/{cik}/{acc_nodash}/{f['primary_document']}"
+        return f
+
+    def fetch_html(self, filing: dict) -> str:
+        return self._get(filing["url"]).text
+
+
+# Domestic annual/quarterly + foreign-private-issuer equivalents. 20-F (foreign annual, e.g. TSM/IREN),
+# 40-F (Canadian annual, e.g. CCJ). 8-K/6-K (current reports) excluded by default — low claim yield.
+HIGH_YIELD_FORMS = ("10-K", "10-Q", "20-F", "40-F")
+
+
+def ingest_filings(
+    conn: sqlite3.Connection,
+    client: EdgarClient,
+    *,
+    source_id: str,
+    ticker: str,
+    since: str | None = None,
+    until: str | None = None,
+    forms: tuple[str, ...] = HIGH_YIELD_FORMS,
+    prompt_version: str = "extract-v0",
+) -> tuple[int, int]:
+    """Insert filing documents and enqueue 'extract' jobs. Filings are text → no transcription;
+    they go straight to extraction (the extract worker fetches + strips the HTML later). Default
+    forms cover both domestic (10-K/10-Q) and foreign-private-issuer (20-F/40-F) filers.
+    Returns (new_documents, new_jobs). Idempotent on (source_id, accession)."""
+    from ..backfill import queue
+
+    cik = client.cik_for(ticker)
+    if cik is None:
+        raise ValueError(f"No CIK found for ticker {ticker!r}")
+    n_docs = n_jobs = 0
+    for f in client.iter_filings(cik, forms=forms, since=since, until=until):
+        doc_id = f"edgar:{f['accession']}"
+        cur = conn.execute(
+            """INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date)
+               VALUES (?,?,?,?,?,?,?)""",
+            (doc_id, source_id, "filing", f["accession"], f["url"],
+             f"{ticker} {f['form']} {f['filing_date']}", f["filing_date"]),
+        )
+        conn.commit()
+        if not cur.rowcount:
+            continue
+        n_docs += 1
+        h = hashlib.sha256(f"{doc_id}|{prompt_version}".encode()).hexdigest()
+        # priority 50: filings are high-info-density (§4.1) → ahead of podcasts (100)
+        if queue.enqueue(conn, job_type="extract", target_id=doc_id, input_hash=h,
+                         parent_doc_id=doc_id, priority=50) is not None:
+            n_jobs += 1
+    return n_docs, n_jobs
@@ -0,0 +1,65 @@
+"""Podcast RSS ingestion (§4.1).
+
+feedparser + conditional GET (ETag/Last-Modified) for efficient incremental polling, with a
+composite (feed_url, guid) dedup discipline. Many podcast CDNs send no validators and some feeds
+truncate to recent episodes — for the §7.1 backtest, older episodes may need the show's full
+archive feed (some hosts expose `?limit=` / a separate archive URL) or a YouTube back-catalog.
+"""
+from __future__ import annotations
+
+import hashlib
+import time
+from typing import Any
+
+import feedparser
+
+DEFAULT_UA = "Ten31SignalEngine/0.1 (+https://ten31.xyz)"
+
+
+def fetch_feed(url: str, *, etag: str | None = None, modified: str | None = None,
+               user_agent: str = DEFAULT_UA) -> feedparser.FeedParserDict:
+    """Conditional GET. On HTTP 304 the result has .status == 304 and .entries == [] → skip."""
+    return feedparser.parse(url, etag=etag, modified=modified, agent=user_agent)
+
+
+def _published_iso(entry: Any) -> str | None:
+    t = entry.get("published_parsed") or entry.get("updated_parsed")
+    if not t:
+        return None
+    return time.strftime("%Y-%m-%d", t)
+
+
+def _enclosure_audio_url(entry: Any) -> str | None:
+    for enc in entry.get("enclosures", []) or []:
+        if str(enc.get("type", "")).startswith("audio"):
+            return enc.get("href") or enc.get("url")
+    # some feeds put audio only in links rel=enclosure
+    for link in entry.get("links", []) or []:
+        if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("audio"):
+            return link.get("href")
+    return None
+
+
+def _guid(entry: Any) -> str:
+    g = entry.get("id") or entry.get("link")
+    if g:
+        return str(g)
+    basis = f"{entry.get('title','')}|{entry.get('published','')}"
+    return "sha1:" + hashlib.sha1(basis.encode()).hexdigest()
+
+
+def episode_records(parsed: feedparser.FeedParserDict) -> list[dict]:
+    """Normalize feed entries to episode records. Skips entries with no audio enclosure."""
+    out: list[dict] = []
+    for e in parsed.entries:
+        audio = _enclosure_audio_url(e)
+        if not audio:
+            continue
+        out.append({
+            "guid": _guid(e),
+            "title": e.get("title"),
+            "audio_url": audio,
+            "link": e.get("link"),
+            "published": _published_iso(e),
+        })
+    return out
@@ -0,0 +1,195 @@
+"""One-time backfill path: transcribe podcast episodes via the Gemini multimodal API instead of the
+local Spark Parakeet+diarizer pipeline. Used to take a bulk backfill OFF the shared Spark GPU (which
+contends with production) — it is NOT the steady-state transcriber (local Parakeet remains the default).
+
+Scope/guardrail: podcast audio is PUBLIC data, so sending it to the frontier does NOT trip the
+exposure/positioning-data rule (that guardrail is about Ten31's conviction/exposure data, never public
+audio). Output is written in the SAME 'Speaker: text' transcript format the extractor consumes, so the
+downstream extract→embed stages are agnostic to which transcriber produced the file.
+
+Tradeoff vs local: Gemini yields speaker-LABELED text, not voiceprint fingerprints — so no voiceprint
+auto-edges. We rely on the hand-seeded EISC edges + name-based attribution instead (acceptable for a
+bounded backfill).
+"""
+from __future__ import annotations
+
+import hashlib
+import logging
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+from ..backfill import queue
+from .download import download_enclosure
+
+log = logging.getLogger(__name__)
+
+_PROMPT = (
+    "You are a precise podcast transcriptionist. Transcribe this audio VERBATIM as a speaker-diarized "
+    "transcript.\n"
+    "RULES:\n"
+    "- One line per speaker turn, formatted exactly as `Name: spoken text` (a colon and one space).\n"
+    "- The host of this show is {host} — label every host turn with exactly `{host}` (the person's "
+    "name, never the show's name).\n"
+    "- When the host introduces a guest by name (e.g. 'welcome X to the show', 'I'm joined by X'), use "
+    "that real first name (or full name) as the guest's label for the WHOLE transcript. Only fall back "
+    "to `Guest` (or `Guest 2`, `Guest 3`) if a name is never stated. Do not invent names.\n"
+    "- Do NOT include timestamps, ad-reads markers, summaries, headings, markdown, or any commentary. "
+    "Only the transcript lines.\n"
+    "- Transcribe the entire episode from start to finish. Do not stop early or summarize.\n"
+)
+
+
+def _host_person(source_name: str) -> str:
+    """Derive the host's PERSON name from a source/show name so claimant attribution isn't the show.
+    'What Bitcoin Did (Peter McCormack)' -> 'Peter McCormack'; 'Stephan Livera Podcast' -> 'Stephan
+    Livera'; 'The Kevin Rooke Show' -> 'Kevin Rooke'; 'The Anita Posch Show' -> 'Anita Posch'."""
+    m = re.search(r"\(([^)]+)\)", source_name or "")
+    if m:
+        return m.group(1).strip()
+    s = re.sub(r"^The\s+", "", source_name or "").strip()
+    s = re.sub(r"\s+(Podcast|Show)$", "", s, flags=re.I).strip()
+    return s
+
+
+def _sniff_audio_mime(path: Path) -> str:
+    """Determine audio MIME from the file header — the downloaded enclosure has a generic `.src`
+    extension, so the Files API can't infer it and rejects the upload without an explicit mime_type."""
+    with open(path, "rb") as fh:
+        head = fh.read(16)
+    if head[:3] == b"ID3" or (len(head) > 1 and head[0] == 0xFF and (head[1] & 0xE0) == 0xE0):
+        return "audio/mpeg"
+    if head[4:8] == b"ftyp":
+        return "audio/mp4"          # m4a/aac
+    if head[:4] == b"OggS":
+        return "audio/ogg"
+    if head[:4] == b"RIFF":
+        return "audio/wav"
+    if head[:4] == b"fLaC":
+        return "audio/flac"
+    return "audio/mpeg"             # podcast default
+
+
+def _upload_and_wait(client, audio_path: Path, *, poll_s: float = 2.0, timeout_s: float = 300.0):
+    """Upload to the Files API and wait until the file is ACTIVE (audio is processed server-side)."""
+    from google.genai import types
+    mime = _sniff_audio_mime(audio_path)
+    f = client.files.upload(file=str(audio_path), config=types.UploadFileConfig(mime_type=mime))
+    waited = 0.0
+    while getattr(f.state, "name", str(f.state)) == "PROCESSING" and waited < timeout_s:
+        time.sleep(poll_s)
+        waited += poll_s
+        f = client.files.get(name=f.name)
+    state = getattr(f.state, "name", str(f.state))
+    if state != "ACTIVE":
+        raise RuntimeError(f"Gemini file not ACTIVE (state={state}) for {audio_path.name}")
+    return f
+
+
+def transcribe_one(client, model: str, audio_path: Path, host_name: str, *,
+                   max_output_tokens: int = 65536) -> tuple[str, dict]:
+    """Transcribe a single audio file → (transcript_text, usage_dict). Network/CPU only; no DB."""
+    from google.genai import types
+    f = _upload_and_wait(client, audio_path)
+    try:
+        resp = client.models.generate_content(
+            model=model,
+            contents=[f, _PROMPT.format(host=host_name or "the host")],
+            config=types.GenerateContentConfig(temperature=0, max_output_tokens=max_output_tokens),
+        )
+        text = (resp.text or "").strip()
+        um = getattr(resp, "usage_metadata", None)
+        usage = {
+            "prompt_tokens": getattr(um, "prompt_token_count", 0) or 0,
+            "output_tokens": getattr(um, "candidates_token_count", 0) or 0,
+            "finish_reason": str(getattr(resp.candidates[0], "finish_reason", "")) if resp.candidates else "",
+        }
+        return text, usage
+    finally:
+        try:
+            client.files.delete(name=f.name)
+        except Exception as e:  # noqa: BLE001 — best-effort cleanup
+            log.debug("file cleanup failed for %s: %s", f.name, e)
+
+
+def _fetch_and_transcribe(client, model: str, cfg, doc, host_name: str) -> dict:
+    """Worker-thread unit: download enclosure → Gemini transcribe → write transcript file. No DB writes."""
+    cache = Path(cfg.audio_cache_dir)
+    cache.mkdir(parents=True, exist_ok=True)
+    safe = doc["doc_id"].replace(":", "_")
+    src = cache / f"{safe}.src"
+    audio = download_enclosure(doc["url"], src)
+    try:
+        text, usage = transcribe_one(client, model, audio, host_name)
+        if not text or len(text) < 40:
+            raise RuntimeError(f"empty/short transcript ({len(text)} chars)")
+        tpath = Path(cfg.data_dir) / "transcripts" / f"{safe}.txt"
+        tpath.parent.mkdir(parents=True, exist_ok=True)
+        tpath.write_text(text)
+        return {
+            "doc_id": doc["doc_id"], "ok": True, "transcript_path": str(tpath),
+            "n_lines": text.count("\n") + 1, "content_hash": hashlib.sha256(text.encode()).hexdigest(),
+            "usage": usage,
+        }
+    finally:
+        try:
+            if audio.exists():
+                audio.unlink()
+        except Exception:  # noqa: BLE001
+            pass
+
+
+def run_transcribe_gemini(conn, cfg, *, limit: int = 5, concurrency: int = 4,
+                          lease_seconds: int = 7200, worker_id: str = "gemini-transcribe") -> dict:
+    """Lease pending transcribe jobs and transcribe them via Gemini in parallel. DB writes stay on the
+    main thread; only download+API run in the pool. Reports token usage for cost accounting."""
+    from google import genai
+    if not cfg.gemini_api_key:
+        raise RuntimeError("GEMINI_API_KEY not configured")
+    client = genai.Client(api_key=cfg.gemini_api_key)
+    model = cfg.gemini_model or "gemini-2.5-flash"
+
+    # Lease the batch up front (main thread); resolve docs + host names.
+    leased: list[tuple] = []
+    while len(leased) < limit:
+        job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
+        if job is None:
+            break
+        doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
+        if doc is None:
+            queue.skip(conn, job["job_id"], "document missing")
+            continue
+        host = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
+        leased.append((job, doc, _host_person(host["name"]) if host else ""))
+
+    done = failed = prompt_tok = out_tok = 0
+    with ThreadPoolExecutor(max_workers=concurrency) as pool:
+        futs = {pool.submit(_fetch_and_transcribe, client, model, cfg, doc, host): (job, doc)
+                for (job, doc, host) in leased}
+        for fut in as_completed(futs):
+            job, doc = futs[fut]
+            try:
+                r = fut.result()
+                conn.execute(
+                    "UPDATE documents SET transcript_path=?, content_hash=?, processed_at=datetime('now') "
+                    "WHERE doc_id=?", (r["transcript_path"], r["content_hash"], doc["doc_id"]),
+                )
+                h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
+                queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
+                              parent_doc_id=doc["doc_id"], priority=100)
+                queue.complete(conn, job["job_id"], output_ref=f"gemini {r['n_lines']} lines")
+                conn.commit()
+                done += 1
+                prompt_tok += r["usage"]["prompt_tokens"]
+                out_tok += r["usage"]["output_tokens"]
+                fr = r["usage"]["finish_reason"]
+                log.info("gemini transcribed %s (%d lines, %d in/%d out tok%s)", doc["doc_id"],
+                         r["n_lines"], r["usage"]["prompt_tokens"], r["usage"]["output_tokens"],
+                         ", TRUNCATED" if "MAX_TOKENS" in fr else "")
+            except Exception as e:  # noqa: BLE001
+                state = queue.fail(conn, job["job_id"], e)
+                conn.commit()
+                failed += 1
+                log.warning("gemini transcribe failed for %s: %s (→ %s)", doc["doc_id"], e, state)
+    return {"done": done, "failed": failed, "prompt_tokens": prompt_tok, "output_tokens": out_tok}
@@ -0,0 +1,45 @@
+"""Speaker-name identification (§4.5 enhancement).
+
+In a 1-on-1 interview the host introduces the guest by name at the top. Reading the transcript head
+with the LLM, we attach a real NAME to each diarized speaker → voiceprints.person_label. This gives
+the independence graph a SECOND, orthogonal overlap signal: the same NAMED guest across two shows is
+a shared_guest edge even when the voiceprints don't cluster (different mic/codec/room). It complements
+voiceprint cosine matching and is robust to fingerprint drift — exactly the case the operator flagged.
+"""
+from __future__ import annotations
+
+import json
+import logging
+
+log = logging.getLogger(__name__)
+
+_SYS = (
+    'You identify the speakers in a podcast/interview transcript. Each line is "LABEL: text". '
+    "Using the introduction and context, determine each LABEL's real full name and role. In an "
+    "interview the host normally introduces themselves and the guest within the first minute. Only "
+    "assert a name you can actually support from the text — if you cannot tell, use null. "
+    'Return ONLY JSON: {"speakers": {"<LABEL>": {"name": "Full Name" or null, '
+    '"role": "host"|"guest"|"panelist"|"unknown", "confidence": "low"|"med"|"high"}}}.'
+)
+
+
+def identify_speakers(backend, transcript_head: str, *, source_name: str, host_hint: str | None = None) -> dict:
+    """Returns {label: {name, role, confidence}}. `backend` is any extract.backends backend."""
+    ctx = f"Show: {source_name}."
+    if host_hint:
+        ctx += f" The show's usual host is {host_hint}."
+    ctx += "\n\nTRANSCRIPT (beginning):\n" + transcript_head
+    messages = [{"role": "system", "content": _SYS}, {"role": "user", "content": ctx}]
+    raw = backend.complete_json(messages, max_tokens=600)
+    try:
+        obj = json.loads(raw)
+    except Exception:
+        i, j = raw.find("{"), raw.rfind("}")
+        if i < 0 or j < 0:
+            return {}
+        try:
+            obj = json.loads(raw[i:j + 1])
+        except Exception:
+            return {}
+    spk = obj.get("speakers", {}) if isinstance(obj, dict) else {}
+    return spk if isinstance(spk, dict) else {}
@@ -0,0 +1,111 @@
+"""Podcast ingestion → documents + 'transcribe' jobs (§4.1).
+
+RSS path: parse the feed, take episodes in [since, until], register documents pointing at the audio
+enclosure. YouTube path: enumerate a channel's videos in the date window via yt-dlp (the back-catalog
+route for the ~9 shows whose RSS is a truncated rolling window — see seeds/podcast_feeds.resolved.yaml).
+The transcribe worker downloads + processes either kind identically.
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import sqlite3
+import subprocess
+
+from ..backfill import queue
+from ..util import audio_dedup_key
+from .feeds import episode_records, fetch_feed
+
+log = logging.getLogger(__name__)
+
+
+def _enqueue_doc(conn, *, source_id, kind, external_id, url, title, date) -> tuple[int, int]:
+    doc_id = f"pod:{source_id}:{hashlib.sha1(external_id.encode()).hexdigest()[:12]}"
+    dkey = audio_dedup_key(title, date)
+    # Cross-mirror dedup (pre-GPU): if this same episode was already processed (any source/feed),
+    # record the sighting for provenance but DON'T re-transcribe. (external_id UNIQUE already covers
+    # same-feed re-ingest; this covers the same episode via a different feed/YouTube mirror.)
+    dup = conn.execute(
+        "SELECT doc_id FROM documents WHERE dedup_key=? AND processed_at IS NOT NULL LIMIT 1", (dkey,)
+    ).fetchone()
+    cur = conn.execute(
+        """INSERT OR IGNORE INTO documents (doc_id, source_id, kind, external_id, url, title, date, dedup_key)
+           VALUES (?,?,?,?,?,?,?,?)""",
+        (doc_id, source_id, kind, external_id, url, title, date, dkey),
+    )
+    conn.commit()
+    if not cur.rowcount:
+        return (0, 0)  # same (source_id, external_id) already known
+    if dup:
+        conn.execute(
+            "UPDATE documents SET processed_at=datetime('now'), raw_path=? WHERE doc_id=?",
+            (f"dup_of:{dup['doc_id']}", doc_id),
+        )
+        conn.commit()
+        log.info("skip transcribe for %s — duplicate content of %s", doc_id, dup["doc_id"])
+        return (1, 0)
+    h = hashlib.sha256(f"{doc_id}|audio-v0".encode()).hexdigest()
+    job = queue.enqueue(conn, job_type="transcribe", target_id=doc_id, input_hash=h,
+                        parent_doc_id=doc_id, priority=100)
+    return (1, 1 if job is not None else 0)
+
+
+def ingest_rss(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None, limit=20):
+    if not source["rss_url"]:
+        raise ValueError(f"{source['source_id']} has no rss_url")
+    recs = episode_records(fetch_feed(source["rss_url"]))
+    n_docs = n_jobs = count = 0
+    for r in recs:
+        d = r["published"]
+        if since and d and d < since:
+            continue
+        if until and d and d > until:
+            continue
+        if count >= limit:
+            break
+        count += 1
+        nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="podcast",
+                              external_id=r["guid"], url=r["audio_url"], title=r["title"], date=d)
+        n_docs += nd
+        n_jobs += nj
+    return n_docs, n_jobs
+
+
+def ingest_youtube(conn: sqlite3.Connection, source: sqlite3.Row, *, since=None, until=None,
+                   limit=20, max_scan=800):
+    """Enumerate channel videos in the date window via yt-dlp (NON-flat, so upload_date is populated —
+    flat mode returns NA). Videos come newest-first, so we use --dateafter/--datebefore to select the
+    window and --break-match-filters to STOP scanning once we drop below `since` (avoids walking the
+    entire channel history). The transcribe worker downloads audio on demand."""
+    if not source["channel_url"]:
+        raise ValueError(f"{source['source_id']} has no channel_url")
+    url = source["channel_url"].rstrip("/")
+    if "/playlist" not in url and not url.endswith("/videos"):
+        url = url + "/videos"
+    cmd = ["yt-dlp", "--no-warnings", "--ignore-errors", "--skip-download",
+           "--print", "%(id)s\t%(upload_date)s\t%(title)s", "--playlist-end", str(max_scan)]
+    if since:
+        s = since.replace("-", "")
+        cmd += ["--dateafter", s, "--break-match-filters", f"upload_date>={s}"]
+    if until:
+        cmd += ["--datebefore", until.replace("-", "")]
+    cmd.append(url)
+    out = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
+    n_docs = n_jobs = count = 0
+    for line in out.stdout.splitlines():
+        parts = line.split("\t")
+        if len(parts) < 2 or not parts[0] or parts[1] in ("NA", ""):
+            continue
+        vid, upd = parts[0], parts[1]
+        title = parts[2] if len(parts) > 2 else vid
+        date = f"{upd[:4]}-{upd[4:6]}-{upd[6:8]}" if len(upd) == 8 else None
+        if count >= limit:
+            break
+        count += 1
+        nd, nj = _enqueue_doc(conn, source_id=source["source_id"], kind="youtube",
+                              external_id=vid, url=f"https://www.youtube.com/watch?v={vid}",
+                              title=title, date=date)
+        n_docs += nd
+        n_jobs += nj
+    return n_docs, n_jobs
@@ -0,0 +1,60 @@
+"""Cross-chunk speaker stitching + the voiceprint library (§4.1, §4.5).
+
+diarize-chunk returns a 192-d TitaNet voiceprint per speaker per chunk. Because each chunk is
+diarized independently, "Speaker 1" in chunk 3 is not the same label as "Speaker 1" in chunk 7 —
+we re-cluster by cosine similarity (~0.7 distance threshold) so one person gets one identity across
+the whole episode. The SAME library then matches a guest ACROSS shows by voice (the independence
+graph's hardest edge, §4.5).
+"""
+from __future__ import annotations
+
+import numpy as np
+
+DISTANCE_THRESHOLD = 0.7  # cosine DISTANCE (1 - cosine similarity); §4.1
+
+
+def _unit(v: np.ndarray) -> np.ndarray:
+    n = np.linalg.norm(v)
+    return v / n if n else v
+
+
+def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
+    return float(1.0 - np.dot(_unit(np.asarray(a, dtype=float)), _unit(np.asarray(b, dtype=float))))
+
+
+def stitch_chunks(chunk_voiceprints: list[np.ndarray], *, threshold: float = DISTANCE_THRESHOLD) -> list[int]:
+    """Greedy online clustering of per-(chunk,speaker) voiceprints into stable speaker ids.
+
+    Input: a flat list of voiceprint vectors (one per chunk-speaker, in encounter order).
+    Output: a parallel list of cluster ids. A vector joins the nearest existing cluster if its
+    distance to that cluster's centroid < threshold, else it starts a new cluster.
+    """
+    centroids: list[np.ndarray] = []
+    counts: list[int] = []
+    labels: list[int] = []
+    for vp in chunk_voiceprints:
+        vp = np.asarray(vp, dtype=float)
+        if centroids:
+            dists = [cosine_distance(vp, c) for c in centroids]
+            j = int(np.argmin(dists))
+            if dists[j] < threshold:
+                centroids[j] = (centroids[j] * counts[j] + vp) / (counts[j] + 1)
+                counts[j] += 1
+                labels.append(j)
+                continue
+        centroids.append(vp.copy())
+        counts.append(1)
+        labels.append(len(centroids) - 1)
+    return labels
+
+
+def match_library(vp: np.ndarray, library: list[tuple[str, np.ndarray]], *,
+                  threshold: float = DISTANCE_THRESHOLD) -> str | None:
+    """Return the voiceprint_id of the closest library entry within threshold, else None
+    (a new speaker → caller mints a new library id)."""
+    best_id, best_d = None, threshold
+    for vid, lib_vec in library:
+        d = cosine_distance(vp, lib_vec)
+        if d < best_d:
+            best_id, best_d = vid, d
+    return best_id
@@ -0,0 +1,308 @@
+"""Audio → speaker-attributed transcript + voiceprint library (§4.1, §4.5).
+
+Per chunk (sequential — audio lock): diarize-chunk (192-d TitaNet fingerprints + timed speaker
+segments) + transcribe (word timestamps). Align words to speakers by time, stitch speakers ACROSS
+chunks by fingerprint cosine, then match the persisted voiceprint library so the SAME guest is
+recognized ACROSS shows by voice — the highest-leverage input to the source-independence graph.
+"""
+from __future__ import annotations
+
+import logging
+import time
+import uuid
+from pathlib import Path
+
+import numpy as np
+
+from ..backfill import queue
+from .chunker import chunk_audio
+from .download import download_enclosure, download_youtube_audio, to_wav_16k_mono
+from .speaker_stitch import DISTANCE_THRESHOLD, match_library, stitch_chunks
+
+log = logging.getLogger(__name__)
+
+
+# ---------- alignment ----------
+def _speaker_at(segments: list[dict], t: float) -> str:
+    for s in segments:
+        if s["start_s"] <= t <= s["end_s"]:
+            return s["speaker"]
+    if not segments:
+        return "Speaker_0"
+    return min(segments, key=lambda s: min(abs(s["start_s"] - t), abs(s["end_s"] - t)))["speaker"]
+
+
+def align_words(words: list[dict], segments: list[dict]) -> list[dict]:
+    """Group word-level transcription into speaker turns using the diarization segments."""
+    turns: list[dict] = []
+    cur: dict | None = None
+    for w in words:
+        mid = (w["start"] + w["end"]) / 2
+        spk = _speaker_at(segments, mid)
+        if cur and cur["speaker"] == spk:
+            cur["text"] += " " + w["text"]
+            cur["end"] = w["end"]
+        else:
+            if cur:
+                turns.append(cur)
+            cur = {"speaker": spk, "start": w["start"], "end": w["end"], "text": w["text"]}
+    if cur:
+        turns.append(cur)
+    return turns
+
+
+# ---------- per-document audio processing ----------
+def diarize_transcribe_chunks(sc, chunks: list[Path], *, concurrency: int = 2):
+    """Returns (chunk_turns, chunk_speakers): turns per chunk + (chunk_idx, local_spk, fingerprint).
+
+    Drives up to `concurrency` chunks in flight — the client's global audio SEMAPHORE is the hard cap
+    across both parakeet endpoints (sit at 2: keeps the single serial GPU continuously fed = full
+    throughput, no idle gap). A single chunk's failure is non-fatal (skip; the client already busy-
+    retries transient blips), but if a MAJORITY of chunks fail the whole job raises so it retries later
+    (rather than emitting a half-empty transcript). Results are reassembled in chunk order."""
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+
+    def _one(idx: int, ch: Path):
+        dia = sc.diarize_chunk(str(ch))
+        tr = sc.transcribe(str(ch))
+        turns = align_words(tr.get("words", []), dia.get("segments", []))
+        spks = [(idx, spk, np.asarray(vec, dtype=np.float32))
+                for spk, vec in (dia.get("fingerprints") or {}).items()]
+        return idx, turns, spks
+
+    results: dict[int, tuple] = {}
+    failed = 0
+    with ThreadPoolExecutor(max_workers=max(1, concurrency)) as pool:
+        futs = {pool.submit(_one, i, ch): i for i, ch in enumerate(chunks)}
+        for fut in as_completed(futs):
+            try:
+                idx, turns, spks = fut.result()
+                results[idx] = (turns, spks)
+            except Exception as e:  # noqa: BLE001 — one contended chunk shouldn't kill the episode
+                failed += 1
+                log.warning("chunk %d/%d failed (%s) — skipping", futs[fut], len(chunks), str(e)[:90])
+    if chunks and failed >= max(3, len(chunks) // 2):
+        raise RuntimeError(f"{failed}/{len(chunks)} chunks failed — backend contended; will retry later")
+    chunk_turns = [(idx, results[idx][0]) for idx in sorted(results)]
+    chunk_speakers = [s for idx in sorted(results) for s in results[idx][1]]
+    return chunk_turns, chunk_speakers
+
+
+def stitch_and_centroids(chunk_speakers, *, threshold: float = DISTANCE_THRESHOLD):
+    """Cluster all (chunk,speaker) fingerprints into within-episode global speakers."""
+    if not chunk_speakers:
+        return {}, {}
+    vecs = [v for (_, _, v) in chunk_speakers]
+    labels = stitch_chunks(vecs, threshold=threshold)
+    keymap: dict[tuple[int, str], int] = {}
+    groups: dict[int, list[np.ndarray]] = {}
+    for (idx, spk, vec), lab in zip(chunk_speakers, labels):
+        keymap[(idx, spk)] = lab
+        groups.setdefault(lab, []).append(vec)
+    centroids = {lab: np.mean(v, axis=0) for lab, v in groups.items()}
+    return keymap, centroids
+
+
+def _load_library(conn) -> list[tuple[str, np.ndarray]]:
+    rows = conn.execute("SELECT voiceprint_id, vector, person_label FROM voiceprints").fetchall()
+    return [(r["voiceprint_id"], np.frombuffer(r["vector"], dtype=np.float32)) for r in rows]
+
+
+def _label_for(conn, vpid: str) -> str:
+    r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
+    return (r["person_label"] if r and r["person_label"] else f"SPK:{vpid[:8]}")
+
+
+def resolve_voiceprints(conn, doc, centroids: dict[int, np.ndarray], *, threshold: float = DISTANCE_THRESHOLD):
+    """Match each within-episode speaker to the persisted library (cross-show identity) or mint a new
+    one; record observations; add shared_guest edges when the voice also appears in ANOTHER source."""
+    library = _load_library(conn)
+    cluster_to_vpid: dict[int, str] = {}
+    for lab, cen in centroids.items():
+        vpid = match_library(cen, library, threshold=threshold)
+        if vpid is None:
+            vpid = "vp_" + uuid.uuid4().hex[:16]
+            conn.execute(
+                "INSERT INTO voiceprints (voiceprint_id, vector, first_doc_id) VALUES (?,?,?)",
+                (vpid, cen.astype(np.float32).tobytes(), doc["doc_id"]),
+            )
+            library.append((vpid, cen))
+        conn.execute(
+            "INSERT INTO voiceprint_observations (voiceprint_id, doc_id, chunk_idx) VALUES (?,?,?)",
+            (vpid, doc["doc_id"], None),
+        )
+        cluster_to_vpid[lab] = vpid
+    conn.commit()
+    # independence graph (§4.5): if this voice appears in a DIFFERENT source, that's a shared guest.
+    for vpid in set(cluster_to_vpid.values()):
+        others = conn.execute(
+            """SELECT DISTINCT d.source_id FROM voiceprint_observations o
+                 JOIN documents d ON d.doc_id = o.doc_id
+                WHERE o.voiceprint_id=? AND d.source_id != ?""",
+            (vpid, doc["source_id"]),
+        ).fetchall()
+        for o in others:
+            a, b = sorted([doc["source_id"], o["source_id"]])
+            conn.execute(
+                """INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
+                   VALUES (?,?,'shared_guest',1.0,?)
+                   ON CONFLICT(src_a, src_b, edge_type)
+                   DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
+                (a, b, vpid),
+            )
+    conn.commit()
+    return cluster_to_vpid
+
+
+def _labeled(chunk_turns, keymap, label_by_cluster: dict) -> str:
+    lines: list[str] = []
+    for idx, turns in chunk_turns:
+        for t in turns:
+            lab = keymap.get((idx, t["speaker"]))
+            label = label_by_cluster.get(lab, t["speaker"])
+            lines.append(f"{label}: {t['text']}")
+    return "\n".join(lines)
+
+
+def build_transcript(conn, chunk_turns, keymap, cluster_to_vpid) -> str:
+    labels = {lab: _label_for(conn, vpid) for lab, vpid in cluster_to_vpid.items()}
+    return _labeled(chunk_turns, keymap, labels)
+
+
+def apply_names(conn, cluster_to_vpid: dict, idmap: dict) -> dict:
+    """Attach confident names to the voiceprint library (person_label). Returns {cluster: name}."""
+    named: dict[int, str] = {}
+    for lab, vpid in cluster_to_vpid.items():
+        info = idmap.get(f"Speaker {lab + 1}") or idmap.get(str(lab + 1)) or {}
+        name = (info.get("name") or "").strip() if isinstance(info, dict) else ""
+        if name and info.get("confidence") in ("med", "high"):
+            conn.execute("UPDATE voiceprints SET person_label=? WHERE voiceprint_id=?", (name, vpid))
+            named[lab] = name
+    conn.commit()
+    return named
+
+
+def add_name_edges(conn, doc, cluster_to_vpid: dict) -> int:
+    """Name-based shared_guest edges: same person_label seen in a DIFFERENT source → independence edge,
+    even if the voiceprints didn't cluster (drift-robust complement to voiceprint matching, §4.5)."""
+    n = 0
+    for vpid in set(cluster_to_vpid.values()):
+        r = conn.execute("SELECT person_label FROM voiceprints WHERE voiceprint_id=?", (vpid,)).fetchone()
+        name = r["person_label"] if r else None
+        if not name:
+            continue
+        others = conn.execute(
+            """SELECT DISTINCT d.source_id FROM voiceprints v
+                 JOIN voiceprint_observations o ON o.voiceprint_id = v.voiceprint_id
+                 JOIN documents d ON d.doc_id = o.doc_id
+                WHERE v.person_label = ? AND d.source_id != ?""",
+            (name, doc["source_id"]),
+        ).fetchall()
+        for o in others:
+            a, b = sorted([doc["source_id"], o["source_id"]])
+            conn.execute(
+                """INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
+                   VALUES (?,?,'shared_guest',1.0,?)
+                   ON CONFLICT(src_a, src_b, edge_type)
+                   DO UPDATE SET weight = weight + 1.0, evidence = excluded.evidence""",
+                (a, b, f"name:{name}"),
+            )
+            n += 1
+    conn.commit()
+    return n
+
+
+def _download_audio(doc, cfg) -> Path:
+    cache = Path(cfg.audio_cache_dir)
+    cache.mkdir(parents=True, exist_ok=True)
+    wav = cache / f"{doc['doc_id'].replace(':', '_')}.wav"
+    if wav.exists():
+        return wav
+    url = doc["url"]
+    if doc["kind"] == "youtube" or (url and ("youtube.com" in url or "youtu.be" in url)):
+        return download_youtube_audio(url, cache, archive_file=cache / "yt-archive.txt")
+    raw = download_enclosure(url, cache / f"{doc['doc_id'].replace(':', '_')}.src")
+    return to_wav_16k_mono(raw, wav)
+
+
+def process_document(conn, sc, cfg, doc, *, max_chunks: int, chunk_seconds: int = 150,
+                     keep_audio: bool = False) -> int:
+    audio = _download_audio(doc, cfg)
+    chunkdir = Path(cfg.audio_cache_dir) / f"chunks_{doc['doc_id'].replace(':', '_')}"
+    chunks = chunk_audio(audio, chunkdir, chunk_seconds=chunk_seconds)[:max_chunks]
+    chunk_turns, chunk_speakers = diarize_transcribe_chunks(
+        sc, chunks, concurrency=getattr(cfg, "audio_concurrency", 2))
+    keymap, centroids = stitch_and_centroids(chunk_speakers)
+    cluster_to_vpid = resolve_voiceprints(conn, doc, centroids)
+
+    # Name the speakers (§4.5): host introduces guest in 1-on-1 → attach person_label, then a
+    # name-based shared_guest edge that survives voiceprint drift across shows.
+    src = conn.execute("SELECT name FROM sources WHERE source_id=?", (doc["source_id"],)).fetchone()
+    try:
+        from ..extract.backends import from_config as backend_from_config
+        from .identify import identify_speakers
+        backend = backend_from_config(cfg, sc)
+        draft = _labeled(chunk_turns, keymap, {lab: f"Speaker {lab + 1}" for lab in cluster_to_vpid})
+        idmap = identify_speakers(backend, draft[:6000], source_name=src["name"] if src else "")
+        named = apply_names(conn, cluster_to_vpid, idmap)
+        if named:
+            log.info("named speakers in %s: %s", doc["doc_id"], ", ".join(named.values()))
+    except Exception as e:  # noqa: BLE001 — naming is best-effort enrichment
+        log.warning("speaker identification failed for %s: %s", doc["doc_id"], e)
+    add_name_edges(conn, doc, cluster_to_vpid)
+
+    transcript = build_transcript(conn, chunk_turns, keymap, cluster_to_vpid)
+    tpath = Path(cfg.data_dir) / "transcripts" / f"{doc['doc_id'].replace(':', '_')}.txt"
+    tpath.parent.mkdir(parents=True, exist_ok=True)
+    tpath.write_text(transcript)
+    import hashlib
+    content_hash = hashlib.sha256(transcript.encode()).hexdigest()
+    conn.execute(
+        "UPDATE documents SET transcript_path=?, duration_sec=?, content_hash=?, processed_at=datetime('now') WHERE doc_id=?",
+        (str(tpath), len(chunks) * chunk_seconds, content_hash, doc["doc_id"]),
+    )
+    conn.commit()
+    h = hashlib.sha256(f"{doc['doc_id']}|extract-v0".encode()).hexdigest()
+    queue.enqueue(conn, job_type="extract", target_id=doc["doc_id"], input_hash=h,
+                  parent_doc_id=doc["doc_id"], priority=100)
+    if not keep_audio:
+        _cleanup_audio(audio, chunkdir)
+    return len(chunk_turns)
+
+
+def _cleanup_audio(audio: Path, chunkdir: Path) -> None:
+    """Audio files are large and disposable once transcribed — reclaim the disk (the transcript +
+    voiceprints are what we keep). Backfilling hundreds of 1-3 hr episodes would otherwise be tens of GB."""
+    import shutil
+    try:
+        if audio.exists():
+            audio.unlink()
+        src = audio.with_suffix(".src")
+        if src.exists():
+            src.unlink()
+        if chunkdir.exists():
+            shutil.rmtree(chunkdir, ignore_errors=True)
+    except Exception as e:  # noqa: BLE001
+        log.warning("audio cleanup failed for %s: %s", audio, e)
+
+
+def run_transcribe(conn, sc, cfg, *, limit: int = 5, max_chunks: int = 999,
+                   lease_seconds: int = 3600, worker_id: str = "transcribe-1") -> dict:
+    processed = 0
+    while processed < limit:
+        job = queue.lease_next(conn, worker_id=worker_id, job_types=["transcribe"], lease_seconds=lease_seconds)
+        if job is None:
+            break
+        processed += 1
+        doc = conn.execute("SELECT * FROM documents WHERE doc_id=?", (job["target_id"],)).fetchone()
+        if doc is None:
+            queue.skip(conn, job["job_id"], "document missing")
+            continue
+        try:
+            n = process_document(conn, sc, cfg, doc, max_chunks=max_chunks)
+            queue.complete(conn, job["job_id"], output_ref=f"{n} chunks")
+            log.info("transcribed %s (%d chunks)", doc["doc_id"], n)
+        except Exception as e:  # noqa: BLE001
+            state = queue.fail(conn, job["job_id"], e)
+            log.warning("transcribe failed for %s: %s (→ %s)", job["target_id"], e, state)
+    return {"jobs_processed": processed}
@@ -0,0 +1,6 @@
+"""The scoring brain (build blueprint).
+
+Stats/geometry NOMINATE candidates; the frontier model only judges/expands a pre-filtered shortlist
+(§5.1). Every count that feeds a score routes through the independence primitive (EISC), never a raw
+source count (§4.5). Every scorer reads `visible_claims` (as-of filtered), never `claims` directly.
+"""
@@ -0,0 +1,43 @@
+"""As-of harness (§6.6 look-ahead guard).
+
+Every scorer reads the `visible_claims` TEMP VIEW, never `claims` directly: at nomination time only
+claims dated <= as_of are visible, so the backtest can't reward noticing what already happened. The
+view also resolves merged canonical topics (topics.status='merged') to a stable `topic_id`.
+"""
+from __future__ import annotations
+
+import sqlite3
+
+
+class Scorer:
+    """Context manager that binds a run to an as_of date and exposes `visible_claims`.
+
+    mode='backtest' enforces strict as-of discipline; 'forward' is the live pilot. as_of is a
+    controlled ISO date (YYYY-MM-DD) — safe to inline into the view DDL (views can't take params)."""
+
+    def __init__(self, conn: sqlite3.Connection, as_of: str, *, mode: str = "backtest") -> None:
+        self.conn = conn
+        self.as_of = as_of
+        self.mode = mode
+
+    def __enter__(self) -> "Scorer":
+        self.conn.executescript(
+            f"""
+            DROP VIEW IF EXISTS visible_claims;
+            CREATE TEMP VIEW visible_claims AS
+            SELECT c.*,
+              COALESCE((SELECT t.merged_into FROM topics t
+                        WHERE t.topic_canonical = c.topic_canonical AND t.status='merged'),
+                       c.topic_canonical) AS topic_id
+            FROM claims c
+            JOIN documents d ON d.doc_id = c.doc_id
+            WHERE c.date IS NOT NULL AND c.date <= '{self.as_of}';
+            """
+        )
+        return self
+
+    def __exit__(self, *exc) -> None:
+        self.conn.execute("DROP VIEW IF EXISTS visible_claims")
+
+    def count_visible(self) -> int:
+        return self.conn.execute("SELECT COUNT(*) FROM visible_claims").fetchone()[0]
@@ -0,0 +1,49 @@
+"""The quantitative bar (§5.1, §6.6) — the single gate between nomination and the frontier judge.
+
+Two tiers:
+  - evidence bar  → clears hard gates → WRITE A LEDGER ROW (the denominator, §6.6), even if never judged.
+  - promotion bar → also clears the score threshold → goes to the frontier judge.
+
+THE GLOBAL META-RULE (applied to every scorer): no candidate clears on a single source or single
+cluster — EISC_adj >= 2.0 AND K_eff >= 2. This is the §2.1 anti-lonely-outlier law, enforced once.
+"""
+from __future__ import annotations
+
+EISC_FLOOR = 2.0
+KEFF_FLOOR = 2
+
+# Defaults; overridable via the score_thresholds table (so the backtest can sweep without code edits).
+DEFAULT_MIN_SCORE = {"under_acted": 0.3, "emergence": 2.0, "contrarian": 1.5,
+                     "convergence": 2.5, "intersection": 2.0}
+
+
+def _min_score(conn, scorer: str) -> float:
+    if conn is not None:
+        row = conn.execute("SELECT min_score FROM score_thresholds WHERE scorer=?", (scorer,)).fetchone()
+        if row and row[0] is not None:
+            return float(row[0])
+    return DEFAULT_MIN_SCORE.get(scorer, 0.0)
+
+
+def evaluate(scorer: str, result: dict, *, conn=None) -> tuple[bool, bool]:
+    """Returns (cleared_evidence_bar, cleared_promotion_bar)."""
+    if scorer == "under_acted":
+        return _under_acted(result, _min_score(conn, scorer))
+    return (False, False)  # Job A scorers wired with the forward pilot
+
+
+def _under_acted(result: dict, min_score: float) -> tuple[bool, bool]:
+    i = result["inputs"]
+    breaker = bool(i.get("is_breaker"))
+    # §4.4 Job B = "rising INDEPENDENT corroboration". EISC>=2.0 enforces independence (shared-guest +
+    # same-cluster discounting), so this is NOT an isolated point or one-guest echo (§2.1). Cross-cluster
+    # (k_eff>=2) is the §4.5 GOLD for Job A DISCOVERY — NOT a hard gate for Job B corroboration: N
+    # independent energy companies confirming a power thesis is real corroboration. Cross-cluster still
+    # BOOSTS the score (eisc_corrob = eisc_adj includes the xcluster_mult) so cross-cluster ranks first.
+    corroborated = (i.get("n_confirmed", 0) >= 4 and i.get("n_src", 0) >= 2
+                    and i.get("eisc_corrob", 0.0) >= EISC_FLOOR and i.get("a_corrob", 0.0) > 0)
+    conv_ok = breaker or i.get("conviction_weight", 0.0) >= 0.7      # med-high / high
+    expo_ok = breaker or i.get("exposure") in ("none", "lt2")        # genuine exposure gap
+    evidence = corroborated and conv_ok and expo_ok
+    promotion = evidence and result["score"] >= min_score
+    return evidence, promotion
@@ -0,0 +1,86 @@
+"""Pre-registered confusion matrix on the §7.1 derivatives (DESIGN_v2 §1.3).
+
+Measures PRECISION and RECALL, not recall alone. Uses the engine's already-stored candidate_scores
+(cleared_date + whisper_date) × the pre-registered external repricing (resolution.K2023.yaml). Reports
+the matrix at BOTH the cleared level (what the engine fired) and the whisper level (what it saw before
+the independence floor) — the delta is the empirical answer to the gate debate.
+"""
+from __future__ import annotations
+
+import json
+from datetime import datetime
+
+import yaml
+
+from .external import basket_index, fetch_eod, resolve_reprice, runway_at_signal
+
+
+def _engine_dates(conn) -> dict[str, dict]:
+    """For each under_acted node: earliest cleared as_of and earliest whisper as_of (n_conf>=4, a>0)."""
+    rows = conn.execute(
+        "SELECT node_id, conviction_id, as_of, cleared_evidence_bar ev, inputs_json "
+        "FROM candidate_scores WHERE scorer='under_acted'"
+    ).fetchall()
+    out: dict[str, dict] = {}
+    for r in rows:
+        k = r["node_id"] or r["conviction_id"]
+        i = json.loads(r["inputs_json"])
+        d = out.setdefault(k, {"cleared": None, "whisper": None})
+        if r["ev"] and (d["cleared"] is None or r["as_of"] < d["cleared"]):
+            d["cleared"] = r["as_of"]
+        if i.get("n_confirmed", 0) >= 4 and i.get("a_corrob", 0) > 0:
+            if d["whisper"] is None or r["as_of"] < d["whisper"]:
+                d["whisper"] = r["as_of"]
+    return out
+
+
+def _lead_days(repricing_date: str, signal_date: str | None) -> int | None:
+    if not signal_date or not repricing_date:
+        return None
+    return (datetime.strptime(repricing_date, "%Y-%m-%d") - datetime.strptime(signal_date, "%Y-%m-%d")).days
+
+
+def run_confusion(conn, cfg, spec_path: str) -> dict:
+    spec = yaml.safe_load(open(spec_path))
+    w, rule = spec["window"], spec["rule"]
+    engine = _engine_dates(conn)
+    price_cache: dict[str, list] = {}
+
+    rows = []
+    for node, basket in spec["baskets"].items():
+        prices = {}
+        for sym in basket:
+            if sym not in price_cache:
+                price_cache[sym] = fetch_eod(cfg.fmp_api_key, sym, w["start"], w["end"])
+            prices[sym] = price_cache[sym]
+        missing = [s for s in basket if not prices[s]]
+        idx = basket_index(prices)
+        res = resolve_reprice(idx, threshold_pct=rule["threshold_pct"], hold_pct=rule["hold_pct"],
+                              hold_days=rule["hold_days"])
+        ed = engine.get(node, {"cleared": None, "whisper": None})
+        rows.append({
+            "node": node, "basket": basket, "missing": missing,
+            "confirmed": res["confirmed"], "repricing_date": res["repricing_date"], "peak_pct": res["peak_pct"],
+            "cleared_date": ed["cleared"], "whisper_date": ed["whisper"],
+            "lead_cleared": _lead_days(res["repricing_date"], ed["cleared"]) if res["confirmed"] else None,
+            "lead_whisper": _lead_days(res["repricing_date"], ed["whisper"]) if res["confirmed"] else None,
+            # DESIGN_v2.1 Correction A: runway = fraction of the durable move still ahead at signal
+            "runway_cleared": runway_at_signal(idx, ed["cleared"]) if res["confirmed"] else None,
+            "runway_whisper": runway_at_signal(idx, ed["whisper"]) if res["confirmed"] else None,
+        })
+
+    def classify(r, level):
+        fired = bool(r[f"{level}_date"])
+        real = r["confirmed"]
+        return "TP" if (fired and real) else "FP" if (fired and not real) else "FN" if real else "TN"
+
+    def matrix(level):
+        c = {"TP": 0, "FP": 0, "FN": 0, "TN": 0}
+        for r in rows:
+            c[classify(r, level)] += 1
+        p = c["TP"] / (c["TP"] + c["FP"]) if (c["TP"] + c["FP"]) else None
+        rec = c["TP"] / (c["TP"] + c["FN"]) if (c["TP"] + c["FN"]) else None
+        return c, p, rec
+
+    return {"rows": rows, "cleared": matrix("cleared"), "whisper": matrix("whisper"),
+            "classify": classify}
@@ -0,0 +1,96 @@
+"""External-confirmation data for the resolver (DESIGN_v2 §1). Price series via FMP (already paid for).
+
+This is the *resolving* leg (§6.2): real-world repricing, not discourse. Kept deliberately simple and
+transparent — the resolution rule is pre-registered, so the code here only fetches + applies it.
+"""
+from __future__ import annotations
+
+import requests
+
+_FMP = "https://financialmodelingprep.com"
+
+
+def fetch_eod(api_key: str, symbol: str, start: str, end: str) -> list[tuple[str, float]]:
+    """Daily (date, close) for a symbol. Tries the FMP 'stable' then legacy 'v3' price endpoints."""
+    s = requests.Session()
+    attempts = [
+        (f"{_FMP}/stable/historical-price-eod/full", {"symbol": symbol, "from": start, "to": end}),
+        (f"{_FMP}/api/v3/historical-price-full/{symbol}", {"from": start, "to": end}),
+    ]
+    for url, params in attempts:
+        try:
+            r = s.get(url, params={**params, "apikey": api_key}, timeout=40)
+            if r.status_code != 200:
+                continue
+            j = r.json()
+        except Exception:  # noqa: BLE001
+            continue
+        rows = j.get("historical") if isinstance(j, dict) else j
+        if not rows:
+            continue
+        out = [(x["date"][:10], x.get("close") or x.get("adjClose")) for x in rows
+               if x.get("date") and (x.get("close") or x.get("adjClose"))]
+        if out:
+            return sorted(out)
+    return []
+
+
+def basket_index(prices_by_symbol: dict[str, list[tuple[str, float]]]) -> list[tuple[str, float]]:
+    """Equal-weight, each-symbol-normalized-to-its-own-first-close index, averaged over dates where
+    data exists. (Symbols that IPO'd mid-window enter at 1.0 when they start — flagged by the caller.)"""
+    norm = {}
+    for sym, series in prices_by_symbol.items():
+        if series:
+            base = series[0][1]
+            norm[sym] = {d: c / base for d, c in series if base}
+    dates = sorted({d for n in norm.values() for d in n})
+    idx = []
+    for d in dates:
+        vals = [n[d] for n in norm.values() if d in n]
+        if vals:
+            idx.append((d, sum(vals) / len(vals)))
+    return idx
+
+
+def index_value_at(index: list[tuple[str, float]], date: str | None) -> float | None:
+    """Latest index value on or before `date` (baseline if the signal predates the data)."""
+    if not index or not date:
+        return None
+    vals = [v for d, v in index if d <= date]
+    return vals[-1] if vals else index[0][1]
+
+
+def runway_at_signal(index: list[tuple[str, float]], signal_date: str | None) -> float | None:
+    """Fraction of the durable move STILL AHEAD at the signal date (DESIGN_v2.1 Correction A).
+    1.0 = whole move ahead (signal before it); 0.0 = signal at the peak. The right metric for a
+    long-duration holder — a modestly-late signal with most of the move ahead is still actionable."""
+    if not index or not signal_date:
+        return None
+    base = index[0][1]
+    peak = max(v for _, v in index)
+    val = index_value_at(index, signal_date)
+    if peak <= base or val is None:
+        return None
+    return round(max(0.0, (peak - val) / (peak - base)), 2)
+
+
+def resolve_reprice(index: list[tuple[str, float]], *, threshold_pct: float, hold_pct: float,
+                    hold_days: int) -> dict:
+    """Apply the pre-registered rule: first date the index is ≥ +threshold% vs baseline AND still
+    ≥ +hold% `hold_days` later. Returns {confirmed, repricing_date, peak_pct}."""
+    from datetime import datetime, timedelta
+    if not index:
+        return {"confirmed": False, "repricing_date": None, "peak_pct": None}
+    base = index[0][1]
+    thr = 1.0 + threshold_pct / 100.0
+    hold = 1.0 + hold_pct / 100.0
+    by_date = dict(index)
+    dates = [d for d, _ in index]
+    peak = max(v for _, v in index)
+    for d, v in index:
+        if v / base >= thr:
+            target = (datetime.strptime(d, "%Y-%m-%d") + timedelta(days=hold_days)).strftime("%Y-%m-%d")
+            later = [vv for dd, vv in index if dd >= target]
+            if later and (later[0] / base) >= hold:
+                return {"confirmed": True, "repricing_date": d, "peak_pct": round((peak / base - 1) * 100, 1)}
+    return {"confirmed": False, "repricing_date": None, "peak_pct": round((peak / base - 1) * 100, 1)}
@@ -0,0 +1,113 @@
+"""Effective Independent Source Count (EISC) — the system's differentiator (§4.5).
+
+Discount convergence by source connectedness. Five shows that "independently converge" but share one
+guest must count as ~one voice; three shows across macro/energy/ai with no shared guests are gold.
+
+Method (resolved in the design panel): noisy-OR connectedness matrix + inverse-row-sum EISC.
+  - symmetric & order-independent (unlike a sequential pairwise-penalty walk)
+  - each source's contribution is individually explainable ("counts 0.31 because connected to 3 others")
+  - collapses correctly: 5 clones -> ~1.0 ; 5 cross-cluster independents -> ~5.0 (raw)
+  - no eigensolve (unstable at n=2..4, our common case)
+"""
+from __future__ import annotations
+
+from collections import defaultdict
+
+import numpy as np
+
+# Coupling per edge type: a voiceprint-confirmed shared guest is near-total redundancy on a topic.
+KAPPA = {"shared_guest": 0.85, "citation": 0.45, "community": 0.60}
+# Same-cluster baseline correlation (sources in the same world are partly redundant even w/o an edge).
+CLUSTER_COUPLING = {"bitcoin": 0.55, "vc_consensus": 0.35}
+SAME_CLUSTER_DEFAULT = 0.25
+EDGE_CLAMP = 0.95          # cap kappa*weight so a heavily-weighted edge can't exceed near-total
+CAP_VALUE = 0.25          # §4.5: bitcoin / capped sources contribute at most 0.25 of a voice
+CLUSTER_MIN_CONTRIB = 0.5  # a cluster must add >= half an independent voice to count toward K_eff
+
+
+def effective_independent_N(srcs: list[tuple], edges: list[tuple], *, mode: str = "live") -> dict:
+    """srcs: [(source_id, source_cluster, cluster_capped_low[, own_network])]; edges: [(a,b,type,weight)].
+    mode='live' (default) DROPS own_network sources (Ten31's own orbit — listening to ourselves, §v2.1);
+    mode='test' keeps them (the reflexivity test fixture). Returns {eisc_adj, eisc_raw, k_eff, ...}."""
+    if mode == "live":
+        srcs = [s for s in srcs if not (len(s) > 3 and s[3])]
+    ids = [s[0] for s in srcs]
+    n = len(ids)
+    if n == 0:
+        return {"eisc_adj": 0.0, "eisc_raw": 0.0, "k_eff": 0, "xcluster_mult": 1.0, "per_source_contrib": {}}
+    idx = {sid: i for i, sid in enumerate(ids)}
+    cluster = {s[0]: s[1] for s in srcs}
+    capped = {s[0]: (bool(s[2]) or s[1] == "bitcoin") for s in srcs}
+
+    # edge channel: combine all edges between a pair by noisy-OR product of (1 - kappa*weight)
+    pair_factor: dict = defaultdict(lambda: 1.0)
+    for a, b, etype, w in edges:
+        if a in idx and b in idx and a != b:
+            term = min(EDGE_CLAMP, KAPPA.get(etype, 0.0) * (w if w is not None else 1.0))
+            pair_factor[frozenset((a, b))] *= (1.0 - term)
+
+    C = np.eye(n)
+    for i in range(n):
+        for j in range(i + 1, n):
+            a, b = ids[i], ids[j]
+            e = 1.0 - pair_factor[frozenset((a, b))]            # 0 if no edge
+            ci, cj = cluster[a], cluster[b]
+            clust = (CLUSTER_COUPLING.get(ci, SAME_CLUSTER_DEFAULT)
+                     if (ci is not None and ci == cj) else 0.0)
+            c = 1.0 - (1.0 - e) * (1.0 - clust)
+            C[i, j] = C[j, i] = c
+
+    rowsum = C.sum(axis=1)                                      # includes the diagonal 1.0
+    contrib, eisc_raw = {}, 0.0
+    cluster_mass: dict = defaultdict(float)
+    for i, sid in enumerate(ids):
+        cap = CAP_VALUE if capped[sid] else 1.0
+        contrib[sid] = cap * (1.0 / rowsum[i])
+        eisc_raw += contrib[sid]
+        if not capped[sid] and cluster[sid]:
+            cluster_mass[cluster[sid]] += contrib[sid]
+
+    # cross-cluster bonus: count NON-capped clusters that genuinely contribute an independent voice
+    # (summed contribution >= half a voice). This stops "one guest across many clusters" from earning
+    # the gold multiplier — the raw EISC already collapses that guest to ~1, and k_eff must agree.
+    k_eff = sum(1 for m in cluster_mass.values() if m >= CLUSTER_MIN_CONTRIB)
+    xmult = max(1.0, 1.0 + 0.5 * (k_eff - 1))                   # 1clu->1.0, 2->1.5, 3->2.0 (gold)
+    return {
+        "eisc_adj": xmult * eisc_raw,
+        "eisc_raw": eisc_raw,
+        "k_eff": k_eff,
+        "xcluster_mult": xmult,
+        "per_source_contrib": {k: round(v, 4) for k, v in contrib.items()},
+    }
+
+
+# --- DB helpers (the brain only READS the graph; edges are produced upstream by the voiceprint lib) ---
+def load_source_meta(conn, ids: list[str]) -> list[tuple]:
+    ids = list(dict.fromkeys(ids))
+    if not ids:
+        return []
+    ph = ",".join("?" * len(ids))
+    rows = conn.execute(
+        f"SELECT source_id, source_cluster, cluster_capped_low, COALESCE(own_network,0) "
+        f"FROM sources WHERE source_id IN ({ph})", ids
+    ).fetchall()
+    return [(r[0], r[1], r[2], r[3]) for r in rows]
+
+
+def load_edges(conn, ids: list[str]) -> list[tuple]:
+    ids = list(dict.fromkeys(ids))
+    if not ids:
+        return []
+    ph = ",".join("?" * len(ids))
+    rows = conn.execute(
+        f"SELECT src_a, src_b, edge_type, weight FROM source_edges WHERE src_a IN ({ph}) AND src_b IN ({ph})",
+        ids + ids,
+    ).fetchall()
+    return [(r[0], r[1], r[2], r[3]) for r in rows]
+
+
+def eisc_for(conn, source_ids: list[str], *, mode: str = "live") -> dict:
+    """Convenience: EISC for a set of source_ids, loading cluster/cap/own_network + edges from SQLite.
+    mode='live' drops own_network sources; mode='test' keeps them (§v2.1 condition 1)."""
+    ids = list(dict.fromkeys(source_ids))
+    return effective_independent_N(load_source_meta(conn, ids), load_edges(conn, ids), mode=mode)
@@ -0,0 +1,49 @@
+"""Ledger + candidate_scores writers. Log EVERY bar-clearer from day one (§6.6 denominator).
+
+date_logged = as_of (backtest rows carry historical dates so lead-time math is correct). The
+discourse_metric JSON is FROZEN here at log time — the resolver (separate forward pass) never edits it.
+Grant's rating lives in human_evaluations; the model never reads it pre-log (§6.7).
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+
+
+def _sig_id(scorer: str, key: str, as_of: str) -> str:
+    return "sig_" + hashlib.sha1(f"{scorer}|{key}|{as_of}".encode()).hexdigest()[:16]
+
+
+def _score_id(scorer: str, key: str, as_of: str) -> str:
+    return hashlib.sha1(f"cs|{scorer}|{key}|{as_of}".encode()).hexdigest()
+
+
+def record_candidate_score(conn, result: dict, as_of: str, evidence: bool, promotion: bool) -> None:
+    key = result.get("node_id") or result.get("conviction_id") or result.get("topic_canonical") or ""
+    conn.execute(
+        """INSERT OR REPLACE INTO candidate_scores
+             (score_id, scorer, as_of, topic_canonical, node_id, conviction_id, score,
+              cleared_evidence_bar, cleared_promotion_bar, inputs_json)
+           VALUES (?,?,?,?,?,?,?,?,?,?)""",
+        (_score_id(result["scorer"], key, as_of), result["scorer"], as_of,
+         result.get("topic_canonical"), result.get("node_id"), result.get("conviction_id"),
+         result["score"], int(evidence), int(promotion), json.dumps(result["inputs"])[:8000]),
+    )
+    conn.commit()
+
+
+def log_candidate(conn, *, scorer: str, as_of: str, ledger_type: str, proposition: str,
+                  discourse_metric: dict, origin_conviction_id=None, origin_node_id=None) -> str:
+    key = origin_node_id or origin_conviction_id or proposition
+    signal_id = _sig_id(scorer, key, as_of)
+    dm = {**discourse_metric, "scorer": scorer}
+    conn.execute(
+        """INSERT OR IGNORE INTO ledger
+             (signal_id, type, proposition, date_logged, discourse_metric, model_confidence,
+              origin_conviction_id, origin_node_id)
+           VALUES (?,?,?,?,?,?,?,?)""",
+        (signal_id, ledger_type, proposition[:1000], as_of, json.dumps(dm)[:8000], None,
+         origin_conviction_id, origin_node_id),
+    )
+    conn.commit()
+    return signal_id
@@ -0,0 +1,80 @@
+"""Local-LLM scoring helpers (§4.4). Bounded labeling passes over PRE-FILTERED candidates only —
+never nomination from the raw corpus (§5.1). JSON mode, temp 0, no thinking → deterministic.
+
+Helper #2 (derivative-relevance) is built first — it's the one the §7.1 backtest needs. Helper #1
+(stance-folding for Job A contrarian) comes with the forward pilot.
+"""
+from __future__ import annotations
+
+import json
+import logging
+
+log = logging.getLogger(__name__)
+
+_REL_SYS = (
+    "You assess whether claims corroborate a specific investment hypothesis (a 2nd/3rd-order "
+    "derivative of a thesis). For EACH claim decide: does it provide real-world evidence that the "
+    "hypothesis is PLAYING OUT (corroborates), and the direction. 'affirms' = supports the hypothesis; "
+    "'contradicts' = is evidence against it; 'tangential' = same topic words but not actually about the "
+    "hypothesis (e.g. 'transformers' the ML architecture vs the electrical-grid kind). Be strict: a "
+    "passing mention is tangential, not corroboration. "
+    "TWO HARD RULES (these are the difference between catching a real signal and being fooled):\n"
+    "1) REALIZED-ONLY. The hypothesis must be PLAYING OUT in fact. Announcements, plans, intentions, "
+    "forecasts, targets, and 'may/will/expects/poised-to/aims-to/up-to' language are NOT corroboration — "
+    "they are 'tangential' unless the claim states the thing has ACTUALLY HAPPENED / been DEPLOYED / "
+    "closed. A $2B program 'announced' or capital 'made available' is NOT capital deployed. A company "
+    "that 'may consider' or 'expects' something has not done it.\n"
+    "2) ROLE-MATCH. The actor in the claim must occupy the role the hypothesis is about. If the "
+    "hypothesis is that capital PROVIDERS are funding/supplying something, then a BORROWER or USER on the "
+    "demand side (e.g. a firm posting an asset AS collateral to RECEIVE a loan) is the wrong side of the "
+    "transaction → 'tangential' to that hypothesis, not 'affirms'. "
+    'Return ONLY JSON: {"results":[{"claim_id":"...","corroborates":true|false,'
+    '"direction":"affirms"|"contradicts"|"tangential"}]}.'
+)
+
+
+def _parse(raw: str) -> list[dict]:
+    try:
+        obj = json.loads(raw)
+    except Exception:
+        i, j = raw.find("{"), raw.rfind("}")
+        if i < 0 or j < 0:
+            return []
+        try:
+            obj = json.loads(raw[i:j + 1])
+        except Exception:
+            return []
+    res = obj.get("results", []) if isinstance(obj, dict) else []
+    return [r for r in res if isinstance(r, dict) and r.get("claim_id")]
+
+
+def derivative_relevance(backend, derivative: str, claims: list[dict]) -> dict[str, dict]:
+    """claims: [{claim_id, proposition}]. Returns {claim_id: {corroborates, direction}}.
+    Filters retrieval near-misses; it cannot ADD claims search didn't return (not a nominator)."""
+    if not claims:
+        return {}
+    listing = "\n".join(f"- [{c['claim_id']}] {c['proposition']}" for c in claims)
+    user = (f"HYPOTHESIS (derivative): {derivative}\n\nCLAIMS:\n{listing}\n\n"
+            f"Judge each claim id.")
+    messages = [{"role": "system", "content": _REL_SYS}, {"role": "user", "content": user}]
+    # Output is ~one JSON record per claim (claim_id + corroborates + direction ≈ 70-100 tokens). At
+    # top_k=60 that's ~5k tokens — a fixed 3000 budget truncated mid-array → empty parse → a node
+    # silently zeroed (the source of the unstable 5-affirm/0-affirm flip). Size the budget to the batch.
+    budget = max(3000, 120 * len(claims) + 500)
+    parsed = []
+    for attempt in range(2):  # one retry — a gateway-under-load truncation shouldn't zero out a node
+        raw = backend.complete_json(messages, max_tokens=budget)
+        parsed = _parse(raw)
+        if parsed:
+            break
+        log.warning("derivative_relevance empty parse (attempt %d) for %r; raw[:160]=%r",
+                    attempt + 1, derivative[:50], raw[:160])
+    # The listing presents ids as `- [{claim_id}] ...`; the model INCONSISTENTLY echoes the id back with
+    # the surrounding brackets ("[edgar:...]") — which then misses the bracket-less lookup key and the
+    # whole node reads as 0/(missing). Normalize the brackets+whitespace so matching is robust either way.
+    out = {}
+    for r in parsed:
+        cid = str(r["claim_id"]).strip().strip("[]").strip()
+        out[cid] = {"corroborates": bool(r.get("corroborates")),
+                    "direction": r.get("direction", "tangential")}
+    return out
@@ -0,0 +1,27 @@
+"""Resolver — the SEPARATE forward pass that closes the loop (§6.2, §6.3).
+
+ARCHITECTURALLY ISOLATED from the scorers: it has no shared write path with them. Scorers write
+candidate_scores + ledger rows with outcome columns NULL and a FROZEN discourse_metric. The resolver
+runs later (larger as_of), reads ledger rows whose date_logged < as_of_now, and writes ONLY
+resolution_date / discourse_outcome / external_outcome / lead_time_days. It is FORBIDDEN from touching
+discourse_metric — that is the structural reason the ledger can't reward noticing what already happened.
+
+Implementation note: real resolutions need forward time (the clock can't be backfilled). For the
+backtest, the discourse leg can be resolved by re-running the discourse metric forward from date_logged;
+the external leg (price/filings/human check, §6.5) is filled as that evidence arrives. Stubbed now to
+lock the architecture; filled out for the forward pilot.
+"""
+from __future__ import annotations
+
+
+def resolve_discourse_leg(conn, sc, cfg, *, as_of_now: str) -> int:
+    """For each ledger row logged before as_of_now without a resolution, re-measure discourse forward
+    and set discourse_outcome + lead_time. (Forward-only; never reads/edits discourse_metric.)
+    Returns count resolved. STUB — implemented for the forward pilot."""
+    rows = conn.execute(
+        "SELECT signal_id, date_logged FROM ledger WHERE resolution_date IS NULL AND date_logged < ?",
+        (as_of_now,),
+    ).fetchall()
+    # TODO(forward-pilot): re-run windowed independence from date_logged→as_of_now for each row's
+    # origin derivative; set discourse_outcome in {up_cross_cluster,up_single_cluster,flat,down}.
+    return 0
@@ -0,0 +1,81 @@
+"""Scoring orchestrator. For Job B / the §7.1 backtest: march as_of dates, score every conviction +
+fan-out derivative, gate, log the denominator, promote nodes.
+"""
+from __future__ import annotations
+
+import logging
+
+from ..extract.backends import from_config as backend_from_config
+from . import bar, under_acted
+from .asof import Scorer
+from .ledger_writer import log_candidate, record_candidate_score
+
+log = logging.getLogger(__name__)
+
+
+def _nodes_for(conn, as_of, mode, conviction_ids):
+    nodes = []
+    where, params = "", []
+    if conviction_ids:
+        ph = ",".join("?" * len(conviction_ids))
+        where = f" WHERE conviction_id IN ({ph})"
+        params = list(conviction_ids)
+    for c in conn.execute(
+        f"SELECT conviction_id, thematic_proposition, conviction_level, current_exposure, is_thesis_breaker "
+        f"FROM conviction_log{where}", params,
+    ):
+        nodes.append({"conviction_id": c[0], "node_id": None, "derivative": c[1],
+                      "level": c[2], "exposure": c[3], "breaker": bool(c[4])})
+    fq = ("SELECT f.node_id, f.parent_conviction_id, f.derivative_proposition, c.conviction_level, "
+          "c.current_exposure, c.is_thesis_breaker FROM fanout_nodes f "
+          "JOIN conviction_log c ON c.conviction_id = f.parent_conviction_id")
+    conds, fparams = [], []
+    if conviction_ids:
+        conds.append(f"f.parent_conviction_id IN ({','.join('?' * len(conviction_ids))})")
+        fparams += list(conviction_ids)
+    if mode == "forward":   # backtest uses the seeded tree as the as-of-2023 hypothesis (no created_at leak)
+        conds.append("f.created_at <= ?")
+        fparams.append(as_of)
+    if conds:
+        fq += " WHERE " + " AND ".join(conds)
+    for f in conn.execute(fq, fparams):
+        nodes.append({"conviction_id": f[1], "node_id": f[0], "derivative": f[2],
+                      "level": f[3], "exposure": f[4], "breaker": bool(f[5])})
+    return nodes
+
+
+def run_under_acted(conn, sc, cfg, *, as_of, mode="backtest", conviction_ids=None, window_days=28) -> list[dict]:
+    backend = backend_from_config(cfg, sc)
+    out = []
+    with Scorer(conn, as_of, mode=mode):
+        for nd in _nodes_for(conn, as_of, mode, conviction_ids):
+            r = under_acted.score_node(
+                conn, sc, backend, as_of=as_of, derivative=nd["derivative"],
+                conviction_id=nd["conviction_id"], node_id=nd["node_id"],
+                conviction_level=nd["level"], exposure=nd["exposure"], is_breaker=nd["breaker"],
+                window_days=window_days,
+            )
+            ev, pr = bar.evaluate("under_acted", r, conn=conn)
+            record_candidate_score(conn, r, as_of, ev, pr)
+            if ev:
+                log_candidate(conn, scorer="under_acted", as_of=as_of,
+                              ledger_type="under_acted_conviction", proposition=nd["derivative"],
+                              discourse_metric=r["inputs"], origin_conviction_id=nd["conviction_id"],
+                              origin_node_id=nd["node_id"])
+                if nd["node_id"]:
+                    conn.execute("UPDATE fanout_nodes SET status=? WHERE node_id=?",
+                                 ("signal" if pr else "corroborated", nd["node_id"]))
+                    conn.commit()
+            out.append({"node": nd, "result": r, "evidence": ev, "promotion": pr})
+    return out
+
+
+def run_backtest(conn, sc, cfg, *, conviction_id, dates, window_days=90) -> list[tuple]:
+    timeline = []
+    for as_of in dates:
+        res = run_under_acted(conn, sc, cfg, as_of=as_of, mode="backtest",
+                              conviction_ids=[conviction_id], window_days=window_days)
+        timeline.append((as_of, res))
+        fired = [r for r in res if r["evidence"]]
+        log.info("as_of %s: %d/%d nodes cleared evidence bar", as_of, len(fired), len(res))
+    return timeline
@@ -0,0 +1,105 @@
+"""Two-sided net-corroboration (DESIGN_v2.1 H5 + condition 3) — the instrument for the adversarial cases.
+
+For a derivative, track the INDEPENDENCE-WEIGHTED affirms MINUS denies over time. This is the right
+output for Strike/Battery (where the question is "did the engine distinguish real adoption from
+narrative, and catch the contradiction?"), not runway:
+  - STRIKE (reflexivity): a PASS = net stays low/quiet in LIVE mode (own_network dropped) while it
+    would have fired in TEST mode (own_network kept) → the engine refuses the intra-cluster echo.
+  - BATTERY (timing): the DEMAND derivative's net rises while the SUPPLY derivative's net stays flat →
+    "half-confirmed, the load-bearing half isn't moving" = the eroding-conviction signal.
+Reuses the §4.6 relevance helper, which already returns direction affirms|contradicts|tangential.
+"""
+from __future__ import annotations
+
+from .independence import eisc_for
+from .llm_helpers import derivative_relevance
+from .windows import window_bounds
+
+
+def classify_corpus(sc, backend, derivative: str, as_of: str, *, top_k: int = 60) -> list[dict]:
+    """Retrieve (as-of filtered) + LLM-classify each claim's direction toward the derivative.
+    Returns affirms/contradicts claims with source_id + date (tangential dropped)."""
+    res = sc.search(derivative, collection="propositions", top_k=top_k, rerank=True)
+    hits = res.get("data", []) if isinstance(res, dict) else []
+    cand = []
+    for h in hits:
+        pl = (h.get("payload") or {})
+        d = pl.get("date")
+        if not pl.get("claim_id") or not d or d[:10] > as_of:
+            continue
+        cand.append({"claim_id": pl["claim_id"], "proposition": pl.get("proposition", ""),
+                     "date": d[:10], "source_id": pl.get("source_id")})
+    if not cand:
+        return []
+    rel = derivative_relevance(backend, derivative,
+                               [{"claim_id": c["claim_id"], "proposition": c["proposition"]} for c in cand])
+    out = []
+    for c in cand:
+        direction = rel.get(c["claim_id"], {}).get("direction", "tangential")
+        if direction in ("affirms", "contradicts"):
+            out.append({**c, "direction": direction})
+    return out
+
+
+# DESIGN_v2 ADOPT #1 (claim-type weighting): a node "resolves" on REALIZED, descriptive disclosure —
+# not on forecasts/intent. A source counts toward the net only if it carries a HARD (realized-fact)
+# claim on this side; predictive/interpretive claims (forecasts, opinion, 'may consider', 'expects')
+# are the exact material that fooled the supply axis on Battery, so they don't qualify a source alone.
+_HARD_CLAIM_TYPES = ("descriptive", "reactive")
+
+
+def _hard_sources(conn, claim_ids: list[str]) -> set:
+    """Sources that contributed at least one realized-fact (descriptive/reactive) claim among claim_ids."""
+    if not claim_ids:
+        return set()
+    ph = ",".join("?" * len(claim_ids))
+    qph = ",".join("?" * len(_HARD_CLAIM_TYPES))
+    rows = conn.execute(
+        f"SELECT DISTINCT source_id FROM claims WHERE claim_id IN ({ph}) AND claim_type IN ({qph})",
+        list(claim_ids) + list(_HARD_CLAIM_TYPES),
+    ).fetchall()
+    return {r[0] for r in rows}
+
+
+def net_at(conn, classified: list[dict], as_of: str, *, window_days: int = 90, mode: str = "live",
+           require_hard_evidence: bool = True) -> dict:
+    """Net independence-weighted corroboration in the trailing window ending at as_of. With
+    require_hard_evidence (default), a source only counts on a side if it carries a realized-fact claim
+    there — forecasts/intent alone don't qualify it (the announced-vs-deployed / opinion-vs-fact guard)."""
+    _, start, end = window_bounds(as_of, n=1, days=window_days)[0]
+    win = [c for c in classified if start < c["date"] <= end]
+    aff = [c for c in win if c["direction"] == "affirms"]
+    den = [c for c in win if c["direction"] == "contradicts"]
+    aff_src_all = {c["source_id"] for c in aff}
+    den_src_all = {c["source_id"] for c in den}
+    if require_hard_evidence:
+        hard_aff = _hard_sources(conn, [c["claim_id"] for c in aff])
+        hard_den = _hard_sources(conn, [c["claim_id"] for c in den])
+        aff_src = list(aff_src_all & hard_aff)
+        den_src = list(den_src_all & hard_den)
+    else:
+        aff_src, den_src = list(aff_src_all), list(den_src_all)
+    aff_e = eisc_for(conn, aff_src, mode=mode)["eisc_adj"] if aff_src else 0.0
+    den_e = eisc_for(conn, den_src, mode=mode)["eisc_adj"] if den_src else 0.0
+    own = 0
+    if aff_src:
+        ph = ",".join("?" * len(aff_src))
+        own = conn.execute(
+            f"SELECT COUNT(*) FROM sources WHERE source_id IN ({ph}) AND COALESCE(own_network,0)=1", aff_src
+        ).fetchone()[0]
+    return {"as_of": as_of, "affirms_eisc": round(aff_e, 2), "denies_eisc": round(den_e, 2),
+            "net": round(aff_e - den_e, 2),
+            "n_affirm": len(aff), "n_deny": len(den),
+            "hard_affirm_src": len(aff_src), "soft_affirm_src_dropped": len(aff_src_all) - len(aff_src),
+            "own_network_affirm_src": own}
+
+
+def trajectory(conn, sc, backend, derivative: str, as_of_dates: list[str], *,
+               window_days: int = 90, mode: str = "live", top_k: int = 60) -> list[dict]:
+    """The net-corroboration curve over as_of_dates. Run twice (mode='live' vs 'test') to see what the
+    own_network quarantine removes — the reflexivity measurement."""
+    out = []
+    for as_of in as_of_dates:
+        classified = classify_corpus(sc, backend, derivative, as_of, top_k=top_k)
+        out.append(net_at(conn, classified, as_of, window_days=window_days, mode=mode))
+    return out
@@ -0,0 +1,75 @@
+"""Under-acted-conviction scorer — Job B, the §7.1 backtest target.
+
+score = conviction_weight x exposure_gap x rising_independent_corroboration
+
+Fires when Ten31 believes something (high conviction), has little/no position (exposure gap), and the
+world is beginning to corroborate it or a derivative of it — independently and with acceleration. This
+is the signal that should have flagged "size up power-infra picks-and-shovels" in 2023.
+
+Exposure is joined LOCALLY (never crosses the frontier boundary, §4.6). Corroboration is RETRIEVED
+(stats nominate), then an LLM helper only FILTERS retrieval near-misses (§5.1) — it cannot add claims.
+"""
+from __future__ import annotations
+
+from .llm_helpers import derivative_relevance
+from .windows import windowed_independence
+
+CONVICTION_WEIGHT = {"low": 0.15, "med": 0.4, "med-high": 0.7, "high": 1.0}
+EXPOSURE_GAP = {"none": 1.0, "lt2": 0.8, "2to10": 0.4, "gt10": 0.1, "unset": 0.6}
+
+
+def score_node(conn, sc, backend, *, as_of: str, derivative: str, conviction_id: str,
+               node_id: str | None, conviction_level: str, exposure: str,
+               is_breaker: bool = False, top_k: int = 40, window_days: int = 28) -> dict:
+    cw = CONVICTION_WEIGHT.get(conviction_level, 0.4)
+    eg = EXPOSURE_GAP.get(exposure, 0.6)
+
+    # 1. RETRIEVE (stats nominate): hybrid search over embedded propositions; as-of post-filter.
+    try:
+        res = sc.search(derivative, collection="propositions", top_k=top_k, rerank=True)
+    except Exception as e:  # noqa: BLE001
+        return _result(conviction_id, node_id, 0.0, {"reason": f"search_failed:{str(e)[:60]}"},
+                       cw, eg, exposure, is_breaker)
+    hits = res.get("data", []) if isinstance(res, dict) else []
+    cand = []
+    for h in hits:
+        pl = (h.get("payload") or {}) if isinstance(h, dict) else {}
+        d = pl.get("date")
+        if not pl.get("claim_id") or not d or d[:10] > as_of:   # Qdrant can't date-filter; do it here
+            continue
+        cand.append({"claim_id": pl["claim_id"], "proposition": pl.get("proposition", ""),
+                     "date": d, "source_id": pl.get("source_id")})
+    if not cand:
+        return _result(conviction_id, node_id, 0.0, {"reason": "no_retrieval", "n_retrieved": 0},
+                       cw, eg, exposure, is_breaker)
+
+    # 2. FILTER near-misses with the LLM (affirms-only). Not a nominator — can't add claims.
+    rel = derivative_relevance(backend, derivative,
+                               [{"claim_id": c["claim_id"], "proposition": c["proposition"]} for c in cand])
+    confirmed = [c for c in cand
+                 if rel.get(c["claim_id"], {}).get("corroborates")
+                 and rel[c["claim_id"]].get("direction") == "affirms"]
+    n_src = len({c["source_id"] for c in confirmed})
+
+    # 3. CORROBORATION = independence-weighted acceleration over the confirmed set (treat as a topic).
+    #    window_days matches corpus cadence: ~90d for quarterly filings/earnings, ~28d for weekly podcasts.
+    wi = windowed_independence(conn, [(c["date"], c["source_id"]) for c in confirmed], as_of, days=window_days)
+    a_corrob = wi["acceleration"]
+    eisc_corrob = wi["eisc0"]
+    corroboration = max(0.0, a_corrob) * eisc_corrob
+
+    score = corroboration if is_breaker else cw * eg * corroboration
+    inputs = {
+        "as_of": as_of, "derivative": derivative, "n_retrieved": len(cand), "n_confirmed": len(confirmed),
+        "n_src": n_src, "a_corrob": a_corrob, "eisc_corrob": eisc_corrob, "k_eff0": wi["k_eff0"],
+        "window_counts": wi["counts"], "window_eisc": wi["eisc"], "corroboration": round(corroboration, 3),
+        "confirmed_claim_ids": [c["claim_id"] for c in confirmed][:50],
+    }
+    return _result(conviction_id, node_id, score, inputs, cw, eg, exposure, is_breaker)
+
+
+def _result(conviction_id, node_id, score, inputs, cw, eg, exposure, is_breaker) -> dict:
+    inputs = {**inputs, "conviction_weight": cw, "exposure_gap": eg, "exposure": exposure,
+              "is_breaker": is_breaker}
+    return {"scorer": "under_acted", "conviction_id": conviction_id, "node_id": node_id,
+            "score": round(float(score), 4), "inputs": inputs}
@@ -0,0 +1,53 @@
+"""Temporal windows + windowed independence (the single temporal layer, §4.4).
+
+28-day non-overlapping windows anchored at as_of (W0 ends at as_of, then back). Non-overlapping
+avoids autocorrelation faking significance. The signal is the discrete 2nd derivative of the
+INDEPENDENCE-WEIGHTED flow (EISC per window), never the raw count — so a topic that "accelerates"
+only because one show booked the same guest three times has flat N(W).
+"""
+from __future__ import annotations
+
+from datetime import datetime, timedelta
+
+from .independence import eisc_for
+
+WINDOW_DAYS = 28
+N_WINDOWS = 3
+
+
+def _d(s: str) -> datetime:
+    return datetime.strptime(s[:10], "%Y-%m-%d")
+
+
+def window_bounds(as_of: str, *, n: int = N_WINDOWS, days: int = WINDOW_DAYS) -> list[tuple[int, str, str]]:
+    """Returns [(idx, start_iso, end_iso)] with W0 ending at as_of, extending backward only."""
+    end = _d(as_of)
+    out = []
+    for idx in range(n):
+        w_end = end - timedelta(days=idx * days)
+        w_start = end - timedelta(days=(idx + 1) * days)
+        out.append((idx, w_start.strftime("%Y-%m-%d"), w_end.strftime("%Y-%m-%d")))
+    return out
+
+
+def windowed_independence(conn, rows: list[tuple], as_of: str, *, n: int = N_WINDOWS,
+                          days: int = WINDOW_DAYS) -> dict:
+    """rows: [(date_iso, source_id)]. For each window compute raw count + EISC_adj of its sources.
+    Returns {counts:[c0..], eisc:[N0..], k_eff:[...], acceleration, eisc0, sources0}.
+    acceleration = N0 - 2*N1 + N2 (independence-weighted 2nd derivative)."""
+    bounds = window_bounds(as_of, n=n, days=days)
+    counts, eiscs, keffs, src_sets = [], [], [], []
+    for _idx, start, end in bounds:
+        win = [r for r in rows if r[0] and start < r[0][:10] <= end]
+        srcs = list({r[1] for r in win})
+        e = eisc_for(conn, srcs) if srcs else {"eisc_adj": 0.0, "k_eff": 0}
+        counts.append(len(win))
+        eiscs.append(e["eisc_adj"])
+        keffs.append(e["k_eff"])
+        src_sets.append(srcs)
+    accel = eiscs[0] - 2 * eiscs[1] + eiscs[2] if n >= 3 else 0.0
+    return {
+        "counts": counts, "eisc": [round(x, 3) for x in eiscs], "k_eff": keffs,
+        "acceleration": round(accel, 3), "eisc0": round(eiscs[0], 3), "k_eff0": keffs[0],
+        "sources0": src_sets[0], "n_total": sum(counts),
+    }
@@ -0,0 +1,9 @@
+"""Spark Control gateway client — the SINGLE chokepoint for all gateway HTTP (§13).
+
+No other module in the engine knows the gateway URL. Everything local-compute
+(transcription, diarization, the local LLM, embeddings, rerank, hybrid search, and the
+scrub/rehydrate sovereignty boundary) goes through here.
+"""
+from .client import SparkControl, SparkControlError, from_config
+
+__all__ = ["SparkControl", "SparkControlError", "from_config"]
@@ -0,0 +1,242 @@
+"""Spark Control HTTP client (handoff §13.2 endpoint table).
+
+Enforces the two operational invariants from §4.1 / §13.4 (revised per infra guidance 2026-06-09):
+  1. AUDIO concurrency is CAPPED at 2 in-flight (hard ceiling 3), GLOBAL across both parakeet
+     endpoints (/v1/audio/transcriptions + /api/audio/diarize*) — they share ONE serial GPU. A
+     process-wide BoundedSemaphore enforces it. Going wider buys ZERO throughput (requests queue and
+     hold the GPU); 2 just keeps the GPU continuously fed with no idle gap = full throughput.
+  2. Transient unresponsiveness is NORMAL, not failure: when the GPU stays continuously busy the
+     /health and in-flight requests can briefly (1-4s) stop responding. Timeouts / 503s /
+     connection-resets are "busy, retry" — handled by short exponential backoff, never treated as work loss.
+
+NOTE: request/response *shapes* for the non-OpenAI endpoints (/api/audio/*, /scrub,
+/rehydrate, /api/search) are provisional and marked TODO(contract) — confirm against the
+live gateway's /api/endpoints. The OpenAI-compatible routes (/v1/*) follow the standard.
+"""
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from pathlib import Path
+from typing import Any
+
+import requests
+
+log = logging.getLogger(__name__)
+
+# Process-wide AUDIO in-flight cap, GLOBAL across both parakeet endpoints. Single serial GPU shared
+# with the operator's production app → concurrency only deepens the queue + lengthens transient
+# busy-blips; sit at 2 (full throughput, ~2-3s busy windows), hard ceiling 3.
+_AUDIO_MAX = 3
+_AUDIO_SEM = threading.BoundedSemaphore(2)
+
+
+def _set_audio_concurrency(n: int) -> None:
+    """Resize the global audio semaphore (clamped to [1, _AUDIO_MAX]). Called at client init from config;
+    set before any worker threads start, so the rebind is not racing in-flight acquirers."""
+    global _AUDIO_SEM
+    _AUDIO_SEM = threading.BoundedSemaphore(min(_AUDIO_MAX, max(1, int(n))))
+
+
+class SparkControlError(RuntimeError):
+    pass
+
+
+class SparkControl:
+    def __init__(
+        self,
+        base_url: str,
+        *,
+        verify_tls: bool = False,
+        timeout: float = 120.0,
+        llm_model: str = "",
+        embed_model: str = "",
+        transcribe_model: str = "",
+        audio_concurrency: int = 2,
+    ) -> None:
+        self.base = base_url.rstrip("/")
+        self.verify = verify_tls
+        self.timeout = timeout
+        self.llm_model = llm_model
+        self.embed_model = embed_model
+        self.transcribe_model = transcribe_model
+        _set_audio_concurrency(audio_concurrency)
+        self._session = requests.Session()
+        if not verify_tls:
+            # same-LAN self-signed cert (§13): suppress the per-request InsecureRequestWarning noise.
+            import urllib3
+            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+    # ---------- low-level ----------
+    def _post(
+        self,
+        path: str,
+        *,
+        json: Any = None,
+        files: Any = None,
+        data: Any = None,
+        retries: int = 4,
+        backoff: float = 5.0,
+    ) -> Any:
+        url = f"{self.base}{path}"
+        for attempt in range(retries + 1):
+            try:
+                r = self._session.post(
+                    url, json=json, files=files, data=data,
+                    timeout=self.timeout, verify=self.verify,
+                )
+                if r.status_code == 503:
+                    raise SparkControlError("503 from Spark Control (GPU busy / cold start)")
+                r.raise_for_status()
+                return r.json()
+            except (requests.RequestException, SparkControlError) as e:
+                if attempt < retries:
+                    sleep = backoff * (2 ** attempt)
+                    log.warning("Spark Control POST %s failed (%s); retry %d/%d in %.0fs",
+                                path, e, attempt + 1, retries, sleep)
+                    time.sleep(sleep)
+                else:
+                    raise SparkControlError(f"POST {path} failed after {retries} retries: {e}") from e
+
+    def _get(self, path: str) -> Any:
+        r = self._session.get(f"{self.base}{path}", timeout=self.timeout, verify=self.verify)
+        r.raise_for_status()
+        return r.json()
+
+    # ---------- health / discovery (§13.2) ----------
+    def status(self) -> Any:
+        return self._get("/api/status")
+
+    def endpoints(self) -> Any:
+        return self._get("/api/endpoints")
+
+    # ---------- local LLM: extraction + scoring helpers (§4.2) ----------
+    def chat(
+        self,
+        messages: list[dict[str, str]],
+        *,
+        json_object: bool = True,
+        temperature: float = 0.0,
+        enable_thinking: bool = False,
+        max_tokens: int | None = None,
+    ) -> Any:
+        """Deterministic, no-chain-of-thought extraction per §4.2 (temp 0, thinking off,
+        JSON mode for guaranteed-valid JSON)."""
+        body: dict[str, Any] = {
+            "model": self.llm_model,
+            "messages": messages,
+            "temperature": temperature,
+            "chat_template_kwargs": {"enable_thinking": enable_thinking},
+        }
+        if json_object:
+            body["response_format"] = {"type": "json_object"}
+        if max_tokens:
+            body["max_tokens"] = max_tokens
+        return self._post("/v1/chat/completions", json=body)
+
+    # ---------- embeddings / rerank / hybrid search (§4.3) ----------
+    def embed(self, inputs: list[str]) -> Any:
+        """Embed DISTILLED PROPOSITIONS, not raw chunks (§4.3)."""
+        return self._post("/v1/embeddings", json={"model": self.embed_model, "input": inputs})
+
+    def rerank(self, query: str, documents: list[str], *, top_n: int | None = None) -> Any:
+        body: dict[str, Any] = {"query": query, "documents": documents}
+        if top_n:
+            body["top_n"] = top_n
+        return self._post("/v1/rerank", json=body)
+
+    def search(
+        self,
+        query: str,
+        *,
+        collection: str,
+        top_k: int = 10,
+        retrieve_n: int | None = None,
+        rerank: bool = True,
+        filter: dict[str, Any] | None = None,
+        with_payload: bool = True,
+        min_score: float | None = None,
+        dense_vector_name: str = "bge_m3",
+        sparse_vector_name: str = "bm25",
+        text_field: str = "proposition",
+    ) -> Any:
+        """Hybrid dense+sparse retrieval (RRF) + optional rerank over a Qdrant collection (§4.3).
+        The gateway defaults vector names to 'dense'/'sparse'; our `propositions` collection uses
+        named vectors bge_m3/bm25, so they must be passed explicitly (confirmed live)."""
+        body: dict[str, Any] = {
+            "query": query, "collection": collection, "top_k": top_k,
+            "rerank": rerank, "with_payload": with_payload,
+            "dense_vector_name": dense_vector_name,
+            "sparse_vector_name": sparse_vector_name,
+            "text_field": text_field,
+        }
+        if retrieve_n is not None:
+            body["retrieve_n"] = retrieve_n
+        if filter is not None:
+            body["filter"] = filter
+        if min_score is not None:
+            body["min_score"] = min_score
+        return self._post("/api/search", json=body)
+
+    # ---------- audio: capped at 2 in-flight GLOBAL (semaphore), short busy-retry ----------
+    # backoff=1.5 → ~1.5/3/6/12/24s: tuned to ride out the 1-4s busy-blips, not the old 5-40s.
+    def transcribe(self, audio_path: str | Path, *, response_format: str = "verbose_json") -> Any:
+        with _AUDIO_SEM, open(audio_path, "rb") as f:
+            return self._post(
+                "/v1/audio/transcriptions",
+                files={"file": f},
+                data={"model": self.transcribe_model, "response_format": response_format},
+                retries=5, backoff=1.5,
+            )
+
+    def diarize_chunk(self, audio_path: str | Path) -> Any:
+        # TODO(contract): confirm /api/audio/diarize-chunk response shape (segments + 192-d voiceprint).
+        with _AUDIO_SEM, open(audio_path, "rb") as f:
+            return self._post("/api/audio/diarize-chunk", files={"file": f}, retries=5, backoff=1.5)
+
+    def transcribe_with_speakers(self, audio_path: str | Path) -> Any:
+        with _AUDIO_SEM, open(audio_path, "rb") as f:
+            return self._post("/api/audio/transcribe-with-speakers", files={"file": f}, retries=5, backoff=1.5)
+
+    # ---------- frontier sovereignty boundary (§4.6) ----------
+    # Confirmed contract (gateway /openapi.json):
+    #   /scrub:     task_id*, items*, known_entities, actor, tier1_action, bucket, ner, map_handle
+    #   /rehydrate: task_id*, map_handle*, items*, actor, strict
+    # De-identifies IDENTITIES into stable placeholders; the de-anon map stays on the box and is
+    # referenced by `map_handle`. Exposure/position data must NEVER be sent here at all (§4.6).
+    def scrub(
+        self,
+        items: list[Any],
+        *,
+        task_id: str,
+        known_entities: dict[str, str] | None = None,
+        actor: str | None = None,
+        ner: bool = True,
+    ) -> Any:
+        """Returns the scrubbed items + a `map_handle` to pass to rehydrate. `known_entities` is the
+        caller-supplied dictionary (Strike→[FUND_1]); `ner` toggles the local-Qwen NER backstop."""
+        body: dict[str, Any] = {"task_id": task_id, "items": items, "ner": ner}
+        if known_entities is not None:
+            body["known_entities"] = known_entities
+        if actor is not None:
+            body["actor"] = actor
+        return self._post("/scrub", json=body)
+
+    def rehydrate(self, items: list[Any], *, task_id: str, map_handle: str, strict: bool = False) -> Any:
+        """Restore real identities in the frontier's output locally, using the scrub `map_handle`."""
+        return self._post("/rehydrate", json={
+            "task_id": task_id, "map_handle": map_handle, "items": items, "strict": strict,
+        })
+
+
+def from_config(cfg: Any) -> SparkControl:
+    return SparkControl(
+        cfg.spark_control_url,
+        verify_tls=cfg.spark_verify_tls,
+        timeout=cfg.spark_timeout_s,
+        llm_model=cfg.local_llm_model,
+        embed_model=cfg.embed_model,
+        transcribe_model=cfg.transcribe_model,
+        audio_concurrency=getattr(cfg, "audio_concurrency", 2),
+    )
@@ -0,0 +1,4 @@
+"""Persistence layer: SQLite (metadata, ledger, conviction log, graph, queue).
+
+Qdrant (vectors) is reached via the Spark Control gateway; see signal_engine.spark.
+"""
@@ -0,0 +1,81 @@
+"""SQLite connection + schema initialization. Boring and inspectable (§5)."""
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+
+SCHEMA_FILE = Path(__file__).with_name("schema.sql")
+
+
+def connect(db_path: Path) -> sqlite3.Connection:
+    db_path = Path(db_path)
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(str(db_path), timeout=30)
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA foreign_keys = ON")
+    conn.execute("PRAGMA busy_timeout = 30000")  # wait, don't fail, under concurrent backfill writers
+    return conn
+
+
+# Additive migrations for DBs created before a column existed (CREATE IF NOT EXISTS won't add columns).
+_MIGRATIONS = {
+    "documents": {"content_hash": "TEXT", "processed_at": "TEXT", "dedup_key": "TEXT"},
+    # DESIGN_v2.1 condition 1: own_network = the Ten31 orbit (Odell/Bent partners etc.) — listening to
+    # ourselves. Quarantined: a TEST FIXTURE for the reflexivity case, DROPPED in live EISC scoring.
+    "sources": {"backtest_2022_2023": "TEXT", "own_network": "INTEGER"},
+    # DESIGN_v2.1: tag derivatives by distance-from-edge for TRIAGE — surfaced, NEVER used as a filter
+    # (an engine that pre-filters to in-mandate reproduces the AI/compute mandate-expansion miss).
+    "fanout_nodes": {"distance_from_edge": "TEXT"},
+}
+
+
+def _widen_cluster_check(conn: sqlite3.Connection) -> None:
+    """Add 'banks'/'credit'/'fintech' to sources.source_cluster's CHECK. SQLite can't ALTER a CHECK, so
+    rebuild the (tiny) table via the standard table-swap. Idempotent: no-op once already widened. Toggles
+    foreign_keys OFF around the swap (DROP would otherwise fail on inbound FKs); data copied by value so
+    referential integrity holds. busy_timeout (set in connect) lets it wait out concurrent backfill writers."""
+    import re
+    row = conn.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='sources'").fetchone()
+    if not row or "'banks'" in row[0]:
+        return
+    new_list = ("('macro','ai_tech','energy','bitcoin','vc_consensus','generalist',"
+                "'banks','credit','fintech')")
+    new_ddl = re.sub(r"source_cluster IN\s*\([^)]*\)", f"source_cluster IN {new_list}", row[0], count=1)
+    new_ddl = new_ddl.replace("CREATE TABLE sources", "CREATE TABLE sources_new", 1)
+    conn.commit()                              # close any implicit txn before toggling FK pragma
+    conn.execute("PRAGMA foreign_keys=OFF")
+    try:
+        conn.execute(new_ddl)
+        conn.execute("INSERT INTO sources_new SELECT * FROM sources")
+        conn.execute("DROP TABLE sources")
+        conn.execute("ALTER TABLE sources_new RENAME TO sources")
+        conn.commit()
+    finally:
+        conn.execute("PRAGMA foreign_keys=ON")
+
+
+def _migrate(conn: sqlite3.Connection) -> None:
+    for table, cols in _MIGRATIONS.items():
+        existing = {r[1] for r in conn.execute(f"PRAGMA table_info({table})")}
+        for col, typ in cols.items():
+            if col not in existing:
+                conn.execute(f"ALTER TABLE {table} ADD COLUMN {col} {typ}")
+    # indexes on migrated columns (created here so they work on DBs predating the column)
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_content_hash ON documents(content_hash)")
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_dedup_key ON documents(dedup_key)")
+    conn.commit()
+    _widen_cluster_check(conn)
+
+
+def init_db(conn: sqlite3.Connection) -> None:
+    """Idempotent: CREATE ... IF NOT EXISTS + additive column migrations."""
+    conn.executescript(SCHEMA_FILE.read_text())
+    conn.commit()
+    _migrate(conn)
+
+
+def table_names(conn: sqlite3.Connection) -> list[str]:
+    rows = conn.execute(
+        "SELECT name FROM sqlite_master WHERE type IN ('table','view') ORDER BY name"
+    ).fetchall()
+    return [r[0] for r in rows]
@@ -0,0 +1,280 @@
+-- Ten31 Signal Engine — SQLite schema (pilot)
+-- Source of truth: ten31-signal-engine-handoff.md  §4 (pipeline layers), §6.7 (ledger),
+--   §3.1 (conviction log), §13.4 (backfill queue).
+-- Design principle (§5, §10): boring, inspectable tables. The whole system state is a SELECT away.
+
+PRAGMA journal_mode = WAL;
+PRAGMA foreign_keys = ON;
+
+-- ============================================================================
+-- CANONICAL TOPIC VOCABULARY (§4.2) — HYBRID (operator decision):
+--   seeded controlled list + emergent topics merged in on a schedule.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS topics (
+  topic_canonical TEXT PRIMARY KEY,
+  status          TEXT CHECK (status IN ('controlled','emergent','merged')) DEFAULT 'emergent',
+  merged_into     TEXT REFERENCES topics(topic_canonical),
+  seam            TEXT,
+  created_at      TEXT DEFAULT (datetime('now'))
+);
+
+-- ============================================================================
+-- SOURCES & DOCUMENTS (§4.1)
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS sources (
+  source_id          TEXT PRIMARY KEY,
+  name               TEXT NOT NULL,
+  kind               TEXT NOT NULL CHECK (kind IN ('podcast','youtube','filing','earnings_call')),
+  source_cluster     TEXT CHECK (source_cluster IN
+                       ('macro','ai_tech','energy','bitcoin','vc_consensus','generalist','banks','credit','fintech')),
+  role               TEXT CHECK (role IN ('CB','IND','DX','none')) DEFAULT 'none',  -- §7.4
+  rss_url            TEXT,
+  channel_url        TEXT,
+  ticker             TEXT,
+  -- §8 credibility: neutral prior that DECAYS in favor of earned track record from the ledger.
+  bootstrap_prior    REAL DEFAULT 1.0,
+  earned_credibility REAL,
+  cluster_capped_low INTEGER DEFAULT 0,   -- §4.5 bitcoin cluster deliberately under-weighted
+  backtest_2022_2023 TEXT,                -- §7.1 reach: rss_full | rss_2023_only | youtube_only | launched_later | unavailable
+  notes              TEXT,
+  created_at         TEXT DEFAULT (datetime('now'))
+);
+
+CREATE TABLE IF NOT EXISTS documents (
+  doc_id          TEXT PRIMARY KEY,
+  source_id       TEXT NOT NULL REFERENCES sources(source_id),
+  kind            TEXT NOT NULL,        -- podcast|youtube|filing|earnings_call
+  external_id     TEXT,                 -- rss guid / yt video id / EDGAR accession / transcript id
+  url             TEXT,
+  title           TEXT,
+  date            TEXT,                 -- ISO publication/filing date
+  duration_sec    REAL,
+  raw_path        TEXT,                 -- downloaded audio / raw filing
+  transcript_path TEXT,
+  -- DEDUP MODEL (layered):
+  --   (1) UNIQUE(source_id, external_id) below = the ROBUST guard. external_id is the stable item id
+  --       (RSS GUID / YouTube video id / EDGAR accession). Checked at ingest, BEFORE any GPU work.
+  --   (2) dedup_key = normalized title+date → catches the SAME episode arriving via a different
+  --       feed/mirror (different external_id). Computed pre-transcription. NOT from the transcript.
+  --   content_hash is ONLY an audit fingerprint of the transcript (did a re-run change?) — it is NOT
+  --       a dedup key (ASR is non-deterministic, so one differing word flips the hash).
+  dedup_key       TEXT,
+  content_hash    TEXT,
+  processed_at    TEXT,                 -- set when transcription/extraction completes
+  ingested_at     TEXT DEFAULT (datetime('now')),
+  UNIQUE (source_id, external_id)       -- idempotent ingest (§13.4 dedup)
+);
+-- indexes for dedup_key / content_hash are created in db._migrate (after columns exist on older DBs).
+
+-- ============================================================================
+-- CLAIMS / PROPOSITIONS (§4.2) — the atomic unit of the whole system.
+-- One passage emits 0..N claims; MOST of a podcast hour is 0 (§4.2). The
+-- extractor must be willing to find nothing.
+-- NOTE: thesis_seam is a TAG, never a hard filter (§5.7) — off-thesis &
+--   anti-thesis claims MUST survive.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS claims (
+  claim_id            TEXT PRIMARY KEY,
+  doc_id              TEXT NOT NULL REFERENCES documents(doc_id),
+  source_id           TEXT NOT NULL REFERENCES sources(source_id),
+  proposition         TEXT NOT NULL,    -- normalized subject-assertion-object
+  topic_canonical     TEXT REFERENCES topics(topic_canonical),
+  topic_raw           TEXT,
+  claimant            TEXT,
+  source_cluster      TEXT,
+  date                TEXT,
+  claim_type          TEXT CHECK (claim_type IN ('interpretive','predictive','descriptive','reactive')),
+  time_horizon        TEXT CHECK (time_horizon IN ('near','medium','long','unspecified')),
+  confidence          TEXT CHECK (confidence IN ('low','med','high')),
+  -- §4.2 relation: stance is EXTRACTED, never inferred from vector distance (§2.2/§5.3).
+  rel_target_claim_id TEXT REFERENCES claims(claim_id),
+  rel_polarity        TEXT CHECK (rel_polarity IN ('affirms','denies','qualifies','none')) DEFAULT 'none',
+  engages_consensus   INTEGER DEFAULT 0,
+  counters_position   TEXT,
+  thesis_seam         TEXT CHECK (thesis_seam IN
+                        ('energy_compute','debasement_bitcoin','ai_data_ownership','none')) DEFAULT 'none',
+  salience            TEXT CHECK (salience IN ('central','secondary','aside')) DEFAULT 'secondary',
+  qdrant_point_id     TEXT,             -- link to the embedded proposition vector (§4.3)
+  extracted_at        TEXT DEFAULT (datetime('now'))
+);
+CREATE INDEX IF NOT EXISTS idx_claims_topic ON claims(topic_canonical);
+CREATE INDEX IF NOT EXISTS idx_claims_date  ON claims(date);
+CREATE INDEX IF NOT EXISTS idx_claims_seam  ON claims(thesis_seam);
+CREATE INDEX IF NOT EXISTS idx_claims_type  ON claims(claim_type);
+
+-- ============================================================================
+-- SOURCE-INDEPENDENCE GRAPH (§4.5) — discount convergence by connectedness.
+-- Cross-cluster convergence = gold; within-cluster = near-noise.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS source_edges (
+  src_a      TEXT NOT NULL REFERENCES sources(source_id),
+  src_b      TEXT NOT NULL REFERENCES sources(source_id),
+  edge_type  TEXT NOT NULL CHECK (edge_type IN ('shared_guest','citation','community')),
+  weight     REAL DEFAULT 1.0,
+  evidence   TEXT,        -- voiceprint_id / show-note ref / url
+  updated_at TEXT DEFAULT (datetime('now')),
+  PRIMARY KEY (src_a, src_b, edge_type)
+);
+
+-- ============================================================================
+-- VOICEPRINT LIBRARY (§4.5, §4.1) — same-guest-across-shows BY VOICE.
+-- 192-dim TitaNet voiceprints; cosine ~0.7 distance threshold for same speaker.
+-- This is the highest-leverage automated input to the independence graph.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS voiceprints (
+  voiceprint_id TEXT PRIMARY KEY,
+  vector        BLOB NOT NULL,         -- 192 x float32
+  person_label  TEXT,                  -- resolved name if known
+  first_doc_id  TEXT REFERENCES documents(doc_id),
+  first_seen    TEXT DEFAULT (datetime('now'))
+);
+CREATE TABLE IF NOT EXISTS voiceprint_observations (
+  obs_id        INTEGER PRIMARY KEY AUTOINCREMENT,
+  voiceprint_id TEXT NOT NULL REFERENCES voiceprints(voiceprint_id),
+  doc_id        TEXT NOT NULL REFERENCES documents(doc_id),
+  chunk_idx     INTEGER,
+  segment_start REAL,
+  segment_end   REAL
+);
+
+-- ============================================================================
+-- CONVICTION LOG (§3.1) — human-owned seed nodes for Job B.
+-- Structural rule (§3.1): separate the TRACKABLE thematic proposition (corpus
+--   can corroborate) from TEAM conviction (context only). The engine must NEVER
+--   present theme corroboration as validation of the team bet beneath it.
+-- Exposure scored as coarse NAV bands (operator decision): none | lt2 | 2to10 | gt10 | unset.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS conviction_log (
+  conviction_id        TEXT PRIMARY KEY,            -- R1, E1, A1, B1 ...
+  seam                 TEXT,                        -- root|energy_compute|debasement_bitcoin|ai_data_ownership
+  thematic_proposition TEXT NOT NULL,               -- the TRACKABLE half
+  team_conviction_note TEXT,                         -- context ONLY, never scored as theme validation
+  conviction_level     TEXT CHECK (conviction_level IN ('low','med','med-high','high')),
+  current_exposure     TEXT CHECK (current_exposure IN ('none','lt2','2to10','gt10','unset')) DEFAULT 'unset',
+  exposure_note        TEXT,                         -- original §3.1 prose ("pervasive", "MED-HIGH") pending NAV-band finalization
+  disconfirming_signal TEXT,
+  is_thesis_breaker    INTEGER DEFAULT 0,            -- §3.1 B1-B3: engine must surface these AGAINST the thesis (§5.7)
+  updated_at           TEXT DEFAULT (datetime('now'))
+);
+
+-- Conviction fan-out tree (§4.6). A derivative is a HYPOTHESIS until independent
+-- corpus corroboration AND the exposure gap both clear the bar — then 'signal'.
+CREATE TABLE IF NOT EXISTS fanout_nodes (
+  node_id                TEXT PRIMARY KEY,
+  parent_conviction_id   TEXT REFERENCES conviction_log(conviction_id),
+  parent_node_id         TEXT REFERENCES fanout_nodes(node_id),
+  derivative_proposition TEXT NOT NULL,
+  depth                  INTEGER DEFAULT 1,
+  status                 TEXT CHECK (status IN ('hypothesis','corroborated','signal')) DEFAULT 'hypothesis',
+  created_at             TEXT DEFAULT (datetime('now'))
+);
+
+-- ============================================================================
+-- DUAL-EVALUATION LEDGER (§4.7, §6) — START DAY ONE; the clock can't be backfilled.
+-- Log EVERY candidate that clears the quantitative bar (§6.6 — you need a denominator).
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS ledger (
+  signal_id            TEXT PRIMARY KEY,
+  type                 TEXT NOT NULL CHECK (type IN ('theme','event','under_acted_conviction')),
+  proposition          TEXT NOT NULL,
+  date_logged          TEXT NOT NULL DEFAULT (datetime('now')),
+  discourse_metric     TEXT,           -- JSON: acceleration, cross-cluster source set, independence-discounted count
+  external_check       TEXT,           -- JSON: resolution spec / nested clean events the model proposed (§6.5)
+  resolution_date      TEXT,
+  discourse_outcome    TEXT CHECK (discourse_outcome IN
+                         ('up_cross_cluster','up_single_cluster','flat','down')),
+  external_outcome     TEXT CHECK (external_outcome IN
+                         ('correct','partial','wrong','unresolved_expired','too_early')),
+  lead_time_days       INTEGER,        -- §6.3 THE alpha measurement (to the DERIVATIVE node for Job B)
+  model_confidence     REAL,           -- §6.7 logged ONLY to measure its uselessness — NEVER fed into scoring
+  origin_conviction_id TEXT REFERENCES conviction_log(conviction_id),  -- Job B traceability
+  origin_node_id       TEXT REFERENCES fanout_nodes(node_id)
+);
+CREATE INDEX IF NOT EXISTS idx_ledger_type   ON ledger(type);
+CREATE INDEX IF NOT EXISTS idx_ledger_logged ON ledger(date_logged);
+
+-- Human eval on a SEPARATE write path (§6.7): "keep them in separate columns and do not let the
+-- model see Grant's rating before it logs its prediction." The model-facing code reads `ledger`;
+-- ONLY the eval UI writes here. A separate table makes that separation structural, not a convention.
+CREATE TABLE IF NOT EXISTS human_evaluations (
+  signal_id    TEXT PRIMARY KEY REFERENCES ledger(signal_id),
+  grant_rating INTEGER,               -- "non-obvious and relevant to me?" (e.g. 1-5)
+  non_obvious  INTEGER,               -- 0/1
+  notes        TEXT,
+  rated_at     TEXT DEFAULT (datetime('now'))
+);
+
+-- Reporting view — the valuable cell is DISAGREEMENT (§6.7). Used for analysis, NOT by the model path.
+CREATE VIEW IF NOT EXISTS v_ledger_eval AS
+  SELECT l.*, h.grant_rating, h.non_obvious, h.notes AS grant_notes, h.rated_at
+  FROM ledger l LEFT JOIN human_evaluations h ON h.signal_id = l.signal_id;
+
+-- ============================================================================
+-- BACKFILL QUEUE (§13.4) — client-side, measured in GPU-HOURS.
+-- Extraction (one LLM pass per chunk over the whole corpus) is the HEAVIER serial load.
+-- Audio is SEQUENTIAL (parallel → 503). Leases give crash-safe resumability.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS backfill_jobs (
+  job_id           INTEGER PRIMARY KEY AUTOINCREMENT,
+  job_type         TEXT NOT NULL CHECK (job_type IN ('transcribe','diarize','extract','embed')),
+  target_id        TEXT NOT NULL,        -- doc_id or chunk id
+  parent_doc_id    TEXT,
+  state            TEXT NOT NULL CHECK (state IN
+                     ('pending','leased','running','done','failed','skipped')) DEFAULT 'pending',
+  priority         INTEGER DEFAULT 100,  -- lower = sooner (backtest corpus jumps the queue, §7.1)
+  attempts         INTEGER DEFAULT 0,
+  max_attempts     INTEGER DEFAULT 5,
+  lease_owner      TEXT,
+  lease_expires_at TEXT,
+  input_hash       TEXT NOT NULL,        -- hash(content + model/prompt version) — idempotency
+  output_ref       TEXT,
+  gpu_seconds      REAL,                 -- measured per job → self-calibrating GPU-hours estimate
+  error            TEXT,
+  created_at       TEXT DEFAULT (datetime('now')),
+  updated_at       TEXT DEFAULT (datetime('now')),
+  UNIQUE (job_type, input_hash)
+);
+CREATE INDEX IF NOT EXISTS idx_jobs_state_priority ON backfill_jobs(state, priority, job_id);
+
+-- ============================================================================
+-- SCORING BRAIN state (the "brain", build blueprint). Candidate state lands here +
+-- ledger + fanout_nodes.status; existing tables unchanged.
+-- ============================================================================
+
+-- Temporal layer: one row per (topic, as_of, window). 28d non-overlapping windows.
+CREATE TABLE IF NOT EXISTS topic_window_stats (
+  topic_canonical TEXT NOT NULL,
+  as_of           TEXT NOT NULL,
+  window_idx      INTEGER NOT NULL,          -- 0 = window ending at as_of, 1 = prior, 2 = baseline
+  window_start    TEXT NOT NULL,
+  window_end      TEXT NOT NULL,
+  n_interp_pred   INTEGER NOT NULL DEFAULT 0,
+  n_descr_react   INTEGER NOT NULL DEFAULT 0,
+  n_distinct_src  INTEGER NOT NULL DEFAULT 0,
+  n_distinct_clu  INTEGER NOT NULL DEFAULT 0,
+  PRIMARY KEY (topic_canonical, as_of, window_idx)
+);
+
+-- Audit trail: one row per (scorer, key, as_of). Deterministic score_id → re-run reproduces.
+CREATE TABLE IF NOT EXISTS candidate_scores (
+  score_id        TEXT PRIMARY KEY,
+  scorer          TEXT NOT NULL,             -- emergence|contrarian|intersection|convergence|under_acted
+  as_of           TEXT NOT NULL,
+  topic_canonical TEXT,
+  node_id         TEXT,
+  conviction_id   TEXT,
+  score           REAL NOT NULL,
+  cleared_evidence_bar  INTEGER NOT NULL DEFAULT 0,   -- tier 1: logged to ledger (the denominator)
+  cleared_promotion_bar INTEGER NOT NULL DEFAULT 0,   -- tier 2: sent to frontier judge
+  inputs_json     TEXT NOT NULL,             -- every term that produced the score (full audit)
+  computed_at     TEXT DEFAULT (datetime('now'))
+);
+CREATE INDEX IF NOT EXISTS idx_cs_asof ON candidate_scores(scorer, as_of, cleared_promotion_bar);
+
+-- Tunable bar config so the backtest can sweep thresholds without code edits.
+CREATE TABLE IF NOT EXISTS score_thresholds (
+  scorer     TEXT PRIMARY KEY,
+  min_score  REAL,
+  gates_json TEXT,
+  version    TEXT
+);
@@ -0,0 +1,74 @@
+"""Load human-owned seed data (conviction log, §3.1) into SQLite.
+
+The conviction log is the highest-leverage Job B input (§3.1) and is HUMAN-OWNED:
+Grant edits the YAML seed files; this loader upserts them. Re-running is idempotent.
+"""
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+_CONVICTION_COLS = (
+    "conviction_id",
+    "seam",
+    "thematic_proposition",
+    "team_conviction_note",
+    "conviction_level",
+    "current_exposure",
+    "exposure_note",
+    "disconfirming_signal",
+    "is_thesis_breaker",
+)
+
+
+def _row(c: dict[str, Any]) -> dict[str, Any]:
+    return {
+        "conviction_id": c["id"],
+        "seam": c.get("seam"),
+        "thematic_proposition": c["thematic_proposition"],
+        "team_conviction_note": c.get("team_conviction_note"),
+        "conviction_level": c.get("conviction_level"),
+        "current_exposure": c.get("current_exposure", "unset"),
+        "exposure_note": c.get("exposure_note"),
+        "disconfirming_signal": c.get("disconfirming_signal"),
+        "is_thesis_breaker": 1 if c.get("is_thesis_breaker") else 0,
+    }
+
+
+def load_fanout(conn: sqlite3.Connection, path: Path) -> int:
+    """Load a hand-written fan-out tree (§7.1 backtest). Idempotent on node_id."""
+    data = yaml.safe_load(Path(path).read_text()) or {}
+    parent = data["parent_conviction_id"]
+    nodes = data.get("nodes", [])
+    for n in nodes:
+        conn.execute(
+            """INSERT INTO fanout_nodes
+                 (node_id, parent_conviction_id, derivative_proposition, depth, status, distance_from_edge)
+               VALUES (?,?,?,?, 'hypothesis', ?)
+               ON CONFLICT(node_id) DO UPDATE SET derivative_proposition=excluded.derivative_proposition,
+                 parent_conviction_id=excluded.parent_conviction_id,
+                 distance_from_edge=excluded.distance_from_edge""",
+            (n["node_id"], parent, n["derivative_proposition"], n.get("depth", 1), n.get("distance_from_edge")),
+        )
+    conn.commit()
+    return len(nodes)
+
+
+def load_convictions(conn: sqlite3.Connection, path: Path) -> int:
+    data = yaml.safe_load(Path(path).read_text()) or {}
+    rows = data.get("convictions", [])
+    cols = ", ".join(_CONVICTION_COLS)
+    placeholders = ", ".join(f":{c}" for c in _CONVICTION_COLS)
+    updates = ", ".join(f"{c}=excluded.{c}" for c in _CONVICTION_COLS if c != "conviction_id")
+    sql = (
+        f"INSERT INTO conviction_log ({cols}, updated_at) "
+        f"VALUES ({placeholders}, datetime('now')) "
+        f"ON CONFLICT(conviction_id) DO UPDATE SET {updates}, updated_at=datetime('now')"
+    )
+    for c in rows:
+        conn.execute(sql, _row(c))
+    conn.commit()
+    return len(rows)
@@ -0,0 +1,90 @@
+"""Load the source registry (companies + podcasts, §7.3/§7.4) into SQLite. Idempotent upsert."""
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+_COLS = ("source_id", "name", "kind", "source_cluster", "role", "rss_url",
+         "channel_url", "ticker", "cluster_capped_low", "own_network", "backtest_2022_2023", "notes")
+
+
+def _row(s: dict[str, Any]) -> dict[str, Any]:
+    return {
+        "source_id": s["id"],
+        "name": s["name"],
+        "kind": s["kind"],
+        "source_cluster": s.get("cluster"),
+        "role": s.get("role", "none"),
+        "rss_url": s.get("rss_url"),
+        "channel_url": s.get("channel_url"),
+        "ticker": s.get("ticker"),
+        "cluster_capped_low": 1 if s.get("cluster_capped_low") else 0,
+        "own_network": 1 if s.get("own_network") else 0,
+        "backtest_2022_2023": s.get("backtest_2022_2023"),
+        "notes": s.get("notes"),
+    }
+
+
+def update_feeds(conn: sqlite3.Connection, path: Path) -> int:
+    """Apply resolved/verified podcast feed URLs + backtest-reach to existing source rows."""
+    try:
+        conn.execute("ALTER TABLE sources ADD COLUMN backtest_2022_2023 TEXT")
+        conn.commit()
+    except sqlite3.OperationalError:
+        pass  # column already exists
+    data = yaml.safe_load(Path(path).read_text()) or {}
+    rows = data.get("feeds", [])
+    for f in rows:
+        conn.execute(
+            """UPDATE sources
+                 SET rss_url=:rss_url, channel_url=:youtube_channel_url,
+                     backtest_2022_2023=:backtest_2022_2023, notes=COALESCE(:note, notes)
+               WHERE source_id=:id""",
+            {
+                "id": f["id"], "rss_url": f.get("rss_url"),
+                "youtube_channel_url": f.get("youtube_channel_url"),
+                "backtest_2022_2023": f.get("backtest_2022_2023"), "note": f.get("note"),
+            },
+        )
+    conn.commit()
+    return len(rows)
+
+
+def load_source_edges(conn: sqlite3.Connection, path: Path) -> int:
+    """Seed EISC connectedness edges (priors) idempotently. Stores src_a,src_b in sorted order to
+    match the transcribe_worker's convention (sorted([a,b]) + ON CONFLICT weight+=1) so real detections
+    accumulate on the same PK instead of creating a reversed duplicate. DO NOTHING on conflict → a
+    re-run won't inflate, and won't clobber a stronger auto-detected weight."""
+    data = yaml.safe_load(Path(path).read_text()) or {}
+    rows = data.get("edges", [])
+    applied = 0
+    for e in rows:
+        a, b = sorted([e["a"], e["b"]])
+        cur = conn.execute(
+            """INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
+               VALUES (?,?,?,?,?)
+               ON CONFLICT(src_a, src_b, edge_type) DO NOTHING""",
+            (a, b, e["type"], float(e.get("weight", 1.0)), e.get("evidence")),
+        )
+        applied += cur.rowcount
+    conn.commit()
+    return applied
+
+
+def load_sources(conn: sqlite3.Connection, path: Path) -> int:
+    data = yaml.safe_load(Path(path).read_text()) or {}
+    rows = data.get("sources", [])
+    cols = ", ".join(_COLS)
+    placeholders = ", ".join(f":{c}" for c in _COLS)
+    updates = ", ".join(f"{c}=excluded.{c}" for c in _COLS if c != "source_id")
+    sql = (
+        f"INSERT INTO sources ({cols}, created_at) VALUES ({placeholders}, datetime('now')) "
+        f"ON CONFLICT(source_id) DO UPDATE SET {updates}"
+    )
+    for s in rows:
+        conn.execute(sql, _row(s))
+    conn.commit()
+    return len(rows)
@@ -0,0 +1,5 @@
+"""Web UI (FastAPI) — corpus management + (later) the human-eval rating interface (§4.7/§6.7).
+
+This is the app the StartOS s9pk exposes on its `ui` interface. Server-rendered HTML, no template
+engine / JS framework — boring and inspectable, like the rest of the system.
+"""
@@ -0,0 +1,179 @@
+"""Corpus-management web UI (FastAPI).
+
+Pages:
+  /              dashboard — corpus + pipeline counts at a glance
+  /corpus        full source selection (companies + podcasts) + "add source" form
+  /corpus/add    POST handler (manual urlencoded parse → no python-multipart dependency)
+  /source/{id}   per-source detail: documents + extracted claims (inspect the signal)
+"""
+from __future__ import annotations
+
+import html
+import re
+import sqlite3
+from urllib.parse import parse_qs
+
+from fastapi import FastAPI, Request
+from fastapi.responses import HTMLResponse, RedirectResponse
+
+from ..config import load_config
+from ..store import db
+
+_CSS = """
+body{font:14px/1.5 -apple-system,Segoe UI,Roboto,sans-serif;margin:0;background:#0f1115;color:#e6e6e6}
+header{background:#161a22;padding:12px 20px;border-bottom:1px solid #2a2f3a}
+header a{color:#7aa2f7;text-decoration:none;margin-right:18px;font-weight:600}
+main{padding:20px;max-width:1100px;margin:0 auto}
+h1{font-size:20px}h2{font-size:16px;margin-top:28px;color:#9aa5b1}
+table{border-collapse:collapse;width:100%;margin:10px 0}
+th,td{text-align:left;padding:6px 10px;border-bottom:1px solid #232833;font-size:13px}
+th{color:#9aa5b1;font-weight:600}
+tr:hover td{background:#161a22}
+.tag{display:inline-block;padding:1px 7px;border-radius:10px;background:#232833;font-size:11px;color:#aab}
+.cards{display:flex;gap:14px;flex-wrap:wrap}
+.card{background:#161a22;border:1px solid #2a2f3a;border-radius:8px;padding:14px 18px;min-width:130px}
+.card .n{font-size:24px;font-weight:700;color:#7aa2f7}.card .l{color:#9aa5b1;font-size:12px}
+form{background:#161a22;border:1px solid #2a2f3a;border-radius:8px;padding:16px;margin:14px 0}
+label{display:block;margin:8px 0 2px;color:#9aa5b1;font-size:12px}
+input,select{background:#0f1115;border:1px solid #2a2f3a;color:#e6e6e6;border-radius:5px;padding:6px 8px;width:240px}
+button{background:#7aa2f7;color:#0f1115;border:0;border-radius:6px;padding:8px 16px;font-weight:700;cursor:pointer;margin-top:12px}
+a{color:#7aa2f7}.muted{color:#6b7280;font-size:12px}
+"""
+
+_CLUSTERS = ["macro", "ai_tech", "energy", "bitcoin", "vc_consensus", "generalist"]
+_KINDS = ["podcast", "youtube", "filing", "earnings_call"]
+_ROLES = ["none", "CB", "IND", "DX"]
+
+
+def _page(title: str, body: str) -> HTMLResponse:
+    nav = ('<header><a href="/">Dashboard</a><a href="/corpus">Corpus</a>'
+           '<span class="muted">Ten31 Signal Engine</span></header>')
+    doc = f"<!doctype html><html><head><meta charset=utf-8><title>{html.escape(title)}</title>" \
+          f"<style>{_CSS}</style></head><body>{nav}<main>{body}</main></body></html>"
+    return HTMLResponse(doc)
+
+
+def _slug(s: str) -> str:
+    return re.sub(r"[^a-z0-9]+", "-", s.lower()).strip("-")[:40] or "src"
+
+
+def create_app() -> FastAPI:
+    cfg = load_config()
+    app = FastAPI(title="Ten31 Signal Engine")
+
+    def conn() -> sqlite3.Connection:
+        c = db.connect(cfg.db_path)
+        db.init_db(c)
+        return c
+
+    @app.get("/", response_class=HTMLResponse)
+    def dashboard() -> HTMLResponse:
+        c = conn()
+        def scalar(q, *a):
+            r = c.execute(q, a).fetchone()
+            return r[0] if r else 0
+        cards = {
+            "Sources": scalar("SELECT COUNT(*) FROM sources"),
+            "Documents": scalar("SELECT COUNT(*) FROM documents"),
+            "Claims": scalar("SELECT COUNT(*) FROM claims"),
+            "Embedded": scalar("SELECT COUNT(*) FROM claims WHERE qdrant_point_id IS NOT NULL"),
+            "Convictions": scalar("SELECT COUNT(*) FROM conviction_log"),
+            "Ledger": scalar("SELECT COUNT(*) FROM ledger"),
+        }
+        cards_html = "".join(f'<div class="card"><div class="n">{v}</div><div class="l">{k}</div></div>'
+                             for k, v in cards.items())
+        # breakdowns
+        def rows(q):
+            return "".join(f"<tr><td>{html.escape(str(a))}</td><td>{b}</td></tr>" for a, b in c.execute(q))
+        claims_by_type = rows("SELECT claim_type, COUNT(*) FROM claims GROUP BY claim_type ORDER BY 2 DESC")
+        claims_by_seam = rows("SELECT thesis_seam, COUNT(*) FROM claims GROUP BY thesis_seam ORDER BY 2 DESC")
+        queue = rows("SELECT job_type||' / '||state, COUNT(*) FROM backfill_jobs GROUP BY 1 ORDER BY 1")
+        c.close()
+        body = f"""<h1>Dashboard</h1><div class="cards">{cards_html}</div>
+        <h2>Claims by type</h2><table><tr><th>type</th><th>n</th></tr>{claims_by_type or '<tr><td class=muted colspan=2>none yet</td></tr>'}</table>
+        <h2>Claims by thesis seam</h2><table><tr><th>seam</th><th>n</th></tr>{claims_by_seam or '<tr><td class=muted colspan=2>none yet</td></tr>'}</table>
+        <h2>Backfill queue</h2><table><tr><th>type / state</th><th>n</th></tr>{queue or '<tr><td class=muted colspan=2>empty</td></tr>'}</table>"""
+        return _page("Dashboard", body)
+
+    @app.get("/corpus", response_class=HTMLResponse)
+    def corpus() -> HTMLResponse:
+        c = conn()
+        srcs = c.execute("""
+            SELECT s.*,
+              (SELECT COUNT(*) FROM documents d WHERE d.source_id=s.source_id) docs,
+              (SELECT COUNT(*) FROM claims cl WHERE cl.source_id=s.source_id) claims
+            FROM sources s ORDER BY s.kind, s.source_id""").fetchall()
+        c.close()
+
+        def row(s):
+            extra = s["ticker"] or s["backtest_2022_2023"] or ""
+            return (f"<tr><td><a href='/source/{html.escape(s['source_id'])}'>{html.escape(s['name'])}</a></td>"
+                    f"<td><span class=tag>{s['kind']}</span></td><td>{s['source_cluster'] or ''}</td>"
+                    f"<td>{s['role'] or ''}</td><td>{html.escape(str(extra))}</td>"
+                    f"<td>{s['docs']}</td><td>{s['claims']}</td></tr>")
+        table = "".join(row(s) for s in srcs)
+        opt = lambda xs: "".join(f"<option>{x}</option>" for x in xs)
+        form = f"""<form method=post action="/corpus/add">
+          <strong>Add to corpus</strong>
+          <label>Name</label><input name=name required placeholder="NVIDIA / Odd Lots">
+          <label>Kind</label><select name=kind>{opt(_KINDS)}</select>
+          <label>Cluster</label><select name=cluster>{opt(_CLUSTERS)}</select>
+          <label>Role</label><select name=role>{opt(_ROLES)}</select>
+          <label>Ticker (companies)</label><input name=ticker placeholder="NVDA">
+          <label>RSS URL (podcasts)</label><input name=rss_url placeholder="https://...">
+          <label>YouTube channel</label><input name=channel_url placeholder="https://youtube.com/@...">
+          <button type=submit>Add source</button>
+        </form>"""
+        body = f"""<h1>Corpus ({len(srcs)} sources)</h1>{form}
+        <table><tr><th>name</th><th>kind</th><th>cluster</th><th>role</th><th>ticker / backtest</th><th>docs</th><th>claims</th></tr>{table}</table>"""
+        return _page("Corpus", body)
+
+    @app.post("/corpus/add")
+    async def corpus_add(request: Request):
+        raw = (await request.body()).decode()
+        f = {k: v[0].strip() for k, v in parse_qs(raw).items() if v and v[0].strip()}
+        name = f.get("name")
+        if not name:
+            return RedirectResponse("/corpus", status_code=303)
+        kind = f.get("kind", "podcast")
+        ticker = f.get("ticker")
+        sid = f"co-{ticker.lower()}" if ticker else f"{'pod' if kind in ('podcast','youtube') else kind}-{_slug(name)}"
+        c = conn()
+        c.execute("""INSERT OR IGNORE INTO sources
+            (source_id, name, kind, source_cluster, role, ticker, rss_url, channel_url)
+            VALUES (?,?,?,?,?,?,?,?)""",
+            (sid, name, kind, f.get("cluster"), f.get("role", "none"),
+             ticker.upper() if ticker else None, f.get("rss_url"), f.get("channel_url")))
+        c.commit()
+        c.close()
+        return RedirectResponse("/corpus", status_code=303)
+
+    @app.get("/source/{source_id}", response_class=HTMLResponse)
+    def source_detail(source_id: str) -> HTMLResponse:
+        c = conn()
+        s = c.execute("SELECT * FROM sources WHERE source_id=?", (source_id,)).fetchone()
+        if not s:
+            c.close()
+            return _page("Not found", "<h1>Source not found</h1>")
+        claims = c.execute("""SELECT proposition, claim_type, time_horizon, thesis_seam, topic_canonical,
+                                     engages_consensus, date FROM claims WHERE source_id=?
+                              ORDER BY date DESC LIMIT 200""", (source_id,)).fetchall()
+        c.close()
+        def crow(cl):
+            star = " ⚔" if cl["engages_consensus"] else ""
+            return (f"<tr><td>{cl['date'] or ''}</td><td><span class=tag>{cl['claim_type']}</span></td>"
+                    f"<td>{cl['thesis_seam']}</td><td>{html.escape(cl['topic_canonical'] or '')}</td>"
+                    f"<td>{html.escape(cl['proposition'])}{star}</td></tr>")
+        rows = "".join(crow(cl) for cl in claims) or '<tr><td class=muted colspan=5>no claims extracted yet</td></tr>'
+        meta = f"<span class=tag>{s['kind']}</span> cluster={s['source_cluster'] or '-'} role={s['role'] or '-'}"
+        if s["ticker"]:
+            meta += f" ticker={s['ticker']}"
+        if s["backtest_2022_2023"]:
+            meta += f" · backtest={s['backtest_2022_2023']}"
+        body = f"""<h1>{html.escape(s['name'])}</h1><p>{meta}</p>
+        <p class=muted>{html.escape(s['notes'] or '')}</p>
+        <h2>Claims ({len(claims)}) <span class=muted>⚔ = engages consensus</span></h2>
+        <table><tr><th>date</th><th>type</th><th>seam</th><th>topic</th><th>proposition</th></tr>{rows}</table>"""
+        return _page(s["name"], body)
+
+    return app
@@ -0,0 +1,28 @@
+"""Small shared utilities (normalization, dedup keys)."""
+from __future__ import annotations
+
+import re
+
+_SHOW_SUFFIX = re.compile(r"\s*[|\-–—]\s*[^|\-–—]*(podcast|show|ep(isode)?\s*\d+).*$", re.I)
+_EP_PREFIX = re.compile(r"^\s*(ep(isode)?\.?\s*\d+\s*[:\-–]|#\s*\d+\s*[:\-–]|\d+\s*[:\-–])\s*", re.I)
+_NONALNUM = re.compile(r"[^a-z0-9]+")
+
+
+def slugify(s: str, *, maxlen: int = 60) -> str:
+    return _NONALNUM.sub("-", (s or "").lower()).strip("-")[:maxlen] or "x"
+
+
+def normalize_title(title: str) -> str:
+    """Normalize an episode title so the SAME episode matches across feeds/mirrors despite cosmetic
+    differences ('Ep 42: Foo' vs 'Foo | The Show'). Best-effort — a safety net, not the primary key."""
+    t = title or ""
+    t = _SHOW_SUFFIX.sub("", t)
+    t = _EP_PREFIX.sub("", t)
+    return _NONALNUM.sub(" ", t.lower()).strip()
+
+
+def audio_dedup_key(title: str | None, date: str | None) -> str:
+    """Cross-mirror dedup key for audio: normalized title + date. Computed BEFORE transcription so a
+    duplicate episode (same content via a different feed/mirror) is skipped without spending GPU.
+    NOT derived from the transcript (ASR is non-deterministic — a transcript hash would be brittle)."""
+    return f"{normalize_title(title or '')}|{date or ''}"
				`@@ -0,0 +1 @@`
				`"""Client-side backfill queue (§13.4). Producers enqueue; ONE worker drains sequentially."""`