Initial commit: Ten31 Signal Engine (ingest, scoring brain, corpus seeds)
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
"""Persistence layer: SQLite (metadata, ledger, conviction log, graph, queue).
|
||||
|
||||
Qdrant (vectors) is reached via the Spark Control gateway; see signal_engine.spark.
|
||||
"""
|
||||
@@ -0,0 +1,81 @@
|
||||
"""SQLite connection + schema initialization. Boring and inspectable (§5)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
SCHEMA_FILE = Path(__file__).with_name("schema.sql")
|
||||
|
||||
|
||||
def connect(db_path: Path) -> sqlite3.Connection:
|
||||
db_path = Path(db_path)
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = sqlite3.connect(str(db_path), timeout=30)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys = ON")
|
||||
conn.execute("PRAGMA busy_timeout = 30000") # wait, don't fail, under concurrent backfill writers
|
||||
return conn
|
||||
|
||||
|
||||
# Additive migrations for DBs created before a column existed (CREATE IF NOT EXISTS won't add columns).
|
||||
_MIGRATIONS = {
|
||||
"documents": {"content_hash": "TEXT", "processed_at": "TEXT", "dedup_key": "TEXT"},
|
||||
# DESIGN_v2.1 condition 1: own_network = the Ten31 orbit (Odell/Bent partners etc.) — listening to
|
||||
# ourselves. Quarantined: a TEST FIXTURE for the reflexivity case, DROPPED in live EISC scoring.
|
||||
"sources": {"backtest_2022_2023": "TEXT", "own_network": "INTEGER"},
|
||||
# DESIGN_v2.1: tag derivatives by distance-from-edge for TRIAGE — surfaced, NEVER used as a filter
|
||||
# (an engine that pre-filters to in-mandate reproduces the AI/compute mandate-expansion miss).
|
||||
"fanout_nodes": {"distance_from_edge": "TEXT"},
|
||||
}
|
||||
|
||||
|
||||
def _widen_cluster_check(conn: sqlite3.Connection) -> None:
|
||||
"""Add 'banks'/'credit'/'fintech' to sources.source_cluster's CHECK. SQLite can't ALTER a CHECK, so
|
||||
rebuild the (tiny) table via the standard table-swap. Idempotent: no-op once already widened. Toggles
|
||||
foreign_keys OFF around the swap (DROP would otherwise fail on inbound FKs); data copied by value so
|
||||
referential integrity holds. busy_timeout (set in connect) lets it wait out concurrent backfill writers."""
|
||||
import re
|
||||
row = conn.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='sources'").fetchone()
|
||||
if not row or "'banks'" in row[0]:
|
||||
return
|
||||
new_list = ("('macro','ai_tech','energy','bitcoin','vc_consensus','generalist',"
|
||||
"'banks','credit','fintech')")
|
||||
new_ddl = re.sub(r"source_cluster IN\s*\([^)]*\)", f"source_cluster IN {new_list}", row[0], count=1)
|
||||
new_ddl = new_ddl.replace("CREATE TABLE sources", "CREATE TABLE sources_new", 1)
|
||||
conn.commit() # close any implicit txn before toggling FK pragma
|
||||
conn.execute("PRAGMA foreign_keys=OFF")
|
||||
try:
|
||||
conn.execute(new_ddl)
|
||||
conn.execute("INSERT INTO sources_new SELECT * FROM sources")
|
||||
conn.execute("DROP TABLE sources")
|
||||
conn.execute("ALTER TABLE sources_new RENAME TO sources")
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
|
||||
def _migrate(conn: sqlite3.Connection) -> None:
|
||||
for table, cols in _MIGRATIONS.items():
|
||||
existing = {r[1] for r in conn.execute(f"PRAGMA table_info({table})")}
|
||||
for col, typ in cols.items():
|
||||
if col not in existing:
|
||||
conn.execute(f"ALTER TABLE {table} ADD COLUMN {col} {typ}")
|
||||
# indexes on migrated columns (created here so they work on DBs predating the column)
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_content_hash ON documents(content_hash)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_dedup_key ON documents(dedup_key)")
|
||||
conn.commit()
|
||||
_widen_cluster_check(conn)
|
||||
|
||||
|
||||
def init_db(conn: sqlite3.Connection) -> None:
|
||||
"""Idempotent: CREATE ... IF NOT EXISTS + additive column migrations."""
|
||||
conn.executescript(SCHEMA_FILE.read_text())
|
||||
conn.commit()
|
||||
_migrate(conn)
|
||||
|
||||
|
||||
def table_names(conn: sqlite3.Connection) -> list[str]:
|
||||
rows = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type IN ('table','view') ORDER BY name"
|
||||
).fetchall()
|
||||
return [r[0] for r in rows]
|
||||
@@ -0,0 +1,280 @@
|
||||
-- Ten31 Signal Engine — SQLite schema (pilot)
|
||||
-- Source of truth: ten31-signal-engine-handoff.md §4 (pipeline layers), §6.7 (ledger),
|
||||
-- §3.1 (conviction log), §13.4 (backfill queue).
|
||||
-- Design principle (§5, §10): boring, inspectable tables. The whole system state is a SELECT away.
|
||||
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA foreign_keys = ON;
|
||||
|
||||
-- ============================================================================
|
||||
-- CANONICAL TOPIC VOCABULARY (§4.2) — HYBRID (operator decision):
|
||||
-- seeded controlled list + emergent topics merged in on a schedule.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS topics (
|
||||
topic_canonical TEXT PRIMARY KEY,
|
||||
status TEXT CHECK (status IN ('controlled','emergent','merged')) DEFAULT 'emergent',
|
||||
merged_into TEXT REFERENCES topics(topic_canonical),
|
||||
seam TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- SOURCES & DOCUMENTS (§4.1)
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS sources (
|
||||
source_id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
kind TEXT NOT NULL CHECK (kind IN ('podcast','youtube','filing','earnings_call')),
|
||||
source_cluster TEXT CHECK (source_cluster IN
|
||||
('macro','ai_tech','energy','bitcoin','vc_consensus','generalist','banks','credit','fintech')),
|
||||
role TEXT CHECK (role IN ('CB','IND','DX','none')) DEFAULT 'none', -- §7.4
|
||||
rss_url TEXT,
|
||||
channel_url TEXT,
|
||||
ticker TEXT,
|
||||
-- §8 credibility: neutral prior that DECAYS in favor of earned track record from the ledger.
|
||||
bootstrap_prior REAL DEFAULT 1.0,
|
||||
earned_credibility REAL,
|
||||
cluster_capped_low INTEGER DEFAULT 0, -- §4.5 bitcoin cluster deliberately under-weighted
|
||||
backtest_2022_2023 TEXT, -- §7.1 reach: rss_full | rss_2023_only | youtube_only | launched_later | unavailable
|
||||
notes TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
doc_id TEXT PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES sources(source_id),
|
||||
kind TEXT NOT NULL, -- podcast|youtube|filing|earnings_call
|
||||
external_id TEXT, -- rss guid / yt video id / EDGAR accession / transcript id
|
||||
url TEXT,
|
||||
title TEXT,
|
||||
date TEXT, -- ISO publication/filing date
|
||||
duration_sec REAL,
|
||||
raw_path TEXT, -- downloaded audio / raw filing
|
||||
transcript_path TEXT,
|
||||
-- DEDUP MODEL (layered):
|
||||
-- (1) UNIQUE(source_id, external_id) below = the ROBUST guard. external_id is the stable item id
|
||||
-- (RSS GUID / YouTube video id / EDGAR accession). Checked at ingest, BEFORE any GPU work.
|
||||
-- (2) dedup_key = normalized title+date → catches the SAME episode arriving via a different
|
||||
-- feed/mirror (different external_id). Computed pre-transcription. NOT from the transcript.
|
||||
-- content_hash is ONLY an audit fingerprint of the transcript (did a re-run change?) — it is NOT
|
||||
-- a dedup key (ASR is non-deterministic, so one differing word flips the hash).
|
||||
dedup_key TEXT,
|
||||
content_hash TEXT,
|
||||
processed_at TEXT, -- set when transcription/extraction completes
|
||||
ingested_at TEXT DEFAULT (datetime('now')),
|
||||
UNIQUE (source_id, external_id) -- idempotent ingest (§13.4 dedup)
|
||||
);
|
||||
-- indexes for dedup_key / content_hash are created in db._migrate (after columns exist on older DBs).
|
||||
|
||||
-- ============================================================================
|
||||
-- CLAIMS / PROPOSITIONS (§4.2) — the atomic unit of the whole system.
|
||||
-- One passage emits 0..N claims; MOST of a podcast hour is 0 (§4.2). The
|
||||
-- extractor must be willing to find nothing.
|
||||
-- NOTE: thesis_seam is a TAG, never a hard filter (§5.7) — off-thesis &
|
||||
-- anti-thesis claims MUST survive.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS claims (
|
||||
claim_id TEXT PRIMARY KEY,
|
||||
doc_id TEXT NOT NULL REFERENCES documents(doc_id),
|
||||
source_id TEXT NOT NULL REFERENCES sources(source_id),
|
||||
proposition TEXT NOT NULL, -- normalized subject-assertion-object
|
||||
topic_canonical TEXT REFERENCES topics(topic_canonical),
|
||||
topic_raw TEXT,
|
||||
claimant TEXT,
|
||||
source_cluster TEXT,
|
||||
date TEXT,
|
||||
claim_type TEXT CHECK (claim_type IN ('interpretive','predictive','descriptive','reactive')),
|
||||
time_horizon TEXT CHECK (time_horizon IN ('near','medium','long','unspecified')),
|
||||
confidence TEXT CHECK (confidence IN ('low','med','high')),
|
||||
-- §4.2 relation: stance is EXTRACTED, never inferred from vector distance (§2.2/§5.3).
|
||||
rel_target_claim_id TEXT REFERENCES claims(claim_id),
|
||||
rel_polarity TEXT CHECK (rel_polarity IN ('affirms','denies','qualifies','none')) DEFAULT 'none',
|
||||
engages_consensus INTEGER DEFAULT 0,
|
||||
counters_position TEXT,
|
||||
thesis_seam TEXT CHECK (thesis_seam IN
|
||||
('energy_compute','debasement_bitcoin','ai_data_ownership','none')) DEFAULT 'none',
|
||||
salience TEXT CHECK (salience IN ('central','secondary','aside')) DEFAULT 'secondary',
|
||||
qdrant_point_id TEXT, -- link to the embedded proposition vector (§4.3)
|
||||
extracted_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_claims_topic ON claims(topic_canonical);
|
||||
CREATE INDEX IF NOT EXISTS idx_claims_date ON claims(date);
|
||||
CREATE INDEX IF NOT EXISTS idx_claims_seam ON claims(thesis_seam);
|
||||
CREATE INDEX IF NOT EXISTS idx_claims_type ON claims(claim_type);
|
||||
|
||||
-- ============================================================================
|
||||
-- SOURCE-INDEPENDENCE GRAPH (§4.5) — discount convergence by connectedness.
|
||||
-- Cross-cluster convergence = gold; within-cluster = near-noise.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS source_edges (
|
||||
src_a TEXT NOT NULL REFERENCES sources(source_id),
|
||||
src_b TEXT NOT NULL REFERENCES sources(source_id),
|
||||
edge_type TEXT NOT NULL CHECK (edge_type IN ('shared_guest','citation','community')),
|
||||
weight REAL DEFAULT 1.0,
|
||||
evidence TEXT, -- voiceprint_id / show-note ref / url
|
||||
updated_at TEXT DEFAULT (datetime('now')),
|
||||
PRIMARY KEY (src_a, src_b, edge_type)
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- VOICEPRINT LIBRARY (§4.5, §4.1) — same-guest-across-shows BY VOICE.
|
||||
-- 192-dim TitaNet voiceprints; cosine ~0.7 distance threshold for same speaker.
|
||||
-- This is the highest-leverage automated input to the independence graph.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS voiceprints (
|
||||
voiceprint_id TEXT PRIMARY KEY,
|
||||
vector BLOB NOT NULL, -- 192 x float32
|
||||
person_label TEXT, -- resolved name if known
|
||||
first_doc_id TEXT REFERENCES documents(doc_id),
|
||||
first_seen TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS voiceprint_observations (
|
||||
obs_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
voiceprint_id TEXT NOT NULL REFERENCES voiceprints(voiceprint_id),
|
||||
doc_id TEXT NOT NULL REFERENCES documents(doc_id),
|
||||
chunk_idx INTEGER,
|
||||
segment_start REAL,
|
||||
segment_end REAL
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- CONVICTION LOG (§3.1) — human-owned seed nodes for Job B.
|
||||
-- Structural rule (§3.1): separate the TRACKABLE thematic proposition (corpus
|
||||
-- can corroborate) from TEAM conviction (context only). The engine must NEVER
|
||||
-- present theme corroboration as validation of the team bet beneath it.
|
||||
-- Exposure scored as coarse NAV bands (operator decision): none | lt2 | 2to10 | gt10 | unset.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS conviction_log (
|
||||
conviction_id TEXT PRIMARY KEY, -- R1, E1, A1, B1 ...
|
||||
seam TEXT, -- root|energy_compute|debasement_bitcoin|ai_data_ownership
|
||||
thematic_proposition TEXT NOT NULL, -- the TRACKABLE half
|
||||
team_conviction_note TEXT, -- context ONLY, never scored as theme validation
|
||||
conviction_level TEXT CHECK (conviction_level IN ('low','med','med-high','high')),
|
||||
current_exposure TEXT CHECK (current_exposure IN ('none','lt2','2to10','gt10','unset')) DEFAULT 'unset',
|
||||
exposure_note TEXT, -- original §3.1 prose ("pervasive", "MED-HIGH") pending NAV-band finalization
|
||||
disconfirming_signal TEXT,
|
||||
is_thesis_breaker INTEGER DEFAULT 0, -- §3.1 B1-B3: engine must surface these AGAINST the thesis (§5.7)
|
||||
updated_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
-- Conviction fan-out tree (§4.6). A derivative is a HYPOTHESIS until independent
|
||||
-- corpus corroboration AND the exposure gap both clear the bar — then 'signal'.
|
||||
CREATE TABLE IF NOT EXISTS fanout_nodes (
|
||||
node_id TEXT PRIMARY KEY,
|
||||
parent_conviction_id TEXT REFERENCES conviction_log(conviction_id),
|
||||
parent_node_id TEXT REFERENCES fanout_nodes(node_id),
|
||||
derivative_proposition TEXT NOT NULL,
|
||||
depth INTEGER DEFAULT 1,
|
||||
status TEXT CHECK (status IN ('hypothesis','corroborated','signal')) DEFAULT 'hypothesis',
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- DUAL-EVALUATION LEDGER (§4.7, §6) — START DAY ONE; the clock can't be backfilled.
|
||||
-- Log EVERY candidate that clears the quantitative bar (§6.6 — you need a denominator).
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS ledger (
|
||||
signal_id TEXT PRIMARY KEY,
|
||||
type TEXT NOT NULL CHECK (type IN ('theme','event','under_acted_conviction')),
|
||||
proposition TEXT NOT NULL,
|
||||
date_logged TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
discourse_metric TEXT, -- JSON: acceleration, cross-cluster source set, independence-discounted count
|
||||
external_check TEXT, -- JSON: resolution spec / nested clean events the model proposed (§6.5)
|
||||
resolution_date TEXT,
|
||||
discourse_outcome TEXT CHECK (discourse_outcome IN
|
||||
('up_cross_cluster','up_single_cluster','flat','down')),
|
||||
external_outcome TEXT CHECK (external_outcome IN
|
||||
('correct','partial','wrong','unresolved_expired','too_early')),
|
||||
lead_time_days INTEGER, -- §6.3 THE alpha measurement (to the DERIVATIVE node for Job B)
|
||||
model_confidence REAL, -- §6.7 logged ONLY to measure its uselessness — NEVER fed into scoring
|
||||
origin_conviction_id TEXT REFERENCES conviction_log(conviction_id), -- Job B traceability
|
||||
origin_node_id TEXT REFERENCES fanout_nodes(node_id)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_ledger_type ON ledger(type);
|
||||
CREATE INDEX IF NOT EXISTS idx_ledger_logged ON ledger(date_logged);
|
||||
|
||||
-- Human eval on a SEPARATE write path (§6.7): "keep them in separate columns and do not let the
|
||||
-- model see Grant's rating before it logs its prediction." The model-facing code reads `ledger`;
|
||||
-- ONLY the eval UI writes here. A separate table makes that separation structural, not a convention.
|
||||
CREATE TABLE IF NOT EXISTS human_evaluations (
|
||||
signal_id TEXT PRIMARY KEY REFERENCES ledger(signal_id),
|
||||
grant_rating INTEGER, -- "non-obvious and relevant to me?" (e.g. 1-5)
|
||||
non_obvious INTEGER, -- 0/1
|
||||
notes TEXT,
|
||||
rated_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
-- Reporting view — the valuable cell is DISAGREEMENT (§6.7). Used for analysis, NOT by the model path.
|
||||
CREATE VIEW IF NOT EXISTS v_ledger_eval AS
|
||||
SELECT l.*, h.grant_rating, h.non_obvious, h.notes AS grant_notes, h.rated_at
|
||||
FROM ledger l LEFT JOIN human_evaluations h ON h.signal_id = l.signal_id;
|
||||
|
||||
-- ============================================================================
|
||||
-- BACKFILL QUEUE (§13.4) — client-side, measured in GPU-HOURS.
|
||||
-- Extraction (one LLM pass per chunk over the whole corpus) is the HEAVIER serial load.
|
||||
-- Audio is SEQUENTIAL (parallel → 503). Leases give crash-safe resumability.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS backfill_jobs (
|
||||
job_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
job_type TEXT NOT NULL CHECK (job_type IN ('transcribe','diarize','extract','embed')),
|
||||
target_id TEXT NOT NULL, -- doc_id or chunk id
|
||||
parent_doc_id TEXT,
|
||||
state TEXT NOT NULL CHECK (state IN
|
||||
('pending','leased','running','done','failed','skipped')) DEFAULT 'pending',
|
||||
priority INTEGER DEFAULT 100, -- lower = sooner (backtest corpus jumps the queue, §7.1)
|
||||
attempts INTEGER DEFAULT 0,
|
||||
max_attempts INTEGER DEFAULT 5,
|
||||
lease_owner TEXT,
|
||||
lease_expires_at TEXT,
|
||||
input_hash TEXT NOT NULL, -- hash(content + model/prompt version) — idempotency
|
||||
output_ref TEXT,
|
||||
gpu_seconds REAL, -- measured per job → self-calibrating GPU-hours estimate
|
||||
error TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
updated_at TEXT DEFAULT (datetime('now')),
|
||||
UNIQUE (job_type, input_hash)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_state_priority ON backfill_jobs(state, priority, job_id);
|
||||
|
||||
-- ============================================================================
|
||||
-- SCORING BRAIN state (the "brain", build blueprint). Candidate state lands here +
|
||||
-- ledger + fanout_nodes.status; existing tables unchanged.
|
||||
-- ============================================================================
|
||||
|
||||
-- Temporal layer: one row per (topic, as_of, window). 28d non-overlapping windows.
|
||||
CREATE TABLE IF NOT EXISTS topic_window_stats (
|
||||
topic_canonical TEXT NOT NULL,
|
||||
as_of TEXT NOT NULL,
|
||||
window_idx INTEGER NOT NULL, -- 0 = window ending at as_of, 1 = prior, 2 = baseline
|
||||
window_start TEXT NOT NULL,
|
||||
window_end TEXT NOT NULL,
|
||||
n_interp_pred INTEGER NOT NULL DEFAULT 0,
|
||||
n_descr_react INTEGER NOT NULL DEFAULT 0,
|
||||
n_distinct_src INTEGER NOT NULL DEFAULT 0,
|
||||
n_distinct_clu INTEGER NOT NULL DEFAULT 0,
|
||||
PRIMARY KEY (topic_canonical, as_of, window_idx)
|
||||
);
|
||||
|
||||
-- Audit trail: one row per (scorer, key, as_of). Deterministic score_id → re-run reproduces.
|
||||
CREATE TABLE IF NOT EXISTS candidate_scores (
|
||||
score_id TEXT PRIMARY KEY,
|
||||
scorer TEXT NOT NULL, -- emergence|contrarian|intersection|convergence|under_acted
|
||||
as_of TEXT NOT NULL,
|
||||
topic_canonical TEXT,
|
||||
node_id TEXT,
|
||||
conviction_id TEXT,
|
||||
score REAL NOT NULL,
|
||||
cleared_evidence_bar INTEGER NOT NULL DEFAULT 0, -- tier 1: logged to ledger (the denominator)
|
||||
cleared_promotion_bar INTEGER NOT NULL DEFAULT 0, -- tier 2: sent to frontier judge
|
||||
inputs_json TEXT NOT NULL, -- every term that produced the score (full audit)
|
||||
computed_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_cs_asof ON candidate_scores(scorer, as_of, cleared_promotion_bar);
|
||||
|
||||
-- Tunable bar config so the backtest can sweep thresholds without code edits.
|
||||
CREATE TABLE IF NOT EXISTS score_thresholds (
|
||||
scorer TEXT PRIMARY KEY,
|
||||
min_score REAL,
|
||||
gates_json TEXT,
|
||||
version TEXT
|
||||
);
|
||||
@@ -0,0 +1,74 @@
|
||||
"""Load human-owned seed data (conviction log, §3.1) into SQLite.
|
||||
|
||||
The conviction log is the highest-leverage Job B input (§3.1) and is HUMAN-OWNED:
|
||||
Grant edits the YAML seed files; this loader upserts them. Re-running is idempotent.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
_CONVICTION_COLS = (
|
||||
"conviction_id",
|
||||
"seam",
|
||||
"thematic_proposition",
|
||||
"team_conviction_note",
|
||||
"conviction_level",
|
||||
"current_exposure",
|
||||
"exposure_note",
|
||||
"disconfirming_signal",
|
||||
"is_thesis_breaker",
|
||||
)
|
||||
|
||||
|
||||
def _row(c: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
"conviction_id": c["id"],
|
||||
"seam": c.get("seam"),
|
||||
"thematic_proposition": c["thematic_proposition"],
|
||||
"team_conviction_note": c.get("team_conviction_note"),
|
||||
"conviction_level": c.get("conviction_level"),
|
||||
"current_exposure": c.get("current_exposure", "unset"),
|
||||
"exposure_note": c.get("exposure_note"),
|
||||
"disconfirming_signal": c.get("disconfirming_signal"),
|
||||
"is_thesis_breaker": 1 if c.get("is_thesis_breaker") else 0,
|
||||
}
|
||||
|
||||
|
||||
def load_fanout(conn: sqlite3.Connection, path: Path) -> int:
|
||||
"""Load a hand-written fan-out tree (§7.1 backtest). Idempotent on node_id."""
|
||||
data = yaml.safe_load(Path(path).read_text()) or {}
|
||||
parent = data["parent_conviction_id"]
|
||||
nodes = data.get("nodes", [])
|
||||
for n in nodes:
|
||||
conn.execute(
|
||||
"""INSERT INTO fanout_nodes
|
||||
(node_id, parent_conviction_id, derivative_proposition, depth, status, distance_from_edge)
|
||||
VALUES (?,?,?,?, 'hypothesis', ?)
|
||||
ON CONFLICT(node_id) DO UPDATE SET derivative_proposition=excluded.derivative_proposition,
|
||||
parent_conviction_id=excluded.parent_conviction_id,
|
||||
distance_from_edge=excluded.distance_from_edge""",
|
||||
(n["node_id"], parent, n["derivative_proposition"], n.get("depth", 1), n.get("distance_from_edge")),
|
||||
)
|
||||
conn.commit()
|
||||
return len(nodes)
|
||||
|
||||
|
||||
def load_convictions(conn: sqlite3.Connection, path: Path) -> int:
|
||||
data = yaml.safe_load(Path(path).read_text()) or {}
|
||||
rows = data.get("convictions", [])
|
||||
cols = ", ".join(_CONVICTION_COLS)
|
||||
placeholders = ", ".join(f":{c}" for c in _CONVICTION_COLS)
|
||||
updates = ", ".join(f"{c}=excluded.{c}" for c in _CONVICTION_COLS if c != "conviction_id")
|
||||
sql = (
|
||||
f"INSERT INTO conviction_log ({cols}, updated_at) "
|
||||
f"VALUES ({placeholders}, datetime('now')) "
|
||||
f"ON CONFLICT(conviction_id) DO UPDATE SET {updates}, updated_at=datetime('now')"
|
||||
)
|
||||
for c in rows:
|
||||
conn.execute(sql, _row(c))
|
||||
conn.commit()
|
||||
return len(rows)
|
||||
@@ -0,0 +1,90 @@
|
||||
"""Load the source registry (companies + podcasts, §7.3/§7.4) into SQLite. Idempotent upsert."""
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
_COLS = ("source_id", "name", "kind", "source_cluster", "role", "rss_url",
|
||||
"channel_url", "ticker", "cluster_capped_low", "own_network", "backtest_2022_2023", "notes")
|
||||
|
||||
|
||||
def _row(s: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
"source_id": s["id"],
|
||||
"name": s["name"],
|
||||
"kind": s["kind"],
|
||||
"source_cluster": s.get("cluster"),
|
||||
"role": s.get("role", "none"),
|
||||
"rss_url": s.get("rss_url"),
|
||||
"channel_url": s.get("channel_url"),
|
||||
"ticker": s.get("ticker"),
|
||||
"cluster_capped_low": 1 if s.get("cluster_capped_low") else 0,
|
||||
"own_network": 1 if s.get("own_network") else 0,
|
||||
"backtest_2022_2023": s.get("backtest_2022_2023"),
|
||||
"notes": s.get("notes"),
|
||||
}
|
||||
|
||||
|
||||
def update_feeds(conn: sqlite3.Connection, path: Path) -> int:
|
||||
"""Apply resolved/verified podcast feed URLs + backtest-reach to existing source rows."""
|
||||
try:
|
||||
conn.execute("ALTER TABLE sources ADD COLUMN backtest_2022_2023 TEXT")
|
||||
conn.commit()
|
||||
except sqlite3.OperationalError:
|
||||
pass # column already exists
|
||||
data = yaml.safe_load(Path(path).read_text()) or {}
|
||||
rows = data.get("feeds", [])
|
||||
for f in rows:
|
||||
conn.execute(
|
||||
"""UPDATE sources
|
||||
SET rss_url=:rss_url, channel_url=:youtube_channel_url,
|
||||
backtest_2022_2023=:backtest_2022_2023, notes=COALESCE(:note, notes)
|
||||
WHERE source_id=:id""",
|
||||
{
|
||||
"id": f["id"], "rss_url": f.get("rss_url"),
|
||||
"youtube_channel_url": f.get("youtube_channel_url"),
|
||||
"backtest_2022_2023": f.get("backtest_2022_2023"), "note": f.get("note"),
|
||||
},
|
||||
)
|
||||
conn.commit()
|
||||
return len(rows)
|
||||
|
||||
|
||||
def load_source_edges(conn: sqlite3.Connection, path: Path) -> int:
|
||||
"""Seed EISC connectedness edges (priors) idempotently. Stores src_a,src_b in sorted order to
|
||||
match the transcribe_worker's convention (sorted([a,b]) + ON CONFLICT weight+=1) so real detections
|
||||
accumulate on the same PK instead of creating a reversed duplicate. DO NOTHING on conflict → a
|
||||
re-run won't inflate, and won't clobber a stronger auto-detected weight."""
|
||||
data = yaml.safe_load(Path(path).read_text()) or {}
|
||||
rows = data.get("edges", [])
|
||||
applied = 0
|
||||
for e in rows:
|
||||
a, b = sorted([e["a"], e["b"]])
|
||||
cur = conn.execute(
|
||||
"""INSERT INTO source_edges (src_a, src_b, edge_type, weight, evidence)
|
||||
VALUES (?,?,?,?,?)
|
||||
ON CONFLICT(src_a, src_b, edge_type) DO NOTHING""",
|
||||
(a, b, e["type"], float(e.get("weight", 1.0)), e.get("evidence")),
|
||||
)
|
||||
applied += cur.rowcount
|
||||
conn.commit()
|
||||
return applied
|
||||
|
||||
|
||||
def load_sources(conn: sqlite3.Connection, path: Path) -> int:
|
||||
data = yaml.safe_load(Path(path).read_text()) or {}
|
||||
rows = data.get("sources", [])
|
||||
cols = ", ".join(_COLS)
|
||||
placeholders = ", ".join(f":{c}" for c in _COLS)
|
||||
updates = ", ".join(f"{c}=excluded.{c}" for c in _COLS if c != "source_id")
|
||||
sql = (
|
||||
f"INSERT INTO sources ({cols}, created_at) VALUES ({placeholders}, datetime('now')) "
|
||||
f"ON CONFLICT(source_id) DO UPDATE SET {updates}"
|
||||
)
|
||||
for s in rows:
|
||||
conn.execute(sql, _row(s))
|
||||
conn.commit()
|
||||
return len(rows)
|
||||
Reference in New Issue
Block a user