-- Ten31 Signal Engine — SQLite schema (pilot) -- Source of truth: ten31-signal-engine-handoff.md §4 (pipeline layers), §6.7 (ledger), -- §3.1 (conviction log), §13.4 (backfill queue). -- Design principle (§5, §10): boring, inspectable tables. The whole system state is a SELECT away. PRAGMA journal_mode = WAL; PRAGMA foreign_keys = ON; -- ============================================================================ -- CANONICAL TOPIC VOCABULARY (§4.2) — HYBRID (operator decision): -- seeded controlled list + emergent topics merged in on a schedule. -- ============================================================================ CREATE TABLE IF NOT EXISTS topics ( topic_canonical TEXT PRIMARY KEY, status TEXT CHECK (status IN ('controlled','emergent','merged')) DEFAULT 'emergent', merged_into TEXT REFERENCES topics(topic_canonical), seam TEXT, created_at TEXT DEFAULT (datetime('now')) ); -- ============================================================================ -- SOURCES & DOCUMENTS (§4.1) -- ============================================================================ CREATE TABLE IF NOT EXISTS sources ( source_id TEXT PRIMARY KEY, name TEXT NOT NULL, kind TEXT NOT NULL CHECK (kind IN ('podcast','youtube','filing','earnings_call')), source_cluster TEXT CHECK (source_cluster IN ('macro','ai_tech','energy','bitcoin','vc_consensus','generalist','banks','credit','fintech')), role TEXT CHECK (role IN ('CB','IND','DX','none')) DEFAULT 'none', -- §7.4 rss_url TEXT, channel_url TEXT, ticker TEXT, -- §8 credibility: neutral prior that DECAYS in favor of earned track record from the ledger. bootstrap_prior REAL DEFAULT 1.0, earned_credibility REAL, cluster_capped_low INTEGER DEFAULT 0, -- §4.5 bitcoin cluster deliberately under-weighted backtest_2022_2023 TEXT, -- §7.1 reach: rss_full | rss_2023_only | youtube_only | launched_later | unavailable notes TEXT, created_at TEXT DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS documents ( doc_id TEXT PRIMARY KEY, source_id TEXT NOT NULL REFERENCES sources(source_id), kind TEXT NOT NULL, -- podcast|youtube|filing|earnings_call external_id TEXT, -- rss guid / yt video id / EDGAR accession / transcript id url TEXT, title TEXT, date TEXT, -- ISO publication/filing date duration_sec REAL, raw_path TEXT, -- downloaded audio / raw filing transcript_path TEXT, -- DEDUP MODEL (layered): -- (1) UNIQUE(source_id, external_id) below = the ROBUST guard. external_id is the stable item id -- (RSS GUID / YouTube video id / EDGAR accession). Checked at ingest, BEFORE any GPU work. -- (2) dedup_key = normalized title+date → catches the SAME episode arriving via a different -- feed/mirror (different external_id). Computed pre-transcription. NOT from the transcript. -- content_hash is ONLY an audit fingerprint of the transcript (did a re-run change?) — it is NOT -- a dedup key (ASR is non-deterministic, so one differing word flips the hash). dedup_key TEXT, content_hash TEXT, processed_at TEXT, -- set when transcription/extraction completes ingested_at TEXT DEFAULT (datetime('now')), UNIQUE (source_id, external_id) -- idempotent ingest (§13.4 dedup) ); -- indexes for dedup_key / content_hash are created in db._migrate (after columns exist on older DBs). -- ============================================================================ -- CLAIMS / PROPOSITIONS (§4.2) — the atomic unit of the whole system. -- One passage emits 0..N claims; MOST of a podcast hour is 0 (§4.2). The -- extractor must be willing to find nothing. -- NOTE: thesis_seam is a TAG, never a hard filter (§5.7) — off-thesis & -- anti-thesis claims MUST survive. -- ============================================================================ CREATE TABLE IF NOT EXISTS claims ( claim_id TEXT PRIMARY KEY, doc_id TEXT NOT NULL REFERENCES documents(doc_id), source_id TEXT NOT NULL REFERENCES sources(source_id), proposition TEXT NOT NULL, -- normalized subject-assertion-object topic_canonical TEXT REFERENCES topics(topic_canonical), topic_raw TEXT, claimant TEXT, source_cluster TEXT, date TEXT, claim_type TEXT CHECK (claim_type IN ('interpretive','predictive','descriptive','reactive')), time_horizon TEXT CHECK (time_horizon IN ('near','medium','long','unspecified')), confidence TEXT CHECK (confidence IN ('low','med','high')), -- §4.2 relation: stance is EXTRACTED, never inferred from vector distance (§2.2/§5.3). rel_target_claim_id TEXT REFERENCES claims(claim_id), rel_polarity TEXT CHECK (rel_polarity IN ('affirms','denies','qualifies','none')) DEFAULT 'none', engages_consensus INTEGER DEFAULT 0, counters_position TEXT, thesis_seam TEXT CHECK (thesis_seam IN ('energy_compute','debasement_bitcoin','ai_data_ownership','none')) DEFAULT 'none', salience TEXT CHECK (salience IN ('central','secondary','aside')) DEFAULT 'secondary', qdrant_point_id TEXT, -- link to the embedded proposition vector (§4.3) extracted_at TEXT DEFAULT (datetime('now')) ); CREATE INDEX IF NOT EXISTS idx_claims_topic ON claims(topic_canonical); CREATE INDEX IF NOT EXISTS idx_claims_date ON claims(date); CREATE INDEX IF NOT EXISTS idx_claims_seam ON claims(thesis_seam); CREATE INDEX IF NOT EXISTS idx_claims_type ON claims(claim_type); -- ============================================================================ -- SOURCE-INDEPENDENCE GRAPH (§4.5) — discount convergence by connectedness. -- Cross-cluster convergence = gold; within-cluster = near-noise. -- ============================================================================ CREATE TABLE IF NOT EXISTS source_edges ( src_a TEXT NOT NULL REFERENCES sources(source_id), src_b TEXT NOT NULL REFERENCES sources(source_id), edge_type TEXT NOT NULL CHECK (edge_type IN ('shared_guest','citation','community')), weight REAL DEFAULT 1.0, evidence TEXT, -- voiceprint_id / show-note ref / url updated_at TEXT DEFAULT (datetime('now')), PRIMARY KEY (src_a, src_b, edge_type) ); -- ============================================================================ -- VOICEPRINT LIBRARY (§4.5, §4.1) — same-guest-across-shows BY VOICE. -- 192-dim TitaNet voiceprints; cosine ~0.7 distance threshold for same speaker. -- This is the highest-leverage automated input to the independence graph. -- ============================================================================ CREATE TABLE IF NOT EXISTS voiceprints ( voiceprint_id TEXT PRIMARY KEY, vector BLOB NOT NULL, -- 192 x float32 person_label TEXT, -- resolved name if known first_doc_id TEXT REFERENCES documents(doc_id), first_seen TEXT DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS voiceprint_observations ( obs_id INTEGER PRIMARY KEY AUTOINCREMENT, voiceprint_id TEXT NOT NULL REFERENCES voiceprints(voiceprint_id), doc_id TEXT NOT NULL REFERENCES documents(doc_id), chunk_idx INTEGER, segment_start REAL, segment_end REAL ); -- ============================================================================ -- CONVICTION LOG (§3.1) — human-owned seed nodes for Job B. -- Structural rule (§3.1): separate the TRACKABLE thematic proposition (corpus -- can corroborate) from TEAM conviction (context only). The engine must NEVER -- present theme corroboration as validation of the team bet beneath it. -- Exposure scored as coarse NAV bands (operator decision): none | lt2 | 2to10 | gt10 | unset. -- ============================================================================ CREATE TABLE IF NOT EXISTS conviction_log ( conviction_id TEXT PRIMARY KEY, -- R1, E1, A1, B1 ... seam TEXT, -- root|energy_compute|debasement_bitcoin|ai_data_ownership thematic_proposition TEXT NOT NULL, -- the TRACKABLE half team_conviction_note TEXT, -- context ONLY, never scored as theme validation conviction_level TEXT CHECK (conviction_level IN ('low','med','med-high','high')), current_exposure TEXT CHECK (current_exposure IN ('none','lt2','2to10','gt10','unset')) DEFAULT 'unset', exposure_note TEXT, -- original §3.1 prose ("pervasive", "MED-HIGH") pending NAV-band finalization disconfirming_signal TEXT, is_thesis_breaker INTEGER DEFAULT 0, -- §3.1 B1-B3: engine must surface these AGAINST the thesis (§5.7) updated_at TEXT DEFAULT (datetime('now')) ); -- Conviction fan-out tree (§4.6). A derivative is a HYPOTHESIS until independent -- corpus corroboration AND the exposure gap both clear the bar — then 'signal'. CREATE TABLE IF NOT EXISTS fanout_nodes ( node_id TEXT PRIMARY KEY, parent_conviction_id TEXT REFERENCES conviction_log(conviction_id), parent_node_id TEXT REFERENCES fanout_nodes(node_id), derivative_proposition TEXT NOT NULL, depth INTEGER DEFAULT 1, status TEXT CHECK (status IN ('hypothesis','corroborated','signal')) DEFAULT 'hypothesis', created_at TEXT DEFAULT (datetime('now')) ); -- ============================================================================ -- DUAL-EVALUATION LEDGER (§4.7, §6) — START DAY ONE; the clock can't be backfilled. -- Log EVERY candidate that clears the quantitative bar (§6.6 — you need a denominator). -- ============================================================================ CREATE TABLE IF NOT EXISTS ledger ( signal_id TEXT PRIMARY KEY, type TEXT NOT NULL CHECK (type IN ('theme','event','under_acted_conviction')), proposition TEXT NOT NULL, date_logged TEXT NOT NULL DEFAULT (datetime('now')), discourse_metric TEXT, -- JSON: acceleration, cross-cluster source set, independence-discounted count external_check TEXT, -- JSON: resolution spec / nested clean events the model proposed (§6.5) resolution_date TEXT, discourse_outcome TEXT CHECK (discourse_outcome IN ('up_cross_cluster','up_single_cluster','flat','down')), external_outcome TEXT CHECK (external_outcome IN ('correct','partial','wrong','unresolved_expired','too_early')), lead_time_days INTEGER, -- §6.3 THE alpha measurement (to the DERIVATIVE node for Job B) model_confidence REAL, -- §6.7 logged ONLY to measure its uselessness — NEVER fed into scoring origin_conviction_id TEXT REFERENCES conviction_log(conviction_id), -- Job B traceability origin_node_id TEXT REFERENCES fanout_nodes(node_id) ); CREATE INDEX IF NOT EXISTS idx_ledger_type ON ledger(type); CREATE INDEX IF NOT EXISTS idx_ledger_logged ON ledger(date_logged); -- Human eval on a SEPARATE write path (§6.7): "keep them in separate columns and do not let the -- model see Grant's rating before it logs its prediction." The model-facing code reads `ledger`; -- ONLY the eval UI writes here. A separate table makes that separation structural, not a convention. CREATE TABLE IF NOT EXISTS human_evaluations ( signal_id TEXT PRIMARY KEY REFERENCES ledger(signal_id), grant_rating INTEGER, -- "non-obvious and relevant to me?" (e.g. 1-5) non_obvious INTEGER, -- 0/1 notes TEXT, rated_at TEXT DEFAULT (datetime('now')) ); -- Reporting view — the valuable cell is DISAGREEMENT (§6.7). Used for analysis, NOT by the model path. CREATE VIEW IF NOT EXISTS v_ledger_eval AS SELECT l.*, h.grant_rating, h.non_obvious, h.notes AS grant_notes, h.rated_at FROM ledger l LEFT JOIN human_evaluations h ON h.signal_id = l.signal_id; -- ============================================================================ -- BACKFILL QUEUE (§13.4) — client-side, measured in GPU-HOURS. -- Extraction (one LLM pass per chunk over the whole corpus) is the HEAVIER serial load. -- Audio is SEQUENTIAL (parallel → 503). Leases give crash-safe resumability. -- ============================================================================ CREATE TABLE IF NOT EXISTS backfill_jobs ( job_id INTEGER PRIMARY KEY AUTOINCREMENT, job_type TEXT NOT NULL CHECK (job_type IN ('transcribe','diarize','extract','embed')), target_id TEXT NOT NULL, -- doc_id or chunk id parent_doc_id TEXT, state TEXT NOT NULL CHECK (state IN ('pending','leased','running','done','failed','skipped')) DEFAULT 'pending', priority INTEGER DEFAULT 100, -- lower = sooner (backtest corpus jumps the queue, §7.1) attempts INTEGER DEFAULT 0, max_attempts INTEGER DEFAULT 5, lease_owner TEXT, lease_expires_at TEXT, input_hash TEXT NOT NULL, -- hash(content + model/prompt version) — idempotency output_ref TEXT, gpu_seconds REAL, -- measured per job → self-calibrating GPU-hours estimate error TEXT, created_at TEXT DEFAULT (datetime('now')), updated_at TEXT DEFAULT (datetime('now')), UNIQUE (job_type, input_hash) ); CREATE INDEX IF NOT EXISTS idx_jobs_state_priority ON backfill_jobs(state, priority, job_id); -- ============================================================================ -- SCORING BRAIN state (the "brain", build blueprint). Candidate state lands here + -- ledger + fanout_nodes.status; existing tables unchanged. -- ============================================================================ -- Temporal layer: one row per (topic, as_of, window). 28d non-overlapping windows. CREATE TABLE IF NOT EXISTS topic_window_stats ( topic_canonical TEXT NOT NULL, as_of TEXT NOT NULL, window_idx INTEGER NOT NULL, -- 0 = window ending at as_of, 1 = prior, 2 = baseline window_start TEXT NOT NULL, window_end TEXT NOT NULL, n_interp_pred INTEGER NOT NULL DEFAULT 0, n_descr_react INTEGER NOT NULL DEFAULT 0, n_distinct_src INTEGER NOT NULL DEFAULT 0, n_distinct_clu INTEGER NOT NULL DEFAULT 0, PRIMARY KEY (topic_canonical, as_of, window_idx) ); -- Audit trail: one row per (scorer, key, as_of). Deterministic score_id → re-run reproduces. CREATE TABLE IF NOT EXISTS candidate_scores ( score_id TEXT PRIMARY KEY, scorer TEXT NOT NULL, -- emergence|contrarian|intersection|convergence|under_acted as_of TEXT NOT NULL, topic_canonical TEXT, node_id TEXT, conviction_id TEXT, score REAL NOT NULL, cleared_evidence_bar INTEGER NOT NULL DEFAULT 0, -- tier 1: logged to ledger (the denominator) cleared_promotion_bar INTEGER NOT NULL DEFAULT 0, -- tier 2: sent to frontier judge inputs_json TEXT NOT NULL, -- every term that produced the score (full audit) computed_at TEXT DEFAULT (datetime('now')) ); CREATE INDEX IF NOT EXISTS idx_cs_asof ON candidate_scores(scorer, as_of, cleared_promotion_bar); -- Tunable bar config so the backtest can sweep thresholds without code edits. CREATE TABLE IF NOT EXISTS score_thresholds ( scorer TEXT PRIMARY KEY, min_score REAL, gates_json TEXT, version TEXT );