ten31-signal-engine/signal_engine/store/schema.sql

-- Ten31 Signal Engine — SQLite schema (pilot)
-- Source of truth: ten31-signal-engine-handoff.md  §4 (pipeline layers), §6.7 (ledger),
--   §3.1 (conviction log), §13.4 (backfill queue).
-- Design principle (§5, §10): boring, inspectable tables. The whole system state is a SELECT away.

PRAGMA journal_mode = WAL;
PRAGMA foreign_keys = ON;

-- ============================================================================
-- CANONICAL TOPIC VOCABULARY (§4.2) — HYBRID (operator decision):
--   seeded controlled list + emergent topics merged in on a schedule.
-- ============================================================================
CREATE TABLE IF NOT EXISTS topics (
  topic_canonical TEXT PRIMARY KEY,
  status          TEXT CHECK (status IN ('controlled','emergent','merged')) DEFAULT 'emergent',
  merged_into     TEXT REFERENCES topics(topic_canonical),
  seam            TEXT,
  created_at      TEXT DEFAULT (datetime('now'))
);

-- ============================================================================
-- SOURCES & DOCUMENTS (§4.1)
-- ============================================================================
CREATE TABLE IF NOT EXISTS sources (
  source_id          TEXT PRIMARY KEY,
  name               TEXT NOT NULL,
  kind               TEXT NOT NULL CHECK (kind IN ('podcast','youtube','filing','earnings_call')),
  source_cluster     TEXT CHECK (source_cluster IN
                       ('macro','ai_tech','energy','bitcoin','vc_consensus','generalist','banks','credit','fintech')),
  role               TEXT CHECK (role IN ('CB','IND','DX','none')) DEFAULT 'none',  -- §7.4
  rss_url            TEXT,
  channel_url        TEXT,
  ticker             TEXT,
  -- §8 credibility: neutral prior that DECAYS in favor of earned track record from the ledger.
  bootstrap_prior    REAL DEFAULT 1.0,
  earned_credibility REAL,
  cluster_capped_low INTEGER DEFAULT 0,   -- §4.5 bitcoin cluster deliberately under-weighted
  backtest_2022_2023 TEXT,                -- §7.1 reach: rss_full | rss_2023_only | youtube_only | launched_later | unavailable
  notes              TEXT,
  created_at         TEXT DEFAULT (datetime('now'))
);

CREATE TABLE IF NOT EXISTS documents (
  doc_id          TEXT PRIMARY KEY,
  source_id       TEXT NOT NULL REFERENCES sources(source_id),
  kind            TEXT NOT NULL,        -- podcast|youtube|filing|earnings_call
  external_id     TEXT,                 -- rss guid / yt video id / EDGAR accession / transcript id
  url             TEXT,
  title           TEXT,
  date            TEXT,                 -- ISO publication/filing date
  duration_sec    REAL,
  raw_path        TEXT,                 -- downloaded audio / raw filing
  transcript_path TEXT,
  -- DEDUP MODEL (layered):
  --   (1) UNIQUE(source_id, external_id) below = the ROBUST guard. external_id is the stable item id
  --       (RSS GUID / YouTube video id / EDGAR accession). Checked at ingest, BEFORE any GPU work.
  --   (2) dedup_key = normalized title+date → catches the SAME episode arriving via a different
  --       feed/mirror (different external_id). Computed pre-transcription. NOT from the transcript.
  --   content_hash is ONLY an audit fingerprint of the transcript (did a re-run change?) — it is NOT
  --       a dedup key (ASR is non-deterministic, so one differing word flips the hash).
  dedup_key       TEXT,
  content_hash    TEXT,
  processed_at    TEXT,                 -- set when transcription/extraction completes
  ingested_at     TEXT DEFAULT (datetime('now')),
  UNIQUE (source_id, external_id)       -- idempotent ingest (§13.4 dedup)
);
-- indexes for dedup_key / content_hash are created in db._migrate (after columns exist on older DBs).

-- ============================================================================
-- CLAIMS / PROPOSITIONS (§4.2) — the atomic unit of the whole system.
-- One passage emits 0..N claims; MOST of a podcast hour is 0 (§4.2). The
-- extractor must be willing to find nothing.
-- NOTE: thesis_seam is a TAG, never a hard filter (§5.7) — off-thesis &
--   anti-thesis claims MUST survive.
-- ============================================================================
CREATE TABLE IF NOT EXISTS claims (
  claim_id            TEXT PRIMARY KEY,
  doc_id              TEXT NOT NULL REFERENCES documents(doc_id),
  source_id           TEXT NOT NULL REFERENCES sources(source_id),
  proposition         TEXT NOT NULL,    -- normalized subject-assertion-object
  topic_canonical     TEXT REFERENCES topics(topic_canonical),
  topic_raw           TEXT,
  claimant            TEXT,
  source_cluster      TEXT,
  date                TEXT,
  claim_type          TEXT CHECK (claim_type IN ('interpretive','predictive','descriptive','reactive')),
  time_horizon        TEXT CHECK (time_horizon IN ('near','medium','long','unspecified')),
  confidence          TEXT CHECK (confidence IN ('low','med','high')),
  -- §4.2 relation: stance is EXTRACTED, never inferred from vector distance (§2.2/§5.3).
  rel_target_claim_id TEXT REFERENCES claims(claim_id),
  rel_polarity        TEXT CHECK (rel_polarity IN ('affirms','denies','qualifies','none')) DEFAULT 'none',
  engages_consensus   INTEGER DEFAULT 0,
  counters_position   TEXT,
  thesis_seam         TEXT CHECK (thesis_seam IN
                        ('energy_compute','debasement_bitcoin','ai_data_ownership','none')) DEFAULT 'none',
  salience            TEXT CHECK (salience IN ('central','secondary','aside')) DEFAULT 'secondary',
  qdrant_point_id     TEXT,             -- link to the embedded proposition vector (§4.3)
  extracted_at        TEXT DEFAULT (datetime('now'))
);
CREATE INDEX IF NOT EXISTS idx_claims_topic ON claims(topic_canonical);
CREATE INDEX IF NOT EXISTS idx_claims_date  ON claims(date);
CREATE INDEX IF NOT EXISTS idx_claims_seam  ON claims(thesis_seam);
CREATE INDEX IF NOT EXISTS idx_claims_type  ON claims(claim_type);

-- ============================================================================
-- SOURCE-INDEPENDENCE GRAPH (§4.5) — discount convergence by connectedness.
-- Cross-cluster convergence = gold; within-cluster = near-noise.
-- ============================================================================
CREATE TABLE IF NOT EXISTS source_edges (
  src_a      TEXT NOT NULL REFERENCES sources(source_id),
  src_b      TEXT NOT NULL REFERENCES sources(source_id),
  edge_type  TEXT NOT NULL CHECK (edge_type IN ('shared_guest','citation','community')),
  weight     REAL DEFAULT 1.0,
  evidence   TEXT,        -- voiceprint_id / show-note ref / url
  updated_at TEXT DEFAULT (datetime('now')),
  PRIMARY KEY (src_a, src_b, edge_type)
);

-- ============================================================================
-- VOICEPRINT LIBRARY (§4.5, §4.1) — same-guest-across-shows BY VOICE.
-- 192-dim TitaNet voiceprints; cosine ~0.7 distance threshold for same speaker.
-- This is the highest-leverage automated input to the independence graph.
-- ============================================================================
CREATE TABLE IF NOT EXISTS voiceprints (
  voiceprint_id TEXT PRIMARY KEY,
  vector        BLOB NOT NULL,         -- 192 x float32
  person_label  TEXT,                  -- resolved name if known
  first_doc_id  TEXT REFERENCES documents(doc_id),
  first_seen    TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS voiceprint_observations (
  obs_id        INTEGER PRIMARY KEY AUTOINCREMENT,
  voiceprint_id TEXT NOT NULL REFERENCES voiceprints(voiceprint_id),
  doc_id        TEXT NOT NULL REFERENCES documents(doc_id),
  chunk_idx     INTEGER,
  segment_start REAL,
  segment_end   REAL
);

-- ============================================================================
-- CONVICTION LOG (§3.1) — human-owned seed nodes for Job B.
-- Structural rule (§3.1): separate the TRACKABLE thematic proposition (corpus
--   can corroborate) from TEAM conviction (context only). The engine must NEVER
--   present theme corroboration as validation of the team bet beneath it.
-- Exposure scored as coarse NAV bands (operator decision): none | lt2 | 2to10 | gt10 | unset.
-- ============================================================================
CREATE TABLE IF NOT EXISTS conviction_log (
  conviction_id        TEXT PRIMARY KEY,            -- R1, E1, A1, B1 ...
  seam                 TEXT,                        -- root|energy_compute|debasement_bitcoin|ai_data_ownership
  thematic_proposition TEXT NOT NULL,               -- the TRACKABLE half
  team_conviction_note TEXT,                         -- context ONLY, never scored as theme validation
  conviction_level     TEXT CHECK (conviction_level IN ('low','med','med-high','high')),
  current_exposure     TEXT CHECK (current_exposure IN ('none','lt2','2to10','gt10','unset')) DEFAULT 'unset',
  exposure_note        TEXT,                         -- original §3.1 prose ("pervasive", "MED-HIGH") pending NAV-band finalization
  disconfirming_signal TEXT,
  is_thesis_breaker    INTEGER DEFAULT 0,            -- §3.1 B1-B3: engine must surface these AGAINST the thesis (§5.7)
  updated_at           TEXT DEFAULT (datetime('now'))
);

-- Conviction fan-out tree (§4.6). A derivative is a HYPOTHESIS until independent
-- corpus corroboration AND the exposure gap both clear the bar — then 'signal'.
CREATE TABLE IF NOT EXISTS fanout_nodes (
  node_id                TEXT PRIMARY KEY,
  parent_conviction_id   TEXT REFERENCES conviction_log(conviction_id),
  parent_node_id         TEXT REFERENCES fanout_nodes(node_id),
  derivative_proposition TEXT NOT NULL,
  depth                  INTEGER DEFAULT 1,
  status                 TEXT CHECK (status IN ('hypothesis','corroborated','signal')) DEFAULT 'hypothesis',
  created_at             TEXT DEFAULT (datetime('now'))
);

-- ============================================================================
-- DUAL-EVALUATION LEDGER (§4.7, §6) — START DAY ONE; the clock can't be backfilled.
-- Log EVERY candidate that clears the quantitative bar (§6.6 — you need a denominator).
-- ============================================================================
CREATE TABLE IF NOT EXISTS ledger (
  signal_id            TEXT PRIMARY KEY,
  type                 TEXT NOT NULL CHECK (type IN ('theme','event','under_acted_conviction')),
  proposition          TEXT NOT NULL,
  date_logged          TEXT NOT NULL DEFAULT (datetime('now')),
  discourse_metric     TEXT,           -- JSON: acceleration, cross-cluster source set, independence-discounted count
  external_check       TEXT,           -- JSON: resolution spec / nested clean events the model proposed (§6.5)
  resolution_date      TEXT,
  discourse_outcome    TEXT CHECK (discourse_outcome IN
                         ('up_cross_cluster','up_single_cluster','flat','down')),
  external_outcome     TEXT CHECK (external_outcome IN
                         ('correct','partial','wrong','unresolved_expired','too_early')),
  lead_time_days       INTEGER,        -- §6.3 THE alpha measurement (to the DERIVATIVE node for Job B)
  model_confidence     REAL,           -- §6.7 logged ONLY to measure its uselessness — NEVER fed into scoring
  origin_conviction_id TEXT REFERENCES conviction_log(conviction_id),  -- Job B traceability
  origin_node_id       TEXT REFERENCES fanout_nodes(node_id)
);
CREATE INDEX IF NOT EXISTS idx_ledger_type   ON ledger(type);
CREATE INDEX IF NOT EXISTS idx_ledger_logged ON ledger(date_logged);

-- Human eval on a SEPARATE write path (§6.7): "keep them in separate columns and do not let the
-- model see Grant's rating before it logs its prediction." The model-facing code reads `ledger`;
-- ONLY the eval UI writes here. A separate table makes that separation structural, not a convention.
CREATE TABLE IF NOT EXISTS human_evaluations (
  signal_id    TEXT PRIMARY KEY REFERENCES ledger(signal_id),
  grant_rating INTEGER,               -- "non-obvious and relevant to me?" (e.g. 1-5)
  non_obvious  INTEGER,               -- 0/1
  notes        TEXT,
  rated_at     TEXT DEFAULT (datetime('now'))
);

-- Reporting view — the valuable cell is DISAGREEMENT (§6.7). Used for analysis, NOT by the model path.
CREATE VIEW IF NOT EXISTS v_ledger_eval AS
  SELECT l.*, h.grant_rating, h.non_obvious, h.notes AS grant_notes, h.rated_at
  FROM ledger l LEFT JOIN human_evaluations h ON h.signal_id = l.signal_id;

-- ============================================================================
-- BACKFILL QUEUE (§13.4) — client-side, measured in GPU-HOURS.
-- Extraction (one LLM pass per chunk over the whole corpus) is the HEAVIER serial load.
-- Audio is SEQUENTIAL (parallel → 503). Leases give crash-safe resumability.
-- ============================================================================
CREATE TABLE IF NOT EXISTS backfill_jobs (
  job_id           INTEGER PRIMARY KEY AUTOINCREMENT,
  job_type         TEXT NOT NULL CHECK (job_type IN ('transcribe','diarize','extract','embed')),
  target_id        TEXT NOT NULL,        -- doc_id or chunk id
  parent_doc_id    TEXT,
  state            TEXT NOT NULL CHECK (state IN
                     ('pending','leased','running','done','failed','skipped')) DEFAULT 'pending',
  priority         INTEGER DEFAULT 100,  -- lower = sooner (backtest corpus jumps the queue, §7.1)
  attempts         INTEGER DEFAULT 0,
  max_attempts     INTEGER DEFAULT 5,
  lease_owner      TEXT,
  lease_expires_at TEXT,
  input_hash       TEXT NOT NULL,        -- hash(content + model/prompt version) — idempotency
  output_ref       TEXT,
  gpu_seconds      REAL,                 -- measured per job → self-calibrating GPU-hours estimate
  error            TEXT,
  created_at       TEXT DEFAULT (datetime('now')),
  updated_at       TEXT DEFAULT (datetime('now')),
  UNIQUE (job_type, input_hash)
);
CREATE INDEX IF NOT EXISTS idx_jobs_state_priority ON backfill_jobs(state, priority, job_id);

-- ============================================================================
-- SCORING BRAIN state (the "brain", build blueprint). Candidate state lands here +
-- ledger + fanout_nodes.status; existing tables unchanged.
-- ============================================================================

-- Temporal layer: one row per (topic, as_of, window). 28d non-overlapping windows.
CREATE TABLE IF NOT EXISTS topic_window_stats (
  topic_canonical TEXT NOT NULL,
  as_of           TEXT NOT NULL,
  window_idx      INTEGER NOT NULL,          -- 0 = window ending at as_of, 1 = prior, 2 = baseline
  window_start    TEXT NOT NULL,
  window_end      TEXT NOT NULL,
  n_interp_pred   INTEGER NOT NULL DEFAULT 0,
  n_descr_react   INTEGER NOT NULL DEFAULT 0,
  n_distinct_src  INTEGER NOT NULL DEFAULT 0,
  n_distinct_clu  INTEGER NOT NULL DEFAULT 0,
  PRIMARY KEY (topic_canonical, as_of, window_idx)
);

-- Audit trail: one row per (scorer, key, as_of). Deterministic score_id → re-run reproduces.
CREATE TABLE IF NOT EXISTS candidate_scores (
  score_id        TEXT PRIMARY KEY,
  scorer          TEXT NOT NULL,             -- emergence|contrarian|intersection|convergence|under_acted
  as_of           TEXT NOT NULL,
  topic_canonical TEXT,
  node_id         TEXT,
  conviction_id   TEXT,
  score           REAL NOT NULL,
  cleared_evidence_bar  INTEGER NOT NULL DEFAULT 0,   -- tier 1: logged to ledger (the denominator)
  cleared_promotion_bar INTEGER NOT NULL DEFAULT 0,   -- tier 2: sent to frontier judge
  inputs_json     TEXT NOT NULL,             -- every term that produced the score (full audit)
  computed_at     TEXT DEFAULT (datetime('now'))
);
CREATE INDEX IF NOT EXISTS idx_cs_asof ON candidate_scores(scorer, as_of, cleared_promotion_bar);

-- Tunable bar config so the backtest can sweep thresholds without code edits.
CREATE TABLE IF NOT EXISTS score_thresholds (
  scorer     TEXT PRIMARY KEY,
  min_score  REAL,
  gates_json TEXT,
  version    TEXT
);