Files
ten31-database/backend/migrations/0001_phase0_foundation.sql
T
Keysat c7ce44d963 Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server
Workstream A–C substrate for the Ten31 agentic system:
- A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9
- A2: additive/reversible core migration (canonical_entities, entity_links,
  interaction_log, relationship_edges, soft-delete) + ledgered runner
- B1/B3: chunking + deterministic entity resolution (backend/ingest)
- B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks
- C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools
- docs: redaction/re-hydration, Gmail enablement runbook
- synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db,
  drop legacy files + start9/0.3.5)

Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity
queries). Real backfill runs on Ten31 infra; index holds synthetic data only.
Branch snapshot also captures pre-existing working-tree changes.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 08:13:35 -05:00

117 lines
6.9 KiB
SQL

-- Phase 0 — Workstream A2: foundation schema for the agentic system.
--
-- ADDITIVE AND REVERSIBLE ONLY (CLAUDE.md guardrail #3): this migration adds
-- new tables and new nullable columns alongside the existing CRM. It never
-- drops, renames, or rewrites existing data. Its reversal is 0001_phase0_foundation.down.sql.
--
-- Applied once at startup by backend/core_migrations.py, tracked in the
-- schema_migrations ledger. Safe to leave in place; the canonical layer it
-- creates starts EMPTY and is populated later by entity resolution (A4/B3).
-- ============================================================================
-- 1. canonical_entities — the single, model-agnostic identity for an LP /
-- organization / person. Both the classic contacts/lp_profiles model and the
-- fundraising_* grid map INTO this; neither existing model is demoted.
-- IDs are full-length (e.g. 'lp_' + uuid4 hex), NOT the 8-char truncated
-- UUIDs used elsewhere in the CRM, so they are safe as the index/payload key.
-- ============================================================================
CREATE TABLE IF NOT EXISTS canonical_entities (
id TEXT PRIMARY KEY,
entity_kind TEXT NOT NULL, -- 'lp' | 'organization' | 'person'
display_name TEXT NOT NULL,
primary_email TEXT,
-- Phase-0 LP/prospect fields (model-agnostic home):
thesis_fit TEXT,
segment TEXT,
accreditation_status TEXT, -- free-text until counsel defines the vocabulary (guardrail #6)
qp_status TEXT,
warmth_score REAL,
source TEXT,
owner_id TEXT REFERENCES users(id),
last_touch_at TEXT,
notes TEXT,
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now')),
deleted_at TEXT -- soft-delete (never hard-delete; guardrail #3)
);
CREATE INDEX IF NOT EXISTS idx_canonical_kind ON canonical_entities(entity_kind);
CREATE INDEX IF NOT EXISTS idx_canonical_email ON canonical_entities(primary_email);
CREATE INDEX IF NOT EXISTS idx_canonical_owner ON canonical_entities(owner_id);
-- ============================================================================
-- 2. entity_links — resolution map. Every source row (a contacts row, a
-- fundraising_investors row, etc.) and every email/name variant points at the
-- canonical entity it resolves to. This is how name variants collapse to one id.
-- ============================================================================
CREATE TABLE IF NOT EXISTS entity_links (
id TEXT PRIMARY KEY,
canonical_id TEXT NOT NULL REFERENCES canonical_entities(id) ON DELETE CASCADE,
source_model TEXT NOT NULL, -- contacts|organizations|lp_profiles|fundraising_investors|fundraising_contacts|email_address|alias
source_id TEXT, -- the local PK in that model (NULL for a bare email/name alias)
match_value TEXT, -- normalized email or name variant
match_kind TEXT NOT NULL, -- exact_email|name_variant|domain|manual
confidence REAL DEFAULT 1.0,
created_at TEXT DEFAULT (datetime('now')),
UNIQUE(source_model, source_id, match_value)
);
CREATE INDEX IF NOT EXISTS idx_entity_links_canonical ON entity_links(canonical_id);
CREATE INDEX IF NOT EXISTS idx_entity_links_match ON entity_links(match_value);
CREATE INDEX IF NOT EXISTS idx_entity_links_source ON entity_links(source_model, source_id);
-- ============================================================================
-- 3. interaction_log — APPEND-ONLY record of every agent action and every human
-- touch (guardrail #5). Distinct from audit_log (which is mutation-diff-only
-- and has no actor/agent dimension). Nothing in this table is ever updated or
-- deleted by convention.
-- ============================================================================
CREATE TABLE IF NOT EXISTS interaction_log (
id TEXT PRIMARY KEY,
ts TEXT NOT NULL DEFAULT (datetime('now')), -- event time
actor_type TEXT NOT NULL, -- human | agent | system
actor_id TEXT, -- users.id, or an agent name (Scout/Analyst/...)
action TEXT NOT NULL, -- e.g. note.created | email.matched | enrichment.written | search.run
target_type TEXT, -- canonical_entity | contact | communication | opportunity | ...
target_id TEXT, -- canonical_entities.id where possible
payload TEXT, -- JSON blob with the action detail
source TEXT, -- crm_ui | mcp | ingest | scout | ...
created_at TEXT DEFAULT (datetime('now'))
);
CREATE INDEX IF NOT EXISTS idx_interaction_target ON interaction_log(target_type, target_id);
CREATE INDEX IF NOT EXISTS idx_interaction_ts ON interaction_log(ts);
CREATE INDEX IF NOT EXISTS idx_interaction_actor ON interaction_log(actor_type, actor_id);
-- ============================================================================
-- 4. relationship_edges — derived graph of who-knows-whom between canonical
-- entities. Starts EMPTY; seeded later from email_investor_links + calendar +
-- X follower overlap (Analyst, Phase 2).
-- ============================================================================
CREATE TABLE IF NOT EXISTS relationship_edges (
id TEXT PRIMARY KEY,
src_id TEXT NOT NULL REFERENCES canonical_entities(id) ON DELETE CASCADE,
dst_id TEXT NOT NULL REFERENCES canonical_entities(id) ON DELETE CASCADE,
edge_type TEXT NOT NULL, -- email_corr | calendar | x_follow | intro | colleague
source TEXT NOT NULL, -- provenance of this edge
strength REAL DEFAULT 0,
directed INTEGER DEFAULT 0,
evidence TEXT, -- JSON supporting detail
first_seen_at TEXT,
last_seen_at TEXT,
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now')),
UNIQUE(src_id, dst_id, edge_type, source)
);
CREATE INDEX IF NOT EXISTS idx_rel_src ON relationship_edges(src_id);
CREATE INDEX IF NOT EXISTS idx_rel_dst ON relationship_edges(dst_id);
-- ============================================================================
-- 5. Soft-delete columns on existing tables. Additive nullable columns; the CRM
-- currently HARD-deletes everywhere (guardrail #3 gap). Adding the column is
-- safe now; switching the DELETE handlers to set it instead of hard-deleting
-- is a separate, reviewed code change.
-- ============================================================================
ALTER TABLE contacts ADD COLUMN deleted_at TEXT;
ALTER TABLE organizations ADD COLUMN deleted_at TEXT;
ALTER TABLE opportunities ADD COLUMN deleted_at TEXT;
ALTER TABLE communications ADD COLUMN deleted_at TEXT;
ALTER TABLE lp_profiles ADD COLUMN deleted_at TEXT;