Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server
Workstream A–C substrate for the Ten31 agentic system: - A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9 - A2: additive/reversible core migration (canonical_entities, entity_links, interaction_log, relationship_edges, soft-delete) + ledgered runner - B1/B3: chunking + deterministic entity resolution (backend/ingest) - B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks - C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools - docs: redaction/re-hydration, Gmail enablement runbook - synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db, drop legacy files + start9/0.3.5) Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity queries). Real backfill runs on Ten31 infra; index holds synthetic data only. Branch snapshot also captures pre-existing working-tree changes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,279 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Seed a SYNTHETIC dev database for Ten31 CRM ingest/retrieval testing.
|
||||
|
||||
ALL DATA IS FAKE. No real LP/prospect information appears here (CLAUDE.md
|
||||
guardrail #9: Claude works only on synthetic/redacted data). This produces a
|
||||
realistic-shaped corpus so the Phase-0 ingest, chunking, and entity-resolution
|
||||
work can be developed and tested without ever touching the live CRM.
|
||||
|
||||
What it builds (into a SEPARATE dev DB, never crm.db):
|
||||
* The full real schema, via server.init_db() — which also runs the new
|
||||
core migration (backend/migrations/), so the canonical/interaction/graph
|
||||
tables exist.
|
||||
* A classic-model dataset: organizations, contacts (investors + prospects),
|
||||
opportunities across pipeline stages, communications with entity-rich prose
|
||||
notes, and lp_profiles.
|
||||
* A fundraising grid (fundraising_state.grid_json) populated via the real
|
||||
sync_fundraising_relational() code path, so the normalized mirror + the
|
||||
grid->classic bridge behave exactly as in production.
|
||||
* DELIBERATE entity-resolution test cases: several investors appear in BOTH
|
||||
models with NAME VARIANTS (e.g. "Jonathan Reyes" vs grid contact "Jon
|
||||
Reyes"), some with matching email (easy merge) and some without (hard case).
|
||||
|
||||
Usage:
|
||||
python3 backend/scripts/seed_synthetic.py # -> data/crm_dev.db
|
||||
python3 backend/scripts/seed_synthetic.py --db /tmp/x.db
|
||||
"""
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
BACKEND_DIR = os.path.dirname(HERE)
|
||||
PROJECT_DIR = os.path.dirname(BACKEND_DIR)
|
||||
sys.path.insert(0, BACKEND_DIR)
|
||||
|
||||
ORGS = [
|
||||
("Cedar Point Capital", "family_office"),
|
||||
("Harbor & Vine Family Office", "family_office"),
|
||||
("Northwind Endowment", "endowment"),
|
||||
("Granite Peak Partners", "institutional"),
|
||||
("Solano Ventures", "wealth_management"),
|
||||
("Tidewater Holdings", "family_office"),
|
||||
("Brightwater Capital", "institutional"),
|
||||
("Meridian Trust", "foundation"),
|
||||
]
|
||||
|
||||
FIRST = ["Jonathan", "Katherine", "Michael", "William", "Robert", "Elena", "Priya",
|
||||
"David", "Sarah", "James", "Maria", "Thomas", "Laura", "Daniel", "Rachel",
|
||||
"Steven", "Nicole", "Andrew", "Jessica", "Brian"]
|
||||
LAST = ["Reyes", "Calder", "Okonkwo", "Brandt", "Sutter", "Vance", "Mehta", "Ellison",
|
||||
"Cho", "Whitlock", "Santos", "Aldridge", "Kerr", "Nilsson", "Pope", "Devlin",
|
||||
"Frye", "Osei", "Lindqvist", "Marsh"]
|
||||
NICK = {"Jonathan": "Jon", "Katherine": "Kate", "Michael": "Mike", "William": "Bill",
|
||||
"Robert": "Bob", "James": "Jim", "Thomas": "Tom", "Daniel": "Dan",
|
||||
"Steven": "Steve", "Jessica": "Jess"}
|
||||
|
||||
FUND_COLS = ["fund_i", "fund_ii", "fund_iii", "tactical_fund", "pawn_to_e4",
|
||||
"ten31_terahash", "sats_and_stats", "pawn_to_f4", "join_the_fold"]
|
||||
FUND_LABELS = {"fund_i": "Fund I", "fund_ii": "Fund II", "fund_iii": "Fund III",
|
||||
"tactical_fund": "Tactical Fund"}
|
||||
AMOUNTS = [250_000, 500_000, 1_000_000, 2_500_000, 5_000_000]
|
||||
LEADS = ["JK", "Grant", "MB", "Parker"]
|
||||
|
||||
COMM_TEMPLATES = [
|
||||
("call", "Intro call recap",
|
||||
"Spoke with {person} ({org}) for {dur} min about {fund}. Strong interest in the "
|
||||
"bitcoin-energy and AI-infrastructure thesis; wants the latest deck and DPI figures. "
|
||||
"Flagged accreditation paperwork still outstanding. Next: send one-pager and schedule a partner call."),
|
||||
("email", "Follow-up: {fund} allocation",
|
||||
"Sent {person} the {fund} summary and the scarcity/critical-infrastructure memo. "
|
||||
"They asked how Ten31 Terahash relates to the energy thesis. Following up next week on commitment size."),
|
||||
("meeting", "Partner meeting notes",
|
||||
"Met {person} at {org}. Discussed pacing into {fund} and co-invest appetite. "
|
||||
"Concern about lockup; reassured on secondary options. Warm — wants to meet the GP again before committing."),
|
||||
("note", "Diligence status",
|
||||
"{person} is mid-diligence on {fund}. Legal reviewing subscription docs; wire expected within 30 days. "
|
||||
"Keep warm; send the Q update."),
|
||||
("text", "Quick ping",
|
||||
"Texted {person} re: the {fund} close timeline. Said they're 'in for at least a unit' pending IC approval."),
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--db", default=os.path.join(PROJECT_DIR, "data", "crm_dev.db"))
|
||||
args = ap.parse_args()
|
||||
db = os.path.abspath(args.db)
|
||||
if os.path.basename(db) == "crm.db":
|
||||
sys.exit("Refusing to seed the real crm.db. Use a dev path like data/crm_dev.db.")
|
||||
|
||||
for ext in ("", "-wal", "-shm"):
|
||||
if os.path.exists(db + ext):
|
||||
os.remove(db + ext)
|
||||
|
||||
os.environ["CRM_DB_PATH"] = db
|
||||
import server # noqa: E402 (must follow CRM_DB_PATH assignment)
|
||||
|
||||
server.init_db()
|
||||
gen, now = server.generate_id, server.now
|
||||
conn = server.get_db()
|
||||
random.seed(31)
|
||||
|
||||
def past(days_ago):
|
||||
return (datetime.datetime.utcnow() - datetime.timedelta(days=days_ago)).isoformat() + "Z"
|
||||
|
||||
# ── dev user (FK target for created_by/owner_id) ──
|
||||
uid = gen()
|
||||
conn.execute(
|
||||
"INSERT INTO users (id, username, email, password_hash, full_name, role) VALUES (?,?,?,?,?,?)",
|
||||
(uid, "dev_admin", "dev@example.invalid", server.hash_password("devpassword"),
|
||||
"Dev Admin", "admin"))
|
||||
|
||||
# ── organizations ──
|
||||
org_ids = {}
|
||||
for name, otype in ORGS:
|
||||
oid = gen()
|
||||
org_ids[name] = oid
|
||||
conn.execute(
|
||||
"INSERT INTO organizations (id, name, type, industry, country, description, created_by, updated_at) "
|
||||
"VALUES (?,?,?,?,?,?,?,?)",
|
||||
(oid, name, otype, "Investment Management", "USA",
|
||||
f"{name} — synthetic {otype.replace('_', ' ')} used for ingest testing.", uid, now()))
|
||||
|
||||
# ── classic contacts (investors + prospects) ──
|
||||
contacts = [] # (cid, first, last, org_name, contact_type)
|
||||
overlap_specs = [] # investors we will also place in the grid, with variants
|
||||
used = set()
|
||||
for i, (org_name, _) in enumerate(ORGS):
|
||||
# one "primary" investor contact per org
|
||||
first, last = FIRST[i], LAST[i]
|
||||
used.add((first, last))
|
||||
cid = gen()
|
||||
email = f"{first.lower()}.{last.lower()}@{org_name.split()[0].lower()}.invalid"
|
||||
conn.execute(
|
||||
"INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, "
|
||||
"status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(cid, first, last, email, "Managing Partner", org_ids[org_name], "investor", "active",
|
||||
"referral", f"Primary relationship at {org_name}. Met via conference intro.", uid, now()))
|
||||
contacts.append((cid, first, last, org_name, "investor"))
|
||||
# mark 5 of 8 for grid overlap with a NAME VARIANT
|
||||
if i < 5:
|
||||
variant = NICK.get(first, first[0] + ".") # nickname or initial
|
||||
match_email = email if i % 2 == 0 else "" # half share email (easy), half don't (hard)
|
||||
overlap_specs.append((org_name, f"{variant} {last}", match_email))
|
||||
|
||||
# extra prospect contacts (no org sometimes)
|
||||
for j in range(12):
|
||||
first = FIRST[(j + 8) % len(FIRST)]
|
||||
last = LAST[(j + 8) % len(LAST)]
|
||||
if (first, last) in used:
|
||||
last = LAST[(j + 11) % len(LAST)]
|
||||
used.add((first, last))
|
||||
org_name = ORGS[j % len(ORGS)][0] if j % 3 else None
|
||||
cid = gen()
|
||||
email = f"{first.lower()}{last.lower()}@example.invalid"
|
||||
conn.execute(
|
||||
"INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, "
|
||||
"status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(cid, first, last, email, "Principal", org_ids.get(org_name) if org_name else None,
|
||||
"prospect", "active", random.choice(["inbound", "referral", "conference", "x"]),
|
||||
f"Prospect sourced via {random.choice(['X DM', 'warm intro', 'podcast'])}.", uid, now()))
|
||||
contacts.append((cid, first, last, org_name, "prospect"))
|
||||
|
||||
# ── opportunities + lp_profiles + communications ──
|
||||
stages = server.PIPELINE_STAGES
|
||||
for idx, (cid, first, last, org_name, ctype) in enumerate(contacts):
|
||||
person = f"{first} {last}"
|
||||
# opportunity for most contacts
|
||||
if idx % 5 != 4:
|
||||
stage = stages[idx % len(stages)]
|
||||
fund_label = random.choice(list(FUND_LABELS.values()))
|
||||
amt = random.choice(AMOUNTS)
|
||||
conn.execute(
|
||||
"INSERT INTO opportunities (id, name, contact_id, organization_id, stage, commitment_amount, "
|
||||
"expected_amount, probability, fund_name, description, next_step, owner_id, priority, updated_at) "
|
||||
"VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(gen(), f"{org_name or person} — {fund_label}", cid, org_ids.get(org_name) if org_name else None,
|
||||
stage, amt if stage in ("committed", "funded") else 0, amt,
|
||||
{"lead": 10, "outreach": 25, "meeting": 40, "due_diligence": 60, "committed": 90, "funded": 100}[stage],
|
||||
fund_label, f"Potential {fund_label} allocation for {person}.",
|
||||
random.choice(["Send deck", "Schedule call", "Await IC", "Send subdocs"]),
|
||||
uid, random.choice(["low", "medium", "high"]), now()))
|
||||
|
||||
# lp_profile for ~closed investors
|
||||
if ctype == "investor" and idx % 2 == 0:
|
||||
amt = random.choice(AMOUNTS)
|
||||
conn.execute(
|
||||
"INSERT INTO lp_profiles (id, contact_id, commitment_amount, funded_amount, commitment_date, "
|
||||
"fund_name, investor_type, accredited, legal_docs_signed, wire_received, k1_sent, notes, updated_at) "
|
||||
"VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(gen(), cid, amt, amt if idx % 4 == 0 else 0, past(120),
|
||||
random.choice(list(FUND_LABELS.values())),
|
||||
random.choice(["family_office", "institutional", "endowment", "individual"]),
|
||||
1, 1 if idx % 3 else 0, 1 if idx % 4 == 0 else 0, 0,
|
||||
f"Closed LP. Accreditation on file. Primary contact {person}.", now()))
|
||||
|
||||
# 2-4 communications each, entity-rich prose
|
||||
for k in range(random.randint(2, 4)):
|
||||
ctype_comm, subj, body = random.choice(COMM_TEMPLATES)
|
||||
fund = random.choice(["Fund III", "Tactical Fund", "Ten31 Terahash", "Fund II"])
|
||||
conn.execute(
|
||||
"INSERT INTO communications (id, contact_id, type, subject, body, communication_date, "
|
||||
"duration_minutes, outcome, next_action, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(gen(), cid, ctype_comm,
|
||||
subj.format(fund=fund),
|
||||
body.format(person=person, org=org_name or "their firm", fund=fund, dur=random.choice([20, 30, 45])),
|
||||
past(random.randint(1, 200)), random.choice([20, 30, 45, None]),
|
||||
random.choice(["positive", "neutral", "needs follow-up"]),
|
||||
random.choice(["Send deck", "Schedule call", "Send subdocs", None]),
|
||||
uid, now()))
|
||||
|
||||
# ── fundraising grid (authoritative grid_json -> real sync path) ──
|
||||
columns = server.DEFAULT_FUNDRAISING_COLUMNS
|
||||
views = server.DEFAULT_GRID_VIEWS
|
||||
rows = []
|
||||
|
||||
# (a) overlap investors — same org as a classic investor, but a NAME-VARIANT contact
|
||||
for org_name, variant_name, match_email in overlap_specs:
|
||||
row = {"id": "row-" + gen(), "investor_name": org_name, "lead": random.choice(LEADS),
|
||||
"lead_source": random.choice(["Conference", "Warm intro", "X"]),
|
||||
"notes": f"[call] {variant_name}: discussed Fund III pacing and co-invest. Warm.\n"
|
||||
f"[email] {variant_name}: sent the energy-thesis memo.",
|
||||
"priority": random.random() < 0.4, "follow_up": random.random() < 0.5, "graveyard": False,
|
||||
"contacts": [{"name": variant_name,
|
||||
"email": match_email or f"{variant_name.split()[0].lower()}@{org_name.split()[0].lower()}.invalid",
|
||||
"title": "Managing Partner"}]}
|
||||
for fc in random.sample(FUND_COLS, k=random.randint(1, 3)):
|
||||
row[fc] = random.choice(AMOUNTS)
|
||||
rows.append(row)
|
||||
|
||||
# (b) grid-only investors (no classic counterpart) — exercise the create path
|
||||
for n in range(7):
|
||||
nm = f"{random.choice(['Slate', 'Copper', 'Ridgeline', 'Anchor', 'Falcon', 'Quarry', 'Beacon'])} " \
|
||||
f"{random.choice(['Capital', 'Partners', 'Holdings', 'Group'])}"
|
||||
row = {"id": "row-" + gen(), "investor_name": nm, "lead": random.choice(LEADS),
|
||||
"lead_source": random.choice(["Inbound", "Referral", "Podcast"]),
|
||||
"notes": f"[note] First touch with {nm}. Sourced via X. Gauging thesis fit.",
|
||||
"priority": False, "follow_up": random.random() < 0.6,
|
||||
"graveyard": n >= 5, # a couple in the graveyard list
|
||||
"contacts": [{"name": f"{random.choice(FIRST)} {random.choice(LAST)}",
|
||||
"email": f"contact{n}@{nm.split()[0].lower()}.invalid", "title": "Partner"}]}
|
||||
for fc in random.sample(FUND_COLS, k=random.randint(0, 2)):
|
||||
row[fc] = random.choice(AMOUNTS)
|
||||
rows.append(row)
|
||||
|
||||
grid = {"columns": columns, "rows": rows}
|
||||
conn.execute(
|
||||
"INSERT INTO fundraising_state (id, grid_json, views_json, version, updated_by, created_at, updated_at) "
|
||||
"VALUES ('main', ?, ?, 1, ?, ?, ?)",
|
||||
(json.dumps(grid), json.dumps(views), uid, now(), now()))
|
||||
server.sync_fundraising_relational(conn, grid, views, actor_user_id=uid)
|
||||
conn.commit()
|
||||
|
||||
# ── summary ──
|
||||
def count(t):
|
||||
return conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0]
|
||||
|
||||
print(f"\nSynthetic dev DB written to: {db}")
|
||||
print(" Classic model:")
|
||||
for t in ("organizations", "contacts", "opportunities", "communications", "lp_profiles"):
|
||||
print(f" {t:<24} {count(t)}")
|
||||
print(" Fundraising grid (after real sync):")
|
||||
for t in ("fundraising_investors", "fundraising_contacts", "fundraising_funds",
|
||||
"fundraising_commitments", "fundraising_list_memberships"):
|
||||
print(f" {t:<24} {count(t)}")
|
||||
print(" Phase-0 foundation tables (from migration, empty until entity resolution):")
|
||||
for t in ("canonical_entities", "entity_links", "interaction_log", "relationship_edges"):
|
||||
print(f" {t:<24} {count(t)}")
|
||||
inv = count("contacts") # note grid bridge may have created extra investor contacts (the variants)
|
||||
print(f"\n Entity-resolution test bed: {len(overlap_specs)} investors intentionally appear in BOTH models "
|
||||
f"with name variants; total contacts now {inv} (grid bridge added the variant rows).")
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user