Files
ten31-database/backend/scripts/seed_synthetic.py
T
Keysat dd2c34d7bc Phase 1: investor↔contacts (member_of), system status, thesis seed v1
- entity_resolution: emit member_of relationship edges (contact -> investor),
  so one investor entity owns many contacts (institution) and a HNWI is the N=1
  case; crm_tools.get_investor_contacts + get_entity contacts/member_of; MCP tool.
- seed_synthetic: multi-contact institutions to exercise it (Harbor & Vine = 5).
- server.py: GET /api/system/status (index/entity/thesis/activity health) for an
  in-app status view (no shell needed to verify the index).
- docs/thesis-seed-v1.md: grounded v1 thesis (throughline, 6 pillars, objections,
  per-segment angles, voice) drawn from Ten31's newsletter/site/essays.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 10:47:26 -05:00

295 lines
16 KiB
Python

#!/usr/bin/env python3
"""Seed a SYNTHETIC dev database for Ten31 CRM ingest/retrieval testing.
ALL DATA IS FAKE. No real LP/prospect information appears here (CLAUDE.md
guardrail #9: Claude works only on synthetic/redacted data). This produces a
realistic-shaped corpus so the Phase-0 ingest, chunking, and entity-resolution
work can be developed and tested without ever touching the live CRM.
What it builds (into a SEPARATE dev DB, never crm.db):
* The full real schema, via server.init_db() — which also runs the new
core migration (backend/migrations/), so the canonical/interaction/graph
tables exist.
* A classic-model dataset: organizations, contacts (investors + prospects),
opportunities across pipeline stages, communications with entity-rich prose
notes, and lp_profiles.
* A fundraising grid (fundraising_state.grid_json) populated via the real
sync_fundraising_relational() code path, so the normalized mirror + the
grid->classic bridge behave exactly as in production.
* DELIBERATE entity-resolution test cases: several investors appear in BOTH
models with NAME VARIANTS (e.g. "Jonathan Reyes" vs grid contact "Jon
Reyes"), some with matching email (easy merge) and some without (hard case).
Usage:
python3 backend/scripts/seed_synthetic.py # -> data/crm_dev.db
python3 backend/scripts/seed_synthetic.py --db /tmp/x.db
"""
import argparse
import datetime
import json
import os
import random
import sys
HERE = os.path.dirname(os.path.abspath(__file__))
BACKEND_DIR = os.path.dirname(HERE)
PROJECT_DIR = os.path.dirname(BACKEND_DIR)
sys.path.insert(0, BACKEND_DIR)
ORGS = [
("Cedar Point Capital", "family_office"),
("Harbor & Vine Family Office", "family_office"),
("Northwind Endowment", "endowment"),
("Granite Peak Partners", "institutional"),
("Solano Ventures", "wealth_management"),
("Tidewater Holdings", "family_office"),
("Brightwater Capital", "institutional"),
("Meridian Trust", "foundation"),
]
FIRST = ["Jonathan", "Katherine", "Michael", "William", "Robert", "Elena", "Priya",
"David", "Sarah", "James", "Maria", "Thomas", "Laura", "Daniel", "Rachel",
"Steven", "Nicole", "Andrew", "Jessica", "Brian"]
LAST = ["Reyes", "Calder", "Okonkwo", "Brandt", "Sutter", "Vance", "Mehta", "Ellison",
"Cho", "Whitlock", "Santos", "Aldridge", "Kerr", "Nilsson", "Pope", "Devlin",
"Frye", "Osei", "Lindqvist", "Marsh"]
NICK = {"Jonathan": "Jon", "Katherine": "Kate", "Michael": "Mike", "William": "Bill",
"Robert": "Bob", "James": "Jim", "Thomas": "Tom", "Daniel": "Dan",
"Steven": "Steve", "Jessica": "Jess"}
FUND_COLS = ["fund_i", "fund_ii", "fund_iii", "tactical_fund", "pawn_to_e4",
"ten31_terahash", "sats_and_stats", "pawn_to_f4", "join_the_fold"]
FUND_LABELS = {"fund_i": "Fund I", "fund_ii": "Fund II", "fund_iii": "Fund III",
"tactical_fund": "Tactical Fund"}
AMOUNTS = [250_000, 500_000, 1_000_000, 2_500_000, 5_000_000]
LEADS = ["JK", "Grant", "MB", "Parker"]
COMM_TEMPLATES = [
("call", "Intro call recap",
"Spoke with {person} ({org}) for {dur} min about {fund}. Strong interest in the "
"bitcoin-energy and AI-infrastructure thesis; wants the latest deck and DPI figures. "
"Flagged accreditation paperwork still outstanding. Next: send one-pager and schedule a partner call."),
("email", "Follow-up: {fund} allocation",
"Sent {person} the {fund} summary and the scarcity/critical-infrastructure memo. "
"They asked how Ten31 Terahash relates to the energy thesis. Following up next week on commitment size."),
("meeting", "Partner meeting notes",
"Met {person} at {org}. Discussed pacing into {fund} and co-invest appetite. "
"Concern about lockup; reassured on secondary options. Warm — wants to meet the GP again before committing."),
("note", "Diligence status",
"{person} is mid-diligence on {fund}. Legal reviewing subscription docs; wire expected within 30 days. "
"Keep warm; send the Q update."),
("text", "Quick ping",
"Texted {person} re: the {fund} close timeline. Said they're 'in for at least a unit' pending IC approval."),
]
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--db", default=os.path.join(PROJECT_DIR, "data", "crm_dev.db"))
args = ap.parse_args()
db = os.path.abspath(args.db)
if os.path.basename(db) == "crm.db":
sys.exit("Refusing to seed the real crm.db. Use a dev path like data/crm_dev.db.")
for ext in ("", "-wal", "-shm"):
if os.path.exists(db + ext):
os.remove(db + ext)
os.environ["CRM_DB_PATH"] = db
import server # noqa: E402 (must follow CRM_DB_PATH assignment)
server.init_db()
gen, now = server.generate_id, server.now
conn = server.get_db()
random.seed(31)
def past(days_ago):
return (datetime.datetime.utcnow() - datetime.timedelta(days=days_ago)).isoformat() + "Z"
# ── dev user (FK target for created_by/owner_id) ──
uid = gen()
conn.execute(
"INSERT INTO users (id, username, email, password_hash, full_name, role) VALUES (?,?,?,?,?,?)",
(uid, "dev_admin", "dev@example.invalid", server.hash_password("devpassword"),
"Dev Admin", "admin"))
# ── organizations ──
org_ids = {}
for name, otype in ORGS:
oid = gen()
org_ids[name] = oid
conn.execute(
"INSERT INTO organizations (id, name, type, industry, country, description, created_by, updated_at) "
"VALUES (?,?,?,?,?,?,?,?)",
(oid, name, otype, "Investment Management", "USA",
f"{name} — synthetic {otype.replace('_', ' ')} used for ingest testing.", uid, now()))
# ── classic contacts (investors + prospects) ──
contacts = [] # (cid, first, last, org_name, contact_type)
overlap_specs = [] # investors we will also place in the grid, with variants
used = set()
for i, (org_name, _) in enumerate(ORGS):
# one "primary" investor contact per org
first, last = FIRST[i], LAST[i]
used.add((first, last))
cid = gen()
email = f"{first.lower()}.{last.lower()}@{org_name.split()[0].lower()}.invalid"
conn.execute(
"INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, "
"status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
(cid, first, last, email, "Managing Partner", org_ids[org_name], "investor", "active",
"referral", f"Primary relationship at {org_name}. Met via conference intro.", uid, now()))
contacts.append((cid, first, last, org_name, "investor"))
# mark 5 of 8 for grid overlap with a NAME VARIANT
if i < 5:
variant = NICK.get(first, first[0] + ".") # nickname or initial
match_email = email if i % 2 == 0 else "" # half share email (easy), half don't (hard)
overlap_specs.append((org_name, f"{variant} {last}", match_email))
# Multi-contact institutions: the first two orgs get extra contacts so ONE
# investor entity owns several people (a family office / institution), to
# exercise the member_of relationship. (A HNWI stays a 1-contact investor.)
for org_name in (ORGS[0][0], ORGS[1][0]):
for k in range(2):
fn, ln = FIRST[(k + 13) % len(FIRST)], LAST[(k + 13) % len(LAST)]
cid = gen()
conn.execute(
"INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, "
"status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
(cid, fn, ln, f"{fn.lower()}.{ln.lower()}@{org_name.split()[0].lower()}.invalid",
random.choice(["Analyst", "Principal", "Associate"]), org_ids[org_name], "investor", "active",
"referral", f"Additional contact at {org_name}.", uid, now()))
contacts.append((cid, fn, ln, org_name, "investor"))
# extra prospect contacts (no org sometimes)
for j in range(12):
first = FIRST[(j + 8) % len(FIRST)]
last = LAST[(j + 8) % len(LAST)]
if (first, last) in used:
last = LAST[(j + 11) % len(LAST)]
used.add((first, last))
org_name = ORGS[j % len(ORGS)][0] if j % 3 else None
cid = gen()
email = f"{first.lower()}{last.lower()}@example.invalid"
conn.execute(
"INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, "
"status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
(cid, first, last, email, "Principal", org_ids.get(org_name) if org_name else None,
"prospect", "active", random.choice(["inbound", "referral", "conference", "x"]),
f"Prospect sourced via {random.choice(['X DM', 'warm intro', 'podcast'])}.", uid, now()))
contacts.append((cid, first, last, org_name, "prospect"))
# ── opportunities + lp_profiles + communications ──
stages = server.PIPELINE_STAGES
for idx, (cid, first, last, org_name, ctype) in enumerate(contacts):
person = f"{first} {last}"
# opportunity for most contacts
if idx % 5 != 4:
stage = stages[idx % len(stages)]
fund_label = random.choice(list(FUND_LABELS.values()))
amt = random.choice(AMOUNTS)
conn.execute(
"INSERT INTO opportunities (id, name, contact_id, organization_id, stage, commitment_amount, "
"expected_amount, probability, fund_name, description, next_step, owner_id, priority, updated_at) "
"VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
(gen(), f"{org_name or person}{fund_label}", cid, org_ids.get(org_name) if org_name else None,
stage, amt if stage in ("committed", "funded") else 0, amt,
{"lead": 10, "outreach": 25, "meeting": 40, "due_diligence": 60, "committed": 90, "funded": 100}[stage],
fund_label, f"Potential {fund_label} allocation for {person}.",
random.choice(["Send deck", "Schedule call", "Await IC", "Send subdocs"]),
uid, random.choice(["low", "medium", "high"]), now()))
# lp_profile for ~closed investors
if ctype == "investor" and idx % 2 == 0:
amt = random.choice(AMOUNTS)
conn.execute(
"INSERT INTO lp_profiles (id, contact_id, commitment_amount, funded_amount, commitment_date, "
"fund_name, investor_type, accredited, legal_docs_signed, wire_received, k1_sent, notes, updated_at) "
"VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
(gen(), cid, amt, amt if idx % 4 == 0 else 0, past(120),
random.choice(list(FUND_LABELS.values())),
random.choice(["family_office", "institutional", "endowment", "individual"]),
1, 1 if idx % 3 else 0, 1 if idx % 4 == 0 else 0, 0,
f"Closed LP. Accreditation on file. Primary contact {person}.", now()))
# 2-4 communications each, entity-rich prose
for k in range(random.randint(2, 4)):
ctype_comm, subj, body = random.choice(COMM_TEMPLATES)
fund = random.choice(["Fund III", "Tactical Fund", "Ten31 Terahash", "Fund II"])
conn.execute(
"INSERT INTO communications (id, contact_id, type, subject, body, communication_date, "
"duration_minutes, outcome, next_action, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?)",
(gen(), cid, ctype_comm,
subj.format(fund=fund),
body.format(person=person, org=org_name or "their firm", fund=fund, dur=random.choice([20, 30, 45])),
past(random.randint(1, 200)), random.choice([20, 30, 45, None]),
random.choice(["positive", "neutral", "needs follow-up"]),
random.choice(["Send deck", "Schedule call", "Send subdocs", None]),
uid, now()))
# ── fundraising grid (authoritative grid_json -> real sync path) ──
columns = server.DEFAULT_FUNDRAISING_COLUMNS
views = server.DEFAULT_GRID_VIEWS
rows = []
# (a) overlap investors — same org as a classic investor, but a NAME-VARIANT contact
for org_name, variant_name, match_email in overlap_specs:
row = {"id": "row-" + gen(), "investor_name": org_name, "lead": random.choice(LEADS),
"lead_source": random.choice(["Conference", "Warm intro", "X"]),
"notes": f"[call] {variant_name}: discussed Fund III pacing and co-invest. Warm.\n"
f"[email] {variant_name}: sent the energy-thesis memo.",
"priority": random.random() < 0.4, "follow_up": random.random() < 0.5, "graveyard": False,
"contacts": [{"name": variant_name,
"email": match_email or f"{variant_name.split()[0].lower()}@{org_name.split()[0].lower()}.invalid",
"title": "Managing Partner"}]}
for fc in random.sample(FUND_COLS, k=random.randint(1, 3)):
row[fc] = random.choice(AMOUNTS)
rows.append(row)
# (b) grid-only investors (no classic counterpart) — exercise the create path
for n in range(7):
nm = f"{random.choice(['Slate', 'Copper', 'Ridgeline', 'Anchor', 'Falcon', 'Quarry', 'Beacon'])} " \
f"{random.choice(['Capital', 'Partners', 'Holdings', 'Group'])}"
row = {"id": "row-" + gen(), "investor_name": nm, "lead": random.choice(LEADS),
"lead_source": random.choice(["Inbound", "Referral", "Podcast"]),
"notes": f"[note] First touch with {nm}. Sourced via X. Gauging thesis fit.",
"priority": False, "follow_up": random.random() < 0.6,
"graveyard": n >= 5, # a couple in the graveyard list
"contacts": [{"name": f"{random.choice(FIRST)} {random.choice(LAST)}",
"email": f"contact{n}@{nm.split()[0].lower()}.invalid", "title": "Partner"}]}
for fc in random.sample(FUND_COLS, k=random.randint(0, 2)):
row[fc] = random.choice(AMOUNTS)
rows.append(row)
grid = {"columns": columns, "rows": rows}
conn.execute(
"INSERT INTO fundraising_state (id, grid_json, views_json, version, updated_by, created_at, updated_at) "
"VALUES ('main', ?, ?, 1, ?, ?, ?)",
(json.dumps(grid), json.dumps(views), uid, now(), now()))
server.sync_fundraising_relational(conn, grid, views, actor_user_id=uid)
conn.commit()
# ── summary ──
def count(t):
return conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0]
print(f"\nSynthetic dev DB written to: {db}")
print(" Classic model:")
for t in ("organizations", "contacts", "opportunities", "communications", "lp_profiles"):
print(f" {t:<24} {count(t)}")
print(" Fundraising grid (after real sync):")
for t in ("fundraising_investors", "fundraising_contacts", "fundraising_funds",
"fundraising_commitments", "fundraising_list_memberships"):
print(f" {t:<24} {count(t)}")
print(" Phase-0 foundation tables (from migration, empty until entity resolution):")
for t in ("canonical_entities", "entity_links", "interaction_log", "relationship_edges"):
print(f" {t:<24} {count(t)}")
inv = count("contacts") # note grid bridge may have created extra investor contacts (the variants)
print(f"\n Entity-resolution test bed: {len(overlap_specs)} investors intentionally appear in BOTH models "
f"with name variants; total contacts now {inv} (grid bridge added the variant rows).")
conn.close()
if __name__ == "__main__":
main()