#!/usr/bin/env python3 """Seed a SYNTHETIC dev database for Ten31 CRM ingest/retrieval testing. ALL DATA IS FAKE. No real LP/prospect information appears here (CLAUDE.md guardrail #9: Claude works only on synthetic/redacted data). This produces a realistic-shaped corpus so the Phase-0 ingest, chunking, and entity-resolution work can be developed and tested without ever touching the live CRM. What it builds (into a SEPARATE dev DB, never crm.db): * The full real schema, via server.init_db() — which also runs the new core migration (backend/migrations/), so the canonical/interaction/graph tables exist. * A classic-model dataset: organizations, contacts (investors + prospects), opportunities across pipeline stages, communications with entity-rich prose notes, and lp_profiles. * A fundraising grid (fundraising_state.grid_json) populated via the real sync_fundraising_relational() code path, so the normalized mirror + the grid->classic bridge behave exactly as in production. * DELIBERATE entity-resolution test cases: several investors appear in BOTH models with NAME VARIANTS (e.g. "Jonathan Reyes" vs grid contact "Jon Reyes"), some with matching email (easy merge) and some without (hard case). Usage: python3 backend/scripts/seed_synthetic.py # -> data/crm_dev.db python3 backend/scripts/seed_synthetic.py --db /tmp/x.db """ import argparse import datetime import json import os import random import sys HERE = os.path.dirname(os.path.abspath(__file__)) BACKEND_DIR = os.path.dirname(HERE) PROJECT_DIR = os.path.dirname(BACKEND_DIR) sys.path.insert(0, BACKEND_DIR) ORGS = [ ("Cedar Point Capital", "family_office"), ("Harbor & Vine Family Office", "family_office"), ("Northwind Endowment", "endowment"), ("Granite Peak Partners", "institutional"), ("Solano Ventures", "wealth_management"), ("Tidewater Holdings", "family_office"), ("Brightwater Capital", "institutional"), ("Meridian Trust", "foundation"), ] FIRST = ["Jonathan", "Katherine", "Michael", "William", "Robert", "Elena", "Priya", "David", "Sarah", "James", "Maria", "Thomas", "Laura", "Daniel", "Rachel", "Steven", "Nicole", "Andrew", "Jessica", "Brian"] LAST = ["Reyes", "Calder", "Okonkwo", "Brandt", "Sutter", "Vance", "Mehta", "Ellison", "Cho", "Whitlock", "Santos", "Aldridge", "Kerr", "Nilsson", "Pope", "Devlin", "Frye", "Osei", "Lindqvist", "Marsh"] NICK = {"Jonathan": "Jon", "Katherine": "Kate", "Michael": "Mike", "William": "Bill", "Robert": "Bob", "James": "Jim", "Thomas": "Tom", "Daniel": "Dan", "Steven": "Steve", "Jessica": "Jess"} FUND_COLS = ["fund_i", "fund_ii", "fund_iii", "tactical_fund", "pawn_to_e4", "ten31_terahash", "sats_and_stats", "pawn_to_f4", "join_the_fold"] FUND_LABELS = {"fund_i": "Fund I", "fund_ii": "Fund II", "fund_iii": "Fund III", "tactical_fund": "Tactical Fund"} AMOUNTS = [250_000, 500_000, 1_000_000, 2_500_000, 5_000_000] LEADS = ["JK", "Grant", "MB", "Parker"] COMM_TEMPLATES = [ ("call", "Intro call recap", "Spoke with {person} ({org}) for {dur} min about {fund}. Strong interest in the " "bitcoin-energy and AI-infrastructure thesis; wants the latest deck and DPI figures. " "Flagged accreditation paperwork still outstanding. Next: send one-pager and schedule a partner call."), ("email", "Follow-up: {fund} allocation", "Sent {person} the {fund} summary and the scarcity/critical-infrastructure memo. " "They asked how Ten31 Terahash relates to the energy thesis. Following up next week on commitment size."), ("meeting", "Partner meeting notes", "Met {person} at {org}. Discussed pacing into {fund} and co-invest appetite. " "Concern about lockup; reassured on secondary options. Warm — wants to meet the GP again before committing."), ("note", "Diligence status", "{person} is mid-diligence on {fund}. Legal reviewing subscription docs; wire expected within 30 days. " "Keep warm; send the Q update."), ("text", "Quick ping", "Texted {person} re: the {fund} close timeline. Said they're 'in for at least a unit' pending IC approval."), ] def main(): ap = argparse.ArgumentParser() ap.add_argument("--db", default=os.path.join(PROJECT_DIR, "data", "crm_dev.db")) args = ap.parse_args() db = os.path.abspath(args.db) if os.path.basename(db) == "crm.db": sys.exit("Refusing to seed the real crm.db. Use a dev path like data/crm_dev.db.") for ext in ("", "-wal", "-shm"): if os.path.exists(db + ext): os.remove(db + ext) os.environ["CRM_DB_PATH"] = db import server # noqa: E402 (must follow CRM_DB_PATH assignment) server.init_db() gen, now = server.generate_id, server.now conn = server.get_db() random.seed(31) def past(days_ago): return (datetime.datetime.utcnow() - datetime.timedelta(days=days_ago)).isoformat() + "Z" # ── dev user (FK target for created_by/owner_id) ── uid = gen() conn.execute( "INSERT INTO users (id, username, email, password_hash, full_name, role) VALUES (?,?,?,?,?,?)", (uid, "dev_admin", "dev@example.invalid", server.hash_password("devpassword"), "Dev Admin", "admin")) # ── organizations ── org_ids = {} for name, otype in ORGS: oid = gen() org_ids[name] = oid conn.execute( "INSERT INTO organizations (id, name, type, industry, country, description, created_by, updated_at) " "VALUES (?,?,?,?,?,?,?,?)", (oid, name, otype, "Investment Management", "USA", f"{name} — synthetic {otype.replace('_', ' ')} used for ingest testing.", uid, now())) # ── classic contacts (investors + prospects) ── contacts = [] # (cid, first, last, org_name, contact_type) overlap_specs = [] # investors we will also place in the grid, with variants used = set() for i, (org_name, _) in enumerate(ORGS): # one "primary" investor contact per org first, last = FIRST[i], LAST[i] used.add((first, last)) cid = gen() email = f"{first.lower()}.{last.lower()}@{org_name.split()[0].lower()}.invalid" conn.execute( "INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, " "status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (cid, first, last, email, "Managing Partner", org_ids[org_name], "investor", "active", "referral", f"Primary relationship at {org_name}. Met via conference intro.", uid, now())) contacts.append((cid, first, last, org_name, "investor")) # mark 5 of 8 for grid overlap with a NAME VARIANT if i < 5: variant = NICK.get(first, first[0] + ".") # nickname or initial match_email = email if i % 2 == 0 else "" # half share email (easy), half don't (hard) overlap_specs.append((org_name, f"{variant} {last}", match_email)) # Multi-contact institutions: the first two orgs get extra contacts so ONE # investor entity owns several people (a family office / institution), to # exercise the member_of relationship. (A HNWI stays a 1-contact investor.) for org_name in (ORGS[0][0], ORGS[1][0]): for k in range(2): fn, ln = FIRST[(k + 13) % len(FIRST)], LAST[(k + 13) % len(LAST)] cid = gen() conn.execute( "INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, " "status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (cid, fn, ln, f"{fn.lower()}.{ln.lower()}@{org_name.split()[0].lower()}.invalid", random.choice(["Analyst", "Principal", "Associate"]), org_ids[org_name], "investor", "active", "referral", f"Additional contact at {org_name}.", uid, now())) contacts.append((cid, fn, ln, org_name, "investor")) # extra prospect contacts (no org sometimes) for j in range(12): first = FIRST[(j + 8) % len(FIRST)] last = LAST[(j + 8) % len(LAST)] if (first, last) in used: last = LAST[(j + 11) % len(LAST)] used.add((first, last)) org_name = ORGS[j % len(ORGS)][0] if j % 3 else None cid = gen() email = f"{first.lower()}{last.lower()}@example.invalid" conn.execute( "INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, " "status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", (cid, first, last, email, "Principal", org_ids.get(org_name) if org_name else None, "prospect", "active", random.choice(["inbound", "referral", "conference", "x"]), f"Prospect sourced via {random.choice(['X DM', 'warm intro', 'podcast'])}.", uid, now())) contacts.append((cid, first, last, org_name, "prospect")) # ── opportunities + lp_profiles + communications ── stages = server.PIPELINE_STAGES for idx, (cid, first, last, org_name, ctype) in enumerate(contacts): person = f"{first} {last}" # opportunity for most contacts if idx % 5 != 4: stage = stages[idx % len(stages)] fund_label = random.choice(list(FUND_LABELS.values())) amt = random.choice(AMOUNTS) conn.execute( "INSERT INTO opportunities (id, name, contact_id, organization_id, stage, commitment_amount, " "expected_amount, probability, fund_name, description, next_step, owner_id, priority, updated_at) " "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)", (gen(), f"{org_name or person} — {fund_label}", cid, org_ids.get(org_name) if org_name else None, stage, amt if stage in ("committed", "funded") else 0, amt, {"lead": 10, "outreach": 25, "meeting": 40, "due_diligence": 60, "committed": 90, "funded": 100}[stage], fund_label, f"Potential {fund_label} allocation for {person}.", random.choice(["Send deck", "Schedule call", "Await IC", "Send subdocs"]), uid, random.choice(["low", "medium", "high"]), now())) # lp_profile for ~closed investors if ctype == "investor" and idx % 2 == 0: amt = random.choice(AMOUNTS) conn.execute( "INSERT INTO lp_profiles (id, contact_id, commitment_amount, funded_amount, commitment_date, " "fund_name, investor_type, accredited, legal_docs_signed, wire_received, k1_sent, notes, updated_at) " "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)", (gen(), cid, amt, amt if idx % 4 == 0 else 0, past(120), random.choice(list(FUND_LABELS.values())), random.choice(["family_office", "institutional", "endowment", "individual"]), 1, 1 if idx % 3 else 0, 1 if idx % 4 == 0 else 0, 0, f"Closed LP. Accreditation on file. Primary contact {person}.", now())) # 2-4 communications each, entity-rich prose for k in range(random.randint(2, 4)): ctype_comm, subj, body = random.choice(COMM_TEMPLATES) fund = random.choice(["Fund III", "Tactical Fund", "Ten31 Terahash", "Fund II"]) conn.execute( "INSERT INTO communications (id, contact_id, type, subject, body, communication_date, " "duration_minutes, outcome, next_action, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?)", (gen(), cid, ctype_comm, subj.format(fund=fund), body.format(person=person, org=org_name or "their firm", fund=fund, dur=random.choice([20, 30, 45])), past(random.randint(1, 200)), random.choice([20, 30, 45, None]), random.choice(["positive", "neutral", "needs follow-up"]), random.choice(["Send deck", "Schedule call", "Send subdocs", None]), uid, now())) # ── fundraising grid (authoritative grid_json -> real sync path) ── columns = server.DEFAULT_FUNDRAISING_COLUMNS views = server.DEFAULT_GRID_VIEWS rows = [] # (a) overlap investors — same org as a classic investor, but a NAME-VARIANT contact for org_name, variant_name, match_email in overlap_specs: row = {"id": "row-" + gen(), "investor_name": org_name, "lead": random.choice(LEADS), "lead_source": random.choice(["Conference", "Warm intro", "X"]), "notes": f"[call] {variant_name}: discussed Fund III pacing and co-invest. Warm.\n" f"[email] {variant_name}: sent the energy-thesis memo.", "priority": random.random() < 0.4, "follow_up": random.random() < 0.5, "graveyard": False, "contacts": [{"name": variant_name, "email": match_email or f"{variant_name.split()[0].lower()}@{org_name.split()[0].lower()}.invalid", "title": "Managing Partner"}]} for fc in random.sample(FUND_COLS, k=random.randint(1, 3)): row[fc] = random.choice(AMOUNTS) rows.append(row) # (b) grid-only investors (no classic counterpart) — exercise the create path for n in range(7): nm = f"{random.choice(['Slate', 'Copper', 'Ridgeline', 'Anchor', 'Falcon', 'Quarry', 'Beacon'])} " \ f"{random.choice(['Capital', 'Partners', 'Holdings', 'Group'])}" row = {"id": "row-" + gen(), "investor_name": nm, "lead": random.choice(LEADS), "lead_source": random.choice(["Inbound", "Referral", "Podcast"]), "notes": f"[note] First touch with {nm}. Sourced via X. Gauging thesis fit.", "priority": False, "follow_up": random.random() < 0.6, "graveyard": n >= 5, # a couple in the graveyard list "contacts": [{"name": f"{random.choice(FIRST)} {random.choice(LAST)}", "email": f"contact{n}@{nm.split()[0].lower()}.invalid", "title": "Partner"}]} for fc in random.sample(FUND_COLS, k=random.randint(0, 2)): row[fc] = random.choice(AMOUNTS) rows.append(row) grid = {"columns": columns, "rows": rows} conn.execute( "INSERT INTO fundraising_state (id, grid_json, views_json, version, updated_by, created_at, updated_at) " "VALUES ('main', ?, ?, 1, ?, ?, ?)", (json.dumps(grid), json.dumps(views), uid, now(), now())) server.sync_fundraising_relational(conn, grid, views, actor_user_id=uid) conn.commit() # ── summary ── def count(t): return conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0] print(f"\nSynthetic dev DB written to: {db}") print(" Classic model:") for t in ("organizations", "contacts", "opportunities", "communications", "lp_profiles"): print(f" {t:<24} {count(t)}") print(" Fundraising grid (after real sync):") for t in ("fundraising_investors", "fundraising_contacts", "fundraising_funds", "fundraising_commitments", "fundraising_list_memberships"): print(f" {t:<24} {count(t)}") print(" Phase-0 foundation tables (from migration, empty until entity resolution):") for t in ("canonical_entities", "entity_links", "interaction_log", "relationship_edges"): print(f" {t:<24} {count(t)}") inv = count("contacts") # note grid bridge may have created extra investor contacts (the variants) print(f"\n Entity-resolution test bed: {len(overlap_specs)} investors intentionally appear in BOTH models " f"with name variants; total contacts now {inv} (grid bridge added the variant rows).") conn.close() if __name__ == "__main__": main()