ten31-database/backend/scripts/seed_synthetic.py

#!/usr/bin/env python3
"""Seed a SYNTHETIC dev database for Ten31 CRM ingest/retrieval testing.

ALL DATA IS FAKE. No real LP/prospect information appears here (CLAUDE.md
guardrail #9: Claude works only on synthetic/redacted data). This produces a
realistic-shaped corpus so the Phase-0 ingest, chunking, and entity-resolution
work can be developed and tested without ever touching the live CRM.

What it builds (into a SEPARATE dev DB, never crm.db):
  * The full real schema, via server.init_db() — which also runs the new
    core migration (backend/migrations/), so the canonical/interaction/graph
    tables exist.
  * A classic-model dataset: organizations, contacts (investors + prospects),
    opportunities across pipeline stages, communications with entity-rich prose
    notes, and lp_profiles.
  * A fundraising grid (fundraising_state.grid_json) populated via the real
    sync_fundraising_relational() code path, so the normalized mirror + the
    grid->classic bridge behave exactly as in production.
  * DELIBERATE entity-resolution test cases: several investors appear in BOTH
    models with NAME VARIANTS (e.g. "Jonathan Reyes" vs grid contact "Jon
    Reyes"), some with matching email (easy merge) and some without (hard case).

Usage:
    python3 backend/scripts/seed_synthetic.py                 # -> data/crm_dev.db
    python3 backend/scripts/seed_synthetic.py --db /tmp/x.db
"""
import argparse
import datetime
import json
import os
import random
import sys

HERE = os.path.dirname(os.path.abspath(__file__))
BACKEND_DIR = os.path.dirname(HERE)
PROJECT_DIR = os.path.dirname(BACKEND_DIR)
sys.path.insert(0, BACKEND_DIR)

ORGS = [
    ("Cedar Point Capital", "family_office"),
    ("Harbor & Vine Family Office", "family_office"),
    ("Northwind Endowment", "endowment"),
    ("Granite Peak Partners", "institutional"),
    ("Solano Ventures", "wealth_management"),
    ("Tidewater Holdings", "family_office"),
    ("Brightwater Capital", "institutional"),
    ("Meridian Trust", "foundation"),
]

FIRST = ["Jonathan", "Katherine", "Michael", "William", "Robert", "Elena", "Priya",
         "David", "Sarah", "James", "Maria", "Thomas", "Laura", "Daniel", "Rachel",
         "Steven", "Nicole", "Andrew", "Jessica", "Brian"]
LAST = ["Reyes", "Calder", "Okonkwo", "Brandt", "Sutter", "Vance", "Mehta", "Ellison",
        "Cho", "Whitlock", "Santos", "Aldridge", "Kerr", "Nilsson", "Pope", "Devlin",
        "Frye", "Osei", "Lindqvist", "Marsh"]
NICK = {"Jonathan": "Jon", "Katherine": "Kate", "Michael": "Mike", "William": "Bill",
        "Robert": "Bob", "James": "Jim", "Thomas": "Tom", "Daniel": "Dan",
        "Steven": "Steve", "Jessica": "Jess"}

FUND_COLS = ["fund_i", "fund_ii", "fund_iii", "tactical_fund", "pawn_to_e4",
             "ten31_terahash", "sats_and_stats", "pawn_to_f4", "join_the_fold"]
FUND_LABELS = {"fund_i": "Fund I", "fund_ii": "Fund II", "fund_iii": "Fund III",
               "tactical_fund": "Tactical Fund"}
AMOUNTS = [250_000, 500_000, 1_000_000, 2_500_000, 5_000_000]
LEADS = ["JK", "Grant", "MB", "Parker"]

COMM_TEMPLATES = [
    ("call", "Intro call recap",
     "Spoke with {person} ({org}) for {dur} min about {fund}. Strong interest in the "
     "bitcoin-energy and AI-infrastructure thesis; wants the latest deck and DPI figures. "
     "Flagged accreditation paperwork still outstanding. Next: send one-pager and schedule a partner call."),
    ("email", "Follow-up: {fund} allocation",
     "Sent {person} the {fund} summary and the scarcity/critical-infrastructure memo. "
     "They asked how Ten31 Terahash relates to the energy thesis. Following up next week on commitment size."),
    ("meeting", "Partner meeting notes",
     "Met {person} at {org}. Discussed pacing into {fund} and co-invest appetite. "
     "Concern about lockup; reassured on secondary options. Warm — wants to meet the GP again before committing."),
    ("note", "Diligence status",
     "{person} is mid-diligence on {fund}. Legal reviewing subscription docs; wire expected within 30 days. "
     "Keep warm; send the Q update."),
    ("text", "Quick ping",
     "Texted {person} re: the {fund} close timeline. Said they're 'in for at least a unit' pending IC approval."),
]


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--db", default=os.path.join(PROJECT_DIR, "data", "crm_dev.db"))
    args = ap.parse_args()
    db = os.path.abspath(args.db)
    if os.path.basename(db) == "crm.db":
        sys.exit("Refusing to seed the real crm.db. Use a dev path like data/crm_dev.db.")

    for ext in ("", "-wal", "-shm"):
        if os.path.exists(db + ext):
            os.remove(db + ext)

    os.environ["CRM_DB_PATH"] = db
    import server  # noqa: E402  (must follow CRM_DB_PATH assignment)

    server.init_db()
    gen, now = server.generate_id, server.now
    conn = server.get_db()
    random.seed(31)

    def past(days_ago):
        return (datetime.datetime.utcnow() - datetime.timedelta(days=days_ago)).isoformat() + "Z"

    # ── dev user (FK target for created_by/owner_id) ──
    uid = gen()
    conn.execute(
        "INSERT INTO users (id, username, email, password_hash, full_name, role) VALUES (?,?,?,?,?,?)",
        (uid, "dev_admin", "dev@example.invalid", server.hash_password("devpassword"),
         "Dev Admin", "admin"))

    # ── organizations ──
    org_ids = {}
    for name, otype in ORGS:
        oid = gen()
        org_ids[name] = oid
        conn.execute(
            "INSERT INTO organizations (id, name, type, industry, country, description, created_by, updated_at) "
            "VALUES (?,?,?,?,?,?,?,?)",
            (oid, name, otype, "Investment Management", "USA",
             f"{name} — synthetic {otype.replace('_', ' ')} used for ingest testing.", uid, now()))

    # ── classic contacts (investors + prospects) ──
    contacts = []          # (cid, first, last, org_name, contact_type)
    overlap_specs = []     # investors we will also place in the grid, with variants
    used = set()
    for i, (org_name, _) in enumerate(ORGS):
        # one "primary" investor contact per org
        first, last = FIRST[i], LAST[i]
        used.add((first, last))
        cid = gen()
        email = f"{first.lower()}.{last.lower()}@{org_name.split()[0].lower()}.invalid"
        conn.execute(
            "INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, "
            "status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
            (cid, first, last, email, "Managing Partner", org_ids[org_name], "investor", "active",
             "referral", f"Primary relationship at {org_name}. Met via conference intro.", uid, now()))
        contacts.append((cid, first, last, org_name, "investor"))
        # mark 5 of 8 for grid overlap with a NAME VARIANT
        if i < 5:
            variant = NICK.get(first, first[0] + ".")  # nickname or initial
            match_email = email if i % 2 == 0 else ""    # half share email (easy), half don't (hard)
            overlap_specs.append((org_name, f"{variant} {last}", match_email))

    # Multi-contact institutions: the first two orgs get extra contacts so ONE
    # investor entity owns several people (a family office / institution), to
    # exercise the member_of relationship. (A HNWI stays a 1-contact investor.)
    for org_name in (ORGS[0][0], ORGS[1][0]):
        for k in range(2):
            fn, ln = FIRST[(k + 13) % len(FIRST)], LAST[(k + 13) % len(LAST)]
            cid = gen()
            conn.execute(
                "INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, "
                "status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
                (cid, fn, ln, f"{fn.lower()}.{ln.lower()}@{org_name.split()[0].lower()}.invalid",
                 random.choice(["Analyst", "Principal", "Associate"]), org_ids[org_name], "investor", "active",
                 "referral", f"Additional contact at {org_name}.", uid, now()))
            contacts.append((cid, fn, ln, org_name, "investor"))

    # extra prospect contacts (no org sometimes)
    for j in range(12):
        first = FIRST[(j + 8) % len(FIRST)]
        last = LAST[(j + 8) % len(LAST)]
        if (first, last) in used:
            last = LAST[(j + 11) % len(LAST)]
        used.add((first, last))
        org_name = ORGS[j % len(ORGS)][0] if j % 3 else None
        cid = gen()
        email = f"{first.lower()}{last.lower()}@example.invalid"
        conn.execute(
            "INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, "
            "status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
            (cid, first, last, email, "Principal", org_ids.get(org_name) if org_name else None,
             "prospect", "active", random.choice(["inbound", "referral", "conference", "x"]),
             f"Prospect sourced via {random.choice(['X DM', 'warm intro', 'podcast'])}.", uid, now()))
        contacts.append((cid, first, last, org_name, "prospect"))

    # ── opportunities + lp_profiles + communications ──
    stages = server.PIPELINE_STAGES
    for idx, (cid, first, last, org_name, ctype) in enumerate(contacts):
        person = f"{first} {last}"
        # opportunity for most contacts
        if idx % 5 != 4:
            stage = stages[idx % len(stages)]
            fund_label = random.choice(list(FUND_LABELS.values()))
            amt = random.choice(AMOUNTS)
            conn.execute(
                "INSERT INTO opportunities (id, name, contact_id, organization_id, stage, commitment_amount, "
                "expected_amount, probability, fund_name, description, next_step, owner_id, priority, updated_at) "
                "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
                (gen(), f"{org_name or person} — {fund_label}", cid, org_ids.get(org_name) if org_name else None,
                 stage, amt if stage in ("committed", "funded") else 0, amt,
                 {"lead": 10, "outreach": 25, "meeting": 40, "due_diligence": 60, "committed": 90, "funded": 100}[stage],
                 fund_label, f"Potential {fund_label} allocation for {person}.",
                 random.choice(["Send deck", "Schedule call", "Await IC", "Send subdocs"]),
                 uid, random.choice(["low", "medium", "high"]), now()))

        # lp_profile for ~closed investors
        if ctype == "investor" and idx % 2 == 0:
            amt = random.choice(AMOUNTS)
            conn.execute(
                "INSERT INTO lp_profiles (id, contact_id, commitment_amount, funded_amount, commitment_date, "
                "fund_name, investor_type, accredited, legal_docs_signed, wire_received, k1_sent, notes, updated_at) "
                "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
                (gen(), cid, amt, amt if idx % 4 == 0 else 0, past(120),
                 random.choice(list(FUND_LABELS.values())),
                 random.choice(["family_office", "institutional", "endowment", "individual"]),
                 1, 1 if idx % 3 else 0, 1 if idx % 4 == 0 else 0, 0,
                 f"Closed LP. Accreditation on file. Primary contact {person}.", now()))

        # 2-4 communications each, entity-rich prose
        for k in range(random.randint(2, 4)):
            ctype_comm, subj, body = random.choice(COMM_TEMPLATES)
            fund = random.choice(["Fund III", "Tactical Fund", "Ten31 Terahash", "Fund II"])
            conn.execute(
                "INSERT INTO communications (id, contact_id, type, subject, body, communication_date, "
                "duration_minutes, outcome, next_action, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?)",
                (gen(), cid, ctype_comm,
                 subj.format(fund=fund),
                 body.format(person=person, org=org_name or "their firm", fund=fund, dur=random.choice([20, 30, 45])),
                 past(random.randint(1, 200)), random.choice([20, 30, 45, None]),
                 random.choice(["positive", "neutral", "needs follow-up"]),
                 random.choice(["Send deck", "Schedule call", "Send subdocs", None]),
                 uid, now()))

    # ── fundraising grid (authoritative grid_json -> real sync path) ──
    columns = server.DEFAULT_FUNDRAISING_COLUMNS
    views = server.DEFAULT_GRID_VIEWS
    rows = []

    # (a) overlap investors — same org as a classic investor, but a NAME-VARIANT contact
    for org_name, variant_name, match_email in overlap_specs:
        row = {"id": "row-" + gen(), "investor_name": org_name, "lead": random.choice(LEADS),
               "lead_source": random.choice(["Conference", "Warm intro", "X"]),
               "notes": f"[call] {variant_name}: discussed Fund III pacing and co-invest. Warm.\n"
                        f"[email] {variant_name}: sent the energy-thesis memo.",
               "priority": random.random() < 0.4, "follow_up": random.random() < 0.5, "graveyard": False,
               "contacts": [{"name": variant_name,
                             "email": match_email or f"{variant_name.split()[0].lower()}@{org_name.split()[0].lower()}.invalid",
                             "title": "Managing Partner"}]}
        for fc in random.sample(FUND_COLS, k=random.randint(1, 3)):
            row[fc] = random.choice(AMOUNTS)
        rows.append(row)

    # (b) grid-only investors (no classic counterpart) — exercise the create path
    for n in range(7):
        nm = f"{random.choice(['Slate', 'Copper', 'Ridgeline', 'Anchor', 'Falcon', 'Quarry', 'Beacon'])} " \
             f"{random.choice(['Capital', 'Partners', 'Holdings', 'Group'])}"
        row = {"id": "row-" + gen(), "investor_name": nm, "lead": random.choice(LEADS),
               "lead_source": random.choice(["Inbound", "Referral", "Podcast"]),
               "notes": f"[note] First touch with {nm}. Sourced via X. Gauging thesis fit.",
               "priority": False, "follow_up": random.random() < 0.6,
               "graveyard": n >= 5,  # a couple in the graveyard list
               "contacts": [{"name": f"{random.choice(FIRST)} {random.choice(LAST)}",
                             "email": f"contact{n}@{nm.split()[0].lower()}.invalid", "title": "Partner"}]}
        for fc in random.sample(FUND_COLS, k=random.randint(0, 2)):
            row[fc] = random.choice(AMOUNTS)
        rows.append(row)

    grid = {"columns": columns, "rows": rows}
    conn.execute(
        "INSERT INTO fundraising_state (id, grid_json, views_json, version, updated_by, created_at, updated_at) "
        "VALUES ('main', ?, ?, 1, ?, ?, ?)",
        (json.dumps(grid), json.dumps(views), uid, now(), now()))
    server.sync_fundraising_relational(conn, grid, views, actor_user_id=uid)
    conn.commit()

    # ── summary ──
    def count(t):
        return conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0]

    print(f"\nSynthetic dev DB written to: {db}")
    print("  Classic model:")
    for t in ("organizations", "contacts", "opportunities", "communications", "lp_profiles"):
        print(f"    {t:<24} {count(t)}")
    print("  Fundraising grid (after real sync):")
    for t in ("fundraising_investors", "fundraising_contacts", "fundraising_funds",
              "fundraising_commitments", "fundraising_list_memberships"):
        print(f"    {t:<24} {count(t)}")
    print("  Phase-0 foundation tables (from migration, empty until entity resolution):")
    for t in ("canonical_entities", "entity_links", "interaction_log", "relationship_edges"):
        print(f"    {t:<24} {count(t)}")
    inv = count("contacts")  # note grid bridge may have created extra investor contacts (the variants)
    print(f"\n  Entity-resolution test bed: {len(overlap_specs)} investors intentionally appear in BOTH models "
          f"with name variants; total contacts now {inv} (grid bridge added the variant rows).")
    conn.close()


if __name__ == "__main__":
    main()