Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server

Workstream A–C substrate for the Ten31 agentic system: - A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9 - A2: additive/reversible core migration (canonical_entities, entity_links, interaction_log, relationship_edges, soft-delete) + ledgered runner - B1/B3: chunking + deterministic entity resolution (backend/ingest) - B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks - C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools - docs: redaction/re-hydration, Gmail enablement runbook - synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db, drop legacy files + start9/0.3.5) Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity queries). Real backfill runs on Ten31 infra; index holds synthetic data only. Branch snapshot also captures pre-existing working-tree changes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 08:11:28 -05:00
parent 7027efd777
commit c7ce44d963
99 changed files with 10676 additions and 7817 deletions
@@ -0,0 +1,184 @@
+"""Phase-0 Workstream B1 — chunk the CRM for retrieval.
+
+Maps each CRM record type to one or more chunks per docs/EMBEDDINGS.md:
+  * one chunk per communications row          (doc_type = the comm type)
+  * one chunk per MATCHED email               (doc_type = email; body only when matched)
+  * one chunk per fundraising_investors notes LINE (the outreach log; split per line)
+  * one chunk each for free-text fields: contacts.notes, lp_profiles.notes,
+    opportunities (description + next_step), organizations.description
+
+Each chunk carries a canonical `lp_id` (resolved via entity_links) and a `date_ts`
+(epoch of the EVENT time, not created_at) so Qdrant can pre-filter and recency-rank.
+Entities/names/dates/types are payload (filterable); only prose is embedded.
+
+A chunk's stable `chunk_key` -> deterministic point id (uuid5), so re-ingest
+upserts in place (idempotent).
+"""
+import sqlite3
+import uuid
+from datetime import datetime, timezone
+
+_NS = uuid.UUID("6ba7b811-9dad-11d1-80b4-00c04fd430c8")  # uuid5 namespace for chunk ids
+
+
+def to_epoch(ts: str):
+    if not ts:
+        return None
+    s = ts.strip().replace("Z", "+00:00")
+    for parse in (datetime.fromisoformat,):
+        try:
+            dt = parse(s)
+            if dt.tzinfo is None:
+                dt = dt.replace(tzinfo=timezone.utc)
+            return int(dt.timestamp())
+        except Exception:
+            pass
+    # date-only fallback
+    try:
+        return int(datetime.strptime(ts[:10], "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp())
+    except Exception:
+        return None
+
+
+def _point_id(chunk_key: str) -> str:
+    return str(uuid.uuid5(_NS, chunk_key))
+
+
+def _mk(chunk_key, lp_id, lp_name, person_id, doc_type, date_ts, text, source_model, source_id):
+    text = (text or "").strip()
+    if not text or not lp_id:
+        return None
+    return {
+        "chunk_key": chunk_key,
+        "point_id": _point_id(chunk_key),
+        "lp_id": lp_id,
+        "lp_name": lp_name,
+        "person_id": person_id,
+        "doc_type": doc_type,
+        "date_ts": date_ts,
+        "text": text,
+        "source_model": source_model,
+        "source_id": source_id,
+    }
+
+
+def _canon_maps(conn):
+    """Resolution lookups from entity_links / canonical_entities."""
+    person_canon, org_canon, inv_canon = {}, {}, {}
+    for r in conn.execute("SELECT source_model, source_id, canonical_id FROM entity_links"):
+        if r["source_model"] == "contacts":
+            person_canon[r["source_id"]] = r["canonical_id"]
+        elif r["source_model"] == "organizations":
+            org_canon[r["source_id"]] = r["canonical_id"]
+        elif r["source_model"] == "fundraising_investors":
+            inv_canon[r["source_id"]] = r["canonical_id"]
+    name = {r["id"]: r["display_name"] for r in conn.execute("SELECT id, display_name FROM canonical_entities")}
+    contact_org = {r["id"]: r["organization_id"] for r in conn.execute("SELECT id, organization_id FROM contacts")}
+    return person_canon, org_canon, inv_canon, name, contact_org
+
+
+def _contact_lp(cid, person_canon, org_canon, name, contact_org):
+    """Best lp_id for a contact-anchored chunk: the firm if known, else the person."""
+    person = person_canon.get(cid)
+    firm = org_canon.get(contact_org.get(cid))
+    lp = firm or person
+    return lp, name.get(lp), person
+
+
+def build_chunks(conn):
+    person_canon, org_canon, inv_canon, name, contact_org = _canon_maps(conn)
+    chunks = []
+
+    # communications
+    for r in conn.execute("""SELECT id, contact_id, type, subject, body, outcome, next_action, communication_date
+                             FROM communications"""):
+        lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
+        parts = [p for p in (r["subject"], r["body"], r["outcome"], r["next_action"]) if (p or "").strip()]
+        chunks.append(_mk(f"communications:{r['id']}", lp, lp_name, person,
+                          r["type"] or "note", to_epoch(r["communication_date"]),
+                          "\n".join(parts), "communications", r["id"]))
+
+    # contacts.notes
+    for r in conn.execute("SELECT id, notes, updated_at FROM contacts WHERE notes IS NOT NULL AND notes <> ''"):
+        lp, lp_name, person = _contact_lp(r["id"], person_canon, org_canon, name, contact_org)
+        chunks.append(_mk(f"contacts.notes:{r['id']}", lp, lp_name, person,
+                          "contact_note", to_epoch(r["updated_at"]), r["notes"], "contacts", r["id"]))
+
+    # lp_profiles.notes
+    for r in conn.execute("""SELECT lp.id, lp.contact_id, lp.notes, lp.updated_at
+                             FROM lp_profiles lp WHERE lp.notes IS NOT NULL AND lp.notes <> ''"""):
+        lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
+        chunks.append(_mk(f"lp_profiles.notes:{r['id']}", lp, lp_name, person,
+                          "lp_note", to_epoch(r["updated_at"]), r["notes"], "lp_profiles", r["id"]))
+
+    # opportunities (description + next_step)
+    for r in conn.execute("""SELECT id, contact_id, name, description, next_step, updated_at
+                             FROM opportunities"""):
+        lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
+        parts = [p for p in (r["name"], r["description"], r["next_step"]) if (p or "").strip()]
+        chunks.append(_mk(f"opportunities:{r['id']}", lp, lp_name, person,
+                          "opportunity", to_epoch(r["updated_at"]), "\n".join(parts), "opportunities", r["id"]))
+
+    # organizations.description
+    for r in conn.execute("""SELECT id, description, updated_at FROM organizations
+                             WHERE description IS NOT NULL AND description <> ''"""):
+        lp = org_canon.get(r["id"])
+        chunks.append(_mk(f"organizations.description:{r['id']}", lp, name.get(lp), None,
+                          "org_note", to_epoch(r["updated_at"]), r["description"], "organizations", r["id"]))
+
+    # fundraising_investors.notes — running outreach log, split per non-empty line
+    for r in conn.execute("""SELECT id, notes, updated_at FROM fundraising_investors
+                             WHERE notes IS NOT NULL AND notes <> ''"""):
+        lp = inv_canon.get(r["id"])
+        for i, line in enumerate(str(r["notes"]).splitlines()):
+            if line.strip():
+                chunks.append(_mk(f"fundraising_investors.notes:{r['id']}:{i}", lp, name.get(lp), None,
+                                  "outreach_note", to_epoch(r["updated_at"]), line, "fundraising_investors", r["id"]))
+
+    # MATCHED emails (only matched rows carry a body; key lp via email_investor_links)
+    if _has_table(conn, "emails") and _has_table(conn, "email_investor_links"):
+        for r in conn.execute("""SELECT id, subject, body_text, snippet, sent_at FROM emails WHERE is_matched=1"""):
+            lp, lp_name = _email_lp(conn, r["id"], inv_canon, org_canon, person_canon, name)
+            text = "\n".join(p for p in (r["subject"], r["body_text"] or r["snippet"]) if (p or "").strip())
+            chunks.append(_mk(f"emails:{r['id']}", lp, lp_name, None, "email",
+                              to_epoch(r["sent_at"]), text, "emails", r["id"]))
+
+    return [c for c in chunks if c]
+
+
+def _has_table(conn, name):
+    return conn.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name=?", (name,)).fetchone() is not None
+
+
+def _email_lp(conn, email_id, inv_canon, org_canon, person_canon, name):
+    """Resolve a matched email's lp_id via email_investor_links, precedence:
+    fundraising_investor -> contact -> organization."""
+    row = conn.execute("""SELECT fundraising_investor_id, contact_id, organization_id
+                          FROM email_investor_links WHERE email_id=? ORDER BY match_confidence DESC LIMIT 1""",
+                       (email_id,)).fetchone()
+    if not row:
+        return None, None
+    lp = (inv_canon.get(row["fundraising_investor_id"]) or person_canon.get(row["contact_id"])
+          or org_canon.get(row["organization_id"]))
+    return lp, name.get(lp)
+
+
+if __name__ == "__main__":
+    import argparse
+    from collections import Counter
+    from config import DEFAULT_DB
+
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--db", default=DEFAULT_DB)
+    args = ap.parse_args()
+    conn = sqlite3.connect(args.db)
+    conn.row_factory = sqlite3.Row
+    chunks = build_chunks(conn)
+    print(f"{len(chunks)} chunks from {args.db}")
+    for dt, n in Counter(c["doc_type"] for c in chunks).most_common():
+        print(f"  {dt:<16} {n}")
+    unresolved = sum(1 for c in chunks if not c["lp_id"])
+    print(f"  (all chunks have an lp_id: {unresolved == 0})")
+    print("\nSample chunk:")
+    s = chunks[0]
+    print({k: (v[:80] + '…' if k == 'text' and v and len(v) > 80 else v) for k, v in s.items()})