Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server
Workstream A–C substrate for the Ten31 agentic system: - A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9 - A2: additive/reversible core migration (canonical_entities, entity_links, interaction_log, relationship_edges, soft-delete) + ledgered runner - B1/B3: chunking + deterministic entity resolution (backend/ingest) - B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks - C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools - docs: redaction/re-hydration, Gmail enablement runbook - synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db, drop legacy files + start9/0.3.5) Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity queries). Real backfill runs on Ten31 infra; index holds synthetic data only. Branch snapshot also captures pre-existing working-tree changes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Phase-0 Workstream B3 / A4 — entity resolution (deterministic tier).
|
||||
|
||||
Collapses the CRM's two parallel investor models into the canonical identity
|
||||
layer created by migration 0001:
|
||||
|
||||
organizations ─┐
|
||||
fundraising_investors ─┴─► canonical_entities (entity_kind = lp | organization)
|
||||
contacts ─┐
|
||||
fundraising_contacts ─┴─► canonical_entities (entity_kind = person)
|
||||
lp_profiles ───► linked to its contact's person entity
|
||||
|
||||
Every source row is recorded in `entity_links` so any name variant resolves to
|
||||
one canonical id. This is the DETERMINISTIC tier — it merges only what we can
|
||||
prove (exact email; exact normalized name within the same canonical org). The
|
||||
HARD cases (nicknames like "Jon" vs "Jonathan", typos) are NOT guessed; they are
|
||||
emitted as *fuzzy candidates* for the local-Qwen tier (Spark Control
|
||||
/v1/chat/completions) to adjudicate later. Honest separation: we never silently
|
||||
merge on a guess.
|
||||
|
||||
Properties:
|
||||
* Local-only, read-mostly: reads CRM source tables, writes only the derived
|
||||
canonical_entities / entity_links and an interaction_log audit row. Never
|
||||
mutates a CRM source record (guardrail #2/#3).
|
||||
* Idempotent: canonical ids are deterministic (sha1 of the resolution key), so
|
||||
re-running upserts in place and keeps ids stable across runs — which keeps
|
||||
downstream Qdrant point ids valid (no churn on re-embed).
|
||||
* Logged: writes one interaction_log row per run (guardrail #5).
|
||||
|
||||
Usage:
|
||||
python3 backend/ingest/entity_resolution.py --db data/crm_dev.db
|
||||
python3 backend/ingest/entity_resolution.py --db data/crm_dev.db --show-candidates
|
||||
"""
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
# ── normalization ─────────────────────────────────────────────────────────────
|
||||
|
||||
def norm_text(s: str) -> str:
|
||||
s = (s or "").strip().lower()
|
||||
s = re.sub(r"[^\w\s]", " ", s)
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def norm_email(s: str) -> str:
|
||||
return (s or "").strip().lower()
|
||||
|
||||
|
||||
def _eid(prefix: str, key: str) -> str:
|
||||
"""Deterministic canonical id: stable across runs for the same resolution key."""
|
||||
return f"{prefix}_{hashlib.sha1(key.encode('utf-8')).hexdigest()[:12]}"
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _split_name(full: str):
|
||||
parts = norm_text(full).split()
|
||||
if not parts:
|
||||
return "", ""
|
||||
return parts[0], parts[-1] if len(parts) > 1 else ""
|
||||
|
||||
|
||||
# ── upsert helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
def _upsert_entity(conn, eid, kind, display_name, primary_email):
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO canonical_entities (id, entity_kind, display_name, primary_email, source, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, 'entity_resolution', ?, ?)
|
||||
ON CONFLICT(id) DO UPDATE SET
|
||||
display_name = excluded.display_name,
|
||||
primary_email = COALESCE(excluded.primary_email, canonical_entities.primary_email),
|
||||
entity_kind = excluded.entity_kind,
|
||||
updated_at = excluded.updated_at
|
||||
""",
|
||||
(eid, kind, display_name, primary_email or None, _now(), _now()),
|
||||
)
|
||||
|
||||
|
||||
def _link(conn, canonical_id, source_model, source_id, match_value, match_kind, confidence):
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO entity_links (id, canonical_id, source_model, source_id, match_value, match_kind, confidence, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(source_model, source_id, match_value) DO UPDATE SET
|
||||
canonical_id = excluded.canonical_id,
|
||||
match_kind = excluded.match_kind,
|
||||
confidence = excluded.confidence
|
||||
""",
|
||||
(str(uuid.uuid4()), canonical_id, source_model, source_id, match_value, match_kind, confidence, _now()),
|
||||
)
|
||||
|
||||
|
||||
# ── resolution passes ─────────────────────────────────────────────────────────
|
||||
|
||||
def resolve_organizations(conn):
|
||||
"""Merge organizations + fundraising_investors by normalized name.
|
||||
|
||||
Returns (org_canon_by_orgid, org_canon_by_fundinv) so the people pass can
|
||||
attach each person to their firm's canonical id.
|
||||
"""
|
||||
groups = defaultdict(lambda: {"orgs": [], "investors": [], "name": "", "email": ""})
|
||||
|
||||
for r in conn.execute("SELECT id, name, email FROM organizations"):
|
||||
key = norm_text(r["name"])
|
||||
if not key:
|
||||
continue
|
||||
g = groups[key]
|
||||
g["orgs"].append(r["id"])
|
||||
if len(r["name"] or "") > len(g["name"]):
|
||||
g["name"] = r["name"]
|
||||
if not g["email"] and (r["email"] or "").strip():
|
||||
g["email"] = r["email"].strip()
|
||||
|
||||
for r in conn.execute("SELECT id, investor_name FROM fundraising_investors"):
|
||||
key = norm_text(r["investor_name"])
|
||||
if not key:
|
||||
continue
|
||||
g = groups[key]
|
||||
g["investors"].append(r["id"])
|
||||
if not g["name"]:
|
||||
g["name"] = r["investor_name"]
|
||||
|
||||
org_canon_by_orgid, org_canon_by_fundinv = {}, {}
|
||||
for key, g in groups.items():
|
||||
# An org we are actively raising from (has a fundraising row) is an 'lp';
|
||||
# otherwise a plain 'organization'.
|
||||
kind = "lp" if g["investors"] else "organization"
|
||||
cid = _eid("lp" if kind == "lp" else "org", key)
|
||||
_upsert_entity(conn, cid, kind, g["name"], g["email"])
|
||||
for oid in g["orgs"]:
|
||||
_link(conn, cid, "organizations", oid, key, "exact_name", 1.0)
|
||||
org_canon_by_orgid[oid] = cid
|
||||
for iid in g["investors"]:
|
||||
_link(conn, cid, "fundraising_investors", iid, key, "exact_name", 1.0)
|
||||
org_canon_by_fundinv[iid] = cid
|
||||
|
||||
return org_canon_by_orgid, org_canon_by_fundinv
|
||||
|
||||
|
||||
def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv):
|
||||
"""Merge contacts + fundraising_contacts by exact email, else exact name within
|
||||
the same canonical org. Returns contact_id -> person canonical id (for lp_profiles)."""
|
||||
# gather (model, source_id, full_name, email, org_canon)
|
||||
people = []
|
||||
for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts"):
|
||||
full = f"{r['first_name'] or ''} {r['last_name'] or ''}".strip()
|
||||
people.append(("contacts", r["id"], full, norm_email(r["email"]),
|
||||
org_canon_by_orgid.get(r["organization_id"])))
|
||||
for r in conn.execute("SELECT id, full_name, email, investor_id FROM fundraising_contacts"):
|
||||
people.append(("fundraising_contacts", r["id"], r["full_name"] or "", norm_email(r["email"]),
|
||||
org_canon_by_fundinv.get(r["investor_id"])))
|
||||
|
||||
contact_to_person = {}
|
||||
person_meta = {} # canonical_id -> {"org": org_canon, "last": last_norm, "name": display, "email": email}
|
||||
|
||||
for model, sid, full, email, org_canon in people:
|
||||
name_norm = norm_text(full)
|
||||
if email:
|
||||
key = f"e|{email}"
|
||||
match_kind, conf, match_value = "exact_email", 1.0, email
|
||||
elif name_norm:
|
||||
key = f"n|{name_norm}|{org_canon or ''}"
|
||||
match_kind, conf, match_value = "name_org", 0.8, name_norm
|
||||
else:
|
||||
continue
|
||||
cid = _eid("per", key)
|
||||
display = full.strip() or email
|
||||
_upsert_entity(conn, cid, "person", display, email)
|
||||
_link(conn, cid, model, sid, match_value, match_kind, conf)
|
||||
if model == "contacts":
|
||||
contact_to_person[sid] = cid
|
||||
meta = person_meta.setdefault(cid, {"org": org_canon, "last": _split_name(full)[1],
|
||||
"name": display, "email": email})
|
||||
if org_canon and not meta["org"]:
|
||||
meta["org"] = org_canon
|
||||
|
||||
# lp_profiles -> the person entity of its contact
|
||||
for r in conn.execute("SELECT id, contact_id FROM lp_profiles"):
|
||||
cid = contact_to_person.get(r["contact_id"])
|
||||
if cid:
|
||||
_link(conn, cid, "lp_profiles", r["id"], r["contact_id"], "contact_fk", 1.0)
|
||||
|
||||
return person_meta
|
||||
|
||||
|
||||
def find_fuzzy_candidates(person_meta):
|
||||
"""Distinct person entities sharing the same canonical org AND surname are
|
||||
likely the same individual under a name variant (e.g. Jon/Jonathan). Emit them
|
||||
for the local-Qwen tier; do NOT merge here."""
|
||||
by_org_last = defaultdict(list)
|
||||
for cid, m in person_meta.items():
|
||||
if m["org"] and m["last"]:
|
||||
by_org_last[(m["org"], m["last"])].append((cid, m["name"], m["email"]))
|
||||
return [{"org": org, "surname": last, "members": members}
|
||||
for (org, last), members in by_org_last.items() if len(members) > 1]
|
||||
|
||||
|
||||
def run(db_path: str):
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
org_by_oid, org_by_inv = resolve_organizations(conn)
|
||||
conn.commit()
|
||||
person_meta = resolve_people(conn, org_by_oid, org_by_inv)
|
||||
conn.commit()
|
||||
candidates = find_fuzzy_candidates(person_meta)
|
||||
|
||||
counts = {
|
||||
"canonical_total": conn.execute("SELECT COUNT(*) FROM canonical_entities").fetchone()[0],
|
||||
"lp": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp'").fetchone()[0],
|
||||
"organization": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization'").fetchone()[0],
|
||||
"person": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person'").fetchone()[0],
|
||||
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
|
||||
"fuzzy_candidates": len(candidates),
|
||||
}
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO interaction_log (id, ts, actor_type, actor_id, action, target_type, payload, source, created_at)
|
||||
VALUES (?, ?, 'system', 'entity_resolver', 'entity_resolution.run', 'canonical_entities', ?, 'ingest', ?)
|
||||
""",
|
||||
(str(uuid.uuid4()), _now(), json.dumps(counts), _now()),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return counts, candidates
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Deterministic entity resolution into the canonical layer.")
|
||||
ap.add_argument("--db", default="data/crm_dev.db", help="path to the CRM SQLite DB")
|
||||
ap.add_argument("--show-candidates", action="store_true", help="print fuzzy merge candidates")
|
||||
args = ap.parse_args()
|
||||
|
||||
counts, candidates = run(args.db)
|
||||
print(f"Entity resolution on {args.db}:")
|
||||
for k, v in counts.items():
|
||||
print(f" {k:<18} {v}")
|
||||
if args.show_candidates and candidates:
|
||||
print("\nFuzzy candidates (same org + surname, different person — for the local-Qwen tier):")
|
||||
for c in candidates:
|
||||
names = ", ".join(f"{n!r}{(' <'+e+'>') if e else ''}" for _, n, e in c["members"])
|
||||
print(f" [{c['surname']}] {names}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user