Grid/contacts unification step 1: real contact_id link + grid as front door (v0.1.0:52)

Structural fix for the duplicate-people class of bug: instead of matching a grid
contact "pill" to a contacts row heuristically by name/email (which drifted and
caused the 1406 double-count), link them by id.

Backend:
- Migration 0004: fundraising_contacts.contact_id (additive, nullable, logical FK
  to contacts(id)) + index. Paired down migration.
- sync_fundraising_relational now stores the id that _upsert_contact_from_fundraising
  already returns, so every grid contact carries its contacts-table id.
- _backfill_grid_contact_ids: one-time, idempotent backfill on startup (re-runs the
  grid sync once if any row lacks contact_id), so existing data links immediately.
- entity_resolution: grid pass prefers the explicit contact_id link (match_kind
  'grid_link') over heuristic email / name+investor, guarded by a PRAGMA check so
  older DBs without the column still work.

Frontend:
- Fundraising grid "+ Row" -> "+ Investor" (clear, single investor entry point).
- Contacts page: the "+ Add Contact" trigger is replaced by a pointer to the grid;
  the page is now a read/search/edit view (ContactDetailPanel still edits all
  fields). New people are added from the grid. No contact data is removed.

Tests: backend/ingest/test_entity_resolution.py extended (explicit-link case, 11/11)
and a new backend/test_grid_contact_link.py integration test (init_db applies 0004,
sync populates contact_id to the right contact, re-sync is idempotent). py_compile +
frontend html.parser clean.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Keysat
2026-06-05 15:10:26 -05:00
parent d16264f401
commit 2afed210cb
10 changed files with 203 additions and 41 deletions
+19 -17
View File
@@ -220,27 +220,29 @@ def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv, merge_map=Non
if cid:
contact_to_person[r["id"]] = cid
# 2. Grid contacts are associations, not new people: match to a contact-person
# (by email, else name within the same investor) and just add membership.
# Only create a person when there is genuinely no matching contact.
for r in conn.execute("SELECT id, full_name, email, investor_id FROM fundraising_contacts"):
# 2. Grid contacts are associations, not new people: link each to its
# contacts-table person and record membership. We prefer the EXPLICIT
# contact_id link (migration 0004 — the grid pill stores the id of the
# contact it was created from), and fall back to provable email / exact name
# within the same investor for rows not yet backfilled. On a miss we
# deliberately do NOT mint a person: the old else-branch mint is exactly what
# produced the people double-count, and guessing by name across firms risks
# binding two different same-named people — honest separation, never on a guess.
fc_cols = {row[1] for row in conn.execute("PRAGMA table_info(fundraising_contacts)")}
has_contact_id = "contact_id" in fc_cols
sel = ("SELECT id, full_name, email, investor_id" +
(", contact_id" if has_contact_id else "") + " FROM fundraising_contacts")
for r in conn.execute(sel):
email = norm_email(r["email"])
name_norm = norm_text(r["full_name"] or "")
inv_canon = org_canon_by_fundinv.get(r["investor_id"])
# Match the grid contact to its contacts-table person by PROVABLE keys only:
# exact email, else exact name within the SAME canonical investor. The app
# keeps the grid and the contacts table in sync (_upsert_contact_from_
# fundraising), so a grid contact IS an existing contact-person, never a new
# one. On a confident match, record the membership. On a miss we deliberately
# do NOT mint a person: the old else-branch mint is exactly what produced the
# people double-count (a grid row whose (name, investor) key didn't line up
# with its contact minted a duplicate), and guessing by name across firms
# risks binding two different same-named people. Unresolved grid rows are
# left for the explicit contact_id link planned in the grid/contacts
# unification — honest separation: never merge or mint on a guess.
cid = (by_email.get(email) if email else None) or by_name_inv.get((name_norm, inv_canon or ""))
link_cid = r["contact_id"] if has_contact_id else None
cid = (contact_to_person.get(link_cid) if link_cid else None) \
or (by_email.get(email) if email else None) \
or by_name_inv.get((name_norm, inv_canon or ""))
if cid:
_link(conn, cid, "fundraising_contacts", r["id"], email or name_norm, "grid_assoc", 0.9)
mk = "grid_link" if (link_cid and contact_to_person.get(link_cid)) else "grid_assoc"
_link(conn, cid, "fundraising_contacts", r["id"], email or name_norm, mk, 0.95 if mk == "grid_link" else 0.9)
_member_of(conn, cid, inv_canon)
# lp_profiles -> the person entity of its contact
+14 -10
View File
@@ -57,7 +57,7 @@ CREATE TABLE contacts (
);
CREATE TABLE organizations (id TEXT PRIMARY KEY, name TEXT, email TEXT);
CREATE TABLE fundraising_investors (id TEXT PRIMARY KEY, investor_name TEXT);
CREATE TABLE fundraising_contacts (id TEXT PRIMARY KEY, full_name TEXT, email TEXT, investor_id TEXT);
CREATE TABLE fundraising_contacts (id TEXT PRIMARY KEY, full_name TEXT, email TEXT, investor_id TEXT, contact_id TEXT);
CREATE TABLE lp_profiles (id TEXT PRIMARY KEY, contact_id TEXT, deleted_at TEXT);
"""
@@ -72,15 +72,17 @@ def seed(db):
("c1", "Alice", "Anderson", "alice@x.com", None), # email, no org
("c2", "Bob", "Brown", None, None), # no email, no org
("c3", "Dave", "Davis", None, "o1"), # no email, org = Acme
("c4", "Frank", "Foster", "frank@x.com", None), # target of an explicit id link
])
c.executemany("INSERT INTO fundraising_investors (id, investor_name) VALUES (?,?)", [
("i_acme", "Acme Capital"), ("i_beta", "Beta Family Office"),
])
c.executemany("INSERT INTO fundraising_contacts (id, full_name, email, investor_id) VALUES (?,?,?,?)", [
("g_alice", "Alice Anderson", "alice@x.com", "i_beta"), # -> email match to c1
("g_dave", "Dave Davis", None, "i_acme"), # -> name+investor match to c3
("g_bob", "Bob Brown", None, "i_beta"), # -> MISS (c2 has no org) -> mint NOTHING
("g_carol", "Carol Clark", None, "i_beta"), # -> MISS (no contact) -> mint NOTHING
c.executemany("INSERT INTO fundraising_contacts (id, full_name, email, investor_id, contact_id) VALUES (?,?,?,?,?)", [
("g_alice", "Alice Anderson", "alice@x.com", "i_beta", None), # -> email match to c1
("g_dave", "Dave Davis", None, "i_acme", None), # -> name+investor match to c3
("g_bob", "Bob Brown", None, "i_beta", None), # -> MISS (c2 has no org) -> mint NOTHING
("g_carol", "Carol Clark", None, "i_beta", None), # -> MISS (no contact) -> mint NOTHING
("g_link", "Totally Mismatched", None, "i_beta", "c4"), # -> explicit contact_id link wins over name/inv
])
# Stale grid-only "twin" (person, only a fundraising_contacts link, no enrichment) -> prune
c.execute("INSERT INTO canonical_entities (id, entity_kind, display_name, source) VALUES "
@@ -150,13 +152,15 @@ def main():
counts1, _ = er.run(db)
print(f"Run 1 counts: {counts1}")
# 3 contacts; grid rows either link back (g_alice, g_dave) or are skipped
# (g_bob, g_carol). NO grid row mints a person -> count stays 3, not 5-7.
check(resolved_persons(db) == 3, f"resolved persons == 3 (got {resolved_persons(db)}); old double-count would be 5-7")
# 4 contacts; grid rows either link back (g_alice email, g_dave name+inv,
# g_link explicit id) or are skipped (g_bob, g_carol). NO grid row mints a
# person -> count stays 4, not 7-9.
check(resolved_persons(db) == 4, f"resolved persons == 4 (got {resolved_persons(db)}); old double-count would be 7-9")
check(minted_from_grid(db) == 0, f"zero persons minted from grid rows (got {minted_from_grid(db)})")
mk = grid_match_kinds(db)
check(mk.get("grid_assoc", 0) == 2, f"two grid contacts matched back via grid_assoc (got {mk.get('grid_assoc',0)})")
check(mk.get("grid_link", 0) == 1, f"one grid contact linked via explicit contact_id (grid_link==1, got {mk.get('grid_link',0)})")
# Targeted cleanup: stale grid-only twin + superseded 'lp' row tombstoned...
check(deleted_at(db, "per_TWIN") is not None, "stale grid-only twin 'per_TWIN' tombstoned")
@@ -174,7 +178,7 @@ def main():
counts2, _ = er.run(db)
print(f"Run 2 counts: {counts2}")
check(deleted_at(db, alice) is None, "re-emitted contact-person is UN-tombstoned (no permanent burial)")
check(resolved_persons(db) == 3, f"resolved persons stable at 3 on re-run (got {resolved_persons(db)})")
check(resolved_persons(db) == 4, f"resolved persons stable at 4 on re-run (got {resolved_persons(db)})")
check(counts2.get("pruned_stale", 0) == 0, f"nothing re-pruned on idempotent re-run (got {counts2.get('pruned_stale')})")
print()
@@ -0,0 +1,7 @@
-- Reversal of 0004_grid_contact_link.sql (manual; .down files are never auto-applied).
--
-- SQLite < 3.35 cannot DROP COLUMN. The added column is nullable and ignored by
-- any code path predating it, so leaving it in place is harmless. The index can be
-- dropped freely. On SQLite >= 3.35 the column itself may also be dropped.
DROP INDEX IF EXISTS idx_fundraising_contacts_contact;
-- ALTER TABLE fundraising_contacts DROP COLUMN contact_id; -- SQLite >= 3.35 only
@@ -0,0 +1,17 @@
-- Grid/contacts unification — explicit link from a fundraising-grid contact
-- "pill" to its row in the contacts table.
--
-- ADDITIVE + REVERSIBLE (CLAUDE.md guardrail #3): adds one nullable column.
-- Until now a grid contact was tied to its contact only by name/email matching,
-- which drifted and produced the people double-count. fundraising_contacts.contact_id
-- records the real link (populated by sync_fundraising_relational, which already
-- upserts the contact and now stores its id). entity_resolution prefers this link
-- over heuristic matching.
--
-- contact_id is a LOGICAL foreign key to contacts(id). It is intentionally NOT a
-- declared SQLite FOREIGN KEY: contacts are soft-deleted (never hard-deleted), so
-- there is nothing to cascade, and SQLite's ALTER TABLE ADD COLUMN cannot add an
-- enforced FK cleanly. Nullable so existing rows are valid until backfilled.
ALTER TABLE fundraising_contacts ADD COLUMN contact_id TEXT;
CREATE INDEX IF NOT EXISTS idx_fundraising_contacts_contact ON fundraising_contacts(contact_id);
+41 -8
View File
@@ -462,6 +462,13 @@ def init_db():
except Exception as _e:
print(f"[migrations] core migration warning: {_e}")
# One-time: populate the new fundraising_contacts.contact_id (migration 0004)
# by re-running the grid→relational sync. No-op once every row is linked.
try:
_backfill_grid_contact_ids(conn)
except Exception as _e:
print(f"[backfill] grid contact_id backfill warning: {_e}")
conn.close()
print(f"Database initialized at {DB_PATH}")
@@ -977,6 +984,32 @@ def _sync_contact_to_fundraising_state(conn, contact_row, actor_user_id=None, re
""", (json.dumps(grid), json.dumps(next_views), next_version, actor_user_id, now()))
sync_fundraising_relational(conn, grid, next_views, actor_user_id=actor_user_id)
def _backfill_grid_contact_ids(conn):
"""One-time backfill for migration 0004: populate fundraising_contacts.contact_id
by re-running the grid→relational sync once. Fires only when the column exists
AND some row still lacks a contact_id, so it runs once after the migration and is
a no-op thereafter. Safe + idempotent: the fundraising_* tables are derived and
rebuilt on every sync, and _upsert_contact_from_fundraising matches existing
contacts by email/name (never creates a duplicate on re-run)."""
try:
need = conn.execute("SELECT 1 FROM fundraising_contacts WHERE contact_id IS NULL LIMIT 1").fetchone()
except sqlite3.OperationalError:
return # contact_id column not present (migration 0004 not applied)
if not need:
return
row = conn.execute("SELECT grid_json, views_json FROM fundraising_state WHERE id = 'main'").fetchone()
if not row or not row[0]:
return
try:
grid = json.loads(row[0])
views = json.loads(row[1]) if row[1] else []
except Exception:
return
sync_fundraising_relational(conn, sanitize_fundraising_grid(grid), views)
conn.commit()
print("[backfill] populated fundraising_contacts.contact_id from grid sync")
def sync_fundraising_relational(conn, grid, views, actor_user_id=None):
columns = grid.get('columns', []) if isinstance(grid, dict) else []
rows = grid.get('rows', []) if isinstance(grid, dict) else []
@@ -1084,23 +1117,23 @@ def sync_fundraising_relational(conn, grid, views, actor_user_id=None):
contact_payload = dict(c)
if lead_source and not str(contact_payload.get('source') or '').strip():
contact_payload['source'] = lead_source
_upsert_contact_from_fundraising(conn, investor_name, contact_payload, actor_user_id=actor_user_id)
linked_contact_id = _upsert_contact_from_fundraising(conn, investor_name, contact_payload, actor_user_id=actor_user_id)
conn.execute("""
INSERT INTO fundraising_contacts (
id, investor_id, full_name, email, title, city, state, country, location_query, sort_order, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
id, investor_id, full_name, email, title, city, state, country, location_query, sort_order, contact_id, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
generate_id(), investor_id, full_name, email, str(c.get('title') or ''),
str(c.get('city') or ''), str(c.get('state') or ''), str(c.get('country') or ''),
str(c.get('location_query') or ''), i, now()
str(c.get('location_query') or ''), i, linked_contact_id, now()
))
elif isinstance(contacts, str) and contacts.strip():
_upsert_contact_from_fundraising(conn, investor_name, {"name": contacts.strip(), "email": "", "title": "", "source": lead_source}, actor_user_id=actor_user_id)
linked_contact_id = _upsert_contact_from_fundraising(conn, investor_name, {"name": contacts.strip(), "email": "", "title": "", "source": lead_source}, actor_user_id=actor_user_id)
conn.execute("""
INSERT INTO fundraising_contacts (
id, investor_id, full_name, email, title, city, state, country, location_query, sort_order, updated_at
) VALUES (?, ?, ?, '', '', '', '', '', '', 0, ?)
""", (generate_id(), investor_id, contacts.strip(), now()))
id, investor_id, full_name, email, title, city, state, country, location_query, sort_order, contact_id, updated_at
) VALUES (?, ?, ?, '', '', '', '', '', '', 0, ?, ?)
""", (generate_id(), investor_id, contacts.strip(), linked_contact_id, now()))
conn.execute("DELETE FROM fundraising_commitments WHERE investor_id = ?", (investor_id,))
for _, col in fund_columns:
+76
View File
@@ -0,0 +1,76 @@
#!/usr/bin/env python3
"""Integration test for the grid→contact id-link wiring (migration 0004 + sync).
Imports the real server module against a throwaway DB, runs init_db (which applies
the 0004 migration adding fundraising_contacts.contact_id), then drives a grid
sync and asserts the grid contact is linked to the contacts-table row the app
created for it. Verifies the end-to-end backend wiring the entity resolver relies on.
Run: cd backend && python3 test_grid_contact_link.py
"""
import os
import sqlite3
import sys
import tempfile
_tmp = tempfile.mkdtemp()
os.environ["CRM_DATA_DIR"] = _tmp
os.environ["CRM_DB_PATH"] = os.path.join(_tmp, "crm.db")
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import server # noqa: E402
FAILS = []
def check(cond, msg):
print((" PASS " if cond else " FAIL ") + msg)
if not cond:
FAILS.append(msg)
def main():
server.init_db()
conn = sqlite3.connect(server.DB_PATH)
conn.row_factory = sqlite3.Row
cols = {r[1] for r in conn.execute("PRAGMA table_info(fundraising_contacts)")}
check("contact_id" in cols, "migration 0004 added fundraising_contacts.contact_id")
grid = {
"columns": [
{"id": "investor_name", "label": "Investor Name", "type": "text"},
{"id": "contacts", "label": "Contacts", "type": "contacts"},
],
"rows": [
{"id": "row-test-1", "investor_name": "Testco Capital",
"contacts": [{"name": "Jane Doe", "email": "jane@testco.com", "title": "Partner"}]},
],
}
server.sync_fundraising_relational(conn, server.sanitize_fundraising_grid(grid), [])
conn.commit()
fc = conn.execute("SELECT full_name, contact_id FROM fundraising_contacts WHERE full_name='Jane Doe'").fetchone()
check(bool(fc and fc["contact_id"]), f"grid contact_id populated by sync (got {dict(fc) if fc else None})")
if fc and fc["contact_id"]:
ct = conn.execute("SELECT id, email FROM contacts WHERE id=?", (fc["contact_id"],)).fetchone()
check(bool(ct and ct["email"] == "jane@testco.com"),
f"link points to the correct contacts row (got {dict(ct) if ct else None})")
# Re-sync is idempotent: still exactly one linked contact for Jane.
server.sync_fundraising_relational(conn, server.sanitize_fundraising_grid(grid), [])
conn.commit()
n = conn.execute("SELECT COUNT(*) FROM contacts WHERE lower(email)='jane@testco.com'").fetchone()[0]
check(n == 1, f"re-sync does not duplicate the contact (got {n})")
conn.close()
print()
if FAILS:
print(f"FAILED ({len(FAILS)})")
sys.exit(1)
print("ALL PASS (grid contact_id link wiring)")
if __name__ == "__main__":
main()