Fix people double-count + duplicate-queue explosion (v0.1.0:51)

Root cause: grid contacts (fundraising_contacts) are the SAME people as the
contacts table (the app syncs them by name/email), but resolution matched grid
rows by (name + investor-canon) where the two sides derive the investor key from
different tables that rarely line up — so nearly every grid contact minted a
duplicate person (715 + ~692 ≈ 1406), and the duplicate finder then flagged each
twin against its real self (~676 candidates).

Fix (entity_resolution.py):
- Grid pass matches a grid contact to its existing contacts-table person by
  PROVABLE keys only (exact email, else exact name within the same investor) and
  records membership; on a miss it MINTS NOTHING (the old else-branch mint was the
  double-count source, and guessing by name across firms risks binding two
  different same-named people).
- Targeted, audited cleanup soft-deletes leftover grid-only "twins" (person rows
  with no 'contacts' link) and superseded pre-:48 'lp'/'organization' rows, guarded
  so any row carrying enrichment/human data is never dropped (guardrail #3); the
  tombstoned ids are logged to interaction_log (guardrail #5).
- _upsert_entity clears deleted_at on conflict so a re-emitted id is un-tombstoned
  (no permanent burial); fuzzy-merge losers stay buried via _redirect.

entity_merge.py / server.py: the duplicate queue + pending count now filter to
candidates whose both sides are still live, so self-healed twins drop out.

Verified: offline reproduction test (backend/ingest/test_entity_resolution.py,
10/10) reproduces the 1406-style doubling and proves it collapses; no regression
on the synthetic dev set; two adversarial review passes. Known pre-existing
identity-key weaknesses (same name+firm+no email collision; shared role inbox
over-link) are unchanged by this fix and will be resolved structurally by the
contact_id link in the grid/contacts unification.

Run "Build search index" after upgrading to recompute the canonical layer.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Keysat
2026-06-05 14:49:39 -05:00
parent 3d9caac178
commit d16264f401
7 changed files with 293 additions and 11 deletions
+59 -5
View File
@@ -90,6 +90,12 @@ def _upsert_entity(conn, eid, kind, display_name, primary_email):
display_name = excluded.display_name,
primary_email = COALESCE(excluded.primary_email, canonical_entities.primary_email),
entity_kind = excluded.entity_kind,
-- Re-emitting a deterministic id means the entity is live again, so
-- clear any prior tombstone (e.g. a stale-cleanup soft-delete from a
-- run when the source row was briefly absent). Fuzzy-merge losers are
-- redirected away by _redirect and never re-upserted, so this never
-- resurrects a merged-away entity.
deleted_at = NULL,
updated_at = excluded.updated_at
""",
(eid, kind, display_name, primary_email or None, _now(), _now()),
@@ -221,12 +227,21 @@ def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv, merge_map=Non
email = norm_email(r["email"])
name_norm = norm_text(r["full_name"] or "")
inv_canon = org_canon_by_fundinv.get(r["investor_id"])
# Match the grid contact to its contacts-table person by PROVABLE keys only:
# exact email, else exact name within the SAME canonical investor. The app
# keeps the grid and the contacts table in sync (_upsert_contact_from_
# fundraising), so a grid contact IS an existing contact-person, never a new
# one. On a confident match, record the membership. On a miss we deliberately
# do NOT mint a person: the old else-branch mint is exactly what produced the
# people double-count (a grid row whose (name, investor) key didn't line up
# with its contact minted a duplicate), and guessing by name across firms
# risks binding two different same-named people. Unresolved grid rows are
# left for the explicit contact_id link planned in the grid/contacts
# unification — honest separation: never merge or mint on a guess.
cid = (by_email.get(email) if email else None) or by_name_inv.get((name_norm, inv_canon or ""))
if cid:
_link(conn, cid, "fundraising_contacts", r["id"], email or name_norm, "grid_assoc", 0.9)
_member_of(conn, cid, inv_canon)
else:
_person(r["full_name"] or "", email, inv_canon, "fundraising_contacts", r["id"])
# lp_profiles -> the person entity of its contact
for r in conn.execute("SELECT id, contact_id FROM lp_profiles WHERE deleted_at IS NULL"):
@@ -270,17 +285,56 @@ def run(db_path: str):
conn.commit()
person_meta = resolve_people(conn, org_by_oid, org_by_inv, merge_map)
conn.commit()
live = "deleted_at IS NULL"
# ── Clean up stale derived rows (soft-delete only; guardrail #3) ──
# Two UNAMBIGUOUS classes of obsolete entity_resolution-owned rows, tombstoned
# ONLY when they carry no human/enrichment data so nothing a partner entered is
# ever dropped:
# (a) PERSON rows with no 'contacts' source link. Real people come from the
# contacts table (pass 1); a person linked only from the grid is a leftover
# "twin" minted by the pre-fix else-branch — the source of the 1406
# double-count. (Narrow + safe: a contact whose canonical id merely
# *changed* still keeps a 'contacts' link, so it is never caught here.)
# (b) Rows under the superseded pre-:48 kinds 'lp'/'organization' (the model
# is now investor | person), left live by old upsert-only runs.
# We list the ids first and log them (guardrail #5: the soft-delete is
# reviewable/undoable), then tombstone + audit in ONE transaction.
nodata = ("thesis_fit IS NULL AND segment IS NULL AND accreditation_status IS NULL "
"AND qp_status IS NULL AND warmth_score IS NULL AND owner_id IS NULL "
"AND last_touch_at IS NULL AND notes IS NULL")
stale = [r["id"] for r in conn.execute(f"""
SELECT id FROM canonical_entities c
WHERE {live} AND source='entity_resolution' AND {nodata}
AND ( (entity_kind='person' AND NOT EXISTS (
SELECT 1 FROM entity_links l
WHERE l.canonical_id = c.id AND l.source_model = 'contacts'))
OR entity_kind IN ('lp', 'organization') )
""")]
for sid in stale:
conn.execute("UPDATE canonical_entities SET deleted_at=?, updated_at=? WHERE id=?", (_now(), _now(), sid))
if stale:
conn.execute(
"""INSERT INTO interaction_log
(id, ts, actor_type, actor_id, action, target_type, target_id, payload, source, created_at)
VALUES (?, ?, 'system', 'entity_resolver', 'entity.stale_tombstoned', 'canonical_entities', NULL, ?, 'ingest', ?)""",
(str(uuid.uuid4()), _now(), json.dumps({"count": len(stale), "ids": stale}), _now()),
)
pruned = len(stale)
conn.commit()
candidates = find_fuzzy_candidates(person_meta)
# Counts report LIVE entities (deleted_at IS NULL); fuzzy-merged losers are
# soft-deleted tombstones (guardrail #3) and excluded.
live = "deleted_at IS NULL"
# Counts report LIVE entities (deleted_at IS NULL); fuzzy-merged losers and
# tombstoned stale rows are soft-deleted (guardrail #3) and excluded.
counts = {
"canonical_total": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE {live}").fetchone()[0],
"investor": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='investor' AND {live}").fetchone()[0],
"person": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0],
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
"fuzzy_candidates": len(candidates),
"pruned_stale": pruned,
}
conn.execute(
+190
View File
@@ -0,0 +1,190 @@
#!/usr/bin/env python3
"""Offline regression test for the grid-contact double-count fix (safe version).
Reproduces the real-data condition that made PEOPLE (RESOLVED) ≈ contacts +
grid-contacts (the ~1406 bug): grid contacts (fundraising_contacts) with no email
and contacts with no organization, so the (name, investor) keys never coincide and
every grid row used to mint its own duplicate person.
Asserts the SAFE fix:
1. a grid contact with a matching email links back to its contact (no new person),
2. a grid contact whose contact shares the same investor links back by name,
3. a grid contact that can't be PROVABLY matched mints NOTHING (no duplicate
person, no cross-firm name guess) — the count stays correct,
4. targeted cleanup soft-deletes a stale grid-only "twin" (person with no
contacts link) and a superseded 'lp'/'organization' row, with no enrichment,
5. cleanup PRESERVES a grid-only person that carries enrichment (guardrail #3),
6. a re-emitted id is UN-tombstoned (no permanent burial),
7. re-running is idempotent.
Pure stdlib + SQLite; never calls Spark/Qwen (er.run is deterministic-only).
Run: cd backend/ingest && python3 test_entity_resolution.py
"""
import os
import sqlite3
import sys
import tempfile
import entity_resolution as er
SCHEMA = """
CREATE TABLE canonical_entities (
id TEXT PRIMARY KEY, entity_kind TEXT NOT NULL, display_name TEXT NOT NULL,
primary_email TEXT, thesis_fit TEXT, segment TEXT, accreditation_status TEXT,
qp_status TEXT, warmth_score REAL, source TEXT, owner_id TEXT,
last_touch_at TEXT, notes TEXT,
created_at TEXT DEFAULT (datetime('now')), updated_at TEXT DEFAULT (datetime('now')),
deleted_at TEXT
);
CREATE TABLE entity_links (
id TEXT PRIMARY KEY, canonical_id TEXT, source_model TEXT, source_id TEXT,
match_value TEXT, match_kind TEXT, confidence REAL, created_at TEXT,
UNIQUE(source_model, source_id, match_value)
);
CREATE TABLE relationship_edges (
id TEXT PRIMARY KEY, src_id TEXT, dst_id TEXT, edge_type TEXT, source TEXT,
strength REAL, directed INTEGER, first_seen_at TEXT, last_seen_at TEXT,
created_at TEXT, updated_at TEXT,
UNIQUE(src_id, dst_id, edge_type, source)
);
CREATE TABLE interaction_log (
id TEXT PRIMARY KEY, ts TEXT, actor_type TEXT, actor_id TEXT, action TEXT,
target_type TEXT, target_id TEXT, payload TEXT, source TEXT, created_at TEXT
);
CREATE TABLE contacts (
id TEXT PRIMARY KEY, first_name TEXT, last_name TEXT, email TEXT,
organization_id TEXT, deleted_at TEXT
);
CREATE TABLE organizations (id TEXT PRIMARY KEY, name TEXT, email TEXT);
CREATE TABLE fundraising_investors (id TEXT PRIMARY KEY, investor_name TEXT);
CREATE TABLE fundraising_contacts (id TEXT PRIMARY KEY, full_name TEXT, email TEXT, investor_id TEXT);
CREATE TABLE lp_profiles (id TEXT PRIMARY KEY, contact_id TEXT, deleted_at TEXT);
"""
SEEDED = ("per_TWIN", "per_ENR", "lp_OLD")
def seed(db):
c = sqlite3.connect(db)
c.executescript(SCHEMA)
c.execute("INSERT INTO organizations (id, name, email) VALUES ('o1','Acme Capital',NULL)")
c.executemany("INSERT INTO contacts (id, first_name, last_name, email, organization_id) VALUES (?,?,?,?,?)", [
("c1", "Alice", "Anderson", "alice@x.com", None), # email, no org
("c2", "Bob", "Brown", None, None), # no email, no org
("c3", "Dave", "Davis", None, "o1"), # no email, org = Acme
])
c.executemany("INSERT INTO fundraising_investors (id, investor_name) VALUES (?,?)", [
("i_acme", "Acme Capital"), ("i_beta", "Beta Family Office"),
])
c.executemany("INSERT INTO fundraising_contacts (id, full_name, email, investor_id) VALUES (?,?,?,?)", [
("g_alice", "Alice Anderson", "alice@x.com", "i_beta"), # -> email match to c1
("g_dave", "Dave Davis", None, "i_acme"), # -> name+investor match to c3
("g_bob", "Bob Brown", None, "i_beta"), # -> MISS (c2 has no org) -> mint NOTHING
("g_carol", "Carol Clark", None, "i_beta"), # -> MISS (no contact) -> mint NOTHING
])
# Stale grid-only "twin" (person, only a fundraising_contacts link, no enrichment) -> prune
c.execute("INSERT INTO canonical_entities (id, entity_kind, display_name, source) VALUES "
"('per_TWIN','person','Ghost Twin','entity_resolution')")
c.execute("INSERT INTO entity_links (id, canonical_id, source_model, source_id, match_value, match_kind, confidence, created_at) "
"VALUES ('l_twin','per_TWIN','fundraising_contacts','gx','ghost','name_org',0.8,'t')")
# Grid-only person WITH enrichment -> preserved (guardrail #3)
c.execute("INSERT INTO canonical_entities (id, entity_kind, display_name, source, segment) VALUES "
"('per_ENR','person','Enriched Orphan','entity_resolution','warm')")
c.execute("INSERT INTO entity_links (id, canonical_id, source_model, source_id, match_value, match_kind, confidence, created_at) "
"VALUES ('l_enr','per_ENR','fundraising_contacts','gy','enr','name_org',0.8,'t')")
# Superseded pre-:48 kind -> prune
c.execute("INSERT INTO canonical_entities (id, entity_kind, display_name, source) VALUES "
"('lp_OLD','lp','Old LP Row','entity_resolution')")
c.commit()
c.close()
def resolved_persons(db):
c = sqlite3.connect(db)
q = "SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND deleted_at IS NULL AND id NOT IN (?,?,?)"
n = c.execute(q, SEEDED).fetchone()[0]
c.close()
return n
def deleted_at(db, eid):
c = sqlite3.connect(db)
r = c.execute("SELECT deleted_at FROM canonical_entities WHERE id=?", (eid,)).fetchone()
c.close()
return r[0] if r else "MISSING"
def grid_match_kinds(db):
c = sqlite3.connect(db)
rows = dict(c.execute("SELECT match_kind, COUNT(*) FROM entity_links "
"WHERE source_model='fundraising_contacts' AND match_kind!='name_org' GROUP BY match_kind").fetchall())
c.close()
return rows
def minted_from_grid(db):
"""Persons minted directly from a grid row (the bug). Should be 0 after the fix."""
c = sqlite3.connect(db)
n = c.execute("""SELECT COUNT(DISTINCT l.canonical_id) FROM entity_links l
JOIN canonical_entities ce ON ce.id=l.canonical_id AND ce.deleted_at IS NULL
WHERE l.source_model='fundraising_contacts' AND l.match_kind IN ('name_org','exact_email')
AND l.canonical_id NOT IN (?,?,?)""", SEEDED).fetchone()[0]
c.close()
return n
FAILS = []
def check(cond, msg):
print((" PASS " if cond else " FAIL ") + msg)
if not cond:
FAILS.append(msg)
def main():
tmp = tempfile.mkdtemp()
db = os.path.join(tmp, "repro.db")
seed(db)
counts1, _ = er.run(db)
print(f"Run 1 counts: {counts1}")
# 3 contacts; grid rows either link back (g_alice, g_dave) or are skipped
# (g_bob, g_carol). NO grid row mints a person -> count stays 3, not 5-7.
check(resolved_persons(db) == 3, f"resolved persons == 3 (got {resolved_persons(db)}); old double-count would be 5-7")
check(minted_from_grid(db) == 0, f"zero persons minted from grid rows (got {minted_from_grid(db)})")
mk = grid_match_kinds(db)
check(mk.get("grid_assoc", 0) == 2, f"two grid contacts matched back via grid_assoc (got {mk.get('grid_assoc',0)})")
# Targeted cleanup: stale grid-only twin + superseded 'lp' row tombstoned...
check(deleted_at(db, "per_TWIN") is not None, "stale grid-only twin 'per_TWIN' tombstoned")
check(deleted_at(db, "lp_OLD") is not None, "superseded 'lp' row 'lp_OLD' tombstoned")
# ...enriched grid-only person preserved.
check(deleted_at(db, "per_ENR") is None, "enriched grid-only person 'per_ENR' PRESERVED (has segment)")
check(counts1.get("pruned_stale", 0) == 2, f"exactly 2 stale rows pruned (got {counts1.get('pruned_stale')})")
# Un-tombstone: soft-delete a real contact-person, then re-run -> it comes back.
alice = er._eid("per", "e|alice@x.com")
cc = sqlite3.connect(db)
cc.execute("UPDATE canonical_entities SET deleted_at='2026-01-01' WHERE id=?", (alice,))
cc.commit()
cc.close()
counts2, _ = er.run(db)
print(f"Run 2 counts: {counts2}")
check(deleted_at(db, alice) is None, "re-emitted contact-person is UN-tombstoned (no permanent burial)")
check(resolved_persons(db) == 3, f"resolved persons stable at 3 on re-run (got {resolved_persons(db)})")
check(counts2.get("pruned_stale", 0) == 0, f"nothing re-pruned on idempotent re-run (got {counts2.get('pruned_stale')})")
print()
if FAILS:
print(f"FAILED ({len(FAILS)}):")
for f in FAILS:
print(f" - {f}")
sys.exit(1)
print("ALL PASS")
if __name__ == "__main__":
main()