Fix people double-count + duplicate-queue explosion (v0.1.0:51)
Root cause: grid contacts (fundraising_contacts) are the SAME people as the contacts table (the app syncs them by name/email), but resolution matched grid rows by (name + investor-canon) where the two sides derive the investor key from different tables that rarely line up — so nearly every grid contact minted a duplicate person (715 + ~692 ≈ 1406), and the duplicate finder then flagged each twin against its real self (~676 candidates). Fix (entity_resolution.py): - Grid pass matches a grid contact to its existing contacts-table person by PROVABLE keys only (exact email, else exact name within the same investor) and records membership; on a miss it MINTS NOTHING (the old else-branch mint was the double-count source, and guessing by name across firms risks binding two different same-named people). - Targeted, audited cleanup soft-deletes leftover grid-only "twins" (person rows with no 'contacts' link) and superseded pre-:48 'lp'/'organization' rows, guarded so any row carrying enrichment/human data is never dropped (guardrail #3); the tombstoned ids are logged to interaction_log (guardrail #5). - _upsert_entity clears deleted_at on conflict so a re-emitted id is un-tombstoned (no permanent burial); fuzzy-merge losers stay buried via _redirect. entity_merge.py / server.py: the duplicate queue + pending count now filter to candidates whose both sides are still live, so self-healed twins drop out. Verified: offline reproduction test (backend/ingest/test_entity_resolution.py, 10/10) reproduces the 1406-style doubling and proves it collapses; no regression on the synthetic dev set; two adversarial review passes. Known pre-existing identity-key weaknesses (same name+firm+no email collision; shared role inbox over-link) are unchanged by this fix and will be resolved structurally by the contact_id link in the grid/contacts unification. Run "Build search index" after upgrading to recompute the canonical layer. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -90,6 +90,12 @@ def _upsert_entity(conn, eid, kind, display_name, primary_email):
|
||||
display_name = excluded.display_name,
|
||||
primary_email = COALESCE(excluded.primary_email, canonical_entities.primary_email),
|
||||
entity_kind = excluded.entity_kind,
|
||||
-- Re-emitting a deterministic id means the entity is live again, so
|
||||
-- clear any prior tombstone (e.g. a stale-cleanup soft-delete from a
|
||||
-- run when the source row was briefly absent). Fuzzy-merge losers are
|
||||
-- redirected away by _redirect and never re-upserted, so this never
|
||||
-- resurrects a merged-away entity.
|
||||
deleted_at = NULL,
|
||||
updated_at = excluded.updated_at
|
||||
""",
|
||||
(eid, kind, display_name, primary_email or None, _now(), _now()),
|
||||
@@ -221,12 +227,21 @@ def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv, merge_map=Non
|
||||
email = norm_email(r["email"])
|
||||
name_norm = norm_text(r["full_name"] or "")
|
||||
inv_canon = org_canon_by_fundinv.get(r["investor_id"])
|
||||
# Match the grid contact to its contacts-table person by PROVABLE keys only:
|
||||
# exact email, else exact name within the SAME canonical investor. The app
|
||||
# keeps the grid and the contacts table in sync (_upsert_contact_from_
|
||||
# fundraising), so a grid contact IS an existing contact-person, never a new
|
||||
# one. On a confident match, record the membership. On a miss we deliberately
|
||||
# do NOT mint a person: the old else-branch mint is exactly what produced the
|
||||
# people double-count (a grid row whose (name, investor) key didn't line up
|
||||
# with its contact minted a duplicate), and guessing by name across firms
|
||||
# risks binding two different same-named people. Unresolved grid rows are
|
||||
# left for the explicit contact_id link planned in the grid/contacts
|
||||
# unification — honest separation: never merge or mint on a guess.
|
||||
cid = (by_email.get(email) if email else None) or by_name_inv.get((name_norm, inv_canon or ""))
|
||||
if cid:
|
||||
_link(conn, cid, "fundraising_contacts", r["id"], email or name_norm, "grid_assoc", 0.9)
|
||||
_member_of(conn, cid, inv_canon)
|
||||
else:
|
||||
_person(r["full_name"] or "", email, inv_canon, "fundraising_contacts", r["id"])
|
||||
|
||||
# lp_profiles -> the person entity of its contact
|
||||
for r in conn.execute("SELECT id, contact_id FROM lp_profiles WHERE deleted_at IS NULL"):
|
||||
@@ -270,17 +285,56 @@ def run(db_path: str):
|
||||
conn.commit()
|
||||
person_meta = resolve_people(conn, org_by_oid, org_by_inv, merge_map)
|
||||
conn.commit()
|
||||
|
||||
live = "deleted_at IS NULL"
|
||||
|
||||
# ── Clean up stale derived rows (soft-delete only; guardrail #3) ──
|
||||
# Two UNAMBIGUOUS classes of obsolete entity_resolution-owned rows, tombstoned
|
||||
# ONLY when they carry no human/enrichment data so nothing a partner entered is
|
||||
# ever dropped:
|
||||
# (a) PERSON rows with no 'contacts' source link. Real people come from the
|
||||
# contacts table (pass 1); a person linked only from the grid is a leftover
|
||||
# "twin" minted by the pre-fix else-branch — the source of the 1406
|
||||
# double-count. (Narrow + safe: a contact whose canonical id merely
|
||||
# *changed* still keeps a 'contacts' link, so it is never caught here.)
|
||||
# (b) Rows under the superseded pre-:48 kinds 'lp'/'organization' (the model
|
||||
# is now investor | person), left live by old upsert-only runs.
|
||||
# We list the ids first and log them (guardrail #5: the soft-delete is
|
||||
# reviewable/undoable), then tombstone + audit in ONE transaction.
|
||||
nodata = ("thesis_fit IS NULL AND segment IS NULL AND accreditation_status IS NULL "
|
||||
"AND qp_status IS NULL AND warmth_score IS NULL AND owner_id IS NULL "
|
||||
"AND last_touch_at IS NULL AND notes IS NULL")
|
||||
stale = [r["id"] for r in conn.execute(f"""
|
||||
SELECT id FROM canonical_entities c
|
||||
WHERE {live} AND source='entity_resolution' AND {nodata}
|
||||
AND ( (entity_kind='person' AND NOT EXISTS (
|
||||
SELECT 1 FROM entity_links l
|
||||
WHERE l.canonical_id = c.id AND l.source_model = 'contacts'))
|
||||
OR entity_kind IN ('lp', 'organization') )
|
||||
""")]
|
||||
for sid in stale:
|
||||
conn.execute("UPDATE canonical_entities SET deleted_at=?, updated_at=? WHERE id=?", (_now(), _now(), sid))
|
||||
if stale:
|
||||
conn.execute(
|
||||
"""INSERT INTO interaction_log
|
||||
(id, ts, actor_type, actor_id, action, target_type, target_id, payload, source, created_at)
|
||||
VALUES (?, ?, 'system', 'entity_resolver', 'entity.stale_tombstoned', 'canonical_entities', NULL, ?, 'ingest', ?)""",
|
||||
(str(uuid.uuid4()), _now(), json.dumps({"count": len(stale), "ids": stale}), _now()),
|
||||
)
|
||||
pruned = len(stale)
|
||||
conn.commit()
|
||||
|
||||
candidates = find_fuzzy_candidates(person_meta)
|
||||
|
||||
# Counts report LIVE entities (deleted_at IS NULL); fuzzy-merged losers are
|
||||
# soft-deleted tombstones (guardrail #3) and excluded.
|
||||
live = "deleted_at IS NULL"
|
||||
# Counts report LIVE entities (deleted_at IS NULL); fuzzy-merged losers and
|
||||
# tombstoned stale rows are soft-deleted (guardrail #3) and excluded.
|
||||
counts = {
|
||||
"canonical_total": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE {live}").fetchone()[0],
|
||||
"investor": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='investor' AND {live}").fetchone()[0],
|
||||
"person": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0],
|
||||
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
|
||||
"fuzzy_candidates": len(candidates),
|
||||
"pruned_stale": pruned,
|
||||
}
|
||||
|
||||
conn.execute(
|
||||
|
||||
Reference in New Issue
Block a user