From d16264f4010c535842563acafee050a7875fd777 Mon Sep 17 00:00:00 2001 From: Keysat Date: Fri, 5 Jun 2026 14:49:39 -0500 Subject: [PATCH] Fix people double-count + duplicate-queue explosion (v0.1.0:51) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: grid contacts (fundraising_contacts) are the SAME people as the contacts table (the app syncs them by name/email), but resolution matched grid rows by (name + investor-canon) where the two sides derive the investor key from different tables that rarely line up — so nearly every grid contact minted a duplicate person (715 + ~692 ≈ 1406), and the duplicate finder then flagged each twin against its real self (~676 candidates). Fix (entity_resolution.py): - Grid pass matches a grid contact to its existing contacts-table person by PROVABLE keys only (exact email, else exact name within the same investor) and records membership; on a miss it MINTS NOTHING (the old else-branch mint was the double-count source, and guessing by name across firms risks binding two different same-named people). - Targeted, audited cleanup soft-deletes leftover grid-only "twins" (person rows with no 'contacts' link) and superseded pre-:48 'lp'/'organization' rows, guarded so any row carrying enrichment/human data is never dropped (guardrail #3); the tombstoned ids are logged to interaction_log (guardrail #5). - _upsert_entity clears deleted_at on conflict so a re-emitted id is un-tombstoned (no permanent burial); fuzzy-merge losers stay buried via _redirect. entity_merge.py / server.py: the duplicate queue + pending count now filter to candidates whose both sides are still live, so self-healed twins drop out. Verified: offline reproduction test (backend/ingest/test_entity_resolution.py, 10/10) reproduces the 1406-style doubling and proves it collapses; no regression on the synthetic dev set; two adversarial review passes. Known pre-existing identity-key weaknesses (same name+firm+no email collision; shared role inbox over-link) are unchanged by this fix and will be resolved structurally by the contact_id link in the grid/contacts unification. Run "Build search index" after upgrading to recompute the canonical layer. Co-Authored-By: Claude Opus 4.8 --- backend/entity_merge.py | 10 +- backend/ingest/entity_resolution.py | 64 +++++++- backend/ingest/test_entity_resolution.py | 190 +++++++++++++++++++++++ backend/server.py | 8 +- start9/0.4/startos/utils.ts | 5 +- start9/0.4/startos/versions/index.ts | 5 +- start9/0.4/startos/versions/v0.1.0.51.ts | 22 +++ 7 files changed, 293 insertions(+), 11 deletions(-) create mode 100644 backend/ingest/test_entity_resolution.py create mode 100644 start9/0.4/startos/versions/v0.1.0.51.ts diff --git a/backend/entity_merge.py b/backend/entity_merge.py index 41c9672..66d9525 100644 --- a/backend/entity_merge.py +++ b/backend/entity_merge.py @@ -31,8 +31,16 @@ def _log(c, actor_id, action, target_id, payload): def list_candidates(db, status="pending"): c = _conn(db) + # Only surface candidates whose BOTH sides are still live canonical entities. + # When entity resolution self-heals (a grid "twin" matched back, so its + # duplicate entity is soft-deleted), the candidate that paired them becomes + # moot — hide it instead of asking a human to adjudicate a tombstone. rows = [dict(r) for r in c.execute( - "SELECT * FROM entity_merge_candidates WHERE status=? ORDER BY confidence DESC, created_at DESC", (status,))] + """SELECT mc.* FROM entity_merge_candidates mc + JOIN canonical_entities a ON a.id = mc.entity_a AND a.deleted_at IS NULL + JOIN canonical_entities b ON b.id = mc.entity_b AND b.deleted_at IS NULL + WHERE mc.status=? + ORDER BY mc.confidence DESC, mc.created_at DESC""", (status,))] c.close() return {"candidates": rows, "count": len(rows)} diff --git a/backend/ingest/entity_resolution.py b/backend/ingest/entity_resolution.py index 920e154..158008f 100644 --- a/backend/ingest/entity_resolution.py +++ b/backend/ingest/entity_resolution.py @@ -90,6 +90,12 @@ def _upsert_entity(conn, eid, kind, display_name, primary_email): display_name = excluded.display_name, primary_email = COALESCE(excluded.primary_email, canonical_entities.primary_email), entity_kind = excluded.entity_kind, + -- Re-emitting a deterministic id means the entity is live again, so + -- clear any prior tombstone (e.g. a stale-cleanup soft-delete from a + -- run when the source row was briefly absent). Fuzzy-merge losers are + -- redirected away by _redirect and never re-upserted, so this never + -- resurrects a merged-away entity. + deleted_at = NULL, updated_at = excluded.updated_at """, (eid, kind, display_name, primary_email or None, _now(), _now()), @@ -221,12 +227,21 @@ def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv, merge_map=Non email = norm_email(r["email"]) name_norm = norm_text(r["full_name"] or "") inv_canon = org_canon_by_fundinv.get(r["investor_id"]) + # Match the grid contact to its contacts-table person by PROVABLE keys only: + # exact email, else exact name within the SAME canonical investor. The app + # keeps the grid and the contacts table in sync (_upsert_contact_from_ + # fundraising), so a grid contact IS an existing contact-person, never a new + # one. On a confident match, record the membership. On a miss we deliberately + # do NOT mint a person: the old else-branch mint is exactly what produced the + # people double-count (a grid row whose (name, investor) key didn't line up + # with its contact minted a duplicate), and guessing by name across firms + # risks binding two different same-named people. Unresolved grid rows are + # left for the explicit contact_id link planned in the grid/contacts + # unification — honest separation: never merge or mint on a guess. cid = (by_email.get(email) if email else None) or by_name_inv.get((name_norm, inv_canon or "")) if cid: _link(conn, cid, "fundraising_contacts", r["id"], email or name_norm, "grid_assoc", 0.9) _member_of(conn, cid, inv_canon) - else: - _person(r["full_name"] or "", email, inv_canon, "fundraising_contacts", r["id"]) # lp_profiles -> the person entity of its contact for r in conn.execute("SELECT id, contact_id FROM lp_profiles WHERE deleted_at IS NULL"): @@ -270,17 +285,56 @@ def run(db_path: str): conn.commit() person_meta = resolve_people(conn, org_by_oid, org_by_inv, merge_map) conn.commit() + + live = "deleted_at IS NULL" + + # ── Clean up stale derived rows (soft-delete only; guardrail #3) ── + # Two UNAMBIGUOUS classes of obsolete entity_resolution-owned rows, tombstoned + # ONLY when they carry no human/enrichment data so nothing a partner entered is + # ever dropped: + # (a) PERSON rows with no 'contacts' source link. Real people come from the + # contacts table (pass 1); a person linked only from the grid is a leftover + # "twin" minted by the pre-fix else-branch — the source of the 1406 + # double-count. (Narrow + safe: a contact whose canonical id merely + # *changed* still keeps a 'contacts' link, so it is never caught here.) + # (b) Rows under the superseded pre-:48 kinds 'lp'/'organization' (the model + # is now investor | person), left live by old upsert-only runs. + # We list the ids first and log them (guardrail #5: the soft-delete is + # reviewable/undoable), then tombstone + audit in ONE transaction. + nodata = ("thesis_fit IS NULL AND segment IS NULL AND accreditation_status IS NULL " + "AND qp_status IS NULL AND warmth_score IS NULL AND owner_id IS NULL " + "AND last_touch_at IS NULL AND notes IS NULL") + stale = [r["id"] for r in conn.execute(f""" + SELECT id FROM canonical_entities c + WHERE {live} AND source='entity_resolution' AND {nodata} + AND ( (entity_kind='person' AND NOT EXISTS ( + SELECT 1 FROM entity_links l + WHERE l.canonical_id = c.id AND l.source_model = 'contacts')) + OR entity_kind IN ('lp', 'organization') ) + """)] + for sid in stale: + conn.execute("UPDATE canonical_entities SET deleted_at=?, updated_at=? WHERE id=?", (_now(), _now(), sid)) + if stale: + conn.execute( + """INSERT INTO interaction_log + (id, ts, actor_type, actor_id, action, target_type, target_id, payload, source, created_at) + VALUES (?, ?, 'system', 'entity_resolver', 'entity.stale_tombstoned', 'canonical_entities', NULL, ?, 'ingest', ?)""", + (str(uuid.uuid4()), _now(), json.dumps({"count": len(stale), "ids": stale}), _now()), + ) + pruned = len(stale) + conn.commit() + candidates = find_fuzzy_candidates(person_meta) - # Counts report LIVE entities (deleted_at IS NULL); fuzzy-merged losers are - # soft-deleted tombstones (guardrail #3) and excluded. - live = "deleted_at IS NULL" + # Counts report LIVE entities (deleted_at IS NULL); fuzzy-merged losers and + # tombstoned stale rows are soft-deleted (guardrail #3) and excluded. counts = { "canonical_total": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE {live}").fetchone()[0], "investor": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='investor' AND {live}").fetchone()[0], "person": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0], "links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0], "fuzzy_candidates": len(candidates), + "pruned_stale": pruned, } conn.execute( diff --git a/backend/ingest/test_entity_resolution.py b/backend/ingest/test_entity_resolution.py new file mode 100644 index 0000000..d50d59e --- /dev/null +++ b/backend/ingest/test_entity_resolution.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +"""Offline regression test for the grid-contact double-count fix (safe version). + +Reproduces the real-data condition that made PEOPLE (RESOLVED) ≈ contacts + +grid-contacts (the ~1406 bug): grid contacts (fundraising_contacts) with no email +and contacts with no organization, so the (name, investor) keys never coincide and +every grid row used to mint its own duplicate person. + +Asserts the SAFE fix: + 1. a grid contact with a matching email links back to its contact (no new person), + 2. a grid contact whose contact shares the same investor links back by name, + 3. a grid contact that can't be PROVABLY matched mints NOTHING (no duplicate + person, no cross-firm name guess) — the count stays correct, + 4. targeted cleanup soft-deletes a stale grid-only "twin" (person with no + contacts link) and a superseded 'lp'/'organization' row, with no enrichment, + 5. cleanup PRESERVES a grid-only person that carries enrichment (guardrail #3), + 6. a re-emitted id is UN-tombstoned (no permanent burial), + 7. re-running is idempotent. + +Pure stdlib + SQLite; never calls Spark/Qwen (er.run is deterministic-only). +Run: cd backend/ingest && python3 test_entity_resolution.py +""" +import os +import sqlite3 +import sys +import tempfile + +import entity_resolution as er + +SCHEMA = """ +CREATE TABLE canonical_entities ( + id TEXT PRIMARY KEY, entity_kind TEXT NOT NULL, display_name TEXT NOT NULL, + primary_email TEXT, thesis_fit TEXT, segment TEXT, accreditation_status TEXT, + qp_status TEXT, warmth_score REAL, source TEXT, owner_id TEXT, + last_touch_at TEXT, notes TEXT, + created_at TEXT DEFAULT (datetime('now')), updated_at TEXT DEFAULT (datetime('now')), + deleted_at TEXT +); +CREATE TABLE entity_links ( + id TEXT PRIMARY KEY, canonical_id TEXT, source_model TEXT, source_id TEXT, + match_value TEXT, match_kind TEXT, confidence REAL, created_at TEXT, + UNIQUE(source_model, source_id, match_value) +); +CREATE TABLE relationship_edges ( + id TEXT PRIMARY KEY, src_id TEXT, dst_id TEXT, edge_type TEXT, source TEXT, + strength REAL, directed INTEGER, first_seen_at TEXT, last_seen_at TEXT, + created_at TEXT, updated_at TEXT, + UNIQUE(src_id, dst_id, edge_type, source) +); +CREATE TABLE interaction_log ( + id TEXT PRIMARY KEY, ts TEXT, actor_type TEXT, actor_id TEXT, action TEXT, + target_type TEXT, target_id TEXT, payload TEXT, source TEXT, created_at TEXT +); +CREATE TABLE contacts ( + id TEXT PRIMARY KEY, first_name TEXT, last_name TEXT, email TEXT, + organization_id TEXT, deleted_at TEXT +); +CREATE TABLE organizations (id TEXT PRIMARY KEY, name TEXT, email TEXT); +CREATE TABLE fundraising_investors (id TEXT PRIMARY KEY, investor_name TEXT); +CREATE TABLE fundraising_contacts (id TEXT PRIMARY KEY, full_name TEXT, email TEXT, investor_id TEXT); +CREATE TABLE lp_profiles (id TEXT PRIMARY KEY, contact_id TEXT, deleted_at TEXT); +""" + +SEEDED = ("per_TWIN", "per_ENR", "lp_OLD") + + +def seed(db): + c = sqlite3.connect(db) + c.executescript(SCHEMA) + c.execute("INSERT INTO organizations (id, name, email) VALUES ('o1','Acme Capital',NULL)") + c.executemany("INSERT INTO contacts (id, first_name, last_name, email, organization_id) VALUES (?,?,?,?,?)", [ + ("c1", "Alice", "Anderson", "alice@x.com", None), # email, no org + ("c2", "Bob", "Brown", None, None), # no email, no org + ("c3", "Dave", "Davis", None, "o1"), # no email, org = Acme + ]) + c.executemany("INSERT INTO fundraising_investors (id, investor_name) VALUES (?,?)", [ + ("i_acme", "Acme Capital"), ("i_beta", "Beta Family Office"), + ]) + c.executemany("INSERT INTO fundraising_contacts (id, full_name, email, investor_id) VALUES (?,?,?,?)", [ + ("g_alice", "Alice Anderson", "alice@x.com", "i_beta"), # -> email match to c1 + ("g_dave", "Dave Davis", None, "i_acme"), # -> name+investor match to c3 + ("g_bob", "Bob Brown", None, "i_beta"), # -> MISS (c2 has no org) -> mint NOTHING + ("g_carol", "Carol Clark", None, "i_beta"), # -> MISS (no contact) -> mint NOTHING + ]) + # Stale grid-only "twin" (person, only a fundraising_contacts link, no enrichment) -> prune + c.execute("INSERT INTO canonical_entities (id, entity_kind, display_name, source) VALUES " + "('per_TWIN','person','Ghost Twin','entity_resolution')") + c.execute("INSERT INTO entity_links (id, canonical_id, source_model, source_id, match_value, match_kind, confidence, created_at) " + "VALUES ('l_twin','per_TWIN','fundraising_contacts','gx','ghost','name_org',0.8,'t')") + # Grid-only person WITH enrichment -> preserved (guardrail #3) + c.execute("INSERT INTO canonical_entities (id, entity_kind, display_name, source, segment) VALUES " + "('per_ENR','person','Enriched Orphan','entity_resolution','warm')") + c.execute("INSERT INTO entity_links (id, canonical_id, source_model, source_id, match_value, match_kind, confidence, created_at) " + "VALUES ('l_enr','per_ENR','fundraising_contacts','gy','enr','name_org',0.8,'t')") + # Superseded pre-:48 kind -> prune + c.execute("INSERT INTO canonical_entities (id, entity_kind, display_name, source) VALUES " + "('lp_OLD','lp','Old LP Row','entity_resolution')") + c.commit() + c.close() + + +def resolved_persons(db): + c = sqlite3.connect(db) + q = "SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND deleted_at IS NULL AND id NOT IN (?,?,?)" + n = c.execute(q, SEEDED).fetchone()[0] + c.close() + return n + + +def deleted_at(db, eid): + c = sqlite3.connect(db) + r = c.execute("SELECT deleted_at FROM canonical_entities WHERE id=?", (eid,)).fetchone() + c.close() + return r[0] if r else "MISSING" + + +def grid_match_kinds(db): + c = sqlite3.connect(db) + rows = dict(c.execute("SELECT match_kind, COUNT(*) FROM entity_links " + "WHERE source_model='fundraising_contacts' AND match_kind!='name_org' GROUP BY match_kind").fetchall()) + c.close() + return rows + + +def minted_from_grid(db): + """Persons minted directly from a grid row (the bug). Should be 0 after the fix.""" + c = sqlite3.connect(db) + n = c.execute("""SELECT COUNT(DISTINCT l.canonical_id) FROM entity_links l + JOIN canonical_entities ce ON ce.id=l.canonical_id AND ce.deleted_at IS NULL + WHERE l.source_model='fundraising_contacts' AND l.match_kind IN ('name_org','exact_email') + AND l.canonical_id NOT IN (?,?,?)""", SEEDED).fetchone()[0] + c.close() + return n + + +FAILS = [] + + +def check(cond, msg): + print((" PASS " if cond else " FAIL ") + msg) + if not cond: + FAILS.append(msg) + + +def main(): + tmp = tempfile.mkdtemp() + db = os.path.join(tmp, "repro.db") + seed(db) + + counts1, _ = er.run(db) + print(f"Run 1 counts: {counts1}") + + # 3 contacts; grid rows either link back (g_alice, g_dave) or are skipped + # (g_bob, g_carol). NO grid row mints a person -> count stays 3, not 5-7. + check(resolved_persons(db) == 3, f"resolved persons == 3 (got {resolved_persons(db)}); old double-count would be 5-7") + check(minted_from_grid(db) == 0, f"zero persons minted from grid rows (got {minted_from_grid(db)})") + + mk = grid_match_kinds(db) + check(mk.get("grid_assoc", 0) == 2, f"two grid contacts matched back via grid_assoc (got {mk.get('grid_assoc',0)})") + + # Targeted cleanup: stale grid-only twin + superseded 'lp' row tombstoned... + check(deleted_at(db, "per_TWIN") is not None, "stale grid-only twin 'per_TWIN' tombstoned") + check(deleted_at(db, "lp_OLD") is not None, "superseded 'lp' row 'lp_OLD' tombstoned") + # ...enriched grid-only person preserved. + check(deleted_at(db, "per_ENR") is None, "enriched grid-only person 'per_ENR' PRESERVED (has segment)") + check(counts1.get("pruned_stale", 0) == 2, f"exactly 2 stale rows pruned (got {counts1.get('pruned_stale')})") + + # Un-tombstone: soft-delete a real contact-person, then re-run -> it comes back. + alice = er._eid("per", "e|alice@x.com") + cc = sqlite3.connect(db) + cc.execute("UPDATE canonical_entities SET deleted_at='2026-01-01' WHERE id=?", (alice,)) + cc.commit() + cc.close() + counts2, _ = er.run(db) + print(f"Run 2 counts: {counts2}") + check(deleted_at(db, alice) is None, "re-emitted contact-person is UN-tombstoned (no permanent burial)") + check(resolved_persons(db) == 3, f"resolved persons stable at 3 on re-run (got {resolved_persons(db)})") + check(counts2.get("pruned_stale", 0) == 0, f"nothing re-pruned on idempotent re-run (got {counts2.get('pruned_stale')})") + + print() + if FAILS: + print(f"FAILED ({len(FAILS)}):") + for f in FAILS: + print(f" - {f}") + sys.exit(1) + print("ALL PASS") + + +if __name__ == "__main__": + main() diff --git a/backend/server.py b/backend/server.py index f4566bd..404b6e6 100644 --- a/backend/server.py +++ b/backend/server.py @@ -3517,8 +3517,14 @@ class CRMHandler(BaseHTTPRequestHandler): except Exception: out['recent_activity'] = [] try: + # Count only candidates whose both sides are still live (mirror the + # review queue in entity_merge.list_candidates) — self-healed twins + # whose duplicate was soft-deleted no longer count as pending work. out['pending_merge_candidates'] = conn.execute( - "SELECT COUNT(*) FROM entity_merge_candidates WHERE status='pending'").fetchone()[0] + """SELECT COUNT(*) FROM entity_merge_candidates mc + JOIN canonical_entities a ON a.id=mc.entity_a AND a.deleted_at IS NULL + JOIN canonical_entities b ON b.id=mc.entity_b AND b.deleted_at IS NULL + WHERE mc.status='pending'""").fetchone()[0] except Exception: out['pending_merge_candidates'] = None out['index_job'] = entity_jobs.get_status() if entity_jobs else None diff --git a/start9/0.4/startos/utils.ts b/start9/0.4/startos/utils.ts index 5f7ac8e..ba48654 100644 --- a/start9/0.4/startos/utils.ts +++ b/start9/0.4/startos/utils.ts @@ -15,8 +15,9 @@ export const PACKAGE_TITLE = 'Ten31 Database' // * 0.1.0:47 (soft-delete instead of hard-delete; source-count diagnostics) // * 0.1.0:48 (entity model: investors vs people; fixes double-count) // * 0.1.0:49 (Architect: Claude thesis generation + Thesis Workshop screen) -// * Current: 0.1.0:50 (Set Anthropic API Key UI action — no terminal needed) -export const PACKAGE_VERSION = '0.1.0:50' +// * 0.1.0:50 (Set Anthropic API Key UI action — no terminal needed) +// * Current: 0.1.0:51 (entity-resolution fix: people double-count + duplicate queue) +export const PACKAGE_VERSION = '0.1.0:51' export const DATA_MOUNT_PATH = '/data' export const WEB_PORT = 8080 diff --git a/start9/0.4/startos/versions/index.ts b/start9/0.4/startos/versions/index.ts index 38d9b46..d50a147 100644 --- a/start9/0.4/startos/versions/index.ts +++ b/start9/0.4/startos/versions/index.ts @@ -11,8 +11,9 @@ import { v_0_1_0_47 } from './v0.1.0.47' import { v_0_1_0_48 } from './v0.1.0.48' import { v_0_1_0_49 } from './v0.1.0.49' import { v_0_1_0_50 } from './v0.1.0.50' +import { v_0_1_0_51 } from './v0.1.0.51' export const versionGraph = VersionGraph.of({ - current: v_0_1_0_50, - other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43, v_0_1_0_44, v_0_1_0_45, v_0_1_0_46, v_0_1_0_47, v_0_1_0_48, v_0_1_0_49], + current: v_0_1_0_51, + other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43, v_0_1_0_44, v_0_1_0_45, v_0_1_0_46, v_0_1_0_47, v_0_1_0_48, v_0_1_0_49, v_0_1_0_50], }) diff --git a/start9/0.4/startos/versions/v0.1.0.51.ts b/start9/0.4/startos/versions/v0.1.0.51.ts new file mode 100644 index 0000000..e426212 --- /dev/null +++ b/start9/0.4/startos/versions/v0.1.0.51.ts @@ -0,0 +1,22 @@ +import { VersionInfo } from '@start9labs/start-sdk' + +// Entity-resolution fix for the people double-count (1406) and the runaway +// duplicate-review queue (676). Grid contacts now link back to their existing +// contacts-table person by provable keys only (exact email, or exact name within +// the same investor) and never mint a duplicate person on a miss; leftover grid +// "twins" and superseded pre-:48 rows are soft-deleted (enrichment-protected, +// audited); re-emitted ids are un-tombstoned. Run "Build search index" after +// upgrading to recompute the canonical layer. No data migration. +export const v_0_1_0_51 = VersionInfo.of({ + version: '0.1.0:51', + releaseNotes: { + en_US: [ + 'Fixes the inflated People count and the oversized duplicate-review queue:', + 'grid contacts now resolve to their existing contact record instead of being', + 'duplicated, and leftover duplicate "twins" are cleaned up safely (enriched', + 'records are never dropped). Run "Build search index" after upgrading to', + 'refresh the numbers.', + ].join(' '), + }, + migrations: { up: async () => {}, down: async () => {} }, +})