Entity model: investors (grid) vs people (contacts); fix double-count (0.1.0:48)

Per Grant's clarification of the real data model:
- Investor entities come from the fundraising grid, one per row, all labeled
  "investor" (drops the confusing lp/organization split). Grid is source of truth.
- People come ONLY from the contacts table. The grid's contacts (fundraising_
  contacts) are matched to a contact-person and recorded as member_of links to
  their investor, instead of creating duplicate person entities. This fixes the
  ~doubled people count (people now ≈ contacts, not contacts + grid contacts).
- System Status cards: Investors / People (resolved) / Contacts in CRM / Grid
  contacts, so resolved-vs-source is visible at a glance.

Verified on synthetic: people == contacts count (no double-count); multi-contact
investors preserved via member_of.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Keysat
2026-06-05 13:05:58 -05:00
parent 3c31b1e8a5
commit 91361042e7
6 changed files with 102 additions and 62 deletions
+70 -50
View File
@@ -143,11 +143,12 @@ def resolve_organizations(conn, merge_map=None):
org_canon_by_orgid, org_canon_by_fundinv = {}, {} org_canon_by_orgid, org_canon_by_fundinv = {}, {}
for key, g in groups.items(): for key, g in groups.items():
# An org we are actively raising from (has a fundraising row) is an 'lp'; # Every firm group is one INVESTOR entity. The fundraising grid is the
# otherwise a plain 'organization'. # source of truth for investor entities (each row = one investor, whether
kind = "lp" if g["investors"] else "organization" # an institution/family-office or an individual); the organizations table
cid = _redirect(merge_map, _eid("lp" if kind == "lp" else "org", key)) # mirrors those names. So we no longer split into lp/organization.
_upsert_entity(conn, cid, kind, g["name"], g["email"]) cid = _redirect(merge_map, _eid("inv", key))
_upsert_entity(conn, cid, "investor", g["name"], g["email"])
for oid in g["orgs"]: for oid in g["orgs"]:
_link(conn, cid, "organizations", oid, key, "exact_name", 1.0) _link(conn, cid, "organizations", oid, key, "exact_name", 1.0)
org_canon_by_orgid[oid] = cid org_canon_by_orgid[oid] = cid
@@ -158,57 +159,77 @@ def resolve_organizations(conn, merge_map=None):
return org_canon_by_orgid, org_canon_by_fundinv return org_canon_by_orgid, org_canon_by_fundinv
def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv, merge_map=None): def _member_of(conn, person_id, investor_id):
"""Merge contacts + fundraising_contacts by exact email, else exact name within """Record that a person (contact) belongs to an investor entity."""
the same canonical org. Returns contact_id -> person canonical id (for lp_profiles).""" if not investor_id or person_id == investor_id:
merge_map = merge_map or {} return
# gather (model, source_id, full_name, email, org_canon)
people = []
for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts"):
full = f"{r['first_name'] or ''} {r['last_name'] or ''}".strip()
people.append(("contacts", r["id"], full, norm_email(r["email"]),
org_canon_by_orgid.get(r["organization_id"])))
for r in conn.execute("SELECT id, full_name, email, investor_id FROM fundraising_contacts"):
people.append(("fundraising_contacts", r["id"], r["full_name"] or "", norm_email(r["email"]),
org_canon_by_fundinv.get(r["investor_id"])))
contact_to_person = {}
person_meta = {} # canonical_id -> {"org": org_canon, "last": last_norm, "name": display, "email": email}
for model, sid, full, email, org_canon in people:
name_norm = norm_text(full)
if email:
key = f"e|{email}"
match_kind, conf, match_value = "exact_email", 1.0, email
elif name_norm:
key = f"n|{name_norm}|{org_canon or ''}"
match_kind, conf, match_value = "name_org", 0.8, name_norm
else:
continue
cid = _redirect(merge_map, _eid("per", key))
display = full.strip() or email
_upsert_entity(conn, cid, "person", display, email)
_link(conn, cid, model, sid, match_value, match_kind, conf)
# Record that this contact (person) belongs to its investor/org entity, so
# one investor can own many contacts (e.g. a family office with several
# people) — and a 1-contact HNWI is just the N=1 case.
if org_canon and cid != org_canon:
conn.execute(""" conn.execute("""
INSERT INTO relationship_edges (id, src_id, dst_id, edge_type, source, strength, directed, INSERT INTO relationship_edges (id, src_id, dst_id, edge_type, source, strength, directed,
first_seen_at, last_seen_at, created_at, updated_at) first_seen_at, last_seen_at, created_at, updated_at)
VALUES (?,?,?, 'member_of', 'entity_resolution', 1.0, 1, ?, ?, ?, ?) VALUES (?,?,?, 'member_of', 'entity_resolution', 1.0, 1, ?, ?, ?, ?)
ON CONFLICT(src_id, dst_id, edge_type, source) ON CONFLICT(src_id, dst_id, edge_type, source)
DO UPDATE SET last_seen_at=excluded.last_seen_at, updated_at=excluded.updated_at DO UPDATE SET last_seen_at=excluded.last_seen_at, updated_at=excluded.updated_at
""", (str(uuid.uuid4()), cid, org_canon, _now(), _now(), _now(), _now())) """, (str(uuid.uuid4()), person_id, investor_id, _now(), _now(), _now(), _now()))
if model == "contacts":
contact_to_person[sid] = cid
meta = person_meta.setdefault(cid, {"org": org_canon, "last": _split_name(full)[1], def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv, merge_map=None):
"name": display, "email": email}) """People come from the CONTACTS table (one person per contact, where the
if org_canon and not meta["org"]: emails/LinkedIn live). The fundraising grid's contacts are NOT a second set of
meta["org"] = org_canon people — each is matched to a contact-person and recorded only as a member_of
edge to its investor entity (the grid's 'Contacts' column says who belongs to
which investor). This is what stops the double-count.
Returns contact_id -> person canonical id (for lp_profiles)."""
merge_map = merge_map or {}
contact_to_person = {}
person_meta = {}
by_email = {} # norm_email -> person cid
by_name_inv = {} # (name_norm, investor_canon) -> person cid
def _person(full, email, inv_canon, model, sid):
name_norm = norm_text(full)
if email:
key, mk, conf, mv = f"e|{email}", "exact_email", 1.0, email
elif name_norm:
key, mk, conf, mv = f"n|{name_norm}|{inv_canon or ''}", "name_org", 0.8, name_norm
else:
return None
cid = _redirect(merge_map, _eid("per", key))
_upsert_entity(conn, cid, "person", full.strip() or email, email)
_link(conn, cid, model, sid, mv, mk, conf)
if email:
by_email[email] = cid
if name_norm:
by_name_inv[(name_norm, inv_canon or "")] = cid
_member_of(conn, cid, inv_canon)
m = person_meta.setdefault(cid, {"org": inv_canon, "last": _split_name(full)[1],
"name": full.strip() or email, "email": email})
if inv_canon and not m["org"]:
m["org"] = inv_canon
return cid
# 1. People = the contacts table.
for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts WHERE deleted_at IS NULL"):
full = f"{r['first_name'] or ''} {r['last_name'] or ''}".strip()
cid = _person(full, norm_email(r["email"]), org_canon_by_orgid.get(r["organization_id"]), "contacts", r["id"])
if cid:
contact_to_person[r["id"]] = cid
# 2. Grid contacts are associations, not new people: match to a contact-person
# (by email, else name within the same investor) and just add membership.
# Only create a person when there is genuinely no matching contact.
for r in conn.execute("SELECT id, full_name, email, investor_id FROM fundraising_contacts"):
email = norm_email(r["email"])
name_norm = norm_text(r["full_name"] or "")
inv_canon = org_canon_by_fundinv.get(r["investor_id"])
cid = (by_email.get(email) if email else None) or by_name_inv.get((name_norm, inv_canon or ""))
if cid:
_link(conn, cid, "fundraising_contacts", r["id"], email or name_norm, "grid_assoc", 0.9)
_member_of(conn, cid, inv_canon)
else:
_person(r["full_name"] or "", email, inv_canon, "fundraising_contacts", r["id"])
# lp_profiles -> the person entity of its contact # lp_profiles -> the person entity of its contact
for r in conn.execute("SELECT id, contact_id FROM lp_profiles"): for r in conn.execute("SELECT id, contact_id FROM lp_profiles WHERE deleted_at IS NULL"):
cid = contact_to_person.get(r["contact_id"]) cid = contact_to_person.get(r["contact_id"])
if cid: if cid:
_link(conn, cid, "lp_profiles", r["id"], r["contact_id"], "contact_fk", 1.0) _link(conn, cid, "lp_profiles", r["id"], r["contact_id"], "contact_fk", 1.0)
@@ -256,8 +277,7 @@ def run(db_path: str):
live = "deleted_at IS NULL" live = "deleted_at IS NULL"
counts = { counts = {
"canonical_total": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE {live}").fetchone()[0], "canonical_total": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE {live}").fetchone()[0],
"lp": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp' AND {live}").fetchone()[0], "investor": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='investor' AND {live}").fetchone()[0],
"organization": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization' AND {live}").fetchone()[0],
"person": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0], "person": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0],
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0], "links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
"fuzzy_candidates": len(candidates), "fuzzy_candidates": len(candidates),
+1 -2
View File
@@ -3467,8 +3467,7 @@ class CRMHandler(BaseHTTPRequestHandler):
try: try:
live = "deleted_at IS NULL" live = "deleted_at IS NULL"
out['canonical_entities'] = { out['canonical_entities'] = {
'lp': conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp' AND {live}").fetchone()[0], 'investor': conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='investor' AND {live}").fetchone()[0],
'organization': conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization' AND {live}").fetchone()[0],
'person': conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0], 'person': conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0],
} }
out['entity_links'] = conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0] out['entity_links'] = conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0]
+9 -9
View File
@@ -9382,20 +9382,20 @@
<div className="kpi-grid"> <div className="kpi-grid">
<div className="kpi-card"> <div className="kpi-card">
<div className="kpi-label">LPs</div> <div className="kpi-label">Investors</div>
<div className="kpi-value">{entities.lp ?? 0}</div> <div className="kpi-value">{entities.investor ?? 0}</div>
</div> </div>
<div className="kpi-card"> <div className="kpi-card">
<div className="kpi-label">Organizations</div> <div className="kpi-label">People (resolved)</div>
<div className="kpi-value">{entities.organization ?? 0}</div>
</div>
<div className="kpi-card">
<div className="kpi-label">People</div>
<div className="kpi-value">{entities.person ?? 0}</div> <div className="kpi-value">{entities.person ?? 0}</div>
</div> </div>
<div className="kpi-card"> <div className="kpi-card">
<div className="kpi-label">Entity Links</div> <div className="kpi-label">Contacts in CRM</div>
<div className="kpi-value">{data.entity_links ?? 0}</div> <div className="kpi-value">{(data.source_counts || {}).contacts ?? '—'}</div>
</div>
<div className="kpi-card">
<div className="kpi-label">Grid contacts</div>
<div className="kpi-value">{(data.source_counts || {}).fundraising_contacts ?? '—'}</div>
</div> </div>
</div> </div>
+3 -2
View File
@@ -12,8 +12,9 @@ export const PACKAGE_TITLE = 'Ten31 Database'
// * 0.1.0:44 (Phase-0 ingest + MCP server in image; build-index action) // * 0.1.0:44 (Phase-0 ingest + MCP server in image; build-index action)
// * 0.1.0:45 (Phase-1 thesis system; dual approval; merge review; in-app index) // * 0.1.0:45 (Phase-1 thesis system; dual approval; merge review; in-app index)
// * 0.1.0:46 (packaging fix: ship full backend so migrations run + endpoints work) // * 0.1.0:46 (packaging fix: ship full backend so migrations run + endpoints work)
// * Current: 0.1.0:47 (soft-delete instead of hard-delete; source-count diagnostics) // * 0.1.0:47 (soft-delete instead of hard-delete; source-count diagnostics)
export const PACKAGE_VERSION = '0.1.0:47' // * Current: 0.1.0:48 (entity model: investors vs people; fixes double-count)
export const PACKAGE_VERSION = '0.1.0:48'
export const DATA_MOUNT_PATH = '/data' export const DATA_MOUNT_PATH = '/data'
export const WEB_PORT = 8080 export const WEB_PORT = 8080
+3 -2
View File
@@ -8,8 +8,9 @@ import { v_0_1_0_44 } from './v0.1.0.44'
import { v_0_1_0_45 } from './v0.1.0.45' import { v_0_1_0_45 } from './v0.1.0.45'
import { v_0_1_0_46 } from './v0.1.0.46' import { v_0_1_0_46 } from './v0.1.0.46'
import { v_0_1_0_47 } from './v0.1.0.47' import { v_0_1_0_47 } from './v0.1.0.47'
import { v_0_1_0_48 } from './v0.1.0.48'
export const versionGraph = VersionGraph.of({ export const versionGraph = VersionGraph.of({
current: v_0_1_0_47, current: v_0_1_0_48,
other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43, v_0_1_0_44, v_0_1_0_45, v_0_1_0_46], other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43, v_0_1_0_44, v_0_1_0_45, v_0_1_0_46, v_0_1_0_47],
}) })
+19
View File
@@ -0,0 +1,19 @@
import { VersionInfo } from '@start9labs/start-sdk'
// Entity-model fix. Investor entities now come from the fundraising grid (one per
// row, all labeled "investor"; the confusing organization/lp split is gone), and
// PEOPLE come only from the contacts table. The grid's contacts are recorded as
// membership links to their investor rather than as duplicate people, which fixes
// the roughly-doubled people count. Rebuild the search index after upgrading.
export const v_0_1_0_48 = VersionInfo.of({
version: '0.1.0:48',
releaseNotes: {
en_US: [
'Fixes the entity counts: investors now come from the fundraising grid and',
'people come from the contacts table, with the grid\'s contacts treated as',
'memberships instead of duplicate people. Run Rebuild search index after',
'upgrading to refresh the numbers.',
].join(' '),
},
migrations: { up: async () => {}, down: async () => {} },
})