Entity model: investors (grid) vs people (contacts); fix double-count (0.1.0:48)
Per Grant's clarification of the real data model: - Investor entities come from the fundraising grid, one per row, all labeled "investor" (drops the confusing lp/organization split). Grid is source of truth. - People come ONLY from the contacts table. The grid's contacts (fundraising_ contacts) are matched to a contact-person and recorded as member_of links to their investor, instead of creating duplicate person entities. This fixes the ~doubled people count (people now ≈ contacts, not contacts + grid contacts). - System Status cards: Investors / People (resolved) / Contacts in CRM / Grid contacts, so resolved-vs-source is visible at a glance. Verified on synthetic: people == contacts count (no double-count); multi-contact investors preserved via member_of. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -143,11 +143,12 @@ def resolve_organizations(conn, merge_map=None):
|
|||||||
|
|
||||||
org_canon_by_orgid, org_canon_by_fundinv = {}, {}
|
org_canon_by_orgid, org_canon_by_fundinv = {}, {}
|
||||||
for key, g in groups.items():
|
for key, g in groups.items():
|
||||||
# An org we are actively raising from (has a fundraising row) is an 'lp';
|
# Every firm group is one INVESTOR entity. The fundraising grid is the
|
||||||
# otherwise a plain 'organization'.
|
# source of truth for investor entities (each row = one investor, whether
|
||||||
kind = "lp" if g["investors"] else "organization"
|
# an institution/family-office or an individual); the organizations table
|
||||||
cid = _redirect(merge_map, _eid("lp" if kind == "lp" else "org", key))
|
# mirrors those names. So we no longer split into lp/organization.
|
||||||
_upsert_entity(conn, cid, kind, g["name"], g["email"])
|
cid = _redirect(merge_map, _eid("inv", key))
|
||||||
|
_upsert_entity(conn, cid, "investor", g["name"], g["email"])
|
||||||
for oid in g["orgs"]:
|
for oid in g["orgs"]:
|
||||||
_link(conn, cid, "organizations", oid, key, "exact_name", 1.0)
|
_link(conn, cid, "organizations", oid, key, "exact_name", 1.0)
|
||||||
org_canon_by_orgid[oid] = cid
|
org_canon_by_orgid[oid] = cid
|
||||||
@@ -158,57 +159,77 @@ def resolve_organizations(conn, merge_map=None):
|
|||||||
return org_canon_by_orgid, org_canon_by_fundinv
|
return org_canon_by_orgid, org_canon_by_fundinv
|
||||||
|
|
||||||
|
|
||||||
def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv, merge_map=None):
|
def _member_of(conn, person_id, investor_id):
|
||||||
"""Merge contacts + fundraising_contacts by exact email, else exact name within
|
"""Record that a person (contact) belongs to an investor entity."""
|
||||||
the same canonical org. Returns contact_id -> person canonical id (for lp_profiles)."""
|
if not investor_id or person_id == investor_id:
|
||||||
merge_map = merge_map or {}
|
return
|
||||||
# gather (model, source_id, full_name, email, org_canon)
|
|
||||||
people = []
|
|
||||||
for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts"):
|
|
||||||
full = f"{r['first_name'] or ''} {r['last_name'] or ''}".strip()
|
|
||||||
people.append(("contacts", r["id"], full, norm_email(r["email"]),
|
|
||||||
org_canon_by_orgid.get(r["organization_id"])))
|
|
||||||
for r in conn.execute("SELECT id, full_name, email, investor_id FROM fundraising_contacts"):
|
|
||||||
people.append(("fundraising_contacts", r["id"], r["full_name"] or "", norm_email(r["email"]),
|
|
||||||
org_canon_by_fundinv.get(r["investor_id"])))
|
|
||||||
|
|
||||||
contact_to_person = {}
|
|
||||||
person_meta = {} # canonical_id -> {"org": org_canon, "last": last_norm, "name": display, "email": email}
|
|
||||||
|
|
||||||
for model, sid, full, email, org_canon in people:
|
|
||||||
name_norm = norm_text(full)
|
|
||||||
if email:
|
|
||||||
key = f"e|{email}"
|
|
||||||
match_kind, conf, match_value = "exact_email", 1.0, email
|
|
||||||
elif name_norm:
|
|
||||||
key = f"n|{name_norm}|{org_canon or ''}"
|
|
||||||
match_kind, conf, match_value = "name_org", 0.8, name_norm
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
cid = _redirect(merge_map, _eid("per", key))
|
|
||||||
display = full.strip() or email
|
|
||||||
_upsert_entity(conn, cid, "person", display, email)
|
|
||||||
_link(conn, cid, model, sid, match_value, match_kind, conf)
|
|
||||||
# Record that this contact (person) belongs to its investor/org entity, so
|
|
||||||
# one investor can own many contacts (e.g. a family office with several
|
|
||||||
# people) — and a 1-contact HNWI is just the N=1 case.
|
|
||||||
if org_canon and cid != org_canon:
|
|
||||||
conn.execute("""
|
conn.execute("""
|
||||||
INSERT INTO relationship_edges (id, src_id, dst_id, edge_type, source, strength, directed,
|
INSERT INTO relationship_edges (id, src_id, dst_id, edge_type, source, strength, directed,
|
||||||
first_seen_at, last_seen_at, created_at, updated_at)
|
first_seen_at, last_seen_at, created_at, updated_at)
|
||||||
VALUES (?,?,?, 'member_of', 'entity_resolution', 1.0, 1, ?, ?, ?, ?)
|
VALUES (?,?,?, 'member_of', 'entity_resolution', 1.0, 1, ?, ?, ?, ?)
|
||||||
ON CONFLICT(src_id, dst_id, edge_type, source)
|
ON CONFLICT(src_id, dst_id, edge_type, source)
|
||||||
DO UPDATE SET last_seen_at=excluded.last_seen_at, updated_at=excluded.updated_at
|
DO UPDATE SET last_seen_at=excluded.last_seen_at, updated_at=excluded.updated_at
|
||||||
""", (str(uuid.uuid4()), cid, org_canon, _now(), _now(), _now(), _now()))
|
""", (str(uuid.uuid4()), person_id, investor_id, _now(), _now(), _now(), _now()))
|
||||||
if model == "contacts":
|
|
||||||
contact_to_person[sid] = cid
|
|
||||||
meta = person_meta.setdefault(cid, {"org": org_canon, "last": _split_name(full)[1],
|
def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv, merge_map=None):
|
||||||
"name": display, "email": email})
|
"""People come from the CONTACTS table (one person per contact, where the
|
||||||
if org_canon and not meta["org"]:
|
emails/LinkedIn live). The fundraising grid's contacts are NOT a second set of
|
||||||
meta["org"] = org_canon
|
people — each is matched to a contact-person and recorded only as a member_of
|
||||||
|
edge to its investor entity (the grid's 'Contacts' column says who belongs to
|
||||||
|
which investor). This is what stops the double-count.
|
||||||
|
Returns contact_id -> person canonical id (for lp_profiles)."""
|
||||||
|
merge_map = merge_map or {}
|
||||||
|
contact_to_person = {}
|
||||||
|
person_meta = {}
|
||||||
|
by_email = {} # norm_email -> person cid
|
||||||
|
by_name_inv = {} # (name_norm, investor_canon) -> person cid
|
||||||
|
|
||||||
|
def _person(full, email, inv_canon, model, sid):
|
||||||
|
name_norm = norm_text(full)
|
||||||
|
if email:
|
||||||
|
key, mk, conf, mv = f"e|{email}", "exact_email", 1.0, email
|
||||||
|
elif name_norm:
|
||||||
|
key, mk, conf, mv = f"n|{name_norm}|{inv_canon or ''}", "name_org", 0.8, name_norm
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
cid = _redirect(merge_map, _eid("per", key))
|
||||||
|
_upsert_entity(conn, cid, "person", full.strip() or email, email)
|
||||||
|
_link(conn, cid, model, sid, mv, mk, conf)
|
||||||
|
if email:
|
||||||
|
by_email[email] = cid
|
||||||
|
if name_norm:
|
||||||
|
by_name_inv[(name_norm, inv_canon or "")] = cid
|
||||||
|
_member_of(conn, cid, inv_canon)
|
||||||
|
m = person_meta.setdefault(cid, {"org": inv_canon, "last": _split_name(full)[1],
|
||||||
|
"name": full.strip() or email, "email": email})
|
||||||
|
if inv_canon and not m["org"]:
|
||||||
|
m["org"] = inv_canon
|
||||||
|
return cid
|
||||||
|
|
||||||
|
# 1. People = the contacts table.
|
||||||
|
for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts WHERE deleted_at IS NULL"):
|
||||||
|
full = f"{r['first_name'] or ''} {r['last_name'] or ''}".strip()
|
||||||
|
cid = _person(full, norm_email(r["email"]), org_canon_by_orgid.get(r["organization_id"]), "contacts", r["id"])
|
||||||
|
if cid:
|
||||||
|
contact_to_person[r["id"]] = cid
|
||||||
|
|
||||||
|
# 2. Grid contacts are associations, not new people: match to a contact-person
|
||||||
|
# (by email, else name within the same investor) and just add membership.
|
||||||
|
# Only create a person when there is genuinely no matching contact.
|
||||||
|
for r in conn.execute("SELECT id, full_name, email, investor_id FROM fundraising_contacts"):
|
||||||
|
email = norm_email(r["email"])
|
||||||
|
name_norm = norm_text(r["full_name"] or "")
|
||||||
|
inv_canon = org_canon_by_fundinv.get(r["investor_id"])
|
||||||
|
cid = (by_email.get(email) if email else None) or by_name_inv.get((name_norm, inv_canon or ""))
|
||||||
|
if cid:
|
||||||
|
_link(conn, cid, "fundraising_contacts", r["id"], email or name_norm, "grid_assoc", 0.9)
|
||||||
|
_member_of(conn, cid, inv_canon)
|
||||||
|
else:
|
||||||
|
_person(r["full_name"] or "", email, inv_canon, "fundraising_contacts", r["id"])
|
||||||
|
|
||||||
# lp_profiles -> the person entity of its contact
|
# lp_profiles -> the person entity of its contact
|
||||||
for r in conn.execute("SELECT id, contact_id FROM lp_profiles"):
|
for r in conn.execute("SELECT id, contact_id FROM lp_profiles WHERE deleted_at IS NULL"):
|
||||||
cid = contact_to_person.get(r["contact_id"])
|
cid = contact_to_person.get(r["contact_id"])
|
||||||
if cid:
|
if cid:
|
||||||
_link(conn, cid, "lp_profiles", r["id"], r["contact_id"], "contact_fk", 1.0)
|
_link(conn, cid, "lp_profiles", r["id"], r["contact_id"], "contact_fk", 1.0)
|
||||||
@@ -256,8 +277,7 @@ def run(db_path: str):
|
|||||||
live = "deleted_at IS NULL"
|
live = "deleted_at IS NULL"
|
||||||
counts = {
|
counts = {
|
||||||
"canonical_total": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE {live}").fetchone()[0],
|
"canonical_total": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE {live}").fetchone()[0],
|
||||||
"lp": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp' AND {live}").fetchone()[0],
|
"investor": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='investor' AND {live}").fetchone()[0],
|
||||||
"organization": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization' AND {live}").fetchone()[0],
|
|
||||||
"person": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0],
|
"person": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0],
|
||||||
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
|
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
|
||||||
"fuzzy_candidates": len(candidates),
|
"fuzzy_candidates": len(candidates),
|
||||||
|
|||||||
+1
-2
@@ -3467,8 +3467,7 @@ class CRMHandler(BaseHTTPRequestHandler):
|
|||||||
try:
|
try:
|
||||||
live = "deleted_at IS NULL"
|
live = "deleted_at IS NULL"
|
||||||
out['canonical_entities'] = {
|
out['canonical_entities'] = {
|
||||||
'lp': conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp' AND {live}").fetchone()[0],
|
'investor': conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='investor' AND {live}").fetchone()[0],
|
||||||
'organization': conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization' AND {live}").fetchone()[0],
|
|
||||||
'person': conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0],
|
'person': conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0],
|
||||||
}
|
}
|
||||||
out['entity_links'] = conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0]
|
out['entity_links'] = conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0]
|
||||||
|
|||||||
+9
-9
@@ -9382,20 +9382,20 @@
|
|||||||
|
|
||||||
<div className="kpi-grid">
|
<div className="kpi-grid">
|
||||||
<div className="kpi-card">
|
<div className="kpi-card">
|
||||||
<div className="kpi-label">LPs</div>
|
<div className="kpi-label">Investors</div>
|
||||||
<div className="kpi-value">{entities.lp ?? 0}</div>
|
<div className="kpi-value">{entities.investor ?? 0}</div>
|
||||||
</div>
|
</div>
|
||||||
<div className="kpi-card">
|
<div className="kpi-card">
|
||||||
<div className="kpi-label">Organizations</div>
|
<div className="kpi-label">People (resolved)</div>
|
||||||
<div className="kpi-value">{entities.organization ?? 0}</div>
|
|
||||||
</div>
|
|
||||||
<div className="kpi-card">
|
|
||||||
<div className="kpi-label">People</div>
|
|
||||||
<div className="kpi-value">{entities.person ?? 0}</div>
|
<div className="kpi-value">{entities.person ?? 0}</div>
|
||||||
</div>
|
</div>
|
||||||
<div className="kpi-card">
|
<div className="kpi-card">
|
||||||
<div className="kpi-label">Entity Links</div>
|
<div className="kpi-label">Contacts in CRM</div>
|
||||||
<div className="kpi-value">{data.entity_links ?? 0}</div>
|
<div className="kpi-value">{(data.source_counts || {}).contacts ?? '—'}</div>
|
||||||
|
</div>
|
||||||
|
<div className="kpi-card">
|
||||||
|
<div className="kpi-label">Grid contacts</div>
|
||||||
|
<div className="kpi-value">{(data.source_counts || {}).fundraising_contacts ?? '—'}</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|||||||
@@ -12,8 +12,9 @@ export const PACKAGE_TITLE = 'Ten31 Database'
|
|||||||
// * 0.1.0:44 (Phase-0 ingest + MCP server in image; build-index action)
|
// * 0.1.0:44 (Phase-0 ingest + MCP server in image; build-index action)
|
||||||
// * 0.1.0:45 (Phase-1 thesis system; dual approval; merge review; in-app index)
|
// * 0.1.0:45 (Phase-1 thesis system; dual approval; merge review; in-app index)
|
||||||
// * 0.1.0:46 (packaging fix: ship full backend so migrations run + endpoints work)
|
// * 0.1.0:46 (packaging fix: ship full backend so migrations run + endpoints work)
|
||||||
// * Current: 0.1.0:47 (soft-delete instead of hard-delete; source-count diagnostics)
|
// * 0.1.0:47 (soft-delete instead of hard-delete; source-count diagnostics)
|
||||||
export const PACKAGE_VERSION = '0.1.0:47'
|
// * Current: 0.1.0:48 (entity model: investors vs people; fixes double-count)
|
||||||
|
export const PACKAGE_VERSION = '0.1.0:48'
|
||||||
|
|
||||||
export const DATA_MOUNT_PATH = '/data'
|
export const DATA_MOUNT_PATH = '/data'
|
||||||
export const WEB_PORT = 8080
|
export const WEB_PORT = 8080
|
||||||
|
|||||||
@@ -8,8 +8,9 @@ import { v_0_1_0_44 } from './v0.1.0.44'
|
|||||||
import { v_0_1_0_45 } from './v0.1.0.45'
|
import { v_0_1_0_45 } from './v0.1.0.45'
|
||||||
import { v_0_1_0_46 } from './v0.1.0.46'
|
import { v_0_1_0_46 } from './v0.1.0.46'
|
||||||
import { v_0_1_0_47 } from './v0.1.0.47'
|
import { v_0_1_0_47 } from './v0.1.0.47'
|
||||||
|
import { v_0_1_0_48 } from './v0.1.0.48'
|
||||||
|
|
||||||
export const versionGraph = VersionGraph.of({
|
export const versionGraph = VersionGraph.of({
|
||||||
current: v_0_1_0_47,
|
current: v_0_1_0_48,
|
||||||
other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43, v_0_1_0_44, v_0_1_0_45, v_0_1_0_46],
|
other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43, v_0_1_0_44, v_0_1_0_45, v_0_1_0_46, v_0_1_0_47],
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -0,0 +1,19 @@
|
|||||||
|
import { VersionInfo } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
|
// Entity-model fix. Investor entities now come from the fundraising grid (one per
|
||||||
|
// row, all labeled "investor"; the confusing organization/lp split is gone), and
|
||||||
|
// PEOPLE come only from the contacts table. The grid's contacts are recorded as
|
||||||
|
// membership links to their investor rather than as duplicate people, which fixes
|
||||||
|
// the roughly-doubled people count. Rebuild the search index after upgrading.
|
||||||
|
export const v_0_1_0_48 = VersionInfo.of({
|
||||||
|
version: '0.1.0:48',
|
||||||
|
releaseNotes: {
|
||||||
|
en_US: [
|
||||||
|
'Fixes the entity counts: investors now come from the fundraising grid and',
|
||||||
|
'people come from the contacts table, with the grid\'s contacts treated as',
|
||||||
|
'memberships instead of duplicate people. Run Rebuild search index after',
|
||||||
|
'upgrading to refresh the numbers.',
|
||||||
|
].join(' '),
|
||||||
|
},
|
||||||
|
migrations: { up: async () => {}, down: async () => {} },
|
||||||
|
})
|
||||||
Reference in New Issue
Block a user