Phase 0 complete: fuzzy entity tier, incremental sync, Start9 packaging
- Fuzzy tier (backend/ingest/fuzzy_resolve.py + llm.py): local Qwen adjudicates the deterministic resolver's flagged name-variant candidates; merges are durable via entity_merges (deterministic re-runs respect them), losers soft-deleted, logged. Idempotent. - Incremental sync (backend/ingest/sync.py): re-embeds only rows changed since a watermark (ingest_sync_state); first run / --recreate = full. Tested full→0→1. - Start9 packaging (start9/0.4): Dockerfile bundles ingest+mcp + fastembed/mcp; "Build search index" action runs the init in a subcontainer; MCP shipped as a manual stdio server (not a daemon); version 0.1.0:44. INGEST_PACKAGING.md. - backfill.py: factored embed_and_upsert() shared with sync. Verified end-to-end on synthetic data + live Sparks/Qwen/Qdrant. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -69,6 +69,16 @@ def _split_name(full: str):
|
||||
return parts[0], parts[-1] if len(parts) > 1 else ""
|
||||
|
||||
|
||||
def _redirect(merge_map, eid):
|
||||
"""Follow durable fuzzy-merge redirects (entity_merges) so deterministic
|
||||
re-runs respect prior merges instead of recreating the merged-away entity."""
|
||||
seen = set()
|
||||
while eid in merge_map and eid not in seen:
|
||||
seen.add(eid)
|
||||
eid = merge_map[eid]
|
||||
return eid
|
||||
|
||||
|
||||
# ── upsert helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
def _upsert_entity(conn, eid, kind, display_name, primary_email):
|
||||
@@ -102,12 +112,13 @@ def _link(conn, canonical_id, source_model, source_id, match_value, match_kind,
|
||||
|
||||
# ── resolution passes ─────────────────────────────────────────────────────────
|
||||
|
||||
def resolve_organizations(conn):
|
||||
def resolve_organizations(conn, merge_map=None):
|
||||
"""Merge organizations + fundraising_investors by normalized name.
|
||||
|
||||
Returns (org_canon_by_orgid, org_canon_by_fundinv) so the people pass can
|
||||
attach each person to their firm's canonical id.
|
||||
"""
|
||||
merge_map = merge_map or {}
|
||||
groups = defaultdict(lambda: {"orgs": [], "investors": [], "name": "", "email": ""})
|
||||
|
||||
for r in conn.execute("SELECT id, name, email FROM organizations"):
|
||||
@@ -135,7 +146,7 @@ def resolve_organizations(conn):
|
||||
# An org we are actively raising from (has a fundraising row) is an 'lp';
|
||||
# otherwise a plain 'organization'.
|
||||
kind = "lp" if g["investors"] else "organization"
|
||||
cid = _eid("lp" if kind == "lp" else "org", key)
|
||||
cid = _redirect(merge_map, _eid("lp" if kind == "lp" else "org", key))
|
||||
_upsert_entity(conn, cid, kind, g["name"], g["email"])
|
||||
for oid in g["orgs"]:
|
||||
_link(conn, cid, "organizations", oid, key, "exact_name", 1.0)
|
||||
@@ -147,9 +158,10 @@ def resolve_organizations(conn):
|
||||
return org_canon_by_orgid, org_canon_by_fundinv
|
||||
|
||||
|
||||
def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv):
|
||||
def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv, merge_map=None):
|
||||
"""Merge contacts + fundraising_contacts by exact email, else exact name within
|
||||
the same canonical org. Returns contact_id -> person canonical id (for lp_profiles)."""
|
||||
merge_map = merge_map or {}
|
||||
# gather (model, source_id, full_name, email, org_canon)
|
||||
people = []
|
||||
for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts"):
|
||||
@@ -173,7 +185,7 @@ def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv):
|
||||
match_kind, conf, match_value = "name_org", 0.8, name_norm
|
||||
else:
|
||||
continue
|
||||
cid = _eid("per", key)
|
||||
cid = _redirect(merge_map, _eid("per", key))
|
||||
display = full.strip() or email
|
||||
_upsert_entity(conn, cid, "person", display, email)
|
||||
_link(conn, cid, model, sid, match_value, match_kind, conf)
|
||||
@@ -210,17 +222,32 @@ def run(db_path: str):
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
org_by_oid, org_by_inv = resolve_organizations(conn)
|
||||
# Durable fuzzy-merge map (entity_merges) so deterministic re-runs respect
|
||||
# prior local-Qwen merges instead of recreating merged-away entities.
|
||||
conn.execute("""CREATE TABLE IF NOT EXISTS entity_merges (
|
||||
merged_id TEXT PRIMARY KEY,
|
||||
survivor_id TEXT NOT NULL,
|
||||
confidence REAL,
|
||||
reason TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
)""")
|
||||
merge_map = {r["merged_id"]: r["survivor_id"]
|
||||
for r in conn.execute("SELECT merged_id, survivor_id FROM entity_merges")}
|
||||
|
||||
org_by_oid, org_by_inv = resolve_organizations(conn, merge_map)
|
||||
conn.commit()
|
||||
person_meta = resolve_people(conn, org_by_oid, org_by_inv)
|
||||
person_meta = resolve_people(conn, org_by_oid, org_by_inv, merge_map)
|
||||
conn.commit()
|
||||
candidates = find_fuzzy_candidates(person_meta)
|
||||
|
||||
# Counts report LIVE entities (deleted_at IS NULL); fuzzy-merged losers are
|
||||
# soft-deleted tombstones (guardrail #3) and excluded.
|
||||
live = "deleted_at IS NULL"
|
||||
counts = {
|
||||
"canonical_total": conn.execute("SELECT COUNT(*) FROM canonical_entities").fetchone()[0],
|
||||
"lp": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp'").fetchone()[0],
|
||||
"organization": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization'").fetchone()[0],
|
||||
"person": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person'").fetchone()[0],
|
||||
"canonical_total": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE {live}").fetchone()[0],
|
||||
"lp": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp' AND {live}").fetchone()[0],
|
||||
"organization": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization' AND {live}").fetchone()[0],
|
||||
"person": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0],
|
||||
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
|
||||
"fuzzy_candidates": len(candidates),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user