Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server
Workstream A–C substrate for the Ten31 agentic system: - A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9 - A2: additive/reversible core migration (canonical_entities, entity_links, interaction_log, relationship_edges, soft-delete) + ledgered runner - B1/B3: chunking + deterministic entity resolution (backend/ingest) - B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks - C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools - docs: redaction/re-hydration, Gmail enablement runbook - synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db, drop legacy files + start9/0.3.5) Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity queries). Real backfill runs on Ten31 infra; index holds synthetic data only. Branch snapshot also captures pre-existing working-tree changes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
"""Ten31 Phase-0 ingest pipeline (entity resolution, chunking, embed, Qdrant upsert).
|
||||
|
||||
All modules are local-only and read the CRM by SQLite file path (CRM is canonical;
|
||||
the canonical/vector layers are derived). No real data is sent to Claude here.
|
||||
"""
|
||||
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Phase-0 Workstream B — backfill the CRM into Qdrant.
|
||||
|
||||
Chunk -> dense (bge-m3 via Spark Control) + sparse (BM25 client-side) -> upsert
|
||||
to Qdrant `crm_chunks` with payload. Idempotent: deterministic point ids mean
|
||||
re-running upserts in place. Reads the CRM by file path; never sends data to Claude.
|
||||
|
||||
python3 backend/ingest/backfill.py --db data/crm_dev.db --recreate
|
||||
"""
|
||||
import argparse
|
||||
import sqlite3
|
||||
|
||||
import chunking
|
||||
import config
|
||||
import embed
|
||||
import qdrant_io
|
||||
import sparse
|
||||
|
||||
|
||||
def run(db, recreate=False, batch=32):
|
||||
conn = sqlite3.connect(db)
|
||||
conn.row_factory = sqlite3.Row
|
||||
chunks = chunking.build_chunks(conn)
|
||||
conn.close()
|
||||
print(f"Built {len(chunks)} chunks from {db}")
|
||||
|
||||
state = qdrant_io.create_collection(recreate=recreate)
|
||||
qdrant_io.ensure_indexes()
|
||||
print(f"Collection '{config.COLLECTION}': {state}")
|
||||
|
||||
total = 0
|
||||
for i in range(0, len(chunks), batch):
|
||||
group = chunks[i:i + batch]
|
||||
dense = embed.dense_embed([c["text"] for c in group])
|
||||
points = []
|
||||
for c, dv in zip(group, dense):
|
||||
sv = sparse.encode(c["text"])
|
||||
points.append({
|
||||
"id": c["point_id"],
|
||||
"vector": {"dense": dv, "sparse": {"indices": sv["indices"], "values": sv["values"]}},
|
||||
"payload": {
|
||||
"lp_id": c["lp_id"], "lp_name": c["lp_name"], "person_id": c["person_id"],
|
||||
"doc_type": c["doc_type"], "date_ts": c["date_ts"], "text": c["text"],
|
||||
"source_model": c["source_model"], "source_id": c["source_id"], "chunk_key": c["chunk_key"],
|
||||
},
|
||||
})
|
||||
qdrant_io.upsert(points)
|
||||
total += len(points)
|
||||
print(f" upserted {total}/{len(chunks)}")
|
||||
|
||||
print(f"Done. Qdrant '{config.COLLECTION}' now holds {qdrant_io.count()} points.")
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--db", default=config.DEFAULT_DB)
|
||||
ap.add_argument("--recreate", action="store_true", help="drop & recreate the collection first")
|
||||
ap.add_argument("--batch", type=int, default=32)
|
||||
args = ap.parse_args()
|
||||
run(args.db, recreate=args.recreate, batch=args.batch)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,184 @@
|
||||
"""Phase-0 Workstream B1 — chunk the CRM for retrieval.
|
||||
|
||||
Maps each CRM record type to one or more chunks per docs/EMBEDDINGS.md:
|
||||
* one chunk per communications row (doc_type = the comm type)
|
||||
* one chunk per MATCHED email (doc_type = email; body only when matched)
|
||||
* one chunk per fundraising_investors notes LINE (the outreach log; split per line)
|
||||
* one chunk each for free-text fields: contacts.notes, lp_profiles.notes,
|
||||
opportunities (description + next_step), organizations.description
|
||||
|
||||
Each chunk carries a canonical `lp_id` (resolved via entity_links) and a `date_ts`
|
||||
(epoch of the EVENT time, not created_at) so Qdrant can pre-filter and recency-rank.
|
||||
Entities/names/dates/types are payload (filterable); only prose is embedded.
|
||||
|
||||
A chunk's stable `chunk_key` -> deterministic point id (uuid5), so re-ingest
|
||||
upserts in place (idempotent).
|
||||
"""
|
||||
import sqlite3
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
_NS = uuid.UUID("6ba7b811-9dad-11d1-80b4-00c04fd430c8") # uuid5 namespace for chunk ids
|
||||
|
||||
|
||||
def to_epoch(ts: str):
|
||||
if not ts:
|
||||
return None
|
||||
s = ts.strip().replace("Z", "+00:00")
|
||||
for parse in (datetime.fromisoformat,):
|
||||
try:
|
||||
dt = parse(s)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return int(dt.timestamp())
|
||||
except Exception:
|
||||
pass
|
||||
# date-only fallback
|
||||
try:
|
||||
return int(datetime.strptime(ts[:10], "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp())
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _point_id(chunk_key: str) -> str:
|
||||
return str(uuid.uuid5(_NS, chunk_key))
|
||||
|
||||
|
||||
def _mk(chunk_key, lp_id, lp_name, person_id, doc_type, date_ts, text, source_model, source_id):
|
||||
text = (text or "").strip()
|
||||
if not text or not lp_id:
|
||||
return None
|
||||
return {
|
||||
"chunk_key": chunk_key,
|
||||
"point_id": _point_id(chunk_key),
|
||||
"lp_id": lp_id,
|
||||
"lp_name": lp_name,
|
||||
"person_id": person_id,
|
||||
"doc_type": doc_type,
|
||||
"date_ts": date_ts,
|
||||
"text": text,
|
||||
"source_model": source_model,
|
||||
"source_id": source_id,
|
||||
}
|
||||
|
||||
|
||||
def _canon_maps(conn):
|
||||
"""Resolution lookups from entity_links / canonical_entities."""
|
||||
person_canon, org_canon, inv_canon = {}, {}, {}
|
||||
for r in conn.execute("SELECT source_model, source_id, canonical_id FROM entity_links"):
|
||||
if r["source_model"] == "contacts":
|
||||
person_canon[r["source_id"]] = r["canonical_id"]
|
||||
elif r["source_model"] == "organizations":
|
||||
org_canon[r["source_id"]] = r["canonical_id"]
|
||||
elif r["source_model"] == "fundraising_investors":
|
||||
inv_canon[r["source_id"]] = r["canonical_id"]
|
||||
name = {r["id"]: r["display_name"] for r in conn.execute("SELECT id, display_name FROM canonical_entities")}
|
||||
contact_org = {r["id"]: r["organization_id"] for r in conn.execute("SELECT id, organization_id FROM contacts")}
|
||||
return person_canon, org_canon, inv_canon, name, contact_org
|
||||
|
||||
|
||||
def _contact_lp(cid, person_canon, org_canon, name, contact_org):
|
||||
"""Best lp_id for a contact-anchored chunk: the firm if known, else the person."""
|
||||
person = person_canon.get(cid)
|
||||
firm = org_canon.get(contact_org.get(cid))
|
||||
lp = firm or person
|
||||
return lp, name.get(lp), person
|
||||
|
||||
|
||||
def build_chunks(conn):
|
||||
person_canon, org_canon, inv_canon, name, contact_org = _canon_maps(conn)
|
||||
chunks = []
|
||||
|
||||
# communications
|
||||
for r in conn.execute("""SELECT id, contact_id, type, subject, body, outcome, next_action, communication_date
|
||||
FROM communications"""):
|
||||
lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
|
||||
parts = [p for p in (r["subject"], r["body"], r["outcome"], r["next_action"]) if (p or "").strip()]
|
||||
chunks.append(_mk(f"communications:{r['id']}", lp, lp_name, person,
|
||||
r["type"] or "note", to_epoch(r["communication_date"]),
|
||||
"\n".join(parts), "communications", r["id"]))
|
||||
|
||||
# contacts.notes
|
||||
for r in conn.execute("SELECT id, notes, updated_at FROM contacts WHERE notes IS NOT NULL AND notes <> ''"):
|
||||
lp, lp_name, person = _contact_lp(r["id"], person_canon, org_canon, name, contact_org)
|
||||
chunks.append(_mk(f"contacts.notes:{r['id']}", lp, lp_name, person,
|
||||
"contact_note", to_epoch(r["updated_at"]), r["notes"], "contacts", r["id"]))
|
||||
|
||||
# lp_profiles.notes
|
||||
for r in conn.execute("""SELECT lp.id, lp.contact_id, lp.notes, lp.updated_at
|
||||
FROM lp_profiles lp WHERE lp.notes IS NOT NULL AND lp.notes <> ''"""):
|
||||
lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
|
||||
chunks.append(_mk(f"lp_profiles.notes:{r['id']}", lp, lp_name, person,
|
||||
"lp_note", to_epoch(r["updated_at"]), r["notes"], "lp_profiles", r["id"]))
|
||||
|
||||
# opportunities (description + next_step)
|
||||
for r in conn.execute("""SELECT id, contact_id, name, description, next_step, updated_at
|
||||
FROM opportunities"""):
|
||||
lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
|
||||
parts = [p for p in (r["name"], r["description"], r["next_step"]) if (p or "").strip()]
|
||||
chunks.append(_mk(f"opportunities:{r['id']}", lp, lp_name, person,
|
||||
"opportunity", to_epoch(r["updated_at"]), "\n".join(parts), "opportunities", r["id"]))
|
||||
|
||||
# organizations.description
|
||||
for r in conn.execute("""SELECT id, description, updated_at FROM organizations
|
||||
WHERE description IS NOT NULL AND description <> ''"""):
|
||||
lp = org_canon.get(r["id"])
|
||||
chunks.append(_mk(f"organizations.description:{r['id']}", lp, name.get(lp), None,
|
||||
"org_note", to_epoch(r["updated_at"]), r["description"], "organizations", r["id"]))
|
||||
|
||||
# fundraising_investors.notes — running outreach log, split per non-empty line
|
||||
for r in conn.execute("""SELECT id, notes, updated_at FROM fundraising_investors
|
||||
WHERE notes IS NOT NULL AND notes <> ''"""):
|
||||
lp = inv_canon.get(r["id"])
|
||||
for i, line in enumerate(str(r["notes"]).splitlines()):
|
||||
if line.strip():
|
||||
chunks.append(_mk(f"fundraising_investors.notes:{r['id']}:{i}", lp, name.get(lp), None,
|
||||
"outreach_note", to_epoch(r["updated_at"]), line, "fundraising_investors", r["id"]))
|
||||
|
||||
# MATCHED emails (only matched rows carry a body; key lp via email_investor_links)
|
||||
if _has_table(conn, "emails") and _has_table(conn, "email_investor_links"):
|
||||
for r in conn.execute("""SELECT id, subject, body_text, snippet, sent_at FROM emails WHERE is_matched=1"""):
|
||||
lp, lp_name = _email_lp(conn, r["id"], inv_canon, org_canon, person_canon, name)
|
||||
text = "\n".join(p for p in (r["subject"], r["body_text"] or r["snippet"]) if (p or "").strip())
|
||||
chunks.append(_mk(f"emails:{r['id']}", lp, lp_name, None, "email",
|
||||
to_epoch(r["sent_at"]), text, "emails", r["id"]))
|
||||
|
||||
return [c for c in chunks if c]
|
||||
|
||||
|
||||
def _has_table(conn, name):
|
||||
return conn.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name=?", (name,)).fetchone() is not None
|
||||
|
||||
|
||||
def _email_lp(conn, email_id, inv_canon, org_canon, person_canon, name):
|
||||
"""Resolve a matched email's lp_id via email_investor_links, precedence:
|
||||
fundraising_investor -> contact -> organization."""
|
||||
row = conn.execute("""SELECT fundraising_investor_id, contact_id, organization_id
|
||||
FROM email_investor_links WHERE email_id=? ORDER BY match_confidence DESC LIMIT 1""",
|
||||
(email_id,)).fetchone()
|
||||
if not row:
|
||||
return None, None
|
||||
lp = (inv_canon.get(row["fundraising_investor_id"]) or person_canon.get(row["contact_id"])
|
||||
or org_canon.get(row["organization_id"]))
|
||||
return lp, name.get(lp)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
from collections import Counter
|
||||
from config import DEFAULT_DB
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--db", default=DEFAULT_DB)
|
||||
args = ap.parse_args()
|
||||
conn = sqlite3.connect(args.db)
|
||||
conn.row_factory = sqlite3.Row
|
||||
chunks = build_chunks(conn)
|
||||
print(f"{len(chunks)} chunks from {args.db}")
|
||||
for dt, n in Counter(c["doc_type"] for c in chunks).most_common():
|
||||
print(f" {dt:<16} {n}")
|
||||
unresolved = sum(1 for c in chunks if not c["lp_id"])
|
||||
print(f" (all chunks have an lp_id: {unresolved == 0})")
|
||||
print("\nSample chunk:")
|
||||
s = chunks[0]
|
||||
print({k: (v[:80] + '…' if k == 'text' and v and len(v) > 80 else v) for k, v in s.items()})
|
||||
@@ -0,0 +1,28 @@
|
||||
"""Ingest config — loads .env and exposes the Spark/Qdrant/CRM settings."""
|
||||
import os
|
||||
|
||||
_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
|
||||
def load_env(path=None):
|
||||
path = path or os.path.join(_ROOT, ".env")
|
||||
if not os.path.exists(path):
|
||||
return
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
k, v = line.split("=", 1)
|
||||
os.environ.setdefault(k.strip(), v.strip())
|
||||
|
||||
|
||||
load_env()
|
||||
|
||||
SPARK_CONTROL_URL = os.environ.get("SPARK_CONTROL_URL", "").rstrip("/")
|
||||
SPARK_VERIFY_TLS = os.environ.get("SPARK_CONTROL_VERIFY_TLS", "false").lower() in ("1", "true", "yes", "on")
|
||||
QDRANT_URL = os.environ.get("QDRANT_URL", "").rstrip("/")
|
||||
COLLECTION = os.environ.get("CRM_QDRANT_COLLECTION", "crm_chunks")
|
||||
EMBED_MODEL = os.environ.get("CRM_EMBED_MODEL", "BAAI/bge-m3")
|
||||
DENSE_DIM = int(os.environ.get("CRM_EMBED_DIM", "1024"))
|
||||
DEFAULT_DB = os.environ.get("CRM_DEV_DB_PATH", os.path.join(_ROOT, "data", "crm_dev.db"))
|
||||
@@ -0,0 +1,17 @@
|
||||
"""Dense embeddings via Spark Control /v1/embeddings (BAAI/bge-m3, 1024-d)."""
|
||||
import config
|
||||
import http_util
|
||||
|
||||
|
||||
def dense_embed(texts, batch=32):
|
||||
out = []
|
||||
for i in range(0, len(texts), batch):
|
||||
group = texts[i:i + batch]
|
||||
status, data = http_util.request(
|
||||
"POST", f"{config.SPARK_CONTROL_URL}/v1/embeddings",
|
||||
{"input": group, "model": config.EMBED_MODEL}, verify=config.SPARK_VERIFY_TLS)
|
||||
if status != 200:
|
||||
raise RuntimeError(f"/v1/embeddings -> {status}: {data}")
|
||||
rows = sorted(data["data"], key=lambda d: d["index"])
|
||||
out.extend(r["embedding"] for r in rows)
|
||||
return out
|
||||
@@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Phase-0 Workstream B3 / A4 — entity resolution (deterministic tier).
|
||||
|
||||
Collapses the CRM's two parallel investor models into the canonical identity
|
||||
layer created by migration 0001:
|
||||
|
||||
organizations ─┐
|
||||
fundraising_investors ─┴─► canonical_entities (entity_kind = lp | organization)
|
||||
contacts ─┐
|
||||
fundraising_contacts ─┴─► canonical_entities (entity_kind = person)
|
||||
lp_profiles ───► linked to its contact's person entity
|
||||
|
||||
Every source row is recorded in `entity_links` so any name variant resolves to
|
||||
one canonical id. This is the DETERMINISTIC tier — it merges only what we can
|
||||
prove (exact email; exact normalized name within the same canonical org). The
|
||||
HARD cases (nicknames like "Jon" vs "Jonathan", typos) are NOT guessed; they are
|
||||
emitted as *fuzzy candidates* for the local-Qwen tier (Spark Control
|
||||
/v1/chat/completions) to adjudicate later. Honest separation: we never silently
|
||||
merge on a guess.
|
||||
|
||||
Properties:
|
||||
* Local-only, read-mostly: reads CRM source tables, writes only the derived
|
||||
canonical_entities / entity_links and an interaction_log audit row. Never
|
||||
mutates a CRM source record (guardrail #2/#3).
|
||||
* Idempotent: canonical ids are deterministic (sha1 of the resolution key), so
|
||||
re-running upserts in place and keeps ids stable across runs — which keeps
|
||||
downstream Qdrant point ids valid (no churn on re-embed).
|
||||
* Logged: writes one interaction_log row per run (guardrail #5).
|
||||
|
||||
Usage:
|
||||
python3 backend/ingest/entity_resolution.py --db data/crm_dev.db
|
||||
python3 backend/ingest/entity_resolution.py --db data/crm_dev.db --show-candidates
|
||||
"""
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
# ── normalization ─────────────────────────────────────────────────────────────
|
||||
|
||||
def norm_text(s: str) -> str:
|
||||
s = (s or "").strip().lower()
|
||||
s = re.sub(r"[^\w\s]", " ", s)
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def norm_email(s: str) -> str:
|
||||
return (s or "").strip().lower()
|
||||
|
||||
|
||||
def _eid(prefix: str, key: str) -> str:
|
||||
"""Deterministic canonical id: stable across runs for the same resolution key."""
|
||||
return f"{prefix}_{hashlib.sha1(key.encode('utf-8')).hexdigest()[:12]}"
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _split_name(full: str):
|
||||
parts = norm_text(full).split()
|
||||
if not parts:
|
||||
return "", ""
|
||||
return parts[0], parts[-1] if len(parts) > 1 else ""
|
||||
|
||||
|
||||
# ── upsert helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
def _upsert_entity(conn, eid, kind, display_name, primary_email):
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO canonical_entities (id, entity_kind, display_name, primary_email, source, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, 'entity_resolution', ?, ?)
|
||||
ON CONFLICT(id) DO UPDATE SET
|
||||
display_name = excluded.display_name,
|
||||
primary_email = COALESCE(excluded.primary_email, canonical_entities.primary_email),
|
||||
entity_kind = excluded.entity_kind,
|
||||
updated_at = excluded.updated_at
|
||||
""",
|
||||
(eid, kind, display_name, primary_email or None, _now(), _now()),
|
||||
)
|
||||
|
||||
|
||||
def _link(conn, canonical_id, source_model, source_id, match_value, match_kind, confidence):
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO entity_links (id, canonical_id, source_model, source_id, match_value, match_kind, confidence, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(source_model, source_id, match_value) DO UPDATE SET
|
||||
canonical_id = excluded.canonical_id,
|
||||
match_kind = excluded.match_kind,
|
||||
confidence = excluded.confidence
|
||||
""",
|
||||
(str(uuid.uuid4()), canonical_id, source_model, source_id, match_value, match_kind, confidence, _now()),
|
||||
)
|
||||
|
||||
|
||||
# ── resolution passes ─────────────────────────────────────────────────────────
|
||||
|
||||
def resolve_organizations(conn):
|
||||
"""Merge organizations + fundraising_investors by normalized name.
|
||||
|
||||
Returns (org_canon_by_orgid, org_canon_by_fundinv) so the people pass can
|
||||
attach each person to their firm's canonical id.
|
||||
"""
|
||||
groups = defaultdict(lambda: {"orgs": [], "investors": [], "name": "", "email": ""})
|
||||
|
||||
for r in conn.execute("SELECT id, name, email FROM organizations"):
|
||||
key = norm_text(r["name"])
|
||||
if not key:
|
||||
continue
|
||||
g = groups[key]
|
||||
g["orgs"].append(r["id"])
|
||||
if len(r["name"] or "") > len(g["name"]):
|
||||
g["name"] = r["name"]
|
||||
if not g["email"] and (r["email"] or "").strip():
|
||||
g["email"] = r["email"].strip()
|
||||
|
||||
for r in conn.execute("SELECT id, investor_name FROM fundraising_investors"):
|
||||
key = norm_text(r["investor_name"])
|
||||
if not key:
|
||||
continue
|
||||
g = groups[key]
|
||||
g["investors"].append(r["id"])
|
||||
if not g["name"]:
|
||||
g["name"] = r["investor_name"]
|
||||
|
||||
org_canon_by_orgid, org_canon_by_fundinv = {}, {}
|
||||
for key, g in groups.items():
|
||||
# An org we are actively raising from (has a fundraising row) is an 'lp';
|
||||
# otherwise a plain 'organization'.
|
||||
kind = "lp" if g["investors"] else "organization"
|
||||
cid = _eid("lp" if kind == "lp" else "org", key)
|
||||
_upsert_entity(conn, cid, kind, g["name"], g["email"])
|
||||
for oid in g["orgs"]:
|
||||
_link(conn, cid, "organizations", oid, key, "exact_name", 1.0)
|
||||
org_canon_by_orgid[oid] = cid
|
||||
for iid in g["investors"]:
|
||||
_link(conn, cid, "fundraising_investors", iid, key, "exact_name", 1.0)
|
||||
org_canon_by_fundinv[iid] = cid
|
||||
|
||||
return org_canon_by_orgid, org_canon_by_fundinv
|
||||
|
||||
|
||||
def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv):
|
||||
"""Merge contacts + fundraising_contacts by exact email, else exact name within
|
||||
the same canonical org. Returns contact_id -> person canonical id (for lp_profiles)."""
|
||||
# gather (model, source_id, full_name, email, org_canon)
|
||||
people = []
|
||||
for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts"):
|
||||
full = f"{r['first_name'] or ''} {r['last_name'] or ''}".strip()
|
||||
people.append(("contacts", r["id"], full, norm_email(r["email"]),
|
||||
org_canon_by_orgid.get(r["organization_id"])))
|
||||
for r in conn.execute("SELECT id, full_name, email, investor_id FROM fundraising_contacts"):
|
||||
people.append(("fundraising_contacts", r["id"], r["full_name"] or "", norm_email(r["email"]),
|
||||
org_canon_by_fundinv.get(r["investor_id"])))
|
||||
|
||||
contact_to_person = {}
|
||||
person_meta = {} # canonical_id -> {"org": org_canon, "last": last_norm, "name": display, "email": email}
|
||||
|
||||
for model, sid, full, email, org_canon in people:
|
||||
name_norm = norm_text(full)
|
||||
if email:
|
||||
key = f"e|{email}"
|
||||
match_kind, conf, match_value = "exact_email", 1.0, email
|
||||
elif name_norm:
|
||||
key = f"n|{name_norm}|{org_canon or ''}"
|
||||
match_kind, conf, match_value = "name_org", 0.8, name_norm
|
||||
else:
|
||||
continue
|
||||
cid = _eid("per", key)
|
||||
display = full.strip() or email
|
||||
_upsert_entity(conn, cid, "person", display, email)
|
||||
_link(conn, cid, model, sid, match_value, match_kind, conf)
|
||||
if model == "contacts":
|
||||
contact_to_person[sid] = cid
|
||||
meta = person_meta.setdefault(cid, {"org": org_canon, "last": _split_name(full)[1],
|
||||
"name": display, "email": email})
|
||||
if org_canon and not meta["org"]:
|
||||
meta["org"] = org_canon
|
||||
|
||||
# lp_profiles -> the person entity of its contact
|
||||
for r in conn.execute("SELECT id, contact_id FROM lp_profiles"):
|
||||
cid = contact_to_person.get(r["contact_id"])
|
||||
if cid:
|
||||
_link(conn, cid, "lp_profiles", r["id"], r["contact_id"], "contact_fk", 1.0)
|
||||
|
||||
return person_meta
|
||||
|
||||
|
||||
def find_fuzzy_candidates(person_meta):
|
||||
"""Distinct person entities sharing the same canonical org AND surname are
|
||||
likely the same individual under a name variant (e.g. Jon/Jonathan). Emit them
|
||||
for the local-Qwen tier; do NOT merge here."""
|
||||
by_org_last = defaultdict(list)
|
||||
for cid, m in person_meta.items():
|
||||
if m["org"] and m["last"]:
|
||||
by_org_last[(m["org"], m["last"])].append((cid, m["name"], m["email"]))
|
||||
return [{"org": org, "surname": last, "members": members}
|
||||
for (org, last), members in by_org_last.items() if len(members) > 1]
|
||||
|
||||
|
||||
def run(db_path: str):
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
org_by_oid, org_by_inv = resolve_organizations(conn)
|
||||
conn.commit()
|
||||
person_meta = resolve_people(conn, org_by_oid, org_by_inv)
|
||||
conn.commit()
|
||||
candidates = find_fuzzy_candidates(person_meta)
|
||||
|
||||
counts = {
|
||||
"canonical_total": conn.execute("SELECT COUNT(*) FROM canonical_entities").fetchone()[0],
|
||||
"lp": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp'").fetchone()[0],
|
||||
"organization": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization'").fetchone()[0],
|
||||
"person": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person'").fetchone()[0],
|
||||
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
|
||||
"fuzzy_candidates": len(candidates),
|
||||
}
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO interaction_log (id, ts, actor_type, actor_id, action, target_type, payload, source, created_at)
|
||||
VALUES (?, ?, 'system', 'entity_resolver', 'entity_resolution.run', 'canonical_entities', ?, 'ingest', ?)
|
||||
""",
|
||||
(str(uuid.uuid4()), _now(), json.dumps(counts), _now()),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return counts, candidates
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Deterministic entity resolution into the canonical layer.")
|
||||
ap.add_argument("--db", default="data/crm_dev.db", help="path to the CRM SQLite DB")
|
||||
ap.add_argument("--show-candidates", action="store_true", help="print fuzzy merge candidates")
|
||||
args = ap.parse_args()
|
||||
|
||||
counts, candidates = run(args.db)
|
||||
print(f"Entity resolution on {args.db}:")
|
||||
for k, v in counts.items():
|
||||
print(f" {k:<18} {v}")
|
||||
if args.show_candidates and candidates:
|
||||
print("\nFuzzy candidates (same org + surname, different person — for the local-Qwen tier):")
|
||||
for c in candidates:
|
||||
names = ", ".join(f"{n!r}{(' <'+e+'>') if e else ''}" for _, n, e in c["members"])
|
||||
print(f" [{c['surname']}] {names}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,35 @@
|
||||
"""Tiny stdlib JSON HTTP client (no third-party deps).
|
||||
|
||||
Handles the Spark Control self-signed cert (verify=False) and plain-HTTP Qdrant.
|
||||
"""
|
||||
import json
|
||||
import ssl
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
|
||||
def _ctx(verify: bool):
|
||||
if verify:
|
||||
return None
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
return ctx
|
||||
|
||||
|
||||
def request(method: str, url: str, body=None, verify: bool = True, timeout: int = 180):
|
||||
data = json.dumps(body).encode("utf-8") if body is not None else None
|
||||
req = urllib.request.Request(url, data=data, method=method,
|
||||
headers={"Content-Type": "application/json"})
|
||||
ctx = _ctx(verify) if url.lower().startswith("https") else None
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
|
||||
raw = resp.read()
|
||||
return resp.status, (json.loads(raw) if raw else {})
|
||||
except urllib.error.HTTPError as exc:
|
||||
raw = exc.read()
|
||||
try:
|
||||
payload = json.loads(raw) if raw else {}
|
||||
except Exception:
|
||||
payload = {"raw": raw.decode("utf-8", "replace")}
|
||||
return exc.code, payload
|
||||
@@ -0,0 +1,50 @@
|
||||
"""Minimal Qdrant REST client for the ingest pipeline (direct to QDRANT_URL).
|
||||
|
||||
Creates the crm_chunks collection per EMBEDDINGS.md: a named dense vector
|
||||
(1024, Cosine) + a named sparse vector with modifier:idf, plus payload indexes.
|
||||
"""
|
||||
import config
|
||||
import http_util
|
||||
|
||||
Q = config.QDRANT_URL
|
||||
COL = config.COLLECTION
|
||||
|
||||
|
||||
def _req(method, path, body=None):
|
||||
return http_util.request(method, f"{Q}{path}", body, verify=False)
|
||||
|
||||
|
||||
def exists() -> bool:
|
||||
status, _ = _req("GET", f"/collections/{COL}")
|
||||
return status == 200
|
||||
|
||||
|
||||
def create_collection(recreate=False, dim=config.DENSE_DIM):
|
||||
if exists():
|
||||
if not recreate:
|
||||
return "exists"
|
||||
_req("DELETE", f"/collections/{COL}")
|
||||
status, data = _req("PUT", f"/collections/{COL}", {
|
||||
"vectors": {"dense": {"size": dim, "distance": "Cosine"}},
|
||||
"sparse_vectors": {"sparse": {"modifier": "idf"}},
|
||||
})
|
||||
if status not in (200, 201):
|
||||
raise RuntimeError(f"create collection -> {status}: {data}")
|
||||
return "created"
|
||||
|
||||
|
||||
def ensure_indexes():
|
||||
for field, schema in (("lp_id", "keyword"), ("doc_type", "keyword"), ("date_ts", "integer")):
|
||||
_req("PUT", f"/collections/{COL}/index", {"field_name": field, "field_schema": schema})
|
||||
|
||||
|
||||
def upsert(points):
|
||||
status, data = _req("PUT", f"/collections/{COL}/points?wait=true", {"points": points})
|
||||
if status not in (200, 201):
|
||||
raise RuntimeError(f"upsert -> {status}: {data}")
|
||||
return data
|
||||
|
||||
|
||||
def count():
|
||||
status, data = _req("POST", f"/collections/{COL}/points/count", {"exact": True})
|
||||
return (data or {}).get("result", {}).get("count")
|
||||
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Phase-0 retrieval — thin wrappers over Spark Control /api/search.
|
||||
|
||||
These are the retrieval modes the CRM MCP server (Workstream C) will expose:
|
||||
* semantic_search — dense only (omit sparse), high recall
|
||||
* hybrid_search — dense + BM25 sparse (RRF) + rerank; best for entity queries
|
||||
* keyword_search — lean on the sparse leg
|
||||
All support a Qdrant `filter` (e.g. lp_id / date_ts range) to pre-filter.
|
||||
|
||||
`--demo` runs an entity-heavy query in dense-only vs hybrid to show the BM25
|
||||
lexical leg surfacing the right LP. The query's sparse vector uses the SAME
|
||||
encoder as ingest (sparse.encode).
|
||||
"""
|
||||
import argparse
|
||||
|
||||
import config
|
||||
import http_util
|
||||
import sparse
|
||||
|
||||
|
||||
def _search(query, sparse_vec=None, rerank=False, top_k=5, lp_id=None, retrieve_n=80, filt=None):
|
||||
body = {"query": query, "collection": config.COLLECTION, "top_k": top_k,
|
||||
"retrieve_n": retrieve_n, "fusion": "rrf", "text_field": "text",
|
||||
"with_payload": True, "rerank": rerank}
|
||||
if sparse_vec is not None:
|
||||
body["sparse"] = {"indices": sparse_vec["indices"], "values": sparse_vec["values"]}
|
||||
# An explicit raw Qdrant filter (filt) wins; otherwise build one from lp_id.
|
||||
if filt is not None:
|
||||
body["filter"] = filt
|
||||
elif lp_id:
|
||||
body["filter"] = {"must": [{"key": "lp_id", "match": {"value": lp_id}}]}
|
||||
status, data = http_util.request("POST", f"{config.SPARK_CONTROL_URL}/api/search",
|
||||
body, verify=config.SPARK_VERIFY_TLS)
|
||||
if status != 200:
|
||||
raise RuntimeError(f"/api/search -> {status}: {data}")
|
||||
return data.get("data", [])
|
||||
|
||||
|
||||
def semantic_search(query, **kw):
|
||||
return _search(query, sparse_vec=None, rerank=kw.pop("rerank", False), **kw)
|
||||
|
||||
|
||||
def hybrid_search(query, **kw):
|
||||
return _search(query, sparse_vec=sparse.encode(query), rerank=kw.pop("rerank", True), **kw)
|
||||
|
||||
|
||||
def keyword_search(query, **kw):
|
||||
return _search(query, sparse_vec=sparse.encode(query), rerank=kw.pop("rerank", True), **kw)
|
||||
|
||||
|
||||
def _row(r):
|
||||
p = r.get("payload", {}) or {}
|
||||
text = (r.get("text") or p.get("text") or "").replace("\n", " ")
|
||||
return f"{p.get('lp_name', '?'):<22} [{p.get('doc_type', '?'):<13}] {text[:58]}"
|
||||
|
||||
|
||||
def _print(title, rows):
|
||||
print(f"\n {title}")
|
||||
if not rows:
|
||||
print(" (no results)")
|
||||
for i, r in enumerate(rows, 1):
|
||||
print(f" {i}. score={r.get('score', 0):+.3f} {_row(r)}")
|
||||
|
||||
|
||||
def demo():
|
||||
target = "Cedar Point Capital"
|
||||
q = "Fund III diligence and wire timeline for Cedar Point"
|
||||
print(f"QUERY: {q!r}\nTarget LP: {target}")
|
||||
|
||||
dense = semantic_search(q, top_k=5)
|
||||
hybrid = hybrid_search(q, top_k=5, rerank=False) # rerank off to isolate the BM25 leg
|
||||
_print("dense-only (semantic):", dense)
|
||||
_print("hybrid (dense + BM25 RRF):", hybrid)
|
||||
|
||||
def first_rank(rows):
|
||||
for i, r in enumerate(rows, 1):
|
||||
if (r.get("payload", {}) or {}).get("lp_name") == target:
|
||||
return i
|
||||
return None
|
||||
print(f"\n First '{target}' chunk — dense rank: {first_rank(dense)}, hybrid rank: {first_rank(hybrid)}")
|
||||
|
||||
# Pre-filter demo: same query, restricted to one LP's chunks.
|
||||
lp_id = None
|
||||
for r in hybrid:
|
||||
p = r.get("payload", {}) or {}
|
||||
if p.get("lp_name") == target:
|
||||
lp_id = p.get("lp_id")
|
||||
break
|
||||
if lp_id:
|
||||
_print(f"hybrid + payload pre-filter (lp_id={lp_id}):",
|
||||
hybrid_search(q, top_k=5, rerank=True, lp_id=lp_id))
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("query", nargs="?")
|
||||
ap.add_argument("--mode", choices=["semantic", "hybrid", "keyword"], default="hybrid")
|
||||
ap.add_argument("--top-k", type=int, default=5)
|
||||
ap.add_argument("--lp-id")
|
||||
ap.add_argument("--demo", action="store_true")
|
||||
args = ap.parse_args()
|
||||
if args.demo or not args.query:
|
||||
return demo()
|
||||
fn = {"semantic": semantic_search, "hybrid": hybrid_search, "keyword": keyword_search}[args.mode]
|
||||
_print(f"{args.mode}: {args.query!r}", fn(args.query, top_k=args.top_k, lp_id=args.lp_id))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,40 @@
|
||||
"""Client-side BM25 sparse vectors.
|
||||
|
||||
EMBEDDINGS.md specifies FastEmbed `Qdrant/bm25` so Qdrant applies IDF (via the
|
||||
sparse vector's `modifier: idf`) over OUR corpus. FastEmbed pulls onnxruntime,
|
||||
which has no wheel for this Python (3.14) yet, so this module provides a
|
||||
dependency-free BM25 term-frequency encoder with the same contract:
|
||||
`encode(text) -> {"indices": [...], "values": [...]}`.
|
||||
|
||||
Qdrant computes IDF server-side from the stored sparse vectors regardless of how
|
||||
indices are assigned, so this is a legitimate corpus-IDF BM25 leg. The ONLY hard
|
||||
requirement is that ingest and query use the SAME encoder — they both import this
|
||||
one. For production, swap `encode()` for FastEmbed `Qdrant/bm25` (and re-index, so
|
||||
ingest and query stay on the same tokenizer).
|
||||
"""
|
||||
import hashlib
|
||||
import math
|
||||
import re
|
||||
|
||||
_TOKEN_RE = re.compile(r"[a-z0-9]+")
|
||||
|
||||
|
||||
def tokenize(text: str):
|
||||
return _TOKEN_RE.findall((text or "").lower())
|
||||
|
||||
|
||||
def _index(token: str) -> int:
|
||||
# Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32).
|
||||
return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big")
|
||||
|
||||
|
||||
def encode(text: str):
|
||||
"""Return a sparse vector {indices, values}. Value is 1 + ln(tf) (sublinear
|
||||
term frequency); IDF is applied by Qdrant via modifier:idf."""
|
||||
tf = {}
|
||||
for tok in tokenize(text):
|
||||
tf[tok] = tf.get(tok, 0) + 1
|
||||
idx_val = {}
|
||||
for tok, count in tf.items():
|
||||
idx_val[_index(tok)] = 1.0 + math.log(count)
|
||||
return {"indices": list(idx_val.keys()), "values": list(idx_val.values())}
|
||||
Reference in New Issue
Block a user