Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server

Workstream A–C substrate for the Ten31 agentic system: - A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9 - A2: additive/reversible core migration (canonical_entities, entity_links, interaction_log, relationship_edges, soft-delete) + ledgered runner - B1/B3: chunking + deterministic entity resolution (backend/ingest) - B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks - C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools - docs: redaction/re-hydration, Gmail enablement runbook - synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db, drop legacy files + start9/0.3.5) Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity queries). Real backfill runs on Ten31 infra; index holds synthetic data only. Branch snapshot also captures pre-existing working-tree changes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 08:11:28 -05:00
parent 7027efd777
commit c7ce44d963
99 changed files with 10676 additions and 7817 deletions
@@ -0,0 +1,5 @@
+"""Ten31 Phase-0 ingest pipeline (entity resolution, chunking, embed, Qdrant upsert).
+
+All modules are local-only and read the CRM by SQLite file path (CRM is canonical;
+the canonical/vector layers are derived). No real data is sent to Claude here.
+"""
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""Phase-0 Workstream B — backfill the CRM into Qdrant.
+
+Chunk -> dense (bge-m3 via Spark Control) + sparse (BM25 client-side) -> upsert
+to Qdrant `crm_chunks` with payload. Idempotent: deterministic point ids mean
+re-running upserts in place. Reads the CRM by file path; never sends data to Claude.
+
+    python3 backend/ingest/backfill.py --db data/crm_dev.db --recreate
+"""
+import argparse
+import sqlite3
+
+import chunking
+import config
+import embed
+import qdrant_io
+import sparse
+
+
+def run(db, recreate=False, batch=32):
+    conn = sqlite3.connect(db)
+    conn.row_factory = sqlite3.Row
+    chunks = chunking.build_chunks(conn)
+    conn.close()
+    print(f"Built {len(chunks)} chunks from {db}")
+
+    state = qdrant_io.create_collection(recreate=recreate)
+    qdrant_io.ensure_indexes()
+    print(f"Collection '{config.COLLECTION}': {state}")
+
+    total = 0
+    for i in range(0, len(chunks), batch):
+        group = chunks[i:i + batch]
+        dense = embed.dense_embed([c["text"] for c in group])
+        points = []
+        for c, dv in zip(group, dense):
+            sv = sparse.encode(c["text"])
+            points.append({
+                "id": c["point_id"],
+                "vector": {"dense": dv, "sparse": {"indices": sv["indices"], "values": sv["values"]}},
+                "payload": {
+                    "lp_id": c["lp_id"], "lp_name": c["lp_name"], "person_id": c["person_id"],
+                    "doc_type": c["doc_type"], "date_ts": c["date_ts"], "text": c["text"],
+                    "source_model": c["source_model"], "source_id": c["source_id"], "chunk_key": c["chunk_key"],
+                },
+            })
+        qdrant_io.upsert(points)
+        total += len(points)
+        print(f"  upserted {total}/{len(chunks)}")
+
+    print(f"Done. Qdrant '{config.COLLECTION}' now holds {qdrant_io.count()} points.")
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--db", default=config.DEFAULT_DB)
+    ap.add_argument("--recreate", action="store_true", help="drop & recreate the collection first")
+    ap.add_argument("--batch", type=int, default=32)
+    args = ap.parse_args()
+    run(args.db, recreate=args.recreate, batch=args.batch)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,184 @@
+"""Phase-0 Workstream B1 — chunk the CRM for retrieval.
+
+Maps each CRM record type to one or more chunks per docs/EMBEDDINGS.md:
+  * one chunk per communications row          (doc_type = the comm type)
+  * one chunk per MATCHED email               (doc_type = email; body only when matched)
+  * one chunk per fundraising_investors notes LINE (the outreach log; split per line)
+  * one chunk each for free-text fields: contacts.notes, lp_profiles.notes,
+    opportunities (description + next_step), organizations.description
+
+Each chunk carries a canonical `lp_id` (resolved via entity_links) and a `date_ts`
+(epoch of the EVENT time, not created_at) so Qdrant can pre-filter and recency-rank.
+Entities/names/dates/types are payload (filterable); only prose is embedded.
+
+A chunk's stable `chunk_key` -> deterministic point id (uuid5), so re-ingest
+upserts in place (idempotent).
+"""
+import sqlite3
+import uuid
+from datetime import datetime, timezone
+
+_NS = uuid.UUID("6ba7b811-9dad-11d1-80b4-00c04fd430c8")  # uuid5 namespace for chunk ids
+
+
+def to_epoch(ts: str):
+    if not ts:
+        return None
+    s = ts.strip().replace("Z", "+00:00")
+    for parse in (datetime.fromisoformat,):
+        try:
+            dt = parse(s)
+            if dt.tzinfo is None:
+                dt = dt.replace(tzinfo=timezone.utc)
+            return int(dt.timestamp())
+        except Exception:
+            pass
+    # date-only fallback
+    try:
+        return int(datetime.strptime(ts[:10], "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp())
+    except Exception:
+        return None
+
+
+def _point_id(chunk_key: str) -> str:
+    return str(uuid.uuid5(_NS, chunk_key))
+
+
+def _mk(chunk_key, lp_id, lp_name, person_id, doc_type, date_ts, text, source_model, source_id):
+    text = (text or "").strip()
+    if not text or not lp_id:
+        return None
+    return {
+        "chunk_key": chunk_key,
+        "point_id": _point_id(chunk_key),
+        "lp_id": lp_id,
+        "lp_name": lp_name,
+        "person_id": person_id,
+        "doc_type": doc_type,
+        "date_ts": date_ts,
+        "text": text,
+        "source_model": source_model,
+        "source_id": source_id,
+    }
+
+
+def _canon_maps(conn):
+    """Resolution lookups from entity_links / canonical_entities."""
+    person_canon, org_canon, inv_canon = {}, {}, {}
+    for r in conn.execute("SELECT source_model, source_id, canonical_id FROM entity_links"):
+        if r["source_model"] == "contacts":
+            person_canon[r["source_id"]] = r["canonical_id"]
+        elif r["source_model"] == "organizations":
+            org_canon[r["source_id"]] = r["canonical_id"]
+        elif r["source_model"] == "fundraising_investors":
+            inv_canon[r["source_id"]] = r["canonical_id"]
+    name = {r["id"]: r["display_name"] for r in conn.execute("SELECT id, display_name FROM canonical_entities")}
+    contact_org = {r["id"]: r["organization_id"] for r in conn.execute("SELECT id, organization_id FROM contacts")}
+    return person_canon, org_canon, inv_canon, name, contact_org
+
+
+def _contact_lp(cid, person_canon, org_canon, name, contact_org):
+    """Best lp_id for a contact-anchored chunk: the firm if known, else the person."""
+    person = person_canon.get(cid)
+    firm = org_canon.get(contact_org.get(cid))
+    lp = firm or person
+    return lp, name.get(lp), person
+
+
+def build_chunks(conn):
+    person_canon, org_canon, inv_canon, name, contact_org = _canon_maps(conn)
+    chunks = []
+
+    # communications
+    for r in conn.execute("""SELECT id, contact_id, type, subject, body, outcome, next_action, communication_date
+                             FROM communications"""):
+        lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
+        parts = [p for p in (r["subject"], r["body"], r["outcome"], r["next_action"]) if (p or "").strip()]
+        chunks.append(_mk(f"communications:{r['id']}", lp, lp_name, person,
+                          r["type"] or "note", to_epoch(r["communication_date"]),
+                          "\n".join(parts), "communications", r["id"]))
+
+    # contacts.notes
+    for r in conn.execute("SELECT id, notes, updated_at FROM contacts WHERE notes IS NOT NULL AND notes <> ''"):
+        lp, lp_name, person = _contact_lp(r["id"], person_canon, org_canon, name, contact_org)
+        chunks.append(_mk(f"contacts.notes:{r['id']}", lp, lp_name, person,
+                          "contact_note", to_epoch(r["updated_at"]), r["notes"], "contacts", r["id"]))
+
+    # lp_profiles.notes
+    for r in conn.execute("""SELECT lp.id, lp.contact_id, lp.notes, lp.updated_at
+                             FROM lp_profiles lp WHERE lp.notes IS NOT NULL AND lp.notes <> ''"""):
+        lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
+        chunks.append(_mk(f"lp_profiles.notes:{r['id']}", lp, lp_name, person,
+                          "lp_note", to_epoch(r["updated_at"]), r["notes"], "lp_profiles", r["id"]))
+
+    # opportunities (description + next_step)
+    for r in conn.execute("""SELECT id, contact_id, name, description, next_step, updated_at
+                             FROM opportunities"""):
+        lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
+        parts = [p for p in (r["name"], r["description"], r["next_step"]) if (p or "").strip()]
+        chunks.append(_mk(f"opportunities:{r['id']}", lp, lp_name, person,
+                          "opportunity", to_epoch(r["updated_at"]), "\n".join(parts), "opportunities", r["id"]))
+
+    # organizations.description
+    for r in conn.execute("""SELECT id, description, updated_at FROM organizations
+                             WHERE description IS NOT NULL AND description <> ''"""):
+        lp = org_canon.get(r["id"])
+        chunks.append(_mk(f"organizations.description:{r['id']}", lp, name.get(lp), None,
+                          "org_note", to_epoch(r["updated_at"]), r["description"], "organizations", r["id"]))
+
+    # fundraising_investors.notes — running outreach log, split per non-empty line
+    for r in conn.execute("""SELECT id, notes, updated_at FROM fundraising_investors
+                             WHERE notes IS NOT NULL AND notes <> ''"""):
+        lp = inv_canon.get(r["id"])
+        for i, line in enumerate(str(r["notes"]).splitlines()):
+            if line.strip():
+                chunks.append(_mk(f"fundraising_investors.notes:{r['id']}:{i}", lp, name.get(lp), None,
+                                  "outreach_note", to_epoch(r["updated_at"]), line, "fundraising_investors", r["id"]))
+
+    # MATCHED emails (only matched rows carry a body; key lp via email_investor_links)
+    if _has_table(conn, "emails") and _has_table(conn, "email_investor_links"):
+        for r in conn.execute("""SELECT id, subject, body_text, snippet, sent_at FROM emails WHERE is_matched=1"""):
+            lp, lp_name = _email_lp(conn, r["id"], inv_canon, org_canon, person_canon, name)
+            text = "\n".join(p for p in (r["subject"], r["body_text"] or r["snippet"]) if (p or "").strip())
+            chunks.append(_mk(f"emails:{r['id']}", lp, lp_name, None, "email",
+                              to_epoch(r["sent_at"]), text, "emails", r["id"]))
+
+    return [c for c in chunks if c]
+
+
+def _has_table(conn, name):
+    return conn.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name=?", (name,)).fetchone() is not None
+
+
+def _email_lp(conn, email_id, inv_canon, org_canon, person_canon, name):
+    """Resolve a matched email's lp_id via email_investor_links, precedence:
+    fundraising_investor -> contact -> organization."""
+    row = conn.execute("""SELECT fundraising_investor_id, contact_id, organization_id
+                          FROM email_investor_links WHERE email_id=? ORDER BY match_confidence DESC LIMIT 1""",
+                       (email_id,)).fetchone()
+    if not row:
+        return None, None
+    lp = (inv_canon.get(row["fundraising_investor_id"]) or person_canon.get(row["contact_id"])
+          or org_canon.get(row["organization_id"]))
+    return lp, name.get(lp)
+
+
+if __name__ == "__main__":
+    import argparse
+    from collections import Counter
+    from config import DEFAULT_DB
+
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--db", default=DEFAULT_DB)
+    args = ap.parse_args()
+    conn = sqlite3.connect(args.db)
+    conn.row_factory = sqlite3.Row
+    chunks = build_chunks(conn)
+    print(f"{len(chunks)} chunks from {args.db}")
+    for dt, n in Counter(c["doc_type"] for c in chunks).most_common():
+        print(f"  {dt:<16} {n}")
+    unresolved = sum(1 for c in chunks if not c["lp_id"])
+    print(f"  (all chunks have an lp_id: {unresolved == 0})")
+    print("\nSample chunk:")
+    s = chunks[0]
+    print({k: (v[:80] + '…' if k == 'text' and v and len(v) > 80 else v) for k, v in s.items()})
@@ -0,0 +1,28 @@
+"""Ingest config — loads .env and exposes the Spark/Qdrant/CRM settings."""
+import os
+
+_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+def load_env(path=None):
+    path = path or os.path.join(_ROOT, ".env")
+    if not os.path.exists(path):
+        return
+    with open(path, "r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line or line.startswith("#") or "=" not in line:
+                continue
+            k, v = line.split("=", 1)
+            os.environ.setdefault(k.strip(), v.strip())
+
+
+load_env()
+
+SPARK_CONTROL_URL = os.environ.get("SPARK_CONTROL_URL", "").rstrip("/")
+SPARK_VERIFY_TLS = os.environ.get("SPARK_CONTROL_VERIFY_TLS", "false").lower() in ("1", "true", "yes", "on")
+QDRANT_URL = os.environ.get("QDRANT_URL", "").rstrip("/")
+COLLECTION = os.environ.get("CRM_QDRANT_COLLECTION", "crm_chunks")
+EMBED_MODEL = os.environ.get("CRM_EMBED_MODEL", "BAAI/bge-m3")
+DENSE_DIM = int(os.environ.get("CRM_EMBED_DIM", "1024"))
+DEFAULT_DB = os.environ.get("CRM_DEV_DB_PATH", os.path.join(_ROOT, "data", "crm_dev.db"))
@@ -0,0 +1,17 @@
+"""Dense embeddings via Spark Control /v1/embeddings (BAAI/bge-m3, 1024-d)."""
+import config
+import http_util
+
+
+def dense_embed(texts, batch=32):
+    out = []
+    for i in range(0, len(texts), batch):
+        group = texts[i:i + batch]
+        status, data = http_util.request(
+            "POST", f"{config.SPARK_CONTROL_URL}/v1/embeddings",
+            {"input": group, "model": config.EMBED_MODEL}, verify=config.SPARK_VERIFY_TLS)
+        if status != 200:
+            raise RuntimeError(f"/v1/embeddings -> {status}: {data}")
+        rows = sorted(data["data"], key=lambda d: d["index"])
+        out.extend(r["embedding"] for r in rows)
+    return out
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""Phase-0 Workstream B3 / A4 — entity resolution (deterministic tier).
+
+Collapses the CRM's two parallel investor models into the canonical identity
+layer created by migration 0001:
+
+    organizations          ─┐
+    fundraising_investors  ─┴─►  canonical_entities (entity_kind = lp | organization)
+    contacts               ─┐
+    fundraising_contacts   ─┴─►  canonical_entities (entity_kind = person)
+    lp_profiles            ───►  linked to its contact's person entity
+
+Every source row is recorded in `entity_links` so any name variant resolves to
+one canonical id. This is the DETERMINISTIC tier — it merges only what we can
+prove (exact email; exact normalized name within the same canonical org). The
+HARD cases (nicknames like "Jon" vs "Jonathan", typos) are NOT guessed; they are
+emitted as *fuzzy candidates* for the local-Qwen tier (Spark Control
+/v1/chat/completions) to adjudicate later. Honest separation: we never silently
+merge on a guess.
+
+Properties:
+  * Local-only, read-mostly: reads CRM source tables, writes only the derived
+    canonical_entities / entity_links and an interaction_log audit row. Never
+    mutates a CRM source record (guardrail #2/#3).
+  * Idempotent: canonical ids are deterministic (sha1 of the resolution key), so
+    re-running upserts in place and keeps ids stable across runs — which keeps
+    downstream Qdrant point ids valid (no churn on re-embed).
+  * Logged: writes one interaction_log row per run (guardrail #5).
+
+Usage:
+    python3 backend/ingest/entity_resolution.py --db data/crm_dev.db
+    python3 backend/ingest/entity_resolution.py --db data/crm_dev.db --show-candidates
+"""
+import argparse
+import hashlib
+import json
+import re
+import sqlite3
+import uuid
+from collections import defaultdict
+from datetime import datetime, timezone
+
+
+# ── normalization ─────────────────────────────────────────────────────────────
+
+def norm_text(s: str) -> str:
+    s = (s or "").strip().lower()
+    s = re.sub(r"[^\w\s]", " ", s)
+    return re.sub(r"\s+", " ", s).strip()
+
+
+def norm_email(s: str) -> str:
+    return (s or "").strip().lower()
+
+
+def _eid(prefix: str, key: str) -> str:
+    """Deterministic canonical id: stable across runs for the same resolution key."""
+    return f"{prefix}_{hashlib.sha1(key.encode('utf-8')).hexdigest()[:12]}"
+
+
+def _now() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _split_name(full: str):
+    parts = norm_text(full).split()
+    if not parts:
+        return "", ""
+    return parts[0], parts[-1] if len(parts) > 1 else ""
+
+
+# ── upsert helpers ────────────────────────────────────────────────────────────
+
+def _upsert_entity(conn, eid, kind, display_name, primary_email):
+    conn.execute(
+        """
+        INSERT INTO canonical_entities (id, entity_kind, display_name, primary_email, source, created_at, updated_at)
+        VALUES (?, ?, ?, ?, 'entity_resolution', ?, ?)
+        ON CONFLICT(id) DO UPDATE SET
+            display_name  = excluded.display_name,
+            primary_email = COALESCE(excluded.primary_email, canonical_entities.primary_email),
+            entity_kind   = excluded.entity_kind,
+            updated_at    = excluded.updated_at
+        """,
+        (eid, kind, display_name, primary_email or None, _now(), _now()),
+    )
+
+
+def _link(conn, canonical_id, source_model, source_id, match_value, match_kind, confidence):
+    conn.execute(
+        """
+        INSERT INTO entity_links (id, canonical_id, source_model, source_id, match_value, match_kind, confidence, created_at)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+        ON CONFLICT(source_model, source_id, match_value) DO UPDATE SET
+            canonical_id = excluded.canonical_id,
+            match_kind   = excluded.match_kind,
+            confidence   = excluded.confidence
+        """,
+        (str(uuid.uuid4()), canonical_id, source_model, source_id, match_value, match_kind, confidence, _now()),
+    )
+
+
+# ── resolution passes ─────────────────────────────────────────────────────────
+
+def resolve_organizations(conn):
+    """Merge organizations + fundraising_investors by normalized name.
+
+    Returns (org_canon_by_orgid, org_canon_by_fundinv) so the people pass can
+    attach each person to their firm's canonical id.
+    """
+    groups = defaultdict(lambda: {"orgs": [], "investors": [], "name": "", "email": ""})
+
+    for r in conn.execute("SELECT id, name, email FROM organizations"):
+        key = norm_text(r["name"])
+        if not key:
+            continue
+        g = groups[key]
+        g["orgs"].append(r["id"])
+        if len(r["name"] or "") > len(g["name"]):
+            g["name"] = r["name"]
+        if not g["email"] and (r["email"] or "").strip():
+            g["email"] = r["email"].strip()
+
+    for r in conn.execute("SELECT id, investor_name FROM fundraising_investors"):
+        key = norm_text(r["investor_name"])
+        if not key:
+            continue
+        g = groups[key]
+        g["investors"].append(r["id"])
+        if not g["name"]:
+            g["name"] = r["investor_name"]
+
+    org_canon_by_orgid, org_canon_by_fundinv = {}, {}
+    for key, g in groups.items():
+        # An org we are actively raising from (has a fundraising row) is an 'lp';
+        # otherwise a plain 'organization'.
+        kind = "lp" if g["investors"] else "organization"
+        cid = _eid("lp" if kind == "lp" else "org", key)
+        _upsert_entity(conn, cid, kind, g["name"], g["email"])
+        for oid in g["orgs"]:
+            _link(conn, cid, "organizations", oid, key, "exact_name", 1.0)
+            org_canon_by_orgid[oid] = cid
+        for iid in g["investors"]:
+            _link(conn, cid, "fundraising_investors", iid, key, "exact_name", 1.0)
+            org_canon_by_fundinv[iid] = cid
+
+    return org_canon_by_orgid, org_canon_by_fundinv
+
+
+def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv):
+    """Merge contacts + fundraising_contacts by exact email, else exact name within
+    the same canonical org. Returns contact_id -> person canonical id (for lp_profiles)."""
+    # gather (model, source_id, full_name, email, org_canon)
+    people = []
+    for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts"):
+        full = f"{r['first_name'] or ''} {r['last_name'] or ''}".strip()
+        people.append(("contacts", r["id"], full, norm_email(r["email"]),
+                       org_canon_by_orgid.get(r["organization_id"])))
+    for r in conn.execute("SELECT id, full_name, email, investor_id FROM fundraising_contacts"):
+        people.append(("fundraising_contacts", r["id"], r["full_name"] or "", norm_email(r["email"]),
+                       org_canon_by_fundinv.get(r["investor_id"])))
+
+    contact_to_person = {}
+    person_meta = {}  # canonical_id -> {"org": org_canon, "last": last_norm, "name": display, "email": email}
+
+    for model, sid, full, email, org_canon in people:
+        name_norm = norm_text(full)
+        if email:
+            key = f"e|{email}"
+            match_kind, conf, match_value = "exact_email", 1.0, email
+        elif name_norm:
+            key = f"n|{name_norm}|{org_canon or ''}"
+            match_kind, conf, match_value = "name_org", 0.8, name_norm
+        else:
+            continue
+        cid = _eid("per", key)
+        display = full.strip() or email
+        _upsert_entity(conn, cid, "person", display, email)
+        _link(conn, cid, model, sid, match_value, match_kind, conf)
+        if model == "contacts":
+            contact_to_person[sid] = cid
+        meta = person_meta.setdefault(cid, {"org": org_canon, "last": _split_name(full)[1],
+                                             "name": display, "email": email})
+        if org_canon and not meta["org"]:
+            meta["org"] = org_canon
+
+    # lp_profiles -> the person entity of its contact
+    for r in conn.execute("SELECT id, contact_id FROM lp_profiles"):
+        cid = contact_to_person.get(r["contact_id"])
+        if cid:
+            _link(conn, cid, "lp_profiles", r["id"], r["contact_id"], "contact_fk", 1.0)
+
+    return person_meta
+
+
+def find_fuzzy_candidates(person_meta):
+    """Distinct person entities sharing the same canonical org AND surname are
+    likely the same individual under a name variant (e.g. Jon/Jonathan). Emit them
+    for the local-Qwen tier; do NOT merge here."""
+    by_org_last = defaultdict(list)
+    for cid, m in person_meta.items():
+        if m["org"] and m["last"]:
+            by_org_last[(m["org"], m["last"])].append((cid, m["name"], m["email"]))
+    return [{"org": org, "surname": last, "members": members}
+            for (org, last), members in by_org_last.items() if len(members) > 1]
+
+
+def run(db_path: str):
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA foreign_keys=ON")
+
+    org_by_oid, org_by_inv = resolve_organizations(conn)
+    conn.commit()
+    person_meta = resolve_people(conn, org_by_oid, org_by_inv)
+    conn.commit()
+    candidates = find_fuzzy_candidates(person_meta)
+
+    counts = {
+        "canonical_total": conn.execute("SELECT COUNT(*) FROM canonical_entities").fetchone()[0],
+        "lp": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp'").fetchone()[0],
+        "organization": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization'").fetchone()[0],
+        "person": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person'").fetchone()[0],
+        "links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
+        "fuzzy_candidates": len(candidates),
+    }
+
+    conn.execute(
+        """
+        INSERT INTO interaction_log (id, ts, actor_type, actor_id, action, target_type, payload, source, created_at)
+        VALUES (?, ?, 'system', 'entity_resolver', 'entity_resolution.run', 'canonical_entities', ?, 'ingest', ?)
+        """,
+        (str(uuid.uuid4()), _now(), json.dumps(counts), _now()),
+    )
+    conn.commit()
+    conn.close()
+    return counts, candidates
+
+
+def main():
+    ap = argparse.ArgumentParser(description="Deterministic entity resolution into the canonical layer.")
+    ap.add_argument("--db", default="data/crm_dev.db", help="path to the CRM SQLite DB")
+    ap.add_argument("--show-candidates", action="store_true", help="print fuzzy merge candidates")
+    args = ap.parse_args()
+
+    counts, candidates = run(args.db)
+    print(f"Entity resolution on {args.db}:")
+    for k, v in counts.items():
+        print(f"  {k:<18} {v}")
+    if args.show_candidates and candidates:
+        print("\nFuzzy candidates (same org + surname, different person — for the local-Qwen tier):")
+        for c in candidates:
+            names = ", ".join(f"{n!r}{(' <'+e+'>') if e else ''}" for _, n, e in c["members"])
+            print(f"  [{c['surname']}] {names}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,35 @@
+"""Tiny stdlib JSON HTTP client (no third-party deps).
+
+Handles the Spark Control self-signed cert (verify=False) and plain-HTTP Qdrant.
+"""
+import json
+import ssl
+import urllib.error
+import urllib.request
+
+
+def _ctx(verify: bool):
+    if verify:
+        return None
+    ctx = ssl.create_default_context()
+    ctx.check_hostname = False
+    ctx.verify_mode = ssl.CERT_NONE
+    return ctx
+
+
+def request(method: str, url: str, body=None, verify: bool = True, timeout: int = 180):
+    data = json.dumps(body).encode("utf-8") if body is not None else None
+    req = urllib.request.Request(url, data=data, method=method,
+                                 headers={"Content-Type": "application/json"})
+    ctx = _ctx(verify) if url.lower().startswith("https") else None
+    try:
+        with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
+            raw = resp.read()
+            return resp.status, (json.loads(raw) if raw else {})
+    except urllib.error.HTTPError as exc:
+        raw = exc.read()
+        try:
+            payload = json.loads(raw) if raw else {}
+        except Exception:
+            payload = {"raw": raw.decode("utf-8", "replace")}
+        return exc.code, payload
@@ -0,0 +1,50 @@
+"""Minimal Qdrant REST client for the ingest pipeline (direct to QDRANT_URL).
+
+Creates the crm_chunks collection per EMBEDDINGS.md: a named dense vector
+(1024, Cosine) + a named sparse vector with modifier:idf, plus payload indexes.
+"""
+import config
+import http_util
+
+Q = config.QDRANT_URL
+COL = config.COLLECTION
+
+
+def _req(method, path, body=None):
+    return http_util.request(method, f"{Q}{path}", body, verify=False)
+
+
+def exists() -> bool:
+    status, _ = _req("GET", f"/collections/{COL}")
+    return status == 200
+
+
+def create_collection(recreate=False, dim=config.DENSE_DIM):
+    if exists():
+        if not recreate:
+            return "exists"
+        _req("DELETE", f"/collections/{COL}")
+    status, data = _req("PUT", f"/collections/{COL}", {
+        "vectors": {"dense": {"size": dim, "distance": "Cosine"}},
+        "sparse_vectors": {"sparse": {"modifier": "idf"}},
+    })
+    if status not in (200, 201):
+        raise RuntimeError(f"create collection -> {status}: {data}")
+    return "created"
+
+
+def ensure_indexes():
+    for field, schema in (("lp_id", "keyword"), ("doc_type", "keyword"), ("date_ts", "integer")):
+        _req("PUT", f"/collections/{COL}/index", {"field_name": field, "field_schema": schema})
+
+
+def upsert(points):
+    status, data = _req("PUT", f"/collections/{COL}/points?wait=true", {"points": points})
+    if status not in (200, 201):
+        raise RuntimeError(f"upsert -> {status}: {data}")
+    return data
+
+
+def count():
+    status, data = _req("POST", f"/collections/{COL}/points/count", {"exact": True})
+    return (data or {}).get("result", {}).get("count")
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""Phase-0 retrieval — thin wrappers over Spark Control /api/search.
+
+These are the retrieval modes the CRM MCP server (Workstream C) will expose:
+  * semantic_search  — dense only (omit sparse), high recall
+  * hybrid_search    — dense + BM25 sparse (RRF) + rerank; best for entity queries
+  * keyword_search   — lean on the sparse leg
+All support a Qdrant `filter` (e.g. lp_id / date_ts range) to pre-filter.
+
+`--demo` runs an entity-heavy query in dense-only vs hybrid to show the BM25
+lexical leg surfacing the right LP. The query's sparse vector uses the SAME
+encoder as ingest (sparse.encode).
+"""
+import argparse
+
+import config
+import http_util
+import sparse
+
+
+def _search(query, sparse_vec=None, rerank=False, top_k=5, lp_id=None, retrieve_n=80, filt=None):
+    body = {"query": query, "collection": config.COLLECTION, "top_k": top_k,
+            "retrieve_n": retrieve_n, "fusion": "rrf", "text_field": "text",
+            "with_payload": True, "rerank": rerank}
+    if sparse_vec is not None:
+        body["sparse"] = {"indices": sparse_vec["indices"], "values": sparse_vec["values"]}
+    # An explicit raw Qdrant filter (filt) wins; otherwise build one from lp_id.
+    if filt is not None:
+        body["filter"] = filt
+    elif lp_id:
+        body["filter"] = {"must": [{"key": "lp_id", "match": {"value": lp_id}}]}
+    status, data = http_util.request("POST", f"{config.SPARK_CONTROL_URL}/api/search",
+                                     body, verify=config.SPARK_VERIFY_TLS)
+    if status != 200:
+        raise RuntimeError(f"/api/search -> {status}: {data}")
+    return data.get("data", [])
+
+
+def semantic_search(query, **kw):
+    return _search(query, sparse_vec=None, rerank=kw.pop("rerank", False), **kw)
+
+
+def hybrid_search(query, **kw):
+    return _search(query, sparse_vec=sparse.encode(query), rerank=kw.pop("rerank", True), **kw)
+
+
+def keyword_search(query, **kw):
+    return _search(query, sparse_vec=sparse.encode(query), rerank=kw.pop("rerank", True), **kw)
+
+
+def _row(r):
+    p = r.get("payload", {}) or {}
+    text = (r.get("text") or p.get("text") or "").replace("\n", " ")
+    return f"{p.get('lp_name', '?'):<22} [{p.get('doc_type', '?'):<13}] {text[:58]}"
+
+
+def _print(title, rows):
+    print(f"\n  {title}")
+    if not rows:
+        print("    (no results)")
+    for i, r in enumerate(rows, 1):
+        print(f"    {i}. score={r.get('score', 0):+.3f}  {_row(r)}")
+
+
+def demo():
+    target = "Cedar Point Capital"
+    q = "Fund III diligence and wire timeline for Cedar Point"
+    print(f"QUERY: {q!r}\nTarget LP: {target}")
+
+    dense = semantic_search(q, top_k=5)
+    hybrid = hybrid_search(q, top_k=5, rerank=False)   # rerank off to isolate the BM25 leg
+    _print("dense-only (semantic):", dense)
+    _print("hybrid (dense + BM25 RRF):", hybrid)
+
+    def first_rank(rows):
+        for i, r in enumerate(rows, 1):
+            if (r.get("payload", {}) or {}).get("lp_name") == target:
+                return i
+        return None
+    print(f"\n  First '{target}' chunk — dense rank: {first_rank(dense)}, hybrid rank: {first_rank(hybrid)}")
+
+    # Pre-filter demo: same query, restricted to one LP's chunks.
+    lp_id = None
+    for r in hybrid:
+        p = r.get("payload", {}) or {}
+        if p.get("lp_name") == target:
+            lp_id = p.get("lp_id")
+            break
+    if lp_id:
+        _print(f"hybrid + payload pre-filter (lp_id={lp_id}):",
+               hybrid_search(q, top_k=5, rerank=True, lp_id=lp_id))
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("query", nargs="?")
+    ap.add_argument("--mode", choices=["semantic", "hybrid", "keyword"], default="hybrid")
+    ap.add_argument("--top-k", type=int, default=5)
+    ap.add_argument("--lp-id")
+    ap.add_argument("--demo", action="store_true")
+    args = ap.parse_args()
+    if args.demo or not args.query:
+        return demo()
+    fn = {"semantic": semantic_search, "hybrid": hybrid_search, "keyword": keyword_search}[args.mode]
+    _print(f"{args.mode}: {args.query!r}", fn(args.query, top_k=args.top_k, lp_id=args.lp_id))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,40 @@
+"""Client-side BM25 sparse vectors.
+
+EMBEDDINGS.md specifies FastEmbed `Qdrant/bm25` so Qdrant applies IDF (via the
+sparse vector's `modifier: idf`) over OUR corpus. FastEmbed pulls onnxruntime,
+which has no wheel for this Python (3.14) yet, so this module provides a
+dependency-free BM25 term-frequency encoder with the same contract:
+`encode(text) -> {"indices": [...], "values": [...]}`.
+
+Qdrant computes IDF server-side from the stored sparse vectors regardless of how
+indices are assigned, so this is a legitimate corpus-IDF BM25 leg. The ONLY hard
+requirement is that ingest and query use the SAME encoder — they both import this
+one. For production, swap `encode()` for FastEmbed `Qdrant/bm25` (and re-index, so
+ingest and query stay on the same tokenizer).
+"""
+import hashlib
+import math
+import re
+
+_TOKEN_RE = re.compile(r"[a-z0-9]+")
+
+
+def tokenize(text: str):
+    return _TOKEN_RE.findall((text or "").lower())
+
+
+def _index(token: str) -> int:
+    # Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32).
+    return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big")
+
+
+def encode(text: str):
+    """Return a sparse vector {indices, values}. Value is 1 + ln(tf) (sublinear
+    term frequency); IDF is applied by Qdrant via modifier:idf."""
+    tf = {}
+    for tok in tokenize(text):
+        tf[tok] = tf.get(tok, 0) + 1
+    idx_val = {}
+    for tok, count in tf.items():
+        idx_val[_index(tok)] = 1.0 + math.log(count)
+    return {"indices": list(idx_val.keys()), "values": list(idx_val.values())}