Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server

Workstream A–C substrate for the Ten31 agentic system:
- A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9
- A2: additive/reversible core migration (canonical_entities, entity_links,
  interaction_log, relationship_edges, soft-delete) + ledgered runner
- B1/B3: chunking + deterministic entity resolution (backend/ingest)
- B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks
- C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools
- docs: redaction/re-hydration, Gmail enablement runbook
- synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db,
  drop legacy files + start9/0.3.5)

Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity
queries). Real backfill runs on Ten31 infra; index holds synthetic data only.
Branch snapshot also captures pre-existing working-tree changes.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Keysat
2026-06-05 08:11:28 -05:00
parent 7027efd777
commit c7ce44d963
99 changed files with 10676 additions and 7817 deletions
+5
View File
@@ -0,0 +1,5 @@
"""Ten31 Phase-0 ingest pipeline (entity resolution, chunking, embed, Qdrant upsert).
All modules are local-only and read the CRM by SQLite file path (CRM is canonical;
the canonical/vector layers are derived). No real data is sent to Claude here.
"""
+64
View File
@@ -0,0 +1,64 @@
#!/usr/bin/env python3
"""Phase-0 Workstream B — backfill the CRM into Qdrant.
Chunk -> dense (bge-m3 via Spark Control) + sparse (BM25 client-side) -> upsert
to Qdrant `crm_chunks` with payload. Idempotent: deterministic point ids mean
re-running upserts in place. Reads the CRM by file path; never sends data to Claude.
python3 backend/ingest/backfill.py --db data/crm_dev.db --recreate
"""
import argparse
import sqlite3
import chunking
import config
import embed
import qdrant_io
import sparse
def run(db, recreate=False, batch=32):
conn = sqlite3.connect(db)
conn.row_factory = sqlite3.Row
chunks = chunking.build_chunks(conn)
conn.close()
print(f"Built {len(chunks)} chunks from {db}")
state = qdrant_io.create_collection(recreate=recreate)
qdrant_io.ensure_indexes()
print(f"Collection '{config.COLLECTION}': {state}")
total = 0
for i in range(0, len(chunks), batch):
group = chunks[i:i + batch]
dense = embed.dense_embed([c["text"] for c in group])
points = []
for c, dv in zip(group, dense):
sv = sparse.encode(c["text"])
points.append({
"id": c["point_id"],
"vector": {"dense": dv, "sparse": {"indices": sv["indices"], "values": sv["values"]}},
"payload": {
"lp_id": c["lp_id"], "lp_name": c["lp_name"], "person_id": c["person_id"],
"doc_type": c["doc_type"], "date_ts": c["date_ts"], "text": c["text"],
"source_model": c["source_model"], "source_id": c["source_id"], "chunk_key": c["chunk_key"],
},
})
qdrant_io.upsert(points)
total += len(points)
print(f" upserted {total}/{len(chunks)}")
print(f"Done. Qdrant '{config.COLLECTION}' now holds {qdrant_io.count()} points.")
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--db", default=config.DEFAULT_DB)
ap.add_argument("--recreate", action="store_true", help="drop & recreate the collection first")
ap.add_argument("--batch", type=int, default=32)
args = ap.parse_args()
run(args.db, recreate=args.recreate, batch=args.batch)
if __name__ == "__main__":
main()
+184
View File
@@ -0,0 +1,184 @@
"""Phase-0 Workstream B1 — chunk the CRM for retrieval.
Maps each CRM record type to one or more chunks per docs/EMBEDDINGS.md:
* one chunk per communications row (doc_type = the comm type)
* one chunk per MATCHED email (doc_type = email; body only when matched)
* one chunk per fundraising_investors notes LINE (the outreach log; split per line)
* one chunk each for free-text fields: contacts.notes, lp_profiles.notes,
opportunities (description + next_step), organizations.description
Each chunk carries a canonical `lp_id` (resolved via entity_links) and a `date_ts`
(epoch of the EVENT time, not created_at) so Qdrant can pre-filter and recency-rank.
Entities/names/dates/types are payload (filterable); only prose is embedded.
A chunk's stable `chunk_key` -> deterministic point id (uuid5), so re-ingest
upserts in place (idempotent).
"""
import sqlite3
import uuid
from datetime import datetime, timezone
_NS = uuid.UUID("6ba7b811-9dad-11d1-80b4-00c04fd430c8") # uuid5 namespace for chunk ids
def to_epoch(ts: str):
if not ts:
return None
s = ts.strip().replace("Z", "+00:00")
for parse in (datetime.fromisoformat,):
try:
dt = parse(s)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return int(dt.timestamp())
except Exception:
pass
# date-only fallback
try:
return int(datetime.strptime(ts[:10], "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp())
except Exception:
return None
def _point_id(chunk_key: str) -> str:
return str(uuid.uuid5(_NS, chunk_key))
def _mk(chunk_key, lp_id, lp_name, person_id, doc_type, date_ts, text, source_model, source_id):
text = (text or "").strip()
if not text or not lp_id:
return None
return {
"chunk_key": chunk_key,
"point_id": _point_id(chunk_key),
"lp_id": lp_id,
"lp_name": lp_name,
"person_id": person_id,
"doc_type": doc_type,
"date_ts": date_ts,
"text": text,
"source_model": source_model,
"source_id": source_id,
}
def _canon_maps(conn):
"""Resolution lookups from entity_links / canonical_entities."""
person_canon, org_canon, inv_canon = {}, {}, {}
for r in conn.execute("SELECT source_model, source_id, canonical_id FROM entity_links"):
if r["source_model"] == "contacts":
person_canon[r["source_id"]] = r["canonical_id"]
elif r["source_model"] == "organizations":
org_canon[r["source_id"]] = r["canonical_id"]
elif r["source_model"] == "fundraising_investors":
inv_canon[r["source_id"]] = r["canonical_id"]
name = {r["id"]: r["display_name"] for r in conn.execute("SELECT id, display_name FROM canonical_entities")}
contact_org = {r["id"]: r["organization_id"] for r in conn.execute("SELECT id, organization_id FROM contacts")}
return person_canon, org_canon, inv_canon, name, contact_org
def _contact_lp(cid, person_canon, org_canon, name, contact_org):
"""Best lp_id for a contact-anchored chunk: the firm if known, else the person."""
person = person_canon.get(cid)
firm = org_canon.get(contact_org.get(cid))
lp = firm or person
return lp, name.get(lp), person
def build_chunks(conn):
person_canon, org_canon, inv_canon, name, contact_org = _canon_maps(conn)
chunks = []
# communications
for r in conn.execute("""SELECT id, contact_id, type, subject, body, outcome, next_action, communication_date
FROM communications"""):
lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
parts = [p for p in (r["subject"], r["body"], r["outcome"], r["next_action"]) if (p or "").strip()]
chunks.append(_mk(f"communications:{r['id']}", lp, lp_name, person,
r["type"] or "note", to_epoch(r["communication_date"]),
"\n".join(parts), "communications", r["id"]))
# contacts.notes
for r in conn.execute("SELECT id, notes, updated_at FROM contacts WHERE notes IS NOT NULL AND notes <> ''"):
lp, lp_name, person = _contact_lp(r["id"], person_canon, org_canon, name, contact_org)
chunks.append(_mk(f"contacts.notes:{r['id']}", lp, lp_name, person,
"contact_note", to_epoch(r["updated_at"]), r["notes"], "contacts", r["id"]))
# lp_profiles.notes
for r in conn.execute("""SELECT lp.id, lp.contact_id, lp.notes, lp.updated_at
FROM lp_profiles lp WHERE lp.notes IS NOT NULL AND lp.notes <> ''"""):
lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
chunks.append(_mk(f"lp_profiles.notes:{r['id']}", lp, lp_name, person,
"lp_note", to_epoch(r["updated_at"]), r["notes"], "lp_profiles", r["id"]))
# opportunities (description + next_step)
for r in conn.execute("""SELECT id, contact_id, name, description, next_step, updated_at
FROM opportunities"""):
lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
parts = [p for p in (r["name"], r["description"], r["next_step"]) if (p or "").strip()]
chunks.append(_mk(f"opportunities:{r['id']}", lp, lp_name, person,
"opportunity", to_epoch(r["updated_at"]), "\n".join(parts), "opportunities", r["id"]))
# organizations.description
for r in conn.execute("""SELECT id, description, updated_at FROM organizations
WHERE description IS NOT NULL AND description <> ''"""):
lp = org_canon.get(r["id"])
chunks.append(_mk(f"organizations.description:{r['id']}", lp, name.get(lp), None,
"org_note", to_epoch(r["updated_at"]), r["description"], "organizations", r["id"]))
# fundraising_investors.notes — running outreach log, split per non-empty line
for r in conn.execute("""SELECT id, notes, updated_at FROM fundraising_investors
WHERE notes IS NOT NULL AND notes <> ''"""):
lp = inv_canon.get(r["id"])
for i, line in enumerate(str(r["notes"]).splitlines()):
if line.strip():
chunks.append(_mk(f"fundraising_investors.notes:{r['id']}:{i}", lp, name.get(lp), None,
"outreach_note", to_epoch(r["updated_at"]), line, "fundraising_investors", r["id"]))
# MATCHED emails (only matched rows carry a body; key lp via email_investor_links)
if _has_table(conn, "emails") and _has_table(conn, "email_investor_links"):
for r in conn.execute("""SELECT id, subject, body_text, snippet, sent_at FROM emails WHERE is_matched=1"""):
lp, lp_name = _email_lp(conn, r["id"], inv_canon, org_canon, person_canon, name)
text = "\n".join(p for p in (r["subject"], r["body_text"] or r["snippet"]) if (p or "").strip())
chunks.append(_mk(f"emails:{r['id']}", lp, lp_name, None, "email",
to_epoch(r["sent_at"]), text, "emails", r["id"]))
return [c for c in chunks if c]
def _has_table(conn, name):
return conn.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name=?", (name,)).fetchone() is not None
def _email_lp(conn, email_id, inv_canon, org_canon, person_canon, name):
"""Resolve a matched email's lp_id via email_investor_links, precedence:
fundraising_investor -> contact -> organization."""
row = conn.execute("""SELECT fundraising_investor_id, contact_id, organization_id
FROM email_investor_links WHERE email_id=? ORDER BY match_confidence DESC LIMIT 1""",
(email_id,)).fetchone()
if not row:
return None, None
lp = (inv_canon.get(row["fundraising_investor_id"]) or person_canon.get(row["contact_id"])
or org_canon.get(row["organization_id"]))
return lp, name.get(lp)
if __name__ == "__main__":
import argparse
from collections import Counter
from config import DEFAULT_DB
ap = argparse.ArgumentParser()
ap.add_argument("--db", default=DEFAULT_DB)
args = ap.parse_args()
conn = sqlite3.connect(args.db)
conn.row_factory = sqlite3.Row
chunks = build_chunks(conn)
print(f"{len(chunks)} chunks from {args.db}")
for dt, n in Counter(c["doc_type"] for c in chunks).most_common():
print(f" {dt:<16} {n}")
unresolved = sum(1 for c in chunks if not c["lp_id"])
print(f" (all chunks have an lp_id: {unresolved == 0})")
print("\nSample chunk:")
s = chunks[0]
print({k: (v[:80] + '' if k == 'text' and v and len(v) > 80 else v) for k, v in s.items()})
+28
View File
@@ -0,0 +1,28 @@
"""Ingest config — loads .env and exposes the Spark/Qdrant/CRM settings."""
import os
_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def load_env(path=None):
path = path or os.path.join(_ROOT, ".env")
if not os.path.exists(path):
return
with open(path, "r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
k, v = line.split("=", 1)
os.environ.setdefault(k.strip(), v.strip())
load_env()
SPARK_CONTROL_URL = os.environ.get("SPARK_CONTROL_URL", "").rstrip("/")
SPARK_VERIFY_TLS = os.environ.get("SPARK_CONTROL_VERIFY_TLS", "false").lower() in ("1", "true", "yes", "on")
QDRANT_URL = os.environ.get("QDRANT_URL", "").rstrip("/")
COLLECTION = os.environ.get("CRM_QDRANT_COLLECTION", "crm_chunks")
EMBED_MODEL = os.environ.get("CRM_EMBED_MODEL", "BAAI/bge-m3")
DENSE_DIM = int(os.environ.get("CRM_EMBED_DIM", "1024"))
DEFAULT_DB = os.environ.get("CRM_DEV_DB_PATH", os.path.join(_ROOT, "data", "crm_dev.db"))
+17
View File
@@ -0,0 +1,17 @@
"""Dense embeddings via Spark Control /v1/embeddings (BAAI/bge-m3, 1024-d)."""
import config
import http_util
def dense_embed(texts, batch=32):
out = []
for i in range(0, len(texts), batch):
group = texts[i:i + batch]
status, data = http_util.request(
"POST", f"{config.SPARK_CONTROL_URL}/v1/embeddings",
{"input": group, "model": config.EMBED_MODEL}, verify=config.SPARK_VERIFY_TLS)
if status != 200:
raise RuntimeError(f"/v1/embeddings -> {status}: {data}")
rows = sorted(data["data"], key=lambda d: d["index"])
out.extend(r["embedding"] for r in rows)
return out
+258
View File
@@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""Phase-0 Workstream B3 / A4 — entity resolution (deterministic tier).
Collapses the CRM's two parallel investor models into the canonical identity
layer created by migration 0001:
organizations ─┐
fundraising_investors ─┴─► canonical_entities (entity_kind = lp | organization)
contacts ─┐
fundraising_contacts ─┴─► canonical_entities (entity_kind = person)
lp_profiles ───► linked to its contact's person entity
Every source row is recorded in `entity_links` so any name variant resolves to
one canonical id. This is the DETERMINISTIC tier — it merges only what we can
prove (exact email; exact normalized name within the same canonical org). The
HARD cases (nicknames like "Jon" vs "Jonathan", typos) are NOT guessed; they are
emitted as *fuzzy candidates* for the local-Qwen tier (Spark Control
/v1/chat/completions) to adjudicate later. Honest separation: we never silently
merge on a guess.
Properties:
* Local-only, read-mostly: reads CRM source tables, writes only the derived
canonical_entities / entity_links and an interaction_log audit row. Never
mutates a CRM source record (guardrail #2/#3).
* Idempotent: canonical ids are deterministic (sha1 of the resolution key), so
re-running upserts in place and keeps ids stable across runs — which keeps
downstream Qdrant point ids valid (no churn on re-embed).
* Logged: writes one interaction_log row per run (guardrail #5).
Usage:
python3 backend/ingest/entity_resolution.py --db data/crm_dev.db
python3 backend/ingest/entity_resolution.py --db data/crm_dev.db --show-candidates
"""
import argparse
import hashlib
import json
import re
import sqlite3
import uuid
from collections import defaultdict
from datetime import datetime, timezone
# ── normalization ─────────────────────────────────────────────────────────────
def norm_text(s: str) -> str:
s = (s or "").strip().lower()
s = re.sub(r"[^\w\s]", " ", s)
return re.sub(r"\s+", " ", s).strip()
def norm_email(s: str) -> str:
return (s or "").strip().lower()
def _eid(prefix: str, key: str) -> str:
"""Deterministic canonical id: stable across runs for the same resolution key."""
return f"{prefix}_{hashlib.sha1(key.encode('utf-8')).hexdigest()[:12]}"
def _now() -> str:
return datetime.now(timezone.utc).isoformat()
def _split_name(full: str):
parts = norm_text(full).split()
if not parts:
return "", ""
return parts[0], parts[-1] if len(parts) > 1 else ""
# ── upsert helpers ────────────────────────────────────────────────────────────
def _upsert_entity(conn, eid, kind, display_name, primary_email):
conn.execute(
"""
INSERT INTO canonical_entities (id, entity_kind, display_name, primary_email, source, created_at, updated_at)
VALUES (?, ?, ?, ?, 'entity_resolution', ?, ?)
ON CONFLICT(id) DO UPDATE SET
display_name = excluded.display_name,
primary_email = COALESCE(excluded.primary_email, canonical_entities.primary_email),
entity_kind = excluded.entity_kind,
updated_at = excluded.updated_at
""",
(eid, kind, display_name, primary_email or None, _now(), _now()),
)
def _link(conn, canonical_id, source_model, source_id, match_value, match_kind, confidence):
conn.execute(
"""
INSERT INTO entity_links (id, canonical_id, source_model, source_id, match_value, match_kind, confidence, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(source_model, source_id, match_value) DO UPDATE SET
canonical_id = excluded.canonical_id,
match_kind = excluded.match_kind,
confidence = excluded.confidence
""",
(str(uuid.uuid4()), canonical_id, source_model, source_id, match_value, match_kind, confidence, _now()),
)
# ── resolution passes ─────────────────────────────────────────────────────────
def resolve_organizations(conn):
"""Merge organizations + fundraising_investors by normalized name.
Returns (org_canon_by_orgid, org_canon_by_fundinv) so the people pass can
attach each person to their firm's canonical id.
"""
groups = defaultdict(lambda: {"orgs": [], "investors": [], "name": "", "email": ""})
for r in conn.execute("SELECT id, name, email FROM organizations"):
key = norm_text(r["name"])
if not key:
continue
g = groups[key]
g["orgs"].append(r["id"])
if len(r["name"] or "") > len(g["name"]):
g["name"] = r["name"]
if not g["email"] and (r["email"] or "").strip():
g["email"] = r["email"].strip()
for r in conn.execute("SELECT id, investor_name FROM fundraising_investors"):
key = norm_text(r["investor_name"])
if not key:
continue
g = groups[key]
g["investors"].append(r["id"])
if not g["name"]:
g["name"] = r["investor_name"]
org_canon_by_orgid, org_canon_by_fundinv = {}, {}
for key, g in groups.items():
# An org we are actively raising from (has a fundraising row) is an 'lp';
# otherwise a plain 'organization'.
kind = "lp" if g["investors"] else "organization"
cid = _eid("lp" if kind == "lp" else "org", key)
_upsert_entity(conn, cid, kind, g["name"], g["email"])
for oid in g["orgs"]:
_link(conn, cid, "organizations", oid, key, "exact_name", 1.0)
org_canon_by_orgid[oid] = cid
for iid in g["investors"]:
_link(conn, cid, "fundraising_investors", iid, key, "exact_name", 1.0)
org_canon_by_fundinv[iid] = cid
return org_canon_by_orgid, org_canon_by_fundinv
def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv):
"""Merge contacts + fundraising_contacts by exact email, else exact name within
the same canonical org. Returns contact_id -> person canonical id (for lp_profiles)."""
# gather (model, source_id, full_name, email, org_canon)
people = []
for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts"):
full = f"{r['first_name'] or ''} {r['last_name'] or ''}".strip()
people.append(("contacts", r["id"], full, norm_email(r["email"]),
org_canon_by_orgid.get(r["organization_id"])))
for r in conn.execute("SELECT id, full_name, email, investor_id FROM fundraising_contacts"):
people.append(("fundraising_contacts", r["id"], r["full_name"] or "", norm_email(r["email"]),
org_canon_by_fundinv.get(r["investor_id"])))
contact_to_person = {}
person_meta = {} # canonical_id -> {"org": org_canon, "last": last_norm, "name": display, "email": email}
for model, sid, full, email, org_canon in people:
name_norm = norm_text(full)
if email:
key = f"e|{email}"
match_kind, conf, match_value = "exact_email", 1.0, email
elif name_norm:
key = f"n|{name_norm}|{org_canon or ''}"
match_kind, conf, match_value = "name_org", 0.8, name_norm
else:
continue
cid = _eid("per", key)
display = full.strip() or email
_upsert_entity(conn, cid, "person", display, email)
_link(conn, cid, model, sid, match_value, match_kind, conf)
if model == "contacts":
contact_to_person[sid] = cid
meta = person_meta.setdefault(cid, {"org": org_canon, "last": _split_name(full)[1],
"name": display, "email": email})
if org_canon and not meta["org"]:
meta["org"] = org_canon
# lp_profiles -> the person entity of its contact
for r in conn.execute("SELECT id, contact_id FROM lp_profiles"):
cid = contact_to_person.get(r["contact_id"])
if cid:
_link(conn, cid, "lp_profiles", r["id"], r["contact_id"], "contact_fk", 1.0)
return person_meta
def find_fuzzy_candidates(person_meta):
"""Distinct person entities sharing the same canonical org AND surname are
likely the same individual under a name variant (e.g. Jon/Jonathan). Emit them
for the local-Qwen tier; do NOT merge here."""
by_org_last = defaultdict(list)
for cid, m in person_meta.items():
if m["org"] and m["last"]:
by_org_last[(m["org"], m["last"])].append((cid, m["name"], m["email"]))
return [{"org": org, "surname": last, "members": members}
for (org, last), members in by_org_last.items() if len(members) > 1]
def run(db_path: str):
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys=ON")
org_by_oid, org_by_inv = resolve_organizations(conn)
conn.commit()
person_meta = resolve_people(conn, org_by_oid, org_by_inv)
conn.commit()
candidates = find_fuzzy_candidates(person_meta)
counts = {
"canonical_total": conn.execute("SELECT COUNT(*) FROM canonical_entities").fetchone()[0],
"lp": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp'").fetchone()[0],
"organization": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization'").fetchone()[0],
"person": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person'").fetchone()[0],
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
"fuzzy_candidates": len(candidates),
}
conn.execute(
"""
INSERT INTO interaction_log (id, ts, actor_type, actor_id, action, target_type, payload, source, created_at)
VALUES (?, ?, 'system', 'entity_resolver', 'entity_resolution.run', 'canonical_entities', ?, 'ingest', ?)
""",
(str(uuid.uuid4()), _now(), json.dumps(counts), _now()),
)
conn.commit()
conn.close()
return counts, candidates
def main():
ap = argparse.ArgumentParser(description="Deterministic entity resolution into the canonical layer.")
ap.add_argument("--db", default="data/crm_dev.db", help="path to the CRM SQLite DB")
ap.add_argument("--show-candidates", action="store_true", help="print fuzzy merge candidates")
args = ap.parse_args()
counts, candidates = run(args.db)
print(f"Entity resolution on {args.db}:")
for k, v in counts.items():
print(f" {k:<18} {v}")
if args.show_candidates and candidates:
print("\nFuzzy candidates (same org + surname, different person — for the local-Qwen tier):")
for c in candidates:
names = ", ".join(f"{n!r}{(' <'+e+'>') if e else ''}" for _, n, e in c["members"])
print(f" [{c['surname']}] {names}")
if __name__ == "__main__":
main()
+35
View File
@@ -0,0 +1,35 @@
"""Tiny stdlib JSON HTTP client (no third-party deps).
Handles the Spark Control self-signed cert (verify=False) and plain-HTTP Qdrant.
"""
import json
import ssl
import urllib.error
import urllib.request
def _ctx(verify: bool):
if verify:
return None
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
return ctx
def request(method: str, url: str, body=None, verify: bool = True, timeout: int = 180):
data = json.dumps(body).encode("utf-8") if body is not None else None
req = urllib.request.Request(url, data=data, method=method,
headers={"Content-Type": "application/json"})
ctx = _ctx(verify) if url.lower().startswith("https") else None
try:
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
raw = resp.read()
return resp.status, (json.loads(raw) if raw else {})
except urllib.error.HTTPError as exc:
raw = exc.read()
try:
payload = json.loads(raw) if raw else {}
except Exception:
payload = {"raw": raw.decode("utf-8", "replace")}
return exc.code, payload
+50
View File
@@ -0,0 +1,50 @@
"""Minimal Qdrant REST client for the ingest pipeline (direct to QDRANT_URL).
Creates the crm_chunks collection per EMBEDDINGS.md: a named dense vector
(1024, Cosine) + a named sparse vector with modifier:idf, plus payload indexes.
"""
import config
import http_util
Q = config.QDRANT_URL
COL = config.COLLECTION
def _req(method, path, body=None):
return http_util.request(method, f"{Q}{path}", body, verify=False)
def exists() -> bool:
status, _ = _req("GET", f"/collections/{COL}")
return status == 200
def create_collection(recreate=False, dim=config.DENSE_DIM):
if exists():
if not recreate:
return "exists"
_req("DELETE", f"/collections/{COL}")
status, data = _req("PUT", f"/collections/{COL}", {
"vectors": {"dense": {"size": dim, "distance": "Cosine"}},
"sparse_vectors": {"sparse": {"modifier": "idf"}},
})
if status not in (200, 201):
raise RuntimeError(f"create collection -> {status}: {data}")
return "created"
def ensure_indexes():
for field, schema in (("lp_id", "keyword"), ("doc_type", "keyword"), ("date_ts", "integer")):
_req("PUT", f"/collections/{COL}/index", {"field_name": field, "field_schema": schema})
def upsert(points):
status, data = _req("PUT", f"/collections/{COL}/points?wait=true", {"points": points})
if status not in (200, 201):
raise RuntimeError(f"upsert -> {status}: {data}")
return data
def count():
status, data = _req("POST", f"/collections/{COL}/points/count", {"exact": True})
return (data or {}).get("result", {}).get("count")
+109
View File
@@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""Phase-0 retrieval — thin wrappers over Spark Control /api/search.
These are the retrieval modes the CRM MCP server (Workstream C) will expose:
* semantic_search — dense only (omit sparse), high recall
* hybrid_search — dense + BM25 sparse (RRF) + rerank; best for entity queries
* keyword_search — lean on the sparse leg
All support a Qdrant `filter` (e.g. lp_id / date_ts range) to pre-filter.
`--demo` runs an entity-heavy query in dense-only vs hybrid to show the BM25
lexical leg surfacing the right LP. The query's sparse vector uses the SAME
encoder as ingest (sparse.encode).
"""
import argparse
import config
import http_util
import sparse
def _search(query, sparse_vec=None, rerank=False, top_k=5, lp_id=None, retrieve_n=80, filt=None):
body = {"query": query, "collection": config.COLLECTION, "top_k": top_k,
"retrieve_n": retrieve_n, "fusion": "rrf", "text_field": "text",
"with_payload": True, "rerank": rerank}
if sparse_vec is not None:
body["sparse"] = {"indices": sparse_vec["indices"], "values": sparse_vec["values"]}
# An explicit raw Qdrant filter (filt) wins; otherwise build one from lp_id.
if filt is not None:
body["filter"] = filt
elif lp_id:
body["filter"] = {"must": [{"key": "lp_id", "match": {"value": lp_id}}]}
status, data = http_util.request("POST", f"{config.SPARK_CONTROL_URL}/api/search",
body, verify=config.SPARK_VERIFY_TLS)
if status != 200:
raise RuntimeError(f"/api/search -> {status}: {data}")
return data.get("data", [])
def semantic_search(query, **kw):
return _search(query, sparse_vec=None, rerank=kw.pop("rerank", False), **kw)
def hybrid_search(query, **kw):
return _search(query, sparse_vec=sparse.encode(query), rerank=kw.pop("rerank", True), **kw)
def keyword_search(query, **kw):
return _search(query, sparse_vec=sparse.encode(query), rerank=kw.pop("rerank", True), **kw)
def _row(r):
p = r.get("payload", {}) or {}
text = (r.get("text") or p.get("text") or "").replace("\n", " ")
return f"{p.get('lp_name', '?'):<22} [{p.get('doc_type', '?'):<13}] {text[:58]}"
def _print(title, rows):
print(f"\n {title}")
if not rows:
print(" (no results)")
for i, r in enumerate(rows, 1):
print(f" {i}. score={r.get('score', 0):+.3f} {_row(r)}")
def demo():
target = "Cedar Point Capital"
q = "Fund III diligence and wire timeline for Cedar Point"
print(f"QUERY: {q!r}\nTarget LP: {target}")
dense = semantic_search(q, top_k=5)
hybrid = hybrid_search(q, top_k=5, rerank=False) # rerank off to isolate the BM25 leg
_print("dense-only (semantic):", dense)
_print("hybrid (dense + BM25 RRF):", hybrid)
def first_rank(rows):
for i, r in enumerate(rows, 1):
if (r.get("payload", {}) or {}).get("lp_name") == target:
return i
return None
print(f"\n First '{target}' chunk — dense rank: {first_rank(dense)}, hybrid rank: {first_rank(hybrid)}")
# Pre-filter demo: same query, restricted to one LP's chunks.
lp_id = None
for r in hybrid:
p = r.get("payload", {}) or {}
if p.get("lp_name") == target:
lp_id = p.get("lp_id")
break
if lp_id:
_print(f"hybrid + payload pre-filter (lp_id={lp_id}):",
hybrid_search(q, top_k=5, rerank=True, lp_id=lp_id))
def main():
ap = argparse.ArgumentParser()
ap.add_argument("query", nargs="?")
ap.add_argument("--mode", choices=["semantic", "hybrid", "keyword"], default="hybrid")
ap.add_argument("--top-k", type=int, default=5)
ap.add_argument("--lp-id")
ap.add_argument("--demo", action="store_true")
args = ap.parse_args()
if args.demo or not args.query:
return demo()
fn = {"semantic": semantic_search, "hybrid": hybrid_search, "keyword": keyword_search}[args.mode]
_print(f"{args.mode}: {args.query!r}", fn(args.query, top_k=args.top_k, lp_id=args.lp_id))
if __name__ == "__main__":
main()
+40
View File
@@ -0,0 +1,40 @@
"""Client-side BM25 sparse vectors.
EMBEDDINGS.md specifies FastEmbed `Qdrant/bm25` so Qdrant applies IDF (via the
sparse vector's `modifier: idf`) over OUR corpus. FastEmbed pulls onnxruntime,
which has no wheel for this Python (3.14) yet, so this module provides a
dependency-free BM25 term-frequency encoder with the same contract:
`encode(text) -> {"indices": [...], "values": [...]}`.
Qdrant computes IDF server-side from the stored sparse vectors regardless of how
indices are assigned, so this is a legitimate corpus-IDF BM25 leg. The ONLY hard
requirement is that ingest and query use the SAME encoder — they both import this
one. For production, swap `encode()` for FastEmbed `Qdrant/bm25` (and re-index, so
ingest and query stay on the same tokenizer).
"""
import hashlib
import math
import re
_TOKEN_RE = re.compile(r"[a-z0-9]+")
def tokenize(text: str):
return _TOKEN_RE.findall((text or "").lower())
def _index(token: str) -> int:
# Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32).
return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big")
def encode(text: str):
"""Return a sparse vector {indices, values}. Value is 1 + ln(tf) (sublinear
term frequency); IDF is applied by Qdrant via modifier:idf."""
tf = {}
for tok in tokenize(text):
tf[tok] = tf.get(tok, 0) + 1
idx_val = {}
for tok, count in tf.items():
idx_val[_index(tok)] = 1.0 + math.log(count)
return {"indices": list(idx_val.keys()), "values": list(idx_val.values())}