Phase 0 complete: fuzzy entity tier, incremental sync, Start9 packaging
- Fuzzy tier (backend/ingest/fuzzy_resolve.py + llm.py): local Qwen adjudicates the deterministic resolver's flagged name-variant candidates; merges are durable via entity_merges (deterministic re-runs respect them), losers soft-deleted, logged. Idempotent. - Incremental sync (backend/ingest/sync.py): re-embeds only rows changed since a watermark (ingest_sync_state); first run / --recreate = full. Tested full→0→1. - Start9 packaging (start9/0.4): Dockerfile bundles ingest+mcp + fastembed/mcp; "Build search index" action runs the init in a subcontainer; MCP shipped as a manual stdio server (not a daemon); version 0.1.0:44. INGEST_PACKAGING.md. - backfill.py: factored embed_and_upsert() shared with sync. Verified end-to-end on synthetic data + live Sparks/Qwen/Qdrant. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+19
-12
@@ -17,17 +17,9 @@ import qdrant_io
|
|||||||
import sparse
|
import sparse
|
||||||
|
|
||||||
|
|
||||||
def run(db, recreate=False, batch=32):
|
def embed_and_upsert(chunks, batch=32, progress=True):
|
||||||
conn = sqlite3.connect(db)
|
"""Embed (dense + sparse) and upsert a list of chunks to Qdrant. Shared by the
|
||||||
conn.row_factory = sqlite3.Row
|
full backfill and the incremental sync. Returns the number of points written."""
|
||||||
chunks = chunking.build_chunks(conn)
|
|
||||||
conn.close()
|
|
||||||
print(f"Built {len(chunks)} chunks from {db}")
|
|
||||||
|
|
||||||
state = qdrant_io.create_collection(recreate=recreate)
|
|
||||||
qdrant_io.ensure_indexes()
|
|
||||||
print(f"Collection '{config.COLLECTION}': {state}")
|
|
||||||
|
|
||||||
total = 0
|
total = 0
|
||||||
for i in range(0, len(chunks), batch):
|
for i in range(0, len(chunks), batch):
|
||||||
group = chunks[i:i + batch]
|
group = chunks[i:i + batch]
|
||||||
@@ -46,8 +38,23 @@ def run(db, recreate=False, batch=32):
|
|||||||
})
|
})
|
||||||
qdrant_io.upsert(points)
|
qdrant_io.upsert(points)
|
||||||
total += len(points)
|
total += len(points)
|
||||||
print(f" upserted {total}/{len(chunks)}")
|
if progress:
|
||||||
|
print(f" upserted {total}/{len(chunks)}")
|
||||||
|
return total
|
||||||
|
|
||||||
|
|
||||||
|
def run(db, recreate=False, batch=32):
|
||||||
|
conn = sqlite3.connect(db)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
chunks = chunking.build_chunks(conn)
|
||||||
|
conn.close()
|
||||||
|
print(f"Built {len(chunks)} chunks from {db}")
|
||||||
|
|
||||||
|
state = qdrant_io.create_collection(recreate=recreate)
|
||||||
|
qdrant_io.ensure_indexes()
|
||||||
|
print(f"Collection '{config.COLLECTION}': {state}")
|
||||||
|
|
||||||
|
embed_and_upsert(chunks, batch=batch)
|
||||||
print(f"Done. Qdrant '{config.COLLECTION}' now holds {qdrant_io.count()} points.")
|
print(f"Done. Qdrant '{config.COLLECTION}' now holds {qdrant_io.count()} points.")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -24,5 +24,6 @@ SPARK_VERIFY_TLS = os.environ.get("SPARK_CONTROL_VERIFY_TLS", "false").lower() i
|
|||||||
QDRANT_URL = os.environ.get("QDRANT_URL", "").rstrip("/")
|
QDRANT_URL = os.environ.get("QDRANT_URL", "").rstrip("/")
|
||||||
COLLECTION = os.environ.get("CRM_QDRANT_COLLECTION", "crm_chunks")
|
COLLECTION = os.environ.get("CRM_QDRANT_COLLECTION", "crm_chunks")
|
||||||
EMBED_MODEL = os.environ.get("CRM_EMBED_MODEL", "BAAI/bge-m3")
|
EMBED_MODEL = os.environ.get("CRM_EMBED_MODEL", "BAAI/bge-m3")
|
||||||
|
CHAT_MODEL = os.environ.get("CRM_CHAT_MODEL", "RedHatAI/Qwen3.6-35B-A3B-NVFP4")
|
||||||
DENSE_DIM = int(os.environ.get("CRM_EMBED_DIM", "1024"))
|
DENSE_DIM = int(os.environ.get("CRM_EMBED_DIM", "1024"))
|
||||||
DEFAULT_DB = os.environ.get("CRM_DEV_DB_PATH", os.path.join(_ROOT, "data", "crm_dev.db"))
|
DEFAULT_DB = os.environ.get("CRM_DEV_DB_PATH", os.path.join(_ROOT, "data", "crm_dev.db"))
|
||||||
|
|||||||
@@ -69,6 +69,16 @@ def _split_name(full: str):
|
|||||||
return parts[0], parts[-1] if len(parts) > 1 else ""
|
return parts[0], parts[-1] if len(parts) > 1 else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _redirect(merge_map, eid):
|
||||||
|
"""Follow durable fuzzy-merge redirects (entity_merges) so deterministic
|
||||||
|
re-runs respect prior merges instead of recreating the merged-away entity."""
|
||||||
|
seen = set()
|
||||||
|
while eid in merge_map and eid not in seen:
|
||||||
|
seen.add(eid)
|
||||||
|
eid = merge_map[eid]
|
||||||
|
return eid
|
||||||
|
|
||||||
|
|
||||||
# ── upsert helpers ────────────────────────────────────────────────────────────
|
# ── upsert helpers ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def _upsert_entity(conn, eid, kind, display_name, primary_email):
|
def _upsert_entity(conn, eid, kind, display_name, primary_email):
|
||||||
@@ -102,12 +112,13 @@ def _link(conn, canonical_id, source_model, source_id, match_value, match_kind,
|
|||||||
|
|
||||||
# ── resolution passes ─────────────────────────────────────────────────────────
|
# ── resolution passes ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def resolve_organizations(conn):
|
def resolve_organizations(conn, merge_map=None):
|
||||||
"""Merge organizations + fundraising_investors by normalized name.
|
"""Merge organizations + fundraising_investors by normalized name.
|
||||||
|
|
||||||
Returns (org_canon_by_orgid, org_canon_by_fundinv) so the people pass can
|
Returns (org_canon_by_orgid, org_canon_by_fundinv) so the people pass can
|
||||||
attach each person to their firm's canonical id.
|
attach each person to their firm's canonical id.
|
||||||
"""
|
"""
|
||||||
|
merge_map = merge_map or {}
|
||||||
groups = defaultdict(lambda: {"orgs": [], "investors": [], "name": "", "email": ""})
|
groups = defaultdict(lambda: {"orgs": [], "investors": [], "name": "", "email": ""})
|
||||||
|
|
||||||
for r in conn.execute("SELECT id, name, email FROM organizations"):
|
for r in conn.execute("SELECT id, name, email FROM organizations"):
|
||||||
@@ -135,7 +146,7 @@ def resolve_organizations(conn):
|
|||||||
# An org we are actively raising from (has a fundraising row) is an 'lp';
|
# An org we are actively raising from (has a fundraising row) is an 'lp';
|
||||||
# otherwise a plain 'organization'.
|
# otherwise a plain 'organization'.
|
||||||
kind = "lp" if g["investors"] else "organization"
|
kind = "lp" if g["investors"] else "organization"
|
||||||
cid = _eid("lp" if kind == "lp" else "org", key)
|
cid = _redirect(merge_map, _eid("lp" if kind == "lp" else "org", key))
|
||||||
_upsert_entity(conn, cid, kind, g["name"], g["email"])
|
_upsert_entity(conn, cid, kind, g["name"], g["email"])
|
||||||
for oid in g["orgs"]:
|
for oid in g["orgs"]:
|
||||||
_link(conn, cid, "organizations", oid, key, "exact_name", 1.0)
|
_link(conn, cid, "organizations", oid, key, "exact_name", 1.0)
|
||||||
@@ -147,9 +158,10 @@ def resolve_organizations(conn):
|
|||||||
return org_canon_by_orgid, org_canon_by_fundinv
|
return org_canon_by_orgid, org_canon_by_fundinv
|
||||||
|
|
||||||
|
|
||||||
def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv):
|
def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv, merge_map=None):
|
||||||
"""Merge contacts + fundraising_contacts by exact email, else exact name within
|
"""Merge contacts + fundraising_contacts by exact email, else exact name within
|
||||||
the same canonical org. Returns contact_id -> person canonical id (for lp_profiles)."""
|
the same canonical org. Returns contact_id -> person canonical id (for lp_profiles)."""
|
||||||
|
merge_map = merge_map or {}
|
||||||
# gather (model, source_id, full_name, email, org_canon)
|
# gather (model, source_id, full_name, email, org_canon)
|
||||||
people = []
|
people = []
|
||||||
for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts"):
|
for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts"):
|
||||||
@@ -173,7 +185,7 @@ def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv):
|
|||||||
match_kind, conf, match_value = "name_org", 0.8, name_norm
|
match_kind, conf, match_value = "name_org", 0.8, name_norm
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
cid = _eid("per", key)
|
cid = _redirect(merge_map, _eid("per", key))
|
||||||
display = full.strip() or email
|
display = full.strip() or email
|
||||||
_upsert_entity(conn, cid, "person", display, email)
|
_upsert_entity(conn, cid, "person", display, email)
|
||||||
_link(conn, cid, model, sid, match_value, match_kind, conf)
|
_link(conn, cid, model, sid, match_value, match_kind, conf)
|
||||||
@@ -210,17 +222,32 @@ def run(db_path: str):
|
|||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
conn.execute("PRAGMA foreign_keys=ON")
|
conn.execute("PRAGMA foreign_keys=ON")
|
||||||
|
|
||||||
org_by_oid, org_by_inv = resolve_organizations(conn)
|
# Durable fuzzy-merge map (entity_merges) so deterministic re-runs respect
|
||||||
|
# prior local-Qwen merges instead of recreating merged-away entities.
|
||||||
|
conn.execute("""CREATE TABLE IF NOT EXISTS entity_merges (
|
||||||
|
merged_id TEXT PRIMARY KEY,
|
||||||
|
survivor_id TEXT NOT NULL,
|
||||||
|
confidence REAL,
|
||||||
|
reason TEXT,
|
||||||
|
created_at TEXT DEFAULT (datetime('now'))
|
||||||
|
)""")
|
||||||
|
merge_map = {r["merged_id"]: r["survivor_id"]
|
||||||
|
for r in conn.execute("SELECT merged_id, survivor_id FROM entity_merges")}
|
||||||
|
|
||||||
|
org_by_oid, org_by_inv = resolve_organizations(conn, merge_map)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
person_meta = resolve_people(conn, org_by_oid, org_by_inv)
|
person_meta = resolve_people(conn, org_by_oid, org_by_inv, merge_map)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
candidates = find_fuzzy_candidates(person_meta)
|
candidates = find_fuzzy_candidates(person_meta)
|
||||||
|
|
||||||
|
# Counts report LIVE entities (deleted_at IS NULL); fuzzy-merged losers are
|
||||||
|
# soft-deleted tombstones (guardrail #3) and excluded.
|
||||||
|
live = "deleted_at IS NULL"
|
||||||
counts = {
|
counts = {
|
||||||
"canonical_total": conn.execute("SELECT COUNT(*) FROM canonical_entities").fetchone()[0],
|
"canonical_total": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE {live}").fetchone()[0],
|
||||||
"lp": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp'").fetchone()[0],
|
"lp": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp' AND {live}").fetchone()[0],
|
||||||
"organization": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization'").fetchone()[0],
|
"organization": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization' AND {live}").fetchone()[0],
|
||||||
"person": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person'").fetchone()[0],
|
"person": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0],
|
||||||
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
|
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
|
||||||
"fuzzy_candidates": len(candidates),
|
"fuzzy_candidates": len(candidates),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,116 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Phase-0 Workstream B3 — fuzzy entity-resolution tier (local Qwen).
|
||||||
|
|
||||||
|
The deterministic tier (entity_resolution.py) merges only provable matches and
|
||||||
|
FLAGS the hard name-variant candidates (same firm + surname, different first
|
||||||
|
name/email) without guessing. This tier asks the local Qwen model (Spark Control
|
||||||
|
/v1/chat/completions — sovereign, on Ten31 infra) to adjudicate each candidate
|
||||||
|
and merges the confirmed ones.
|
||||||
|
|
||||||
|
A merge repoints the loser's entity_links to the survivor and soft-deletes the
|
||||||
|
loser canonical entity (deleted_at; never hard-deleted — guardrail #3). Every
|
||||||
|
merge is written to the interaction_log (guardrail #5). Idempotent: re-running
|
||||||
|
finds no new candidates once merged.
|
||||||
|
|
||||||
|
python3 backend/ingest/fuzzy_resolve.py --db data/crm_dev.db
|
||||||
|
python3 backend/ingest/fuzzy_resolve.py --db data/crm_dev.db --dry-run
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
import entity_resolution as er
|
||||||
|
import llm
|
||||||
|
|
||||||
|
_SYSTEM = ("You are an entity-resolution assistant for a CRM. Decide if the listed "
|
||||||
|
"people are the SAME individual recorded under name variants (e.g. nicknames "
|
||||||
|
"like Kate/Katherine, Bill/William), or DIFFERENT people who happen to share a "
|
||||||
|
"surname and firm. Be conservative: only say same when a nickname/abbreviation "
|
||||||
|
"relationship or matching contact info makes it clear.")
|
||||||
|
|
||||||
|
|
||||||
|
def _now():
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def _ask(members, firm):
|
||||||
|
people = "; ".join(f"{n}" + (f" <{e}>" if e else "") for _, n, e in members)
|
||||||
|
prompt = (f"Firm: {firm or 'unknown'}\nPeople: {people}\n\n"
|
||||||
|
"Are these the SAME person under name variants? "
|
||||||
|
'Answer only JSON: {"same": true|false, "confidence": 0.0-1.0, "reason": "..."}')
|
||||||
|
return llm.chat_json(prompt, system=_SYSTEM, max_tokens=160) or {"same": False, "confidence": 0.0}
|
||||||
|
|
||||||
|
|
||||||
|
def _survivor(members):
|
||||||
|
# Prefer a member with an email, then the longest (most complete) name.
|
||||||
|
return sorted(members, key=lambda m: (bool(m[2]), len(m[1])), reverse=True)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def run(db, threshold=0.7, dry_run=False):
|
||||||
|
counts, candidates = er.run(db) # ensure deterministic state + fresh candidates
|
||||||
|
conn = sqlite3.connect(db)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
conn.execute("PRAGMA foreign_keys=ON")
|
||||||
|
name_of = {r["id"]: r["display_name"] for r in conn.execute("SELECT id, display_name FROM canonical_entities")}
|
||||||
|
|
||||||
|
merges = []
|
||||||
|
for cand in candidates:
|
||||||
|
members = cand["members"]
|
||||||
|
verdict = _ask(members, name_of.get(cand["org"]))
|
||||||
|
same = bool(verdict.get("same")) and float(verdict.get("confidence", 0)) >= threshold
|
||||||
|
decision = {"surname": cand["surname"], "firm": name_of.get(cand["org"]),
|
||||||
|
"members": [{"id": m[0], "name": m[1]} for m in members],
|
||||||
|
"same": same, "confidence": verdict.get("confidence"),
|
||||||
|
"reason": verdict.get("reason")}
|
||||||
|
if same:
|
||||||
|
keep = _survivor(members)
|
||||||
|
losers = [m for m in members if m[0] != keep[0]]
|
||||||
|
decision["merged_into"] = {"id": keep[0], "name": keep[1]}
|
||||||
|
if not dry_run:
|
||||||
|
for loser in losers:
|
||||||
|
# Record the merge durably so deterministic re-runs respect it.
|
||||||
|
conn.execute("""INSERT INTO entity_merges (merged_id, survivor_id, confidence, reason, created_at)
|
||||||
|
VALUES (?,?,?,?,?)
|
||||||
|
ON CONFLICT(merged_id) DO UPDATE SET survivor_id=excluded.survivor_id,
|
||||||
|
confidence=excluded.confidence, reason=excluded.reason""",
|
||||||
|
(loser[0], keep[0], verdict.get("confidence", 0.7),
|
||||||
|
verdict.get("reason"), _now()))
|
||||||
|
conn.execute("UPDATE entity_links SET canonical_id=?, match_kind='fuzzy_merge', confidence=? "
|
||||||
|
"WHERE canonical_id=?", (keep[0], verdict.get("confidence", 0.7), loser[0]))
|
||||||
|
conn.execute("UPDATE canonical_entities SET deleted_at=?, updated_at=? WHERE id=?",
|
||||||
|
(_now(), _now(), loser[0]))
|
||||||
|
conn.execute("""INSERT INTO interaction_log
|
||||||
|
(id, ts, actor_type, actor_id, action, target_type, target_id, payload, source, created_at)
|
||||||
|
VALUES (?,?,?,?,?,?,?,?,?,?)""",
|
||||||
|
(str(uuid.uuid4()), _now(), "agent", "qwen_entity_resolver", "entity.merged",
|
||||||
|
"canonical_entity", keep[0], json.dumps(decision), "ingest", _now()))
|
||||||
|
merges.append(decision)
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
conn.commit()
|
||||||
|
live_people = conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND deleted_at IS NULL").fetchone()[0]
|
||||||
|
conn.close()
|
||||||
|
return merges, live_people
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--db", default="data/crm_dev.db")
|
||||||
|
ap.add_argument("--threshold", type=float, default=0.7)
|
||||||
|
ap.add_argument("--dry-run", action="store_true")
|
||||||
|
args = ap.parse_args()
|
||||||
|
merges, live = run(args.db, threshold=args.threshold, dry_run=args.dry_run)
|
||||||
|
print(f"Adjudicated {len(merges)} candidate group(s){' (dry run)' if args.dry_run else ''}:")
|
||||||
|
for m in merges:
|
||||||
|
names = " / ".join(p["name"] for p in m["members"])
|
||||||
|
verdict = f"MERGE -> {m['merged_into']['name']}" if m.get("merged_into") else "keep separate"
|
||||||
|
print(f" [{m['surname']}] {names}: same={m['same']} conf={m['confidence']} => {verdict}")
|
||||||
|
if m.get("reason"):
|
||||||
|
print(f" reason: {m['reason']}")
|
||||||
|
print(f"Live person entities now: {live}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
"""Local Qwen chat client via Spark Control /v1/chat/completions.
|
||||||
|
|
||||||
|
Used for the privacy-sensitive, high-volume reasoning that must stay on Ten31
|
||||||
|
infra (entity-resolution adjudication, triage). Frontier reasoning still goes to
|
||||||
|
Claude; this is the local leg. Thinking is disabled for fast structured output.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
import config
|
||||||
|
import http_util
|
||||||
|
|
||||||
|
|
||||||
|
def chat(prompt, system=None, max_tokens=200, temperature=0.0):
|
||||||
|
messages = []
|
||||||
|
if system:
|
||||||
|
messages.append({"role": "system", "content": system})
|
||||||
|
messages.append({"role": "user", "content": prompt})
|
||||||
|
body = {"model": config.CHAT_MODEL, "messages": messages,
|
||||||
|
"temperature": temperature, "max_tokens": max_tokens,
|
||||||
|
"chat_template_kwargs": {"enable_thinking": False}}
|
||||||
|
status, data = http_util.request("POST", f"{config.SPARK_CONTROL_URL}/v1/chat/completions",
|
||||||
|
body, verify=config.SPARK_VERIFY_TLS)
|
||||||
|
if status != 200:
|
||||||
|
raise RuntimeError(f"/v1/chat/completions -> {status}: {data}")
|
||||||
|
return (data["choices"][0]["message"].get("content") or "").strip()
|
||||||
|
|
||||||
|
|
||||||
|
def chat_json(prompt, system=None, max_tokens=200):
|
||||||
|
"""Chat and parse the first JSON object from the reply (tolerant of fences)."""
|
||||||
|
raw = chat(prompt, system=system, max_tokens=max_tokens)
|
||||||
|
raw = re.sub(r"^```(json)?|```$", "", raw.strip(), flags=re.MULTILINE).strip()
|
||||||
|
m = re.search(r"\{.*\}", raw, re.DOTALL)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return json.loads(m.group(0))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return None
|
||||||
+32
-16
@@ -16,25 +16,41 @@ import hashlib
|
|||||||
import math
|
import math
|
||||||
import re
|
import re
|
||||||
|
|
||||||
_TOKEN_RE = re.compile(r"[a-z0-9]+")
|
# Prefer FastEmbed Qdrant/bm25 (the EMBEDDINGS.md-specified encoder) when it is
|
||||||
|
# installable — true on the Start9 box (Python 3.11). Fall back to the
|
||||||
|
# dependency-free encoder below where it is not (e.g. this dev Mac on 3.14).
|
||||||
|
# Whichever is active, ingest and query in the SAME environment use it, so they
|
||||||
|
# stay consistent; production rebuilds the index on the box, so it uses FastEmbed
|
||||||
|
# end-to-end. BACKEND reports which is live.
|
||||||
|
try:
|
||||||
|
from fastembed import SparseTextEmbedding # type: ignore
|
||||||
|
_MODEL = None
|
||||||
|
|
||||||
|
def _model():
|
||||||
|
global _MODEL
|
||||||
|
if _MODEL is None:
|
||||||
|
_MODEL = SparseTextEmbedding(model_name="Qdrant/bm25")
|
||||||
|
return _MODEL
|
||||||
|
|
||||||
def tokenize(text: str):
|
def encode(text: str):
|
||||||
return _TOKEN_RE.findall((text or "").lower())
|
emb = next(_model().embed([text or ""]))
|
||||||
|
return {"indices": [int(i) for i in emb.indices], "values": [float(v) for v in emb.values]}
|
||||||
|
|
||||||
|
BACKEND = "fastembed:Qdrant/bm25"
|
||||||
|
except Exception:
|
||||||
|
BACKEND = "pure-python-bm25"
|
||||||
|
_TOKEN_RE = re.compile(r"[a-z0-9]+")
|
||||||
|
|
||||||
def _index(token: str) -> int:
|
def tokenize(text: str):
|
||||||
# Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32).
|
return _TOKEN_RE.findall((text or "").lower())
|
||||||
return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big")
|
|
||||||
|
|
||||||
|
def _index(token: str) -> int:
|
||||||
|
# Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32).
|
||||||
|
return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big")
|
||||||
|
|
||||||
def encode(text: str):
|
def encode(text: str):
|
||||||
"""Return a sparse vector {indices, values}. Value is 1 + ln(tf) (sublinear
|
"""Sparse vector {indices, values}; value = 1 + ln(tf). Qdrant applies IDF."""
|
||||||
term frequency); IDF is applied by Qdrant via modifier:idf."""
|
tf = {}
|
||||||
tf = {}
|
for tok in tokenize(text):
|
||||||
for tok in tokenize(text):
|
tf[tok] = tf.get(tok, 0) + 1
|
||||||
tf[tok] = tf.get(tok, 0) + 1
|
return {"indices": [_index(t) for t in tf], "values": [1.0 + math.log(c) for c in tf.values()]}
|
||||||
idx_val = {}
|
|
||||||
for tok, count in tf.items():
|
|
||||||
idx_val[_index(tok)] = 1.0 + math.log(count)
|
|
||||||
return {"indices": list(idx_val.keys()), "values": list(idx_val.values())}
|
|
||||||
|
|||||||
@@ -0,0 +1,126 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Phase-0 Workstream B4 — incremental, idempotent CRM -> Qdrant sync.
|
||||||
|
|
||||||
|
One command that keeps the index fresh:
|
||||||
|
1. Re-run deterministic entity resolution (cheap, idempotent, respects durable
|
||||||
|
fuzzy merges). Optionally re-run the local-Qwen fuzzy tier (--fuzzy).
|
||||||
|
2. Re-embed ONLY the source rows changed since the last sync (by updated_at);
|
||||||
|
the first run (or --recreate) is a full backfill.
|
||||||
|
3. Upsert with deterministic point ids (overwrite in place) and advance the
|
||||||
|
watermark. Logged to interaction_log.
|
||||||
|
|
||||||
|
Idempotent: re-running with no CRM changes embeds nothing. Watermark lives in an
|
||||||
|
`ingest_sync_state` table the pipeline owns.
|
||||||
|
|
||||||
|
python3 backend/ingest/sync.py --db data/crm_dev.db # incremental (full on first run)
|
||||||
|
python3 backend/ingest/sync.py --db data/crm_dev.db --recreate # force full rebuild
|
||||||
|
python3 backend/ingest/sync.py --db data/crm_dev.db --fuzzy # also run the Qwen fuzzy tier
|
||||||
|
|
||||||
|
LIMITATION: the CRM hard-deletes today, so a removed row's chunk is not pruned
|
||||||
|
incrementally (no tombstone). Until the DELETE handlers honor `deleted_at`, run a
|
||||||
|
periodic `--recreate` (or `backfill.py --recreate`) to drop orphans. Structural
|
||||||
|
entity-id changes (merges) are likewise best followed by a periodic full rebuild.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
import backfill
|
||||||
|
import chunking
|
||||||
|
import config
|
||||||
|
import entity_resolution as er
|
||||||
|
import qdrant_io
|
||||||
|
|
||||||
|
_CHANGE_TABLES = [("communications", "communications"), ("contacts", "contacts"),
|
||||||
|
("lp_profiles", "lp_profiles"), ("opportunities", "opportunities"),
|
||||||
|
("organizations", "organizations"), ("fundraising_investors", "fundraising_investors")]
|
||||||
|
|
||||||
|
|
||||||
|
def _now():
|
||||||
|
# Match the CRM's updated_at format ("...Z") so the watermark compares
|
||||||
|
# correctly against source-row updated_at (server.now() in server.py).
|
||||||
|
return datetime.now(timezone.utc).replace(tzinfo=None).isoformat() + "Z"
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_state(conn):
|
||||||
|
conn.execute("""CREATE TABLE IF NOT EXISTS ingest_sync_state (
|
||||||
|
key TEXT PRIMARY KEY, value TEXT, updated_at TEXT DEFAULT (datetime('now')))""")
|
||||||
|
|
||||||
|
|
||||||
|
def _state_get(conn, key):
|
||||||
|
r = conn.execute("SELECT value FROM ingest_sync_state WHERE key=?", (key,)).fetchone()
|
||||||
|
return r[0] if r else None
|
||||||
|
|
||||||
|
|
||||||
|
def _state_set(conn, key, value):
|
||||||
|
conn.execute("""INSERT INTO ingest_sync_state (key, value, updated_at) VALUES (?,?,?)
|
||||||
|
ON CONFLICT(key) DO UPDATE SET value=excluded.value, updated_at=excluded.updated_at""",
|
||||||
|
(key, value, _now()))
|
||||||
|
|
||||||
|
|
||||||
|
def _changed_source_ids(conn, since):
|
||||||
|
changed = set()
|
||||||
|
for tbl, model in _CHANGE_TABLES:
|
||||||
|
for r in conn.execute(f"SELECT id FROM {tbl} WHERE updated_at > ?", (since,)):
|
||||||
|
changed.add((model, r["id"]))
|
||||||
|
if chunking._has_table(conn, "emails"):
|
||||||
|
for r in conn.execute("SELECT id FROM emails WHERE updated_at > ? AND is_matched=1", (since,)):
|
||||||
|
changed.add(("emails", r["id"]))
|
||||||
|
return changed
|
||||||
|
|
||||||
|
|
||||||
|
def run(db, recreate=False, fuzzy=False, batch=32):
|
||||||
|
# 1. refresh the canonical layer (deterministic always; fuzzy on request)
|
||||||
|
er.run(db)
|
||||||
|
if fuzzy:
|
||||||
|
import fuzzy_resolve
|
||||||
|
fuzzy_resolve.run(db)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
_ensure_state(conn)
|
||||||
|
last = _state_get(conn, "last_sync_ts")
|
||||||
|
run_start = _now()
|
||||||
|
|
||||||
|
qdrant_io.create_collection(recreate=recreate)
|
||||||
|
qdrant_io.ensure_indexes()
|
||||||
|
|
||||||
|
all_chunks = chunking.build_chunks(conn)
|
||||||
|
if last is None or recreate:
|
||||||
|
mode, target = "full", all_chunks
|
||||||
|
else:
|
||||||
|
changed = _changed_source_ids(conn, last)
|
||||||
|
mode, target = "incremental", [c for c in all_chunks
|
||||||
|
if (c["source_model"], c["source_id"]) in changed]
|
||||||
|
|
||||||
|
written = backfill.embed_and_upsert(target, batch=batch, progress=False)
|
||||||
|
_state_set(conn, "last_sync_ts", run_start)
|
||||||
|
|
||||||
|
summary = {"mode": mode, "rows_embedded": written, "total_chunks": len(all_chunks),
|
||||||
|
"qdrant_points": qdrant_io.count()}
|
||||||
|
conn.execute("""INSERT INTO interaction_log
|
||||||
|
(id, ts, actor_type, actor_id, action, target_type, payload, source, created_at)
|
||||||
|
VALUES (?,?,?,?,?,?,?,?,?)""",
|
||||||
|
(str(uuid.uuid4()), _now(), "system", "ingest_sync", "ingest.sync", "crm_chunks",
|
||||||
|
json.dumps(summary), "ingest", _now()))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--db", default=config.DEFAULT_DB)
|
||||||
|
ap.add_argument("--recreate", action="store_true")
|
||||||
|
ap.add_argument("--fuzzy", action="store_true")
|
||||||
|
ap.add_argument("--batch", type=int, default=32)
|
||||||
|
args = ap.parse_args()
|
||||||
|
s = run(args.db, recreate=args.recreate, fuzzy=args.fuzzy, batch=args.batch)
|
||||||
|
print(f"Sync ({s['mode']}): embedded {s['rows_embedded']} chunk(s); "
|
||||||
|
f"{s['total_chunks']} total; Qdrant now holds {s['qdrant_points']} points.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,87 @@
|
|||||||
|
# Go-Live Runbook — Phase 0 substrate on the live Start9 box
|
||||||
|
|
||||||
|
*How to take the Phase-0 data substrate from "tested on synthetic data" to "running against the real CRM" on the Start9 server. You run this on your infrastructure; no real LP data goes to Claude/Anthropic (guardrails #1, #9). The live `/data/crm.db` on the box is the canonical source — not the possibly-stale `start9/0.4/seed/` snapshot.*
|
||||||
|
|
||||||
|
Recap of the three moves (see also `docs/crm-overview.md`): (1) ship code → empty new tables appear; (2) run the one-time init → fills the canonical IDs + search index from your real data; (3) run the MCP server.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Spark Control + Qdrant reachable from the box: `SPARK_CONTROL_URL`, `QDRANT_URL` (see `.env.example`). Verify with `curl -sk $SPARK_CONTROL_URL/api/endpoints`.
|
||||||
|
- The `backend/ingest/` + `backend/mcp/` code present on the box (ships with the package — see "Packaging decision" below).
|
||||||
|
- Python deps in the ingest environment: `fastembed` (BM25; installs cleanly on the box's Python 3.11) and `mcp` (only to run the MCP server). The CRM server itself needs no new deps.
|
||||||
|
|
||||||
|
## Step 1 — Deploy the new CRM version (auto-creates the empty tables)
|
||||||
|
|
||||||
|
1. Bump the package version, rebuild the `.s9pk`, sideload it. StartOS preserves `/data`, so your real data is undisturbed.
|
||||||
|
2. On first boot, `init_db()` runs `backend/core_migrations.py`, which applies `migrations/0001_phase0_foundation.sql` **once** (tracked in `schema_migrations`) — additively creating `canonical_entities`, `entity_links`, `interaction_log`, `relationship_edges`, and the `deleted_at` columns. Nothing existing changes.
|
||||||
|
3. Verify: `sqlite3 /data/crm.db "SELECT filename FROM schema_migrations;"` → should list `0001_phase0_foundation.sql`.
|
||||||
|
|
||||||
|
## Step 2 — Prepare the ingest environment (on the box)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install fastembed # BM25 Qdrant/bm25 (sparse.py auto-detects it)
|
||||||
|
export CRM_DB_PATH=/data/crm.db
|
||||||
|
export SPARK_CONTROL_URL=https://192.168.1.72:62419
|
||||||
|
export SPARK_CONTROL_VERIFY_TLS=false
|
||||||
|
export QDRANT_URL=http://192.168.1.87:6333
|
||||||
|
```
|
||||||
|
|
||||||
|
`sparse.py` will report `BACKEND = fastembed:Qdrant/bm25` here (vs the pure-Python fallback used on the dev Mac). Because the index is built **and** queried on the box, the encoder is consistent end-to-end.
|
||||||
|
|
||||||
|
## Step 3 — Build the canonical IDs from your real data
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 backend/ingest/entity_resolution.py --db /data/crm.db --show-candidates
|
||||||
|
```
|
||||||
|
|
||||||
|
This reads your real contacts / fundraising investors / organizations and fills `canonical_entities` + `entity_links` (the "create entity IDs from existing data" step). It is **read-only on your CRM source tables**, idempotent, and logs a run to `interaction_log`. Review the printed fuzzy candidates — those are the name-variant pairs the deterministic tier wouldn't merge on a guess (the local-Qwen fuzzy tier, still to be built, resolves these).
|
||||||
|
|
||||||
|
## Step 4 — Build the search index
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 backend/ingest/backfill.py --db /data/crm.db --recreate
|
||||||
|
```
|
||||||
|
|
||||||
|
Chunks your real records → dense (bge-m3 via Spark Control) + BM25 sparse → upserts to Qdrant `crm_chunks`. ~8–15 min for a full corpus. Idempotent (deterministic point ids), so re-running is safe. `--recreate` drops and rebuilds the collection; omit it to update in place.
|
||||||
|
|
||||||
|
Note: your live CRM's text is concentrated in the **fundraising grid notes** + grid contacts (the seed snapshot had 0 communications / 0 lp_profiles), plus Gmail once enabled (see `docs/gmail-enablement-runbook.md`). The chunker already handles all of these.
|
||||||
|
|
||||||
|
## Step 5 — Start the MCP server
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install mcp
|
||||||
|
CRM_DB_PATH=/data/crm.db python3 backend/mcp/server.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Register it with the Agent SDK / Claude Code as an stdio MCP server. It exposes reads, the three retrieval modes, and logged writes — **no outbound/contact tools** (Phase 3 gate). For Phase 0 there are no live agents; this is for testing and the internal-only Analyst work later.
|
||||||
|
|
||||||
|
## Step 6 — Incremental sync (NOT YET BUILT — Workstream B4)
|
||||||
|
|
||||||
|
The full backfill is one-shot. Keeping the index fresh as the CRM changes (new grid edits, new emails) needs an incremental, idempotent sync on a schedule. This is the remaining Phase-0 ingest piece; until it's built, re-run Steps 3–4 to refresh.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT entity_kind, COUNT(*) FROM canonical_entities GROUP BY entity_kind; -- IDs built
|
||||||
|
SELECT COUNT(*) FROM entity_links; -- source rows linked
|
||||||
|
```
|
||||||
|
```bash
|
||||||
|
curl -s "$QDRANT_URL/collections/crm_chunks" | python3 -c "import sys,json;print('points:', json.load(sys.stdin)['result']['points_count'])"
|
||||||
|
python3 backend/ingest/search.py "Fund III wire timeline" --mode hybrid # sanity query
|
||||||
|
```
|
||||||
|
|
||||||
|
## Open decision — packaging (how the init + MCP run on the box)
|
||||||
|
|
||||||
|
The ingest scripts read `/data/crm.db` by file path, so they must run **where that file lives** — inside or beside the CRM container (the dev Mac cannot open the container's SQLite file directly). Options, to decide before go-live:
|
||||||
|
|
||||||
|
- **A (recommended): same image.** Bundle `backend/ingest` + `backend/mcp` (+ `fastembed`, `mcp`) into the CRM container image; expose the init as a one-shot Start9 action and run the MCP server as a second daemon in the 0.4 `startos` manifest. The image is already Python 3.11 with the volume mounted.
|
||||||
|
- **B: sidecar container** on the box mounting the same `/data` volume.
|
||||||
|
- **C: co-located host** with a copy of `/data` and LAN access to the Sparks (involves copying the DB — least clean).
|
||||||
|
|
||||||
|
This packaging wiring (and Step 6) is the remaining build work for a fully turn-key go-live.
|
||||||
|
|
||||||
|
## Sovereignty checkpoint
|
||||||
|
|
||||||
|
Every step above runs on Ten31 infrastructure. Real records flow `crm.db → local Spark (bge-m3) → local Qdrant` and never reach Anthropic. The scripts print counts, not records. Keep it that way: don't paste query *results* over real data back into a Claude session (guardrail #9).
|
||||||
+17
-4
@@ -31,14 +31,27 @@ RUN apt-get update \
|
|||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# ── Python dependencies ─────────────────────────────────────────
|
# ── Python dependencies ─────────────────────────────────────────
|
||||||
# Only one hard dep for now: `cryptography` is required by the Gmail
|
# `cryptography` is required by the Gmail integration's RS256 JWT signing
|
||||||
# integration's RS256 JWT signing (DWD bearer tokens). Everything else
|
# (DWD bearer tokens). The two Phase-0 deps are runtime-only for the ingest
|
||||||
# server.py needs is stdlib.
|
# pipeline + MCP server (the CRM web server itself still needs no new deps):
|
||||||
RUN pip install --no-cache-dir cryptography==42.0.5
|
# * fastembed — client-side BM25 (Qdrant/bm25) for the sparse retrieval leg
|
||||||
|
# (backend/ingest/sparse.py auto-detects it).
|
||||||
|
# * mcp — MCP Python SDK, only needed to run backend/mcp/server.py.
|
||||||
|
# Everything else server.py needs is stdlib.
|
||||||
|
RUN pip install --no-cache-dir \
|
||||||
|
cryptography==42.0.5 \
|
||||||
|
fastembed==0.4.2 \
|
||||||
|
mcp==1.2.0
|
||||||
|
|
||||||
# ── Application source ──────────────────────────────────────────
|
# ── Application source ──────────────────────────────────────────
|
||||||
COPY backend/server.py /app/backend/server.py
|
COPY backend/server.py /app/backend/server.py
|
||||||
COPY backend/email_integration /app/backend/email_integration
|
COPY backend/email_integration /app/backend/email_integration
|
||||||
|
# Phase-0 substrate: ingest pipeline (entity resolution + backfill) and the
|
||||||
|
# CRM MCP server. Shipped alongside the web server so the one-time index build
|
||||||
|
# and the (manually-run) MCP server can execute on the box where /data/crm.db
|
||||||
|
# lives. See start9/0.4/INGEST_PACKAGING.md.
|
||||||
|
COPY backend/ingest /app/backend/ingest
|
||||||
|
COPY backend/mcp /app/backend/mcp
|
||||||
COPY frontend /app/frontend
|
COPY frontend /app/frontend
|
||||||
|
|
||||||
# ── StartOS wrapper scripts ─────────────────────────────────────
|
# ── StartOS wrapper scripts ─────────────────────────────────────
|
||||||
|
|||||||
@@ -0,0 +1,142 @@
|
|||||||
|
# Phase-0 ingest packaging (StartOS 0.4)
|
||||||
|
|
||||||
|
How the Phase-0 data substrate — the ingest pipeline (`backend/ingest/`) and the
|
||||||
|
CRM MCP server (`backend/mcp/`) — ships and runs on the live StartOS 0.4 package,
|
||||||
|
**without changing the CRM web server**. This implements **Option A** ("same
|
||||||
|
image") from `docs/go-live-runbook.md` §"Open decision — packaging".
|
||||||
|
|
||||||
|
The CRM web server (`backend/server.py`) is untouched and gains no new
|
||||||
|
dependencies. The `primary` daemon and its `checkPortListening` health check are
|
||||||
|
unchanged.
|
||||||
|
|
||||||
|
## What changed
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
| --- | --- |
|
||||||
|
| `Dockerfile` | `COPY backend/ingest` and `COPY backend/mcp` into the image alongside `backend/server.py`. Added two runtime deps to the existing `pip install`: `fastembed==0.4.2` (client-side BM25 / `Qdrant/bm25` for the sparse retrieval leg) and `mcp==1.2.0` (MCP Python SDK, only for `backend/mcp/server.py`). |
|
||||||
|
| `docker_entrypoint.sh` | Added an export block for the ingest/retrieval env: `CRM_DB_PATH`, `SPARK_CONTROL_URL`, `SPARK_CONTROL_VERIFY_TLS`, `QDRANT_URL`, with LAN-default placeholder values and an operator comment. The CRM web server ignores these; they exist so manual `python3 /app/backend/ingest/...` and `backend/mcp/server.py` runs on the box inherit them. |
|
||||||
|
| `startos/actions/buildSearchIndex.ts` | **New.** A one-shot "Build search index" StartOS action (Steps 3–4 of the runbook). |
|
||||||
|
| `startos/actions/index.ts` | Registered the new action: `sdk.Actions.of().addAction(buildSearchIndex)`. |
|
||||||
|
| `startos/versions/v0.1.0.44.ts` + `versions/index.ts` | New version `0.1.0:44` (image-only change, no data migration) set as `current`; `0.1.0:43` moved to `other`. |
|
||||||
|
| `startos/utils.ts` | Bumped the informational `PACKAGE_VERSION` constant to `0.1.0:44`. |
|
||||||
|
|
||||||
|
### Action registration mechanism (verified)
|
||||||
|
|
||||||
|
Actions are collected in `startos/actions/index.ts` as
|
||||||
|
`export const actions = sdk.Actions.of().addAction(...)`, and that `actions`
|
||||||
|
object is passed into `sdk.setupInit(...)` in `startos/init/index.ts` (and
|
||||||
|
re-exported from `startos/index.ts`). Adding `.addAction(buildSearchIndex)` is
|
||||||
|
the entire registration — no manifest entry is required for actions in the 0.4
|
||||||
|
SDK.
|
||||||
|
|
||||||
|
## How the operator triggers the index build
|
||||||
|
|
||||||
|
1. Build/sideload the new `.s9pk` (version `0.1.0:44`). StartOS preserves
|
||||||
|
`/data`, so live data is undisturbed. On first boot the CRM's own migration
|
||||||
|
runner creates the Phase-0 tables (see runbook Step 1) — that is independent
|
||||||
|
of this packaging change.
|
||||||
|
2. In the StartOS UI, open the **Ten31 Database** service → **Actions** →
|
||||||
|
**Build search index**, and run it. It:
|
||||||
|
- runs `entity_resolution.py --db /data/crm.db` (canonical ids + links), then
|
||||||
|
- runs `backfill.py --db /data/crm.db --recreate` (chunk → dense via Spark
|
||||||
|
Control + BM25 → upsert to Qdrant `crm_chunks`).
|
||||||
|
Both steps are idempotent and read-only on the CRM source tables, so the
|
||||||
|
action is safe to re-run any time to refresh the index. A full re-embed is
|
||||||
|
~8–15 min (the action allows up to 30 min before timing out).
|
||||||
|
|
||||||
|
The action runs in its **own subcontainer** with the same `main` volume mounted
|
||||||
|
at `/data`, with `cwd=/app/backend/ingest` (the ingest scripts import their
|
||||||
|
siblings by bare name, e.g. `import config`, so they must run from that
|
||||||
|
directory). It uses `allowedStatuses: 'any'` — SQLite WAL mode makes a
|
||||||
|
concurrently-running CRM safe for these reads/derived writes.
|
||||||
|
|
||||||
|
## Env / config the operator must set (Spark URLs)
|
||||||
|
|
||||||
|
The ingest run reaches out to **Spark Control** (dense embeddings) and **Qdrant**
|
||||||
|
(upserts). Those endpoints are LAN-specific, so they are defined in **two
|
||||||
|
places** that the operator must point at their network. The current values are
|
||||||
|
the Ten31 LAN defaults:
|
||||||
|
|
||||||
|
| Variable | Default | Used by |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `SPARK_CONTROL_URL` | `https://192.168.1.72:62419` | dense embeds (`/v1/embeddings`) |
|
||||||
|
| `SPARK_CONTROL_VERIFY_TLS` | `false` (Spark Control uses a self-signed cert) | TLS verification toggle |
|
||||||
|
| `QDRANT_URL` | `http://192.168.1.87:6333` | Qdrant collection admin + upserts |
|
||||||
|
| `CRM_DB_PATH` | `/data/crm.db` | both scripts + MCP server (already correct) |
|
||||||
|
|
||||||
|
Where to set them:
|
||||||
|
|
||||||
|
- **`docker_entrypoint.sh`** — for manual `python3` / MCP runs via the running
|
||||||
|
container. Edit the `${VAR:-default}` block, or override via the StartOS
|
||||||
|
service environment.
|
||||||
|
- **`startos/actions/buildSearchIndex.ts`** (`ingestEnv`) — for the "Build search
|
||||||
|
index" action, which runs in its own subcontainer and does **not** execute the
|
||||||
|
entrypoint, so it carries its own copy of the values. Edit these to match.
|
||||||
|
|
||||||
|
> Keep the two copies in sync. They are duplicated because the action's
|
||||||
|
> subcontainer never runs `docker_entrypoint.sh`; there is no shared config
|
||||||
|
> store wired into this package today (see "Still needed" below).
|
||||||
|
|
||||||
|
Verify reachability from the box before running the action:
|
||||||
|
`curl -sk $SPARK_CONTROL_URL/api/endpoints` and
|
||||||
|
`curl -s $QDRANT_URL/collections`.
|
||||||
|
|
||||||
|
## MCP server: decision and how to run it
|
||||||
|
|
||||||
|
**Decision: the MCP server is NOT a daemon in this release — it is shipped in the
|
||||||
|
image and run manually.** Rationale:
|
||||||
|
|
||||||
|
- `backend/mcp/server.py` is an **stdio** MCP server (`mcp.run()` with FastMCP):
|
||||||
|
it has no network port to bind, so the StartOS daemon model (a long-running
|
||||||
|
process with a `checkPortListening` health check, like `primary`) does not fit
|
||||||
|
it. There is nothing to port-probe and no meaningful liveness signal.
|
||||||
|
- **Phase 0 has no live agents** (per `CLAUDE.md` and the runbook): nothing on
|
||||||
|
the box would connect to it. An always-on daemon would idle with no client on
|
||||||
|
its stdin and no health semantics.
|
||||||
|
- It exposes reads, the three retrieval modes, and logged writes — **no
|
||||||
|
outbound/contact tools** (Phase 3 compliance gate). It is for testing and
|
||||||
|
later internal-only Analyst work.
|
||||||
|
|
||||||
|
To run it manually on the box (it is present at `/app/backend/mcp/server.py` with
|
||||||
|
`mcp` already installed):
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# from inside the running container
|
||||||
|
CRM_DB_PATH=/data/crm.db python3 /app/backend/mcp/server.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Then register it with the Agent SDK / Claude Code as an stdio MCP server pointing
|
||||||
|
at that script (it inherits the Spark/Qdrant env exported by the entrypoint).
|
||||||
|
|
||||||
|
If/when a live agent needs it as a persistent service, the cleanest upgrade is to
|
||||||
|
add it as a **second daemon** in `startos/main.ts` mirroring the `primary`
|
||||||
|
daemon — but only after giving it a network transport (e.g. an HTTP/SSE MCP
|
||||||
|
endpoint on its own port) so it has a real `checkPortListening` health check.
|
||||||
|
That is deliberately deferred to a later phase.
|
||||||
|
|
||||||
|
## Still needed for a fully turn-key deploy
|
||||||
|
|
||||||
|
- **MCP-as-a-service** — see above. Deferred until there is a live agent and a
|
||||||
|
network transport; today it is manual/stdio only.
|
||||||
|
- **Incremental sync (runbook Step 6 / Workstream B4)** — the action does a full
|
||||||
|
one-shot rebuild. Keeping the index fresh as the CRM changes needs an
|
||||||
|
incremental, idempotent sync on a schedule. Until that exists, re-running the
|
||||||
|
"Build search index" action is the refresh path. When built, it could be wired
|
||||||
|
as a recurring StartOS action/task rather than a manual re-run.
|
||||||
|
- **Single source of truth for Spark/Qdrant config** — currently duplicated in
|
||||||
|
`docker_entrypoint.sh` and `buildSearchIndex.ts`. A small StartOS config
|
||||||
|
store + input form (the SDK supports `Action.withInput` and a service config)
|
||||||
|
would let the operator set the endpoints once in the UI; deferred to keep this
|
||||||
|
change minimal and reviewable.
|
||||||
|
- **`.env` on the box** — `backend/ingest/config.py` also reads `/app/.env` if
|
||||||
|
present (via `os.environ.setdefault`, so it does not override the exported
|
||||||
|
env). Not required given the exported env above, but available as an
|
||||||
|
alternative if the operator prefers a file.
|
||||||
|
|
||||||
|
## Constraints honored
|
||||||
|
|
||||||
|
- No files under `backend/ingest/`, `backend/mcp/`, `backend/server.py`,
|
||||||
|
`backend/core_migrations.py`, `backend/migrations/`, or `data/` were modified;
|
||||||
|
only `start9/0.4/**` and this new doc.
|
||||||
|
- No build/deploy commands were run. `npx tsc --noEmit` was used only to verify
|
||||||
|
the new TypeScript compiles against the SDK types.
|
||||||
@@ -57,5 +57,22 @@ else
|
|||||||
echo "[entrypoint] Gmail integration: DISABLED (no key at $GMAIL_SA_KEY)"
|
echo "[entrypoint] Gmail integration: DISABLED (no key at $GMAIL_SA_KEY)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# ── Phase-0 ingest / retrieval env ──────────────────────────────
|
||||||
|
# These are consumed by the ingest pipeline (backend/ingest/) and the MCP
|
||||||
|
# server (backend/mcp/) — NOT by the CRM web server, which ignores them.
|
||||||
|
# They are exported here so the "Build search index" StartOS action and any
|
||||||
|
# manual `python3 /app/backend/ingest/...` / `backend/mcp/server.py` run on the
|
||||||
|
# box inherit them.
|
||||||
|
#
|
||||||
|
# OPERATOR: the values below are LAN defaults for the Ten31 deployment. Set the
|
||||||
|
# real ones for your network — either by editing them here before building the
|
||||||
|
# image, or by overriding the env vars in the StartOS service environment.
|
||||||
|
# Point SPARK_CONTROL_URL at the Spark Control gateway (TLS, self-signed by
|
||||||
|
# default → SPARK_CONTROL_VERIFY_TLS=false) and QDRANT_URL at Qdrant on Spark 2.
|
||||||
|
export CRM_DB_PATH="${CRM_DB_PATH:-$DATA_DIR/crm.db}"
|
||||||
|
export SPARK_CONTROL_URL="${SPARK_CONTROL_URL:-https://192.168.1.72:62419}"
|
||||||
|
export SPARK_CONTROL_VERIFY_TLS="${SPARK_CONTROL_VERIFY_TLS:-false}"
|
||||||
|
export QDRANT_URL="${QDRANT_URL:-http://192.168.1.87:6333}"
|
||||||
|
|
||||||
# ── Launch the app ──────────────────────────────────────────────
|
# ── Launch the app ──────────────────────────────────────────────
|
||||||
exec python3 /app/backend/server.py
|
exec python3 /app/backend/server.py
|
||||||
|
|||||||
@@ -0,0 +1,118 @@
|
|||||||
|
import { i18n } from '../i18n'
|
||||||
|
import { sdk } from '../sdk'
|
||||||
|
import { DATA_MOUNT_PATH, IMAGE_ID } from '../utils'
|
||||||
|
|
||||||
|
/**
|
||||||
|
* One-shot "Build search index" action (Phase-0 ingest, go-live Steps 3–4).
|
||||||
|
*
|
||||||
|
* Runs the one-time init that turns the live CRM into the canonical-id layer
|
||||||
|
* and the Qdrant search index, on the box where /data/crm.db lives:
|
||||||
|
*
|
||||||
|
* 1. entity_resolution.py --db /data/crm.db (build canonical ids + links)
|
||||||
|
* 2. backfill.py --db /data/crm.db --recreate (chunk → dense+BM25 → Qdrant)
|
||||||
|
*
|
||||||
|
* Both steps are idempotent (deterministic ids), read-only on the CRM source
|
||||||
|
* tables, and log a row to interaction_log — so re-running is always safe.
|
||||||
|
*
|
||||||
|
* Implementation notes:
|
||||||
|
* - The scripts import their siblings by bare name (`import config`, etc.),
|
||||||
|
* so they must run with cwd = /app/backend/ingest.
|
||||||
|
* - backfill.py talks to Spark Control (dense embeds) and Qdrant (upserts),
|
||||||
|
* so the Spark/Qdrant env must be present. This action runs in its OWN
|
||||||
|
* subcontainer and does NOT go through docker_entrypoint.sh, so it cannot
|
||||||
|
* inherit the entrypoint's exports — the env is passed explicitly below.
|
||||||
|
* - allowedStatuses: 'any' — the action runs in its own subcontainer with the
|
||||||
|
* same /data volume mounted, so it works whether or not the CRM is running.
|
||||||
|
* SQLite WAL mode means a concurrently-running CRM is fine for these
|
||||||
|
* reads/derived writes.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const DB_PATH = `${DATA_MOUNT_PATH}/crm.db`
|
||||||
|
const INGEST_DIR = '/app/backend/ingest'
|
||||||
|
|
||||||
|
// OPERATOR: Spark Control + Qdrant endpoints for the ingest run. These are the
|
||||||
|
// LAN defaults for the Ten31 deployment — edit them for your network. Keep them
|
||||||
|
// in sync with the export block in docker_entrypoint.sh (single source of truth
|
||||||
|
// for the values; this action needs its own copy because it does not run the
|
||||||
|
// entrypoint). Spark Control is TLS with a self-signed cert by default, hence
|
||||||
|
// SPARK_CONTROL_VERIFY_TLS = 'false'.
|
||||||
|
const ingestEnv: { [k: string]: string } = {
|
||||||
|
CRM_DB_PATH: DB_PATH,
|
||||||
|
SPARK_CONTROL_URL: 'https://192.168.1.72:62419',
|
||||||
|
SPARK_CONTROL_VERIFY_TLS: 'false',
|
||||||
|
QDRANT_URL: 'http://192.168.1.87:6333',
|
||||||
|
}
|
||||||
|
|
||||||
|
export const buildSearchIndex = sdk.Action.withoutInput(
|
||||||
|
// id
|
||||||
|
'build-search-index',
|
||||||
|
|
||||||
|
// metadata
|
||||||
|
async ({ effects }) => ({
|
||||||
|
name: i18n('Build search index'),
|
||||||
|
description: i18n(
|
||||||
|
'One-time Phase-0 init: builds the canonical entity ids from your live ' +
|
||||||
|
'CRM (entity_resolution.py), then chunks + embeds every record into ' +
|
||||||
|
'the Qdrant search index (backfill.py --recreate). Both steps are ' +
|
||||||
|
'idempotent and read-only on your CRM source tables. Requires Spark ' +
|
||||||
|
'Control and Qdrant to be reachable (set SPARK_CONTROL_URL / ' +
|
||||||
|
'QDRANT_URL). A full re-embed takes roughly 8–15 minutes.',
|
||||||
|
),
|
||||||
|
warning: i18n(
|
||||||
|
'Rebuilds the Qdrant `crm_chunks` collection (--recreate drops and ' +
|
||||||
|
'recreates it). The index is derived from the CRM and safe to rebuild; ' +
|
||||||
|
'no CRM source data is modified.',
|
||||||
|
),
|
||||||
|
allowedStatuses: 'any',
|
||||||
|
group: null,
|
||||||
|
visibility: 'enabled',
|
||||||
|
}),
|
||||||
|
|
||||||
|
// execution
|
||||||
|
async ({ effects }) => {
|
||||||
|
const env = ingestEnv
|
||||||
|
|
||||||
|
const subcontainer = await sdk.SubContainer.of(
|
||||||
|
effects,
|
||||||
|
{ imageId: IMAGE_ID },
|
||||||
|
sdk.Mounts.of().mountVolume({
|
||||||
|
volumeId: 'main',
|
||||||
|
subpath: null,
|
||||||
|
mountpoint: DATA_MOUNT_PATH,
|
||||||
|
readonly: false,
|
||||||
|
}),
|
||||||
|
'ten31-database-build-search-index',
|
||||||
|
)
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Step 3 — canonical ids from the real data (fast, local-only).
|
||||||
|
await subcontainer.execFail(
|
||||||
|
['python3', 'entity_resolution.py', '--db', DB_PATH],
|
||||||
|
{ cwd: INGEST_DIR, env },
|
||||||
|
// 10 minutes — pure SQLite work, but generous for a large corpus.
|
||||||
|
10 * 60 * 1000,
|
||||||
|
)
|
||||||
|
|
||||||
|
// Step 4 — chunk → dense (Spark Control) + BM25 → Qdrant upsert.
|
||||||
|
await subcontainer.execFail(
|
||||||
|
['python3', 'backfill.py', '--db', DB_PATH, '--recreate'],
|
||||||
|
{ cwd: INGEST_DIR, env },
|
||||||
|
// 30 minutes — a full re-embed is ~8–15 min; leave generous headroom.
|
||||||
|
30 * 60 * 1000,
|
||||||
|
)
|
||||||
|
} finally {
|
||||||
|
await subcontainer.destroy()
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
version: '1',
|
||||||
|
title: i18n('Search index built'),
|
||||||
|
message: i18n(
|
||||||
|
'Canonical entity ids were resolved and the Qdrant `crm_chunks` ' +
|
||||||
|
'collection was rebuilt from your live CRM. You can re-run this ' +
|
||||||
|
'action any time to refresh the index.',
|
||||||
|
),
|
||||||
|
result: null,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
import { sdk } from '../sdk'
|
import { sdk } from '../sdk'
|
||||||
|
import { buildSearchIndex } from './buildSearchIndex'
|
||||||
|
|
||||||
export const actions = sdk.Actions.of()
|
export const actions = sdk.Actions.of().addAction(buildSearchIndex)
|
||||||
|
|||||||
@@ -3,12 +3,14 @@
|
|||||||
// from manifest/index.ts (id, title) and versions/ (version).
|
// from manifest/index.ts (id, title) and versions/ (version).
|
||||||
export const PACKAGE_ID = 'ten-database'
|
export const PACKAGE_ID = 'ten-database'
|
||||||
export const PACKAGE_TITLE = 'Ten31 Database'
|
export const PACKAGE_TITLE = 'Ten31 Database'
|
||||||
// ExVer form of the current 0.4 wrapper release (upstream 0.1.0, wrapper rev 41).
|
// ExVer form of the current 0.4 wrapper release (upstream 0.1.0, wrapper rev 44).
|
||||||
// * 0.3.5 wrapper: 0.1.0.38 (legacy, aarch64)
|
// * 0.3.5 wrapper: 0.1.0.38 (legacy, aarch64)
|
||||||
// * First 0.4: 0.1.0:39 (shipped seed snapshot for migration)
|
// * First 0.4: 0.1.0:39 (shipped seed snapshot for migration)
|
||||||
// * Cleanup: 0.1.0:40 (seed removed + multi-threaded server + abuser auto-ban)
|
// * Cleanup: 0.1.0:40 (seed removed + multi-threaded server + abuser auto-ban)
|
||||||
// * Current: 0.1.0:41 (frontend persists auth across refreshes)
|
// * 0.1.0:41 (frontend persists auth across refreshes)
|
||||||
export const PACKAGE_VERSION = '0.1.0:41'
|
// * 0.1.0:42 (Gmail integration) / 0.1.0:43 (Gmail POST-body hotfix)
|
||||||
|
// * Current: 0.1.0:44 (Phase-0 ingest + MCP server in image; build-index action)
|
||||||
|
export const PACKAGE_VERSION = '0.1.0:44'
|
||||||
|
|
||||||
export const DATA_MOUNT_PATH = '/data'
|
export const DATA_MOUNT_PATH = '/data'
|
||||||
export const WEB_PORT = 8080
|
export const WEB_PORT = 8080
|
||||||
|
|||||||
@@ -4,8 +4,9 @@ import { v_0_1_0_40 } from './v0.1.0.40'
|
|||||||
import { v_0_1_0_41 } from './v0.1.0.41'
|
import { v_0_1_0_41 } from './v0.1.0.41'
|
||||||
import { v_0_1_0_42 } from './v0.1.0.42'
|
import { v_0_1_0_42 } from './v0.1.0.42'
|
||||||
import { v_0_1_0_43 } from './v0.1.0.43'
|
import { v_0_1_0_43 } from './v0.1.0.43'
|
||||||
|
import { v_0_1_0_44 } from './v0.1.0.44'
|
||||||
|
|
||||||
export const versionGraph = VersionGraph.of({
|
export const versionGraph = VersionGraph.of({
|
||||||
current: v_0_1_0_43,
|
current: v_0_1_0_44,
|
||||||
other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42],
|
other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43],
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -0,0 +1,47 @@
|
|||||||
|
import { VersionInfo } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
|
// Phase-0 substrate packaging release.
|
||||||
|
//
|
||||||
|
// Context:
|
||||||
|
// * Ships the Phase-0 ingest pipeline (backend/ingest/) and the CRM MCP
|
||||||
|
// server (backend/mcp/) inside the existing CRM container image, alongside
|
||||||
|
// the web server. Two runtime deps are added to the image: `fastembed`
|
||||||
|
// (client-side BM25 for the sparse retrieval leg) and `mcp` (the MCP
|
||||||
|
// Python SDK, used only to run backend/mcp/server.py). The CRM web server
|
||||||
|
// itself gains no new dependencies and is unchanged.
|
||||||
|
// * Adds a one-shot "Build search index" StartOS action that runs the
|
||||||
|
// one-time init on the box where /data/crm.db lives:
|
||||||
|
// entity_resolution.py --db /data/crm.db (canonical ids)
|
||||||
|
// backfill.py --db /data/crm.db --recreate (Qdrant search index)
|
||||||
|
// Both steps are idempotent and read-only on the CRM source tables.
|
||||||
|
// * docker_entrypoint.sh now exports the Spark Control / Qdrant env
|
||||||
|
// (SPARK_CONTROL_URL, SPARK_CONTROL_VERIFY_TLS, QDRANT_URL) with LAN
|
||||||
|
// defaults so manual ingest / MCP runs on the box inherit them.
|
||||||
|
//
|
||||||
|
// The MCP server is intentionally NOT a daemon in this release: it is an
|
||||||
|
// stdio server with no port to bind and (in Phase 0) no live agent on the box
|
||||||
|
// to talk to it, so it is run manually for testing. See
|
||||||
|
// start9/0.4/INGEST_PACKAGING.md.
|
||||||
|
//
|
||||||
|
// No schema changes and no data migration: the SQLite schema is unchanged and
|
||||||
|
// the live /data volume is left exactly as-is. The new tables the ingest
|
||||||
|
// pipeline reads/writes are created by the CRM's own migration runner
|
||||||
|
// (migrations/0001_phase0_foundation.sql), independent of this package change.
|
||||||
|
export const v_0_1_0_44 = VersionInfo.of({
|
||||||
|
version: '0.1.0:44',
|
||||||
|
releaseNotes: {
|
||||||
|
en_US: [
|
||||||
|
'Ships the Phase-0 data substrate inside the CRM image: the ingest',
|
||||||
|
'pipeline (entity resolution + Qdrant backfill) and the CRM MCP server,',
|
||||||
|
'plus the fastembed and mcp runtime dependencies. Adds a one-time',
|
||||||
|
'"Build search index" action that resolves canonical entity ids from',
|
||||||
|
'your live CRM and rebuilds the Qdrant search index — both steps are',
|
||||||
|
'idempotent and read-only on your CRM source data. The CRM web server',
|
||||||
|
'is unchanged and gains no new dependencies. No data migration.',
|
||||||
|
].join(' '),
|
||||||
|
},
|
||||||
|
migrations: {
|
||||||
|
up: async () => {},
|
||||||
|
down: async () => {},
|
||||||
|
},
|
||||||
|
})
|
||||||
Reference in New Issue
Block a user