From f357c23c75eeffde985dbb833d9e84751e037aac Mon Sep 17 00:00:00 2001 From: Keysat Date: Fri, 5 Jun 2026 08:55:12 -0500 Subject: [PATCH] Phase 0 complete: fuzzy entity tier, incremental sync, Start9 packaging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fuzzy tier (backend/ingest/fuzzy_resolve.py + llm.py): local Qwen adjudicates the deterministic resolver's flagged name-variant candidates; merges are durable via entity_merges (deterministic re-runs respect them), losers soft-deleted, logged. Idempotent. - Incremental sync (backend/ingest/sync.py): re-embeds only rows changed since a watermark (ingest_sync_state); first run / --recreate = full. Tested full→0→1. - Start9 packaging (start9/0.4): Dockerfile bundles ingest+mcp + fastembed/mcp; "Build search index" action runs the init in a subcontainer; MCP shipped as a manual stdio server (not a daemon); version 0.1.0:44. INGEST_PACKAGING.md. - backfill.py: factored embed_and_upsert() shared with sync. Verified end-to-end on synthetic data + live Sparks/Qwen/Qdrant. Co-Authored-By: Claude Opus 4.8 --- backend/ingest/backfill.py | 31 ++-- backend/ingest/config.py | 1 + backend/ingest/entity_resolution.py | 47 ++++-- backend/ingest/fuzzy_resolve.py | 116 ++++++++++++++ backend/ingest/llm.py | 39 +++++ backend/ingest/sparse.py | 48 ++++-- backend/ingest/sync.py | 126 ++++++++++++++++ docs/go-live-runbook.md | 87 +++++++++++ start9/0.4/Dockerfile | 21 ++- start9/0.4/INGEST_PACKAGING.md | 142 ++++++++++++++++++ start9/0.4/docker_entrypoint.sh | 17 +++ .../0.4/startos/actions/buildSearchIndex.ts | 118 +++++++++++++++ start9/0.4/startos/actions/index.ts | 3 +- start9/0.4/startos/utils.ts | 8 +- start9/0.4/startos/versions/index.ts | 5 +- start9/0.4/startos/versions/v0.1.0.44.ts | 47 ++++++ 16 files changed, 808 insertions(+), 48 deletions(-) create mode 100644 backend/ingest/fuzzy_resolve.py create mode 100644 backend/ingest/llm.py create mode 100644 backend/ingest/sync.py create mode 100644 docs/go-live-runbook.md create mode 100644 start9/0.4/INGEST_PACKAGING.md create mode 100644 start9/0.4/startos/actions/buildSearchIndex.ts create mode 100644 start9/0.4/startos/versions/v0.1.0.44.ts diff --git a/backend/ingest/backfill.py b/backend/ingest/backfill.py index ea158a8..708480d 100644 --- a/backend/ingest/backfill.py +++ b/backend/ingest/backfill.py @@ -17,17 +17,9 @@ import qdrant_io import sparse -def run(db, recreate=False, batch=32): - conn = sqlite3.connect(db) - conn.row_factory = sqlite3.Row - chunks = chunking.build_chunks(conn) - conn.close() - print(f"Built {len(chunks)} chunks from {db}") - - state = qdrant_io.create_collection(recreate=recreate) - qdrant_io.ensure_indexes() - print(f"Collection '{config.COLLECTION}': {state}") - +def embed_and_upsert(chunks, batch=32, progress=True): + """Embed (dense + sparse) and upsert a list of chunks to Qdrant. Shared by the + full backfill and the incremental sync. Returns the number of points written.""" total = 0 for i in range(0, len(chunks), batch): group = chunks[i:i + batch] @@ -46,8 +38,23 @@ def run(db, recreate=False, batch=32): }) qdrant_io.upsert(points) total += len(points) - print(f" upserted {total}/{len(chunks)}") + if progress: + print(f" upserted {total}/{len(chunks)}") + return total + +def run(db, recreate=False, batch=32): + conn = sqlite3.connect(db) + conn.row_factory = sqlite3.Row + chunks = chunking.build_chunks(conn) + conn.close() + print(f"Built {len(chunks)} chunks from {db}") + + state = qdrant_io.create_collection(recreate=recreate) + qdrant_io.ensure_indexes() + print(f"Collection '{config.COLLECTION}': {state}") + + embed_and_upsert(chunks, batch=batch) print(f"Done. Qdrant '{config.COLLECTION}' now holds {qdrant_io.count()} points.") diff --git a/backend/ingest/config.py b/backend/ingest/config.py index 4186b77..686ffcf 100644 --- a/backend/ingest/config.py +++ b/backend/ingest/config.py @@ -24,5 +24,6 @@ SPARK_VERIFY_TLS = os.environ.get("SPARK_CONTROL_VERIFY_TLS", "false").lower() i QDRANT_URL = os.environ.get("QDRANT_URL", "").rstrip("/") COLLECTION = os.environ.get("CRM_QDRANT_COLLECTION", "crm_chunks") EMBED_MODEL = os.environ.get("CRM_EMBED_MODEL", "BAAI/bge-m3") +CHAT_MODEL = os.environ.get("CRM_CHAT_MODEL", "RedHatAI/Qwen3.6-35B-A3B-NVFP4") DENSE_DIM = int(os.environ.get("CRM_EMBED_DIM", "1024")) DEFAULT_DB = os.environ.get("CRM_DEV_DB_PATH", os.path.join(_ROOT, "data", "crm_dev.db")) diff --git a/backend/ingest/entity_resolution.py b/backend/ingest/entity_resolution.py index 49a5c04..7523e4b 100644 --- a/backend/ingest/entity_resolution.py +++ b/backend/ingest/entity_resolution.py @@ -69,6 +69,16 @@ def _split_name(full: str): return parts[0], parts[-1] if len(parts) > 1 else "" +def _redirect(merge_map, eid): + """Follow durable fuzzy-merge redirects (entity_merges) so deterministic + re-runs respect prior merges instead of recreating the merged-away entity.""" + seen = set() + while eid in merge_map and eid not in seen: + seen.add(eid) + eid = merge_map[eid] + return eid + + # ── upsert helpers ──────────────────────────────────────────────────────────── def _upsert_entity(conn, eid, kind, display_name, primary_email): @@ -102,12 +112,13 @@ def _link(conn, canonical_id, source_model, source_id, match_value, match_kind, # ── resolution passes ───────────────────────────────────────────────────────── -def resolve_organizations(conn): +def resolve_organizations(conn, merge_map=None): """Merge organizations + fundraising_investors by normalized name. Returns (org_canon_by_orgid, org_canon_by_fundinv) so the people pass can attach each person to their firm's canonical id. """ + merge_map = merge_map or {} groups = defaultdict(lambda: {"orgs": [], "investors": [], "name": "", "email": ""}) for r in conn.execute("SELECT id, name, email FROM organizations"): @@ -135,7 +146,7 @@ def resolve_organizations(conn): # An org we are actively raising from (has a fundraising row) is an 'lp'; # otherwise a plain 'organization'. kind = "lp" if g["investors"] else "organization" - cid = _eid("lp" if kind == "lp" else "org", key) + cid = _redirect(merge_map, _eid("lp" if kind == "lp" else "org", key)) _upsert_entity(conn, cid, kind, g["name"], g["email"]) for oid in g["orgs"]: _link(conn, cid, "organizations", oid, key, "exact_name", 1.0) @@ -147,9 +158,10 @@ def resolve_organizations(conn): return org_canon_by_orgid, org_canon_by_fundinv -def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv): +def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv, merge_map=None): """Merge contacts + fundraising_contacts by exact email, else exact name within the same canonical org. Returns contact_id -> person canonical id (for lp_profiles).""" + merge_map = merge_map or {} # gather (model, source_id, full_name, email, org_canon) people = [] for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts"): @@ -173,7 +185,7 @@ def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv): match_kind, conf, match_value = "name_org", 0.8, name_norm else: continue - cid = _eid("per", key) + cid = _redirect(merge_map, _eid("per", key)) display = full.strip() or email _upsert_entity(conn, cid, "person", display, email) _link(conn, cid, model, sid, match_value, match_kind, conf) @@ -210,17 +222,32 @@ def run(db_path: str): conn.row_factory = sqlite3.Row conn.execute("PRAGMA foreign_keys=ON") - org_by_oid, org_by_inv = resolve_organizations(conn) + # Durable fuzzy-merge map (entity_merges) so deterministic re-runs respect + # prior local-Qwen merges instead of recreating merged-away entities. + conn.execute("""CREATE TABLE IF NOT EXISTS entity_merges ( + merged_id TEXT PRIMARY KEY, + survivor_id TEXT NOT NULL, + confidence REAL, + reason TEXT, + created_at TEXT DEFAULT (datetime('now')) + )""") + merge_map = {r["merged_id"]: r["survivor_id"] + for r in conn.execute("SELECT merged_id, survivor_id FROM entity_merges")} + + org_by_oid, org_by_inv = resolve_organizations(conn, merge_map) conn.commit() - person_meta = resolve_people(conn, org_by_oid, org_by_inv) + person_meta = resolve_people(conn, org_by_oid, org_by_inv, merge_map) conn.commit() candidates = find_fuzzy_candidates(person_meta) + # Counts report LIVE entities (deleted_at IS NULL); fuzzy-merged losers are + # soft-deleted tombstones (guardrail #3) and excluded. + live = "deleted_at IS NULL" counts = { - "canonical_total": conn.execute("SELECT COUNT(*) FROM canonical_entities").fetchone()[0], - "lp": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp'").fetchone()[0], - "organization": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization'").fetchone()[0], - "person": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person'").fetchone()[0], + "canonical_total": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE {live}").fetchone()[0], + "lp": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp' AND {live}").fetchone()[0], + "organization": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization' AND {live}").fetchone()[0], + "person": conn.execute(f"SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND {live}").fetchone()[0], "links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0], "fuzzy_candidates": len(candidates), } diff --git a/backend/ingest/fuzzy_resolve.py b/backend/ingest/fuzzy_resolve.py new file mode 100644 index 0000000..f7ae93b --- /dev/null +++ b/backend/ingest/fuzzy_resolve.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +"""Phase-0 Workstream B3 — fuzzy entity-resolution tier (local Qwen). + +The deterministic tier (entity_resolution.py) merges only provable matches and +FLAGS the hard name-variant candidates (same firm + surname, different first +name/email) without guessing. This tier asks the local Qwen model (Spark Control +/v1/chat/completions — sovereign, on Ten31 infra) to adjudicate each candidate +and merges the confirmed ones. + +A merge repoints the loser's entity_links to the survivor and soft-deletes the +loser canonical entity (deleted_at; never hard-deleted — guardrail #3). Every +merge is written to the interaction_log (guardrail #5). Idempotent: re-running +finds no new candidates once merged. + + python3 backend/ingest/fuzzy_resolve.py --db data/crm_dev.db + python3 backend/ingest/fuzzy_resolve.py --db data/crm_dev.db --dry-run +""" +import argparse +import json +import sqlite3 +import uuid +from datetime import datetime, timezone + +import entity_resolution as er +import llm + +_SYSTEM = ("You are an entity-resolution assistant for a CRM. Decide if the listed " + "people are the SAME individual recorded under name variants (e.g. nicknames " + "like Kate/Katherine, Bill/William), or DIFFERENT people who happen to share a " + "surname and firm. Be conservative: only say same when a nickname/abbreviation " + "relationship or matching contact info makes it clear.") + + +def _now(): + return datetime.now(timezone.utc).isoformat() + + +def _ask(members, firm): + people = "; ".join(f"{n}" + (f" <{e}>" if e else "") for _, n, e in members) + prompt = (f"Firm: {firm or 'unknown'}\nPeople: {people}\n\n" + "Are these the SAME person under name variants? " + 'Answer only JSON: {"same": true|false, "confidence": 0.0-1.0, "reason": "..."}') + return llm.chat_json(prompt, system=_SYSTEM, max_tokens=160) or {"same": False, "confidence": 0.0} + + +def _survivor(members): + # Prefer a member with an email, then the longest (most complete) name. + return sorted(members, key=lambda m: (bool(m[2]), len(m[1])), reverse=True)[0] + + +def run(db, threshold=0.7, dry_run=False): + counts, candidates = er.run(db) # ensure deterministic state + fresh candidates + conn = sqlite3.connect(db) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys=ON") + name_of = {r["id"]: r["display_name"] for r in conn.execute("SELECT id, display_name FROM canonical_entities")} + + merges = [] + for cand in candidates: + members = cand["members"] + verdict = _ask(members, name_of.get(cand["org"])) + same = bool(verdict.get("same")) and float(verdict.get("confidence", 0)) >= threshold + decision = {"surname": cand["surname"], "firm": name_of.get(cand["org"]), + "members": [{"id": m[0], "name": m[1]} for m in members], + "same": same, "confidence": verdict.get("confidence"), + "reason": verdict.get("reason")} + if same: + keep = _survivor(members) + losers = [m for m in members if m[0] != keep[0]] + decision["merged_into"] = {"id": keep[0], "name": keep[1]} + if not dry_run: + for loser in losers: + # Record the merge durably so deterministic re-runs respect it. + conn.execute("""INSERT INTO entity_merges (merged_id, survivor_id, confidence, reason, created_at) + VALUES (?,?,?,?,?) + ON CONFLICT(merged_id) DO UPDATE SET survivor_id=excluded.survivor_id, + confidence=excluded.confidence, reason=excluded.reason""", + (loser[0], keep[0], verdict.get("confidence", 0.7), + verdict.get("reason"), _now())) + conn.execute("UPDATE entity_links SET canonical_id=?, match_kind='fuzzy_merge', confidence=? " + "WHERE canonical_id=?", (keep[0], verdict.get("confidence", 0.7), loser[0])) + conn.execute("UPDATE canonical_entities SET deleted_at=?, updated_at=? WHERE id=?", + (_now(), _now(), loser[0])) + conn.execute("""INSERT INTO interaction_log + (id, ts, actor_type, actor_id, action, target_type, target_id, payload, source, created_at) + VALUES (?,?,?,?,?,?,?,?,?,?)""", + (str(uuid.uuid4()), _now(), "agent", "qwen_entity_resolver", "entity.merged", + "canonical_entity", keep[0], json.dumps(decision), "ingest", _now())) + merges.append(decision) + + if not dry_run: + conn.commit() + live_people = conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND deleted_at IS NULL").fetchone()[0] + conn.close() + return merges, live_people + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--db", default="data/crm_dev.db") + ap.add_argument("--threshold", type=float, default=0.7) + ap.add_argument("--dry-run", action="store_true") + args = ap.parse_args() + merges, live = run(args.db, threshold=args.threshold, dry_run=args.dry_run) + print(f"Adjudicated {len(merges)} candidate group(s){' (dry run)' if args.dry_run else ''}:") + for m in merges: + names = " / ".join(p["name"] for p in m["members"]) + verdict = f"MERGE -> {m['merged_into']['name']}" if m.get("merged_into") else "keep separate" + print(f" [{m['surname']}] {names}: same={m['same']} conf={m['confidence']} => {verdict}") + if m.get("reason"): + print(f" reason: {m['reason']}") + print(f"Live person entities now: {live}") + + +if __name__ == "__main__": + main() diff --git a/backend/ingest/llm.py b/backend/ingest/llm.py new file mode 100644 index 0000000..a18384d --- /dev/null +++ b/backend/ingest/llm.py @@ -0,0 +1,39 @@ +"""Local Qwen chat client via Spark Control /v1/chat/completions. + +Used for the privacy-sensitive, high-volume reasoning that must stay on Ten31 +infra (entity-resolution adjudication, triage). Frontier reasoning still goes to +Claude; this is the local leg. Thinking is disabled for fast structured output. +""" +import json +import re + +import config +import http_util + + +def chat(prompt, system=None, max_tokens=200, temperature=0.0): + messages = [] + if system: + messages.append({"role": "system", "content": system}) + messages.append({"role": "user", "content": prompt}) + body = {"model": config.CHAT_MODEL, "messages": messages, + "temperature": temperature, "max_tokens": max_tokens, + "chat_template_kwargs": {"enable_thinking": False}} + status, data = http_util.request("POST", f"{config.SPARK_CONTROL_URL}/v1/chat/completions", + body, verify=config.SPARK_VERIFY_TLS) + if status != 200: + raise RuntimeError(f"/v1/chat/completions -> {status}: {data}") + return (data["choices"][0]["message"].get("content") or "").strip() + + +def chat_json(prompt, system=None, max_tokens=200): + """Chat and parse the first JSON object from the reply (tolerant of fences).""" + raw = chat(prompt, system=system, max_tokens=max_tokens) + raw = re.sub(r"^```(json)?|```$", "", raw.strip(), flags=re.MULTILINE).strip() + m = re.search(r"\{.*\}", raw, re.DOTALL) + if not m: + return None + try: + return json.loads(m.group(0)) + except json.JSONDecodeError: + return None diff --git a/backend/ingest/sparse.py b/backend/ingest/sparse.py index eac9f60..f591d56 100644 --- a/backend/ingest/sparse.py +++ b/backend/ingest/sparse.py @@ -16,25 +16,41 @@ import hashlib import math import re -_TOKEN_RE = re.compile(r"[a-z0-9]+") +# Prefer FastEmbed Qdrant/bm25 (the EMBEDDINGS.md-specified encoder) when it is +# installable — true on the Start9 box (Python 3.11). Fall back to the +# dependency-free encoder below where it is not (e.g. this dev Mac on 3.14). +# Whichever is active, ingest and query in the SAME environment use it, so they +# stay consistent; production rebuilds the index on the box, so it uses FastEmbed +# end-to-end. BACKEND reports which is live. +try: + from fastembed import SparseTextEmbedding # type: ignore + _MODEL = None + def _model(): + global _MODEL + if _MODEL is None: + _MODEL = SparseTextEmbedding(model_name="Qdrant/bm25") + return _MODEL -def tokenize(text: str): - return _TOKEN_RE.findall((text or "").lower()) + def encode(text: str): + emb = next(_model().embed([text or ""])) + return {"indices": [int(i) for i in emb.indices], "values": [float(v) for v in emb.values]} + BACKEND = "fastembed:Qdrant/bm25" +except Exception: + BACKEND = "pure-python-bm25" + _TOKEN_RE = re.compile(r"[a-z0-9]+") -def _index(token: str) -> int: - # Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32). - return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big") + def tokenize(text: str): + return _TOKEN_RE.findall((text or "").lower()) + def _index(token: str) -> int: + # Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32). + return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big") -def encode(text: str): - """Return a sparse vector {indices, values}. Value is 1 + ln(tf) (sublinear - term frequency); IDF is applied by Qdrant via modifier:idf.""" - tf = {} - for tok in tokenize(text): - tf[tok] = tf.get(tok, 0) + 1 - idx_val = {} - for tok, count in tf.items(): - idx_val[_index(tok)] = 1.0 + math.log(count) - return {"indices": list(idx_val.keys()), "values": list(idx_val.values())} + def encode(text: str): + """Sparse vector {indices, values}; value = 1 + ln(tf). Qdrant applies IDF.""" + tf = {} + for tok in tokenize(text): + tf[tok] = tf.get(tok, 0) + 1 + return {"indices": [_index(t) for t in tf], "values": [1.0 + math.log(c) for c in tf.values()]} diff --git a/backend/ingest/sync.py b/backend/ingest/sync.py new file mode 100644 index 0000000..a7879b8 --- /dev/null +++ b/backend/ingest/sync.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Phase-0 Workstream B4 — incremental, idempotent CRM -> Qdrant sync. + +One command that keeps the index fresh: + 1. Re-run deterministic entity resolution (cheap, idempotent, respects durable + fuzzy merges). Optionally re-run the local-Qwen fuzzy tier (--fuzzy). + 2. Re-embed ONLY the source rows changed since the last sync (by updated_at); + the first run (or --recreate) is a full backfill. + 3. Upsert with deterministic point ids (overwrite in place) and advance the + watermark. Logged to interaction_log. + +Idempotent: re-running with no CRM changes embeds nothing. Watermark lives in an +`ingest_sync_state` table the pipeline owns. + + python3 backend/ingest/sync.py --db data/crm_dev.db # incremental (full on first run) + python3 backend/ingest/sync.py --db data/crm_dev.db --recreate # force full rebuild + python3 backend/ingest/sync.py --db data/crm_dev.db --fuzzy # also run the Qwen fuzzy tier + +LIMITATION: the CRM hard-deletes today, so a removed row's chunk is not pruned +incrementally (no tombstone). Until the DELETE handlers honor `deleted_at`, run a +periodic `--recreate` (or `backfill.py --recreate`) to drop orphans. Structural +entity-id changes (merges) are likewise best followed by a periodic full rebuild. +""" +import argparse +import json +import sqlite3 +import uuid +from datetime import datetime, timezone + +import backfill +import chunking +import config +import entity_resolution as er +import qdrant_io + +_CHANGE_TABLES = [("communications", "communications"), ("contacts", "contacts"), + ("lp_profiles", "lp_profiles"), ("opportunities", "opportunities"), + ("organizations", "organizations"), ("fundraising_investors", "fundraising_investors")] + + +def _now(): + # Match the CRM's updated_at format ("...Z") so the watermark compares + # correctly against source-row updated_at (server.now() in server.py). + return datetime.now(timezone.utc).replace(tzinfo=None).isoformat() + "Z" + + +def _ensure_state(conn): + conn.execute("""CREATE TABLE IF NOT EXISTS ingest_sync_state ( + key TEXT PRIMARY KEY, value TEXT, updated_at TEXT DEFAULT (datetime('now')))""") + + +def _state_get(conn, key): + r = conn.execute("SELECT value FROM ingest_sync_state WHERE key=?", (key,)).fetchone() + return r[0] if r else None + + +def _state_set(conn, key, value): + conn.execute("""INSERT INTO ingest_sync_state (key, value, updated_at) VALUES (?,?,?) + ON CONFLICT(key) DO UPDATE SET value=excluded.value, updated_at=excluded.updated_at""", + (key, value, _now())) + + +def _changed_source_ids(conn, since): + changed = set() + for tbl, model in _CHANGE_TABLES: + for r in conn.execute(f"SELECT id FROM {tbl} WHERE updated_at > ?", (since,)): + changed.add((model, r["id"])) + if chunking._has_table(conn, "emails"): + for r in conn.execute("SELECT id FROM emails WHERE updated_at > ? AND is_matched=1", (since,)): + changed.add(("emails", r["id"])) + return changed + + +def run(db, recreate=False, fuzzy=False, batch=32): + # 1. refresh the canonical layer (deterministic always; fuzzy on request) + er.run(db) + if fuzzy: + import fuzzy_resolve + fuzzy_resolve.run(db) + + conn = sqlite3.connect(db) + conn.row_factory = sqlite3.Row + _ensure_state(conn) + last = _state_get(conn, "last_sync_ts") + run_start = _now() + + qdrant_io.create_collection(recreate=recreate) + qdrant_io.ensure_indexes() + + all_chunks = chunking.build_chunks(conn) + if last is None or recreate: + mode, target = "full", all_chunks + else: + changed = _changed_source_ids(conn, last) + mode, target = "incremental", [c for c in all_chunks + if (c["source_model"], c["source_id"]) in changed] + + written = backfill.embed_and_upsert(target, batch=batch, progress=False) + _state_set(conn, "last_sync_ts", run_start) + + summary = {"mode": mode, "rows_embedded": written, "total_chunks": len(all_chunks), + "qdrant_points": qdrant_io.count()} + conn.execute("""INSERT INTO interaction_log + (id, ts, actor_type, actor_id, action, target_type, payload, source, created_at) + VALUES (?,?,?,?,?,?,?,?,?)""", + (str(uuid.uuid4()), _now(), "system", "ingest_sync", "ingest.sync", "crm_chunks", + json.dumps(summary), "ingest", _now())) + conn.commit() + conn.close() + return summary + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--db", default=config.DEFAULT_DB) + ap.add_argument("--recreate", action="store_true") + ap.add_argument("--fuzzy", action="store_true") + ap.add_argument("--batch", type=int, default=32) + args = ap.parse_args() + s = run(args.db, recreate=args.recreate, fuzzy=args.fuzzy, batch=args.batch) + print(f"Sync ({s['mode']}): embedded {s['rows_embedded']} chunk(s); " + f"{s['total_chunks']} total; Qdrant now holds {s['qdrant_points']} points.") + + +if __name__ == "__main__": + main() diff --git a/docs/go-live-runbook.md b/docs/go-live-runbook.md new file mode 100644 index 0000000..87c18de --- /dev/null +++ b/docs/go-live-runbook.md @@ -0,0 +1,87 @@ +# Go-Live Runbook — Phase 0 substrate on the live Start9 box + +*How to take the Phase-0 data substrate from "tested on synthetic data" to "running against the real CRM" on the Start9 server. You run this on your infrastructure; no real LP data goes to Claude/Anthropic (guardrails #1, #9). The live `/data/crm.db` on the box is the canonical source — not the possibly-stale `start9/0.4/seed/` snapshot.* + +Recap of the three moves (see also `docs/crm-overview.md`): (1) ship code → empty new tables appear; (2) run the one-time init → fills the canonical IDs + search index from your real data; (3) run the MCP server. + +--- + +## Prerequisites + +- Spark Control + Qdrant reachable from the box: `SPARK_CONTROL_URL`, `QDRANT_URL` (see `.env.example`). Verify with `curl -sk $SPARK_CONTROL_URL/api/endpoints`. +- The `backend/ingest/` + `backend/mcp/` code present on the box (ships with the package — see "Packaging decision" below). +- Python deps in the ingest environment: `fastembed` (BM25; installs cleanly on the box's Python 3.11) and `mcp` (only to run the MCP server). The CRM server itself needs no new deps. + +## Step 1 — Deploy the new CRM version (auto-creates the empty tables) + +1. Bump the package version, rebuild the `.s9pk`, sideload it. StartOS preserves `/data`, so your real data is undisturbed. +2. On first boot, `init_db()` runs `backend/core_migrations.py`, which applies `migrations/0001_phase0_foundation.sql` **once** (tracked in `schema_migrations`) — additively creating `canonical_entities`, `entity_links`, `interaction_log`, `relationship_edges`, and the `deleted_at` columns. Nothing existing changes. +3. Verify: `sqlite3 /data/crm.db "SELECT filename FROM schema_migrations;"` → should list `0001_phase0_foundation.sql`. + +## Step 2 — Prepare the ingest environment (on the box) + +```bash +pip install fastembed # BM25 Qdrant/bm25 (sparse.py auto-detects it) +export CRM_DB_PATH=/data/crm.db +export SPARK_CONTROL_URL=https://192.168.1.72:62419 +export SPARK_CONTROL_VERIFY_TLS=false +export QDRANT_URL=http://192.168.1.87:6333 +``` + +`sparse.py` will report `BACKEND = fastembed:Qdrant/bm25` here (vs the pure-Python fallback used on the dev Mac). Because the index is built **and** queried on the box, the encoder is consistent end-to-end. + +## Step 3 — Build the canonical IDs from your real data + +```bash +python3 backend/ingest/entity_resolution.py --db /data/crm.db --show-candidates +``` + +This reads your real contacts / fundraising investors / organizations and fills `canonical_entities` + `entity_links` (the "create entity IDs from existing data" step). It is **read-only on your CRM source tables**, idempotent, and logs a run to `interaction_log`. Review the printed fuzzy candidates — those are the name-variant pairs the deterministic tier wouldn't merge on a guess (the local-Qwen fuzzy tier, still to be built, resolves these). + +## Step 4 — Build the search index + +```bash +python3 backend/ingest/backfill.py --db /data/crm.db --recreate +``` + +Chunks your real records → dense (bge-m3 via Spark Control) + BM25 sparse → upserts to Qdrant `crm_chunks`. ~8–15 min for a full corpus. Idempotent (deterministic point ids), so re-running is safe. `--recreate` drops and rebuilds the collection; omit it to update in place. + +Note: your live CRM's text is concentrated in the **fundraising grid notes** + grid contacts (the seed snapshot had 0 communications / 0 lp_profiles), plus Gmail once enabled (see `docs/gmail-enablement-runbook.md`). The chunker already handles all of these. + +## Step 5 — Start the MCP server + +```bash +pip install mcp +CRM_DB_PATH=/data/crm.db python3 backend/mcp/server.py +``` + +Register it with the Agent SDK / Claude Code as an stdio MCP server. It exposes reads, the three retrieval modes, and logged writes — **no outbound/contact tools** (Phase 3 gate). For Phase 0 there are no live agents; this is for testing and the internal-only Analyst work later. + +## Step 6 — Incremental sync (NOT YET BUILT — Workstream B4) + +The full backfill is one-shot. Keeping the index fresh as the CRM changes (new grid edits, new emails) needs an incremental, idempotent sync on a schedule. This is the remaining Phase-0 ingest piece; until it's built, re-run Steps 3–4 to refresh. + +## Verification + +```sql +SELECT entity_kind, COUNT(*) FROM canonical_entities GROUP BY entity_kind; -- IDs built +SELECT COUNT(*) FROM entity_links; -- source rows linked +``` +```bash +curl -s "$QDRANT_URL/collections/crm_chunks" | python3 -c "import sys,json;print('points:', json.load(sys.stdin)['result']['points_count'])" +python3 backend/ingest/search.py "Fund III wire timeline" --mode hybrid # sanity query +``` + +## Open decision — packaging (how the init + MCP run on the box) + +The ingest scripts read `/data/crm.db` by file path, so they must run **where that file lives** — inside or beside the CRM container (the dev Mac cannot open the container's SQLite file directly). Options, to decide before go-live: + +- **A (recommended): same image.** Bundle `backend/ingest` + `backend/mcp` (+ `fastembed`, `mcp`) into the CRM container image; expose the init as a one-shot Start9 action and run the MCP server as a second daemon in the 0.4 `startos` manifest. The image is already Python 3.11 with the volume mounted. +- **B: sidecar container** on the box mounting the same `/data` volume. +- **C: co-located host** with a copy of `/data` and LAN access to the Sparks (involves copying the DB — least clean). + +This packaging wiring (and Step 6) is the remaining build work for a fully turn-key go-live. + +## Sovereignty checkpoint + +Every step above runs on Ten31 infrastructure. Real records flow `crm.db → local Spark (bge-m3) → local Qdrant` and never reach Anthropic. The scripts print counts, not records. Keep it that way: don't paste query *results* over real data back into a Claude session (guardrail #9). diff --git a/start9/0.4/Dockerfile b/start9/0.4/Dockerfile index a020d6f..c67f5aa 100644 --- a/start9/0.4/Dockerfile +++ b/start9/0.4/Dockerfile @@ -31,14 +31,27 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* # ── Python dependencies ───────────────────────────────────────── -# Only one hard dep for now: `cryptography` is required by the Gmail -# integration's RS256 JWT signing (DWD bearer tokens). Everything else -# server.py needs is stdlib. -RUN pip install --no-cache-dir cryptography==42.0.5 +# `cryptography` is required by the Gmail integration's RS256 JWT signing +# (DWD bearer tokens). The two Phase-0 deps are runtime-only for the ingest +# pipeline + MCP server (the CRM web server itself still needs no new deps): +# * fastembed — client-side BM25 (Qdrant/bm25) for the sparse retrieval leg +# (backend/ingest/sparse.py auto-detects it). +# * mcp — MCP Python SDK, only needed to run backend/mcp/server.py. +# Everything else server.py needs is stdlib. +RUN pip install --no-cache-dir \ + cryptography==42.0.5 \ + fastembed==0.4.2 \ + mcp==1.2.0 # ── Application source ────────────────────────────────────────── COPY backend/server.py /app/backend/server.py COPY backend/email_integration /app/backend/email_integration +# Phase-0 substrate: ingest pipeline (entity resolution + backfill) and the +# CRM MCP server. Shipped alongside the web server so the one-time index build +# and the (manually-run) MCP server can execute on the box where /data/crm.db +# lives. See start9/0.4/INGEST_PACKAGING.md. +COPY backend/ingest /app/backend/ingest +COPY backend/mcp /app/backend/mcp COPY frontend /app/frontend # ── StartOS wrapper scripts ───────────────────────────────────── diff --git a/start9/0.4/INGEST_PACKAGING.md b/start9/0.4/INGEST_PACKAGING.md new file mode 100644 index 0000000..ee2a0c7 --- /dev/null +++ b/start9/0.4/INGEST_PACKAGING.md @@ -0,0 +1,142 @@ +# Phase-0 ingest packaging (StartOS 0.4) + +How the Phase-0 data substrate — the ingest pipeline (`backend/ingest/`) and the +CRM MCP server (`backend/mcp/`) — ships and runs on the live StartOS 0.4 package, +**without changing the CRM web server**. This implements **Option A** ("same +image") from `docs/go-live-runbook.md` §"Open decision — packaging". + +The CRM web server (`backend/server.py`) is untouched and gains no new +dependencies. The `primary` daemon and its `checkPortListening` health check are +unchanged. + +## What changed + +| File | Change | +| --- | --- | +| `Dockerfile` | `COPY backend/ingest` and `COPY backend/mcp` into the image alongside `backend/server.py`. Added two runtime deps to the existing `pip install`: `fastembed==0.4.2` (client-side BM25 / `Qdrant/bm25` for the sparse retrieval leg) and `mcp==1.2.0` (MCP Python SDK, only for `backend/mcp/server.py`). | +| `docker_entrypoint.sh` | Added an export block for the ingest/retrieval env: `CRM_DB_PATH`, `SPARK_CONTROL_URL`, `SPARK_CONTROL_VERIFY_TLS`, `QDRANT_URL`, with LAN-default placeholder values and an operator comment. The CRM web server ignores these; they exist so manual `python3 /app/backend/ingest/...` and `backend/mcp/server.py` runs on the box inherit them. | +| `startos/actions/buildSearchIndex.ts` | **New.** A one-shot "Build search index" StartOS action (Steps 3–4 of the runbook). | +| `startos/actions/index.ts` | Registered the new action: `sdk.Actions.of().addAction(buildSearchIndex)`. | +| `startos/versions/v0.1.0.44.ts` + `versions/index.ts` | New version `0.1.0:44` (image-only change, no data migration) set as `current`; `0.1.0:43` moved to `other`. | +| `startos/utils.ts` | Bumped the informational `PACKAGE_VERSION` constant to `0.1.0:44`. | + +### Action registration mechanism (verified) + +Actions are collected in `startos/actions/index.ts` as +`export const actions = sdk.Actions.of().addAction(...)`, and that `actions` +object is passed into `sdk.setupInit(...)` in `startos/init/index.ts` (and +re-exported from `startos/index.ts`). Adding `.addAction(buildSearchIndex)` is +the entire registration — no manifest entry is required for actions in the 0.4 +SDK. + +## How the operator triggers the index build + +1. Build/sideload the new `.s9pk` (version `0.1.0:44`). StartOS preserves + `/data`, so live data is undisturbed. On first boot the CRM's own migration + runner creates the Phase-0 tables (see runbook Step 1) — that is independent + of this packaging change. +2. In the StartOS UI, open the **Ten31 Database** service → **Actions** → + **Build search index**, and run it. It: + - runs `entity_resolution.py --db /data/crm.db` (canonical ids + links), then + - runs `backfill.py --db /data/crm.db --recreate` (chunk → dense via Spark + Control + BM25 → upsert to Qdrant `crm_chunks`). + Both steps are idempotent and read-only on the CRM source tables, so the + action is safe to re-run any time to refresh the index. A full re-embed is + ~8–15 min (the action allows up to 30 min before timing out). + +The action runs in its **own subcontainer** with the same `main` volume mounted +at `/data`, with `cwd=/app/backend/ingest` (the ingest scripts import their +siblings by bare name, e.g. `import config`, so they must run from that +directory). It uses `allowedStatuses: 'any'` — SQLite WAL mode makes a +concurrently-running CRM safe for these reads/derived writes. + +## Env / config the operator must set (Spark URLs) + +The ingest run reaches out to **Spark Control** (dense embeddings) and **Qdrant** +(upserts). Those endpoints are LAN-specific, so they are defined in **two +places** that the operator must point at their network. The current values are +the Ten31 LAN defaults: + +| Variable | Default | Used by | +| --- | --- | --- | +| `SPARK_CONTROL_URL` | `https://192.168.1.72:62419` | dense embeds (`/v1/embeddings`) | +| `SPARK_CONTROL_VERIFY_TLS` | `false` (Spark Control uses a self-signed cert) | TLS verification toggle | +| `QDRANT_URL` | `http://192.168.1.87:6333` | Qdrant collection admin + upserts | +| `CRM_DB_PATH` | `/data/crm.db` | both scripts + MCP server (already correct) | + +Where to set them: + +- **`docker_entrypoint.sh`** — for manual `python3` / MCP runs via the running + container. Edit the `${VAR:-default}` block, or override via the StartOS + service environment. +- **`startos/actions/buildSearchIndex.ts`** (`ingestEnv`) — for the "Build search + index" action, which runs in its own subcontainer and does **not** execute the + entrypoint, so it carries its own copy of the values. Edit these to match. + +> Keep the two copies in sync. They are duplicated because the action's +> subcontainer never runs `docker_entrypoint.sh`; there is no shared config +> store wired into this package today (see "Still needed" below). + +Verify reachability from the box before running the action: +`curl -sk $SPARK_CONTROL_URL/api/endpoints` and +`curl -s $QDRANT_URL/collections`. + +## MCP server: decision and how to run it + +**Decision: the MCP server is NOT a daemon in this release — it is shipped in the +image and run manually.** Rationale: + +- `backend/mcp/server.py` is an **stdio** MCP server (`mcp.run()` with FastMCP): + it has no network port to bind, so the StartOS daemon model (a long-running + process with a `checkPortListening` health check, like `primary`) does not fit + it. There is nothing to port-probe and no meaningful liveness signal. +- **Phase 0 has no live agents** (per `CLAUDE.md` and the runbook): nothing on + the box would connect to it. An always-on daemon would idle with no client on + its stdin and no health semantics. +- It exposes reads, the three retrieval modes, and logged writes — **no + outbound/contact tools** (Phase 3 compliance gate). It is for testing and + later internal-only Analyst work. + +To run it manually on the box (it is present at `/app/backend/mcp/server.py` with +`mcp` already installed): + +```sh +# from inside the running container +CRM_DB_PATH=/data/crm.db python3 /app/backend/mcp/server.py +``` + +Then register it with the Agent SDK / Claude Code as an stdio MCP server pointing +at that script (it inherits the Spark/Qdrant env exported by the entrypoint). + +If/when a live agent needs it as a persistent service, the cleanest upgrade is to +add it as a **second daemon** in `startos/main.ts` mirroring the `primary` +daemon — but only after giving it a network transport (e.g. an HTTP/SSE MCP +endpoint on its own port) so it has a real `checkPortListening` health check. +That is deliberately deferred to a later phase. + +## Still needed for a fully turn-key deploy + +- **MCP-as-a-service** — see above. Deferred until there is a live agent and a + network transport; today it is manual/stdio only. +- **Incremental sync (runbook Step 6 / Workstream B4)** — the action does a full + one-shot rebuild. Keeping the index fresh as the CRM changes needs an + incremental, idempotent sync on a schedule. Until that exists, re-running the + "Build search index" action is the refresh path. When built, it could be wired + as a recurring StartOS action/task rather than a manual re-run. +- **Single source of truth for Spark/Qdrant config** — currently duplicated in + `docker_entrypoint.sh` and `buildSearchIndex.ts`. A small StartOS config + store + input form (the SDK supports `Action.withInput` and a service config) + would let the operator set the endpoints once in the UI; deferred to keep this + change minimal and reviewable. +- **`.env` on the box** — `backend/ingest/config.py` also reads `/app/.env` if + present (via `os.environ.setdefault`, so it does not override the exported + env). Not required given the exported env above, but available as an + alternative if the operator prefers a file. + +## Constraints honored + +- No files under `backend/ingest/`, `backend/mcp/`, `backend/server.py`, + `backend/core_migrations.py`, `backend/migrations/`, or `data/` were modified; + only `start9/0.4/**` and this new doc. +- No build/deploy commands were run. `npx tsc --noEmit` was used only to verify + the new TypeScript compiles against the SDK types. diff --git a/start9/0.4/docker_entrypoint.sh b/start9/0.4/docker_entrypoint.sh index eefbbd1..3106701 100755 --- a/start9/0.4/docker_entrypoint.sh +++ b/start9/0.4/docker_entrypoint.sh @@ -57,5 +57,22 @@ else echo "[entrypoint] Gmail integration: DISABLED (no key at $GMAIL_SA_KEY)" fi +# ── Phase-0 ingest / retrieval env ────────────────────────────── +# These are consumed by the ingest pipeline (backend/ingest/) and the MCP +# server (backend/mcp/) — NOT by the CRM web server, which ignores them. +# They are exported here so the "Build search index" StartOS action and any +# manual `python3 /app/backend/ingest/...` / `backend/mcp/server.py` run on the +# box inherit them. +# +# OPERATOR: the values below are LAN defaults for the Ten31 deployment. Set the +# real ones for your network — either by editing them here before building the +# image, or by overriding the env vars in the StartOS service environment. +# Point SPARK_CONTROL_URL at the Spark Control gateway (TLS, self-signed by +# default → SPARK_CONTROL_VERIFY_TLS=false) and QDRANT_URL at Qdrant on Spark 2. +export CRM_DB_PATH="${CRM_DB_PATH:-$DATA_DIR/crm.db}" +export SPARK_CONTROL_URL="${SPARK_CONTROL_URL:-https://192.168.1.72:62419}" +export SPARK_CONTROL_VERIFY_TLS="${SPARK_CONTROL_VERIFY_TLS:-false}" +export QDRANT_URL="${QDRANT_URL:-http://192.168.1.87:6333}" + # ── Launch the app ────────────────────────────────────────────── exec python3 /app/backend/server.py diff --git a/start9/0.4/startos/actions/buildSearchIndex.ts b/start9/0.4/startos/actions/buildSearchIndex.ts new file mode 100644 index 0000000..5ee4256 --- /dev/null +++ b/start9/0.4/startos/actions/buildSearchIndex.ts @@ -0,0 +1,118 @@ +import { i18n } from '../i18n' +import { sdk } from '../sdk' +import { DATA_MOUNT_PATH, IMAGE_ID } from '../utils' + +/** + * One-shot "Build search index" action (Phase-0 ingest, go-live Steps 3–4). + * + * Runs the one-time init that turns the live CRM into the canonical-id layer + * and the Qdrant search index, on the box where /data/crm.db lives: + * + * 1. entity_resolution.py --db /data/crm.db (build canonical ids + links) + * 2. backfill.py --db /data/crm.db --recreate (chunk → dense+BM25 → Qdrant) + * + * Both steps are idempotent (deterministic ids), read-only on the CRM source + * tables, and log a row to interaction_log — so re-running is always safe. + * + * Implementation notes: + * - The scripts import their siblings by bare name (`import config`, etc.), + * so they must run with cwd = /app/backend/ingest. + * - backfill.py talks to Spark Control (dense embeds) and Qdrant (upserts), + * so the Spark/Qdrant env must be present. This action runs in its OWN + * subcontainer and does NOT go through docker_entrypoint.sh, so it cannot + * inherit the entrypoint's exports — the env is passed explicitly below. + * - allowedStatuses: 'any' — the action runs in its own subcontainer with the + * same /data volume mounted, so it works whether or not the CRM is running. + * SQLite WAL mode means a concurrently-running CRM is fine for these + * reads/derived writes. + */ + +const DB_PATH = `${DATA_MOUNT_PATH}/crm.db` +const INGEST_DIR = '/app/backend/ingest' + +// OPERATOR: Spark Control + Qdrant endpoints for the ingest run. These are the +// LAN defaults for the Ten31 deployment — edit them for your network. Keep them +// in sync with the export block in docker_entrypoint.sh (single source of truth +// for the values; this action needs its own copy because it does not run the +// entrypoint). Spark Control is TLS with a self-signed cert by default, hence +// SPARK_CONTROL_VERIFY_TLS = 'false'. +const ingestEnv: { [k: string]: string } = { + CRM_DB_PATH: DB_PATH, + SPARK_CONTROL_URL: 'https://192.168.1.72:62419', + SPARK_CONTROL_VERIFY_TLS: 'false', + QDRANT_URL: 'http://192.168.1.87:6333', +} + +export const buildSearchIndex = sdk.Action.withoutInput( + // id + 'build-search-index', + + // metadata + async ({ effects }) => ({ + name: i18n('Build search index'), + description: i18n( + 'One-time Phase-0 init: builds the canonical entity ids from your live ' + + 'CRM (entity_resolution.py), then chunks + embeds every record into ' + + 'the Qdrant search index (backfill.py --recreate). Both steps are ' + + 'idempotent and read-only on your CRM source tables. Requires Spark ' + + 'Control and Qdrant to be reachable (set SPARK_CONTROL_URL / ' + + 'QDRANT_URL). A full re-embed takes roughly 8–15 minutes.', + ), + warning: i18n( + 'Rebuilds the Qdrant `crm_chunks` collection (--recreate drops and ' + + 'recreates it). The index is derived from the CRM and safe to rebuild; ' + + 'no CRM source data is modified.', + ), + allowedStatuses: 'any', + group: null, + visibility: 'enabled', + }), + + // execution + async ({ effects }) => { + const env = ingestEnv + + const subcontainer = await sdk.SubContainer.of( + effects, + { imageId: IMAGE_ID }, + sdk.Mounts.of().mountVolume({ + volumeId: 'main', + subpath: null, + mountpoint: DATA_MOUNT_PATH, + readonly: false, + }), + 'ten31-database-build-search-index', + ) + + try { + // Step 3 — canonical ids from the real data (fast, local-only). + await subcontainer.execFail( + ['python3', 'entity_resolution.py', '--db', DB_PATH], + { cwd: INGEST_DIR, env }, + // 10 minutes — pure SQLite work, but generous for a large corpus. + 10 * 60 * 1000, + ) + + // Step 4 — chunk → dense (Spark Control) + BM25 → Qdrant upsert. + await subcontainer.execFail( + ['python3', 'backfill.py', '--db', DB_PATH, '--recreate'], + { cwd: INGEST_DIR, env }, + // 30 minutes — a full re-embed is ~8–15 min; leave generous headroom. + 30 * 60 * 1000, + ) + } finally { + await subcontainer.destroy() + } + + return { + version: '1', + title: i18n('Search index built'), + message: i18n( + 'Canonical entity ids were resolved and the Qdrant `crm_chunks` ' + + 'collection was rebuilt from your live CRM. You can re-run this ' + + 'action any time to refresh the index.', + ), + result: null, + } + }, +) diff --git a/start9/0.4/startos/actions/index.ts b/start9/0.4/startos/actions/index.ts index c82cb95..366d892 100644 --- a/start9/0.4/startos/actions/index.ts +++ b/start9/0.4/startos/actions/index.ts @@ -1,3 +1,4 @@ import { sdk } from '../sdk' +import { buildSearchIndex } from './buildSearchIndex' -export const actions = sdk.Actions.of() +export const actions = sdk.Actions.of().addAction(buildSearchIndex) diff --git a/start9/0.4/startos/utils.ts b/start9/0.4/startos/utils.ts index 46673d4..c9edbc8 100644 --- a/start9/0.4/startos/utils.ts +++ b/start9/0.4/startos/utils.ts @@ -3,12 +3,14 @@ // from manifest/index.ts (id, title) and versions/ (version). export const PACKAGE_ID = 'ten-database' export const PACKAGE_TITLE = 'Ten31 Database' -// ExVer form of the current 0.4 wrapper release (upstream 0.1.0, wrapper rev 41). +// ExVer form of the current 0.4 wrapper release (upstream 0.1.0, wrapper rev 44). // * 0.3.5 wrapper: 0.1.0.38 (legacy, aarch64) // * First 0.4: 0.1.0:39 (shipped seed snapshot for migration) // * Cleanup: 0.1.0:40 (seed removed + multi-threaded server + abuser auto-ban) -// * Current: 0.1.0:41 (frontend persists auth across refreshes) -export const PACKAGE_VERSION = '0.1.0:41' +// * 0.1.0:41 (frontend persists auth across refreshes) +// * 0.1.0:42 (Gmail integration) / 0.1.0:43 (Gmail POST-body hotfix) +// * Current: 0.1.0:44 (Phase-0 ingest + MCP server in image; build-index action) +export const PACKAGE_VERSION = '0.1.0:44' export const DATA_MOUNT_PATH = '/data' export const WEB_PORT = 8080 diff --git a/start9/0.4/startos/versions/index.ts b/start9/0.4/startos/versions/index.ts index 38e4daf..f83e997 100644 --- a/start9/0.4/startos/versions/index.ts +++ b/start9/0.4/startos/versions/index.ts @@ -4,8 +4,9 @@ import { v_0_1_0_40 } from './v0.1.0.40' import { v_0_1_0_41 } from './v0.1.0.41' import { v_0_1_0_42 } from './v0.1.0.42' import { v_0_1_0_43 } from './v0.1.0.43' +import { v_0_1_0_44 } from './v0.1.0.44' export const versionGraph = VersionGraph.of({ - current: v_0_1_0_43, - other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42], + current: v_0_1_0_44, + other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43], }) diff --git a/start9/0.4/startos/versions/v0.1.0.44.ts b/start9/0.4/startos/versions/v0.1.0.44.ts new file mode 100644 index 0000000..e33089c --- /dev/null +++ b/start9/0.4/startos/versions/v0.1.0.44.ts @@ -0,0 +1,47 @@ +import { VersionInfo } from '@start9labs/start-sdk' + +// Phase-0 substrate packaging release. +// +// Context: +// * Ships the Phase-0 ingest pipeline (backend/ingest/) and the CRM MCP +// server (backend/mcp/) inside the existing CRM container image, alongside +// the web server. Two runtime deps are added to the image: `fastembed` +// (client-side BM25 for the sparse retrieval leg) and `mcp` (the MCP +// Python SDK, used only to run backend/mcp/server.py). The CRM web server +// itself gains no new dependencies and is unchanged. +// * Adds a one-shot "Build search index" StartOS action that runs the +// one-time init on the box where /data/crm.db lives: +// entity_resolution.py --db /data/crm.db (canonical ids) +// backfill.py --db /data/crm.db --recreate (Qdrant search index) +// Both steps are idempotent and read-only on the CRM source tables. +// * docker_entrypoint.sh now exports the Spark Control / Qdrant env +// (SPARK_CONTROL_URL, SPARK_CONTROL_VERIFY_TLS, QDRANT_URL) with LAN +// defaults so manual ingest / MCP runs on the box inherit them. +// +// The MCP server is intentionally NOT a daemon in this release: it is an +// stdio server with no port to bind and (in Phase 0) no live agent on the box +// to talk to it, so it is run manually for testing. See +// start9/0.4/INGEST_PACKAGING.md. +// +// No schema changes and no data migration: the SQLite schema is unchanged and +// the live /data volume is left exactly as-is. The new tables the ingest +// pipeline reads/writes are created by the CRM's own migration runner +// (migrations/0001_phase0_foundation.sql), independent of this package change. +export const v_0_1_0_44 = VersionInfo.of({ + version: '0.1.0:44', + releaseNotes: { + en_US: [ + 'Ships the Phase-0 data substrate inside the CRM image: the ingest', + 'pipeline (entity resolution + Qdrant backfill) and the CRM MCP server,', + 'plus the fastembed and mcp runtime dependencies. Adds a one-time', + '"Build search index" action that resolves canonical entity ids from', + 'your live CRM and rebuilds the Qdrant search index — both steps are', + 'idempotent and read-only on your CRM source data. The CRM web server', + 'is unchanged and gains no new dependencies. No data migration.', + ].join(' '), + }, + migrations: { + up: async () => {}, + down: async () => {}, + }, +})