Phase 0 complete: fuzzy entity tier, incremental sync, Start9 packaging

- Fuzzy tier (backend/ingest/fuzzy_resolve.py + llm.py): local Qwen adjudicates
  the deterministic resolver's flagged name-variant candidates; merges are
  durable via entity_merges (deterministic re-runs respect them), losers
  soft-deleted, logged. Idempotent.
- Incremental sync (backend/ingest/sync.py): re-embeds only rows changed since a
  watermark (ingest_sync_state); first run / --recreate = full. Tested full→0→1.
- Start9 packaging (start9/0.4): Dockerfile bundles ingest+mcp + fastembed/mcp;
  "Build search index" action runs the init in a subcontainer; MCP shipped as a
  manual stdio server (not a daemon); version 0.1.0:44. INGEST_PACKAGING.md.
- backfill.py: factored embed_and_upsert() shared with sync.

Verified end-to-end on synthetic data + live Sparks/Qwen/Qdrant.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Keysat
2026-06-05 08:55:12 -05:00
parent c7ce44d963
commit f357c23c75
16 changed files with 808 additions and 48 deletions
@@ -0,0 +1,118 @@
import { i18n } from '../i18n'
import { sdk } from '../sdk'
import { DATA_MOUNT_PATH, IMAGE_ID } from '../utils'
/**
* One-shot "Build search index" action (Phase-0 ingest, go-live Steps 34).
*
* Runs the one-time init that turns the live CRM into the canonical-id layer
* and the Qdrant search index, on the box where /data/crm.db lives:
*
* 1. entity_resolution.py --db /data/crm.db (build canonical ids + links)
* 2. backfill.py --db /data/crm.db --recreate (chunk → dense+BM25 → Qdrant)
*
* Both steps are idempotent (deterministic ids), read-only on the CRM source
* tables, and log a row to interaction_log — so re-running is always safe.
*
* Implementation notes:
* - The scripts import their siblings by bare name (`import config`, etc.),
* so they must run with cwd = /app/backend/ingest.
* - backfill.py talks to Spark Control (dense embeds) and Qdrant (upserts),
* so the Spark/Qdrant env must be present. This action runs in its OWN
* subcontainer and does NOT go through docker_entrypoint.sh, so it cannot
* inherit the entrypoint's exports — the env is passed explicitly below.
* - allowedStatuses: 'any' — the action runs in its own subcontainer with the
* same /data volume mounted, so it works whether or not the CRM is running.
* SQLite WAL mode means a concurrently-running CRM is fine for these
* reads/derived writes.
*/
const DB_PATH = `${DATA_MOUNT_PATH}/crm.db`
const INGEST_DIR = '/app/backend/ingest'
// OPERATOR: Spark Control + Qdrant endpoints for the ingest run. These are the
// LAN defaults for the Ten31 deployment — edit them for your network. Keep them
// in sync with the export block in docker_entrypoint.sh (single source of truth
// for the values; this action needs its own copy because it does not run the
// entrypoint). Spark Control is TLS with a self-signed cert by default, hence
// SPARK_CONTROL_VERIFY_TLS = 'false'.
const ingestEnv: { [k: string]: string } = {
CRM_DB_PATH: DB_PATH,
SPARK_CONTROL_URL: 'https://192.168.1.72:62419',
SPARK_CONTROL_VERIFY_TLS: 'false',
QDRANT_URL: 'http://192.168.1.87:6333',
}
export const buildSearchIndex = sdk.Action.withoutInput(
// id
'build-search-index',
// metadata
async ({ effects }) => ({
name: i18n('Build search index'),
description: i18n(
'One-time Phase-0 init: builds the canonical entity ids from your live ' +
'CRM (entity_resolution.py), then chunks + embeds every record into ' +
'the Qdrant search index (backfill.py --recreate). Both steps are ' +
'idempotent and read-only on your CRM source tables. Requires Spark ' +
'Control and Qdrant to be reachable (set SPARK_CONTROL_URL / ' +
'QDRANT_URL). A full re-embed takes roughly 815 minutes.',
),
warning: i18n(
'Rebuilds the Qdrant `crm_chunks` collection (--recreate drops and ' +
'recreates it). The index is derived from the CRM and safe to rebuild; ' +
'no CRM source data is modified.',
),
allowedStatuses: 'any',
group: null,
visibility: 'enabled',
}),
// execution
async ({ effects }) => {
const env = ingestEnv
const subcontainer = await sdk.SubContainer.of(
effects,
{ imageId: IMAGE_ID },
sdk.Mounts.of().mountVolume({
volumeId: 'main',
subpath: null,
mountpoint: DATA_MOUNT_PATH,
readonly: false,
}),
'ten31-database-build-search-index',
)
try {
// Step 3 — canonical ids from the real data (fast, local-only).
await subcontainer.execFail(
['python3', 'entity_resolution.py', '--db', DB_PATH],
{ cwd: INGEST_DIR, env },
// 10 minutes — pure SQLite work, but generous for a large corpus.
10 * 60 * 1000,
)
// Step 4 — chunk → dense (Spark Control) + BM25 → Qdrant upsert.
await subcontainer.execFail(
['python3', 'backfill.py', '--db', DB_PATH, '--recreate'],
{ cwd: INGEST_DIR, env },
// 30 minutes — a full re-embed is ~815 min; leave generous headroom.
30 * 60 * 1000,
)
} finally {
await subcontainer.destroy()
}
return {
version: '1',
title: i18n('Search index built'),
message: i18n(
'Canonical entity ids were resolved and the Qdrant `crm_chunks` ' +
'collection was rebuilt from your live CRM. You can re-run this ' +
'action any time to refresh the index.',
),
result: null,
}
},
)
+2 -1
View File
@@ -1,3 +1,4 @@
import { sdk } from '../sdk'
import { buildSearchIndex } from './buildSearchIndex'
export const actions = sdk.Actions.of()
export const actions = sdk.Actions.of().addAction(buildSearchIndex)
+5 -3
View File
@@ -3,12 +3,14 @@
// from manifest/index.ts (id, title) and versions/ (version).
export const PACKAGE_ID = 'ten-database'
export const PACKAGE_TITLE = 'Ten31 Database'
// ExVer form of the current 0.4 wrapper release (upstream 0.1.0, wrapper rev 41).
// ExVer form of the current 0.4 wrapper release (upstream 0.1.0, wrapper rev 44).
// * 0.3.5 wrapper: 0.1.0.38 (legacy, aarch64)
// * First 0.4: 0.1.0:39 (shipped seed snapshot for migration)
// * Cleanup: 0.1.0:40 (seed removed + multi-threaded server + abuser auto-ban)
// * Current: 0.1.0:41 (frontend persists auth across refreshes)
export const PACKAGE_VERSION = '0.1.0:41'
// * 0.1.0:41 (frontend persists auth across refreshes)
// * 0.1.0:42 (Gmail integration) / 0.1.0:43 (Gmail POST-body hotfix)
// * Current: 0.1.0:44 (Phase-0 ingest + MCP server in image; build-index action)
export const PACKAGE_VERSION = '0.1.0:44'
export const DATA_MOUNT_PATH = '/data'
export const WEB_PORT = 8080
+3 -2
View File
@@ -4,8 +4,9 @@ import { v_0_1_0_40 } from './v0.1.0.40'
import { v_0_1_0_41 } from './v0.1.0.41'
import { v_0_1_0_42 } from './v0.1.0.42'
import { v_0_1_0_43 } from './v0.1.0.43'
import { v_0_1_0_44 } from './v0.1.0.44'
export const versionGraph = VersionGraph.of({
current: v_0_1_0_43,
other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42],
current: v_0_1_0_44,
other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43],
})
+47
View File
@@ -0,0 +1,47 @@
import { VersionInfo } from '@start9labs/start-sdk'
// Phase-0 substrate packaging release.
//
// Context:
// * Ships the Phase-0 ingest pipeline (backend/ingest/) and the CRM MCP
// server (backend/mcp/) inside the existing CRM container image, alongside
// the web server. Two runtime deps are added to the image: `fastembed`
// (client-side BM25 for the sparse retrieval leg) and `mcp` (the MCP
// Python SDK, used only to run backend/mcp/server.py). The CRM web server
// itself gains no new dependencies and is unchanged.
// * Adds a one-shot "Build search index" StartOS action that runs the
// one-time init on the box where /data/crm.db lives:
// entity_resolution.py --db /data/crm.db (canonical ids)
// backfill.py --db /data/crm.db --recreate (Qdrant search index)
// Both steps are idempotent and read-only on the CRM source tables.
// * docker_entrypoint.sh now exports the Spark Control / Qdrant env
// (SPARK_CONTROL_URL, SPARK_CONTROL_VERIFY_TLS, QDRANT_URL) with LAN
// defaults so manual ingest / MCP runs on the box inherit them.
//
// The MCP server is intentionally NOT a daemon in this release: it is an
// stdio server with no port to bind and (in Phase 0) no live agent on the box
// to talk to it, so it is run manually for testing. See
// start9/0.4/INGEST_PACKAGING.md.
//
// No schema changes and no data migration: the SQLite schema is unchanged and
// the live /data volume is left exactly as-is. The new tables the ingest
// pipeline reads/writes are created by the CRM's own migration runner
// (migrations/0001_phase0_foundation.sql), independent of this package change.
export const v_0_1_0_44 = VersionInfo.of({
version: '0.1.0:44',
releaseNotes: {
en_US: [
'Ships the Phase-0 data substrate inside the CRM image: the ingest',
'pipeline (entity resolution + Qdrant backfill) and the CRM MCP server,',
'plus the fastembed and mcp runtime dependencies. Adds a one-time',
'"Build search index" action that resolves canonical entity ids from',
'your live CRM and rebuilds the Qdrant search index — both steps are',
'idempotent and read-only on your CRM source data. The CRM web server',
'is unchanged and gains no new dependencies. No data migration.',
].join(' '),
},
migrations: {
up: async () => {},
down: async () => {},
},
})