Phase 0 complete: fuzzy entity tier, incremental sync, Start9 packaging
- Fuzzy tier (backend/ingest/fuzzy_resolve.py + llm.py): local Qwen adjudicates the deterministic resolver's flagged name-variant candidates; merges are durable via entity_merges (deterministic re-runs respect them), losers soft-deleted, logged. Idempotent. - Incremental sync (backend/ingest/sync.py): re-embeds only rows changed since a watermark (ingest_sync_state); first run / --recreate = full. Tested full→0→1. - Start9 packaging (start9/0.4): Dockerfile bundles ingest+mcp + fastembed/mcp; "Build search index" action runs the init in a subcontainer; MCP shipped as a manual stdio server (not a daemon); version 0.1.0:44. INGEST_PACKAGING.md. - backfill.py: factored embed_and_upsert() shared with sync. Verified end-to-end on synthetic data + live Sparks/Qwen/Qdrant. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,118 @@
|
||||
import { i18n } from '../i18n'
|
||||
import { sdk } from '../sdk'
|
||||
import { DATA_MOUNT_PATH, IMAGE_ID } from '../utils'
|
||||
|
||||
/**
|
||||
* One-shot "Build search index" action (Phase-0 ingest, go-live Steps 3–4).
|
||||
*
|
||||
* Runs the one-time init that turns the live CRM into the canonical-id layer
|
||||
* and the Qdrant search index, on the box where /data/crm.db lives:
|
||||
*
|
||||
* 1. entity_resolution.py --db /data/crm.db (build canonical ids + links)
|
||||
* 2. backfill.py --db /data/crm.db --recreate (chunk → dense+BM25 → Qdrant)
|
||||
*
|
||||
* Both steps are idempotent (deterministic ids), read-only on the CRM source
|
||||
* tables, and log a row to interaction_log — so re-running is always safe.
|
||||
*
|
||||
* Implementation notes:
|
||||
* - The scripts import their siblings by bare name (`import config`, etc.),
|
||||
* so they must run with cwd = /app/backend/ingest.
|
||||
* - backfill.py talks to Spark Control (dense embeds) and Qdrant (upserts),
|
||||
* so the Spark/Qdrant env must be present. This action runs in its OWN
|
||||
* subcontainer and does NOT go through docker_entrypoint.sh, so it cannot
|
||||
* inherit the entrypoint's exports — the env is passed explicitly below.
|
||||
* - allowedStatuses: 'any' — the action runs in its own subcontainer with the
|
||||
* same /data volume mounted, so it works whether or not the CRM is running.
|
||||
* SQLite WAL mode means a concurrently-running CRM is fine for these
|
||||
* reads/derived writes.
|
||||
*/
|
||||
|
||||
const DB_PATH = `${DATA_MOUNT_PATH}/crm.db`
|
||||
const INGEST_DIR = '/app/backend/ingest'
|
||||
|
||||
// OPERATOR: Spark Control + Qdrant endpoints for the ingest run. These are the
|
||||
// LAN defaults for the Ten31 deployment — edit them for your network. Keep them
|
||||
// in sync with the export block in docker_entrypoint.sh (single source of truth
|
||||
// for the values; this action needs its own copy because it does not run the
|
||||
// entrypoint). Spark Control is TLS with a self-signed cert by default, hence
|
||||
// SPARK_CONTROL_VERIFY_TLS = 'false'.
|
||||
const ingestEnv: { [k: string]: string } = {
|
||||
CRM_DB_PATH: DB_PATH,
|
||||
SPARK_CONTROL_URL: 'https://192.168.1.72:62419',
|
||||
SPARK_CONTROL_VERIFY_TLS: 'false',
|
||||
QDRANT_URL: 'http://192.168.1.87:6333',
|
||||
}
|
||||
|
||||
export const buildSearchIndex = sdk.Action.withoutInput(
|
||||
// id
|
||||
'build-search-index',
|
||||
|
||||
// metadata
|
||||
async ({ effects }) => ({
|
||||
name: i18n('Build search index'),
|
||||
description: i18n(
|
||||
'One-time Phase-0 init: builds the canonical entity ids from your live ' +
|
||||
'CRM (entity_resolution.py), then chunks + embeds every record into ' +
|
||||
'the Qdrant search index (backfill.py --recreate). Both steps are ' +
|
||||
'idempotent and read-only on your CRM source tables. Requires Spark ' +
|
||||
'Control and Qdrant to be reachable (set SPARK_CONTROL_URL / ' +
|
||||
'QDRANT_URL). A full re-embed takes roughly 8–15 minutes.',
|
||||
),
|
||||
warning: i18n(
|
||||
'Rebuilds the Qdrant `crm_chunks` collection (--recreate drops and ' +
|
||||
'recreates it). The index is derived from the CRM and safe to rebuild; ' +
|
||||
'no CRM source data is modified.',
|
||||
),
|
||||
allowedStatuses: 'any',
|
||||
group: null,
|
||||
visibility: 'enabled',
|
||||
}),
|
||||
|
||||
// execution
|
||||
async ({ effects }) => {
|
||||
const env = ingestEnv
|
||||
|
||||
const subcontainer = await sdk.SubContainer.of(
|
||||
effects,
|
||||
{ imageId: IMAGE_ID },
|
||||
sdk.Mounts.of().mountVolume({
|
||||
volumeId: 'main',
|
||||
subpath: null,
|
||||
mountpoint: DATA_MOUNT_PATH,
|
||||
readonly: false,
|
||||
}),
|
||||
'ten31-database-build-search-index',
|
||||
)
|
||||
|
||||
try {
|
||||
// Step 3 — canonical ids from the real data (fast, local-only).
|
||||
await subcontainer.execFail(
|
||||
['python3', 'entity_resolution.py', '--db', DB_PATH],
|
||||
{ cwd: INGEST_DIR, env },
|
||||
// 10 minutes — pure SQLite work, but generous for a large corpus.
|
||||
10 * 60 * 1000,
|
||||
)
|
||||
|
||||
// Step 4 — chunk → dense (Spark Control) + BM25 → Qdrant upsert.
|
||||
await subcontainer.execFail(
|
||||
['python3', 'backfill.py', '--db', DB_PATH, '--recreate'],
|
||||
{ cwd: INGEST_DIR, env },
|
||||
// 30 minutes — a full re-embed is ~8–15 min; leave generous headroom.
|
||||
30 * 60 * 1000,
|
||||
)
|
||||
} finally {
|
||||
await subcontainer.destroy()
|
||||
}
|
||||
|
||||
return {
|
||||
version: '1',
|
||||
title: i18n('Search index built'),
|
||||
message: i18n(
|
||||
'Canonical entity ids were resolved and the Qdrant `crm_chunks` ' +
|
||||
'collection was rebuilt from your live CRM. You can re-run this ' +
|
||||
'action any time to refresh the index.',
|
||||
),
|
||||
result: null,
|
||||
}
|
||||
},
|
||||
)
|
||||
@@ -1,3 +1,4 @@
|
||||
import { sdk } from '../sdk'
|
||||
import { buildSearchIndex } from './buildSearchIndex'
|
||||
|
||||
export const actions = sdk.Actions.of()
|
||||
export const actions = sdk.Actions.of().addAction(buildSearchIndex)
|
||||
|
||||
Reference in New Issue
Block a user