import { i18n } from '../i18n' import { sdk } from '../sdk' import { DATA_MOUNT_PATH, IMAGE_ID } from '../utils' /** * One-shot "Build search index" action (Phase-0 ingest, go-live Steps 3–4). * * Runs the one-time init that turns the live CRM into the canonical-id layer * and the Qdrant search index, on the box where /data/crm.db lives: * * 1. entity_resolution.py --db /data/crm.db (build canonical ids + links) * 2. backfill.py --db /data/crm.db --recreate (chunk → dense+BM25 → Qdrant) * * Both steps are idempotent (deterministic ids), read-only on the CRM source * tables, and log a row to interaction_log — so re-running is always safe. * * Implementation notes: * - The scripts import their siblings by bare name (`import config`, etc.), * so they must run with cwd = /app/backend/ingest. * - backfill.py talks to Spark Control (dense embeds) and Qdrant (upserts), * so the Spark/Qdrant env must be present. This action runs in its OWN * subcontainer and does NOT go through docker_entrypoint.sh, so it cannot * inherit the entrypoint's exports — the env is passed explicitly below. * - allowedStatuses: 'any' — the action runs in its own subcontainer with the * same /data volume mounted, so it works whether or not the CRM is running. * SQLite WAL mode means a concurrently-running CRM is fine for these * reads/derived writes. */ const DB_PATH = `${DATA_MOUNT_PATH}/crm.db` const INGEST_DIR = '/app/backend/ingest' // OPERATOR: Spark Control + Qdrant endpoints for the ingest run. These are the // LAN defaults for the Ten31 deployment — edit them for your network. Keep them // in sync with the export block in docker_entrypoint.sh (single source of truth // for the values; this action needs its own copy because it does not run the // entrypoint). Spark Control is TLS with a self-signed cert by default, hence // SPARK_CONTROL_VERIFY_TLS = 'false'. const ingestEnv: { [k: string]: string } = { CRM_DB_PATH: DB_PATH, SPARK_CONTROL_URL: 'https://192.168.1.72:62419', SPARK_CONTROL_VERIFY_TLS: 'false', QDRANT_URL: 'http://192.168.1.87:6333', } export const buildSearchIndex = sdk.Action.withoutInput( // id 'build-search-index', // metadata async ({ effects }) => ({ name: i18n('Build search index'), description: i18n( 'One-time Phase-0 init: builds the canonical entity ids from your live ' + 'CRM (entity_resolution.py), then chunks + embeds every record into ' + 'the Qdrant search index (backfill.py --recreate). Both steps are ' + 'idempotent and read-only on your CRM source tables. Requires Spark ' + 'Control and Qdrant to be reachable (set SPARK_CONTROL_URL / ' + 'QDRANT_URL). A full re-embed takes roughly 8–15 minutes.', ), warning: i18n( 'Rebuilds the Qdrant `crm_chunks` collection (--recreate drops and ' + 'recreates it). The index is derived from the CRM and safe to rebuild; ' + 'no CRM source data is modified.', ), allowedStatuses: 'any', group: null, visibility: 'enabled', }), // execution async ({ effects }) => { const env = ingestEnv const subcontainer = await sdk.SubContainer.of( effects, { imageId: IMAGE_ID }, sdk.Mounts.of().mountVolume({ volumeId: 'main', subpath: null, mountpoint: DATA_MOUNT_PATH, readonly: false, }), 'ten31-database-build-search-index', ) try { // Step 3 — canonical ids from the real data (fast, local-only). await subcontainer.execFail( ['python3', 'entity_resolution.py', '--db', DB_PATH], { cwd: INGEST_DIR, env }, // 10 minutes — pure SQLite work, but generous for a large corpus. 10 * 60 * 1000, ) // Step 4 — chunk → dense (Spark Control) + BM25 → Qdrant upsert. await subcontainer.execFail( ['python3', 'backfill.py', '--db', DB_PATH, '--recreate'], { cwd: INGEST_DIR, env }, // 30 minutes — a full re-embed is ~8–15 min; leave generous headroom. 30 * 60 * 1000, ) } finally { await subcontainer.destroy() } return { version: '1', title: i18n('Search index built'), message: i18n( 'Canonical entity ids were resolved and the Qdrant `crm_chunks` ' + 'collection was rebuilt from your live CRM. You can re-run this ' + 'action any time to refresh the index.', ), result: null, } }, )