Files
ten31-database/start9/0.4/startos/actions/buildSearchIndex.ts
T
Keysat f357c23c75 Phase 0 complete: fuzzy entity tier, incremental sync, Start9 packaging
- Fuzzy tier (backend/ingest/fuzzy_resolve.py + llm.py): local Qwen adjudicates
  the deterministic resolver's flagged name-variant candidates; merges are
  durable via entity_merges (deterministic re-runs respect them), losers
  soft-deleted, logged. Idempotent.
- Incremental sync (backend/ingest/sync.py): re-embeds only rows changed since a
  watermark (ingest_sync_state); first run / --recreate = full. Tested full→0→1.
- Start9 packaging (start9/0.4): Dockerfile bundles ingest+mcp + fastembed/mcp;
  "Build search index" action runs the init in a subcontainer; MCP shipped as a
  manual stdio server (not a daemon); version 0.1.0:44. INGEST_PACKAGING.md.
- backfill.py: factored embed_and_upsert() shared with sync.

Verified end-to-end on synthetic data + live Sparks/Qwen/Qdrant.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 08:55:12 -05:00

119 lines
4.4 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { i18n } from '../i18n'
import { sdk } from '../sdk'
import { DATA_MOUNT_PATH, IMAGE_ID } from '../utils'
/**
* One-shot "Build search index" action (Phase-0 ingest, go-live Steps 34).
*
* Runs the one-time init that turns the live CRM into the canonical-id layer
* and the Qdrant search index, on the box where /data/crm.db lives:
*
* 1. entity_resolution.py --db /data/crm.db (build canonical ids + links)
* 2. backfill.py --db /data/crm.db --recreate (chunk → dense+BM25 → Qdrant)
*
* Both steps are idempotent (deterministic ids), read-only on the CRM source
* tables, and log a row to interaction_log — so re-running is always safe.
*
* Implementation notes:
* - The scripts import their siblings by bare name (`import config`, etc.),
* so they must run with cwd = /app/backend/ingest.
* - backfill.py talks to Spark Control (dense embeds) and Qdrant (upserts),
* so the Spark/Qdrant env must be present. This action runs in its OWN
* subcontainer and does NOT go through docker_entrypoint.sh, so it cannot
* inherit the entrypoint's exports — the env is passed explicitly below.
* - allowedStatuses: 'any' — the action runs in its own subcontainer with the
* same /data volume mounted, so it works whether or not the CRM is running.
* SQLite WAL mode means a concurrently-running CRM is fine for these
* reads/derived writes.
*/
const DB_PATH = `${DATA_MOUNT_PATH}/crm.db`
const INGEST_DIR = '/app/backend/ingest'
// OPERATOR: Spark Control + Qdrant endpoints for the ingest run. These are the
// LAN defaults for the Ten31 deployment — edit them for your network. Keep them
// in sync with the export block in docker_entrypoint.sh (single source of truth
// for the values; this action needs its own copy because it does not run the
// entrypoint). Spark Control is TLS with a self-signed cert by default, hence
// SPARK_CONTROL_VERIFY_TLS = 'false'.
const ingestEnv: { [k: string]: string } = {
CRM_DB_PATH: DB_PATH,
SPARK_CONTROL_URL: 'https://192.168.1.72:62419',
SPARK_CONTROL_VERIFY_TLS: 'false',
QDRANT_URL: 'http://192.168.1.87:6333',
}
export const buildSearchIndex = sdk.Action.withoutInput(
// id
'build-search-index',
// metadata
async ({ effects }) => ({
name: i18n('Build search index'),
description: i18n(
'One-time Phase-0 init: builds the canonical entity ids from your live ' +
'CRM (entity_resolution.py), then chunks + embeds every record into ' +
'the Qdrant search index (backfill.py --recreate). Both steps are ' +
'idempotent and read-only on your CRM source tables. Requires Spark ' +
'Control and Qdrant to be reachable (set SPARK_CONTROL_URL / ' +
'QDRANT_URL). A full re-embed takes roughly 815 minutes.',
),
warning: i18n(
'Rebuilds the Qdrant `crm_chunks` collection (--recreate drops and ' +
'recreates it). The index is derived from the CRM and safe to rebuild; ' +
'no CRM source data is modified.',
),
allowedStatuses: 'any',
group: null,
visibility: 'enabled',
}),
// execution
async ({ effects }) => {
const env = ingestEnv
const subcontainer = await sdk.SubContainer.of(
effects,
{ imageId: IMAGE_ID },
sdk.Mounts.of().mountVolume({
volumeId: 'main',
subpath: null,
mountpoint: DATA_MOUNT_PATH,
readonly: false,
}),
'ten31-database-build-search-index',
)
try {
// Step 3 — canonical ids from the real data (fast, local-only).
await subcontainer.execFail(
['python3', 'entity_resolution.py', '--db', DB_PATH],
{ cwd: INGEST_DIR, env },
// 10 minutes — pure SQLite work, but generous for a large corpus.
10 * 60 * 1000,
)
// Step 4 — chunk → dense (Spark Control) + BM25 → Qdrant upsert.
await subcontainer.execFail(
['python3', 'backfill.py', '--db', DB_PATH, '--recreate'],
{ cwd: INGEST_DIR, env },
// 30 minutes — a full re-embed is ~815 min; leave generous headroom.
30 * 60 * 1000,
)
} finally {
await subcontainer.destroy()
}
return {
version: '1',
title: i18n('Search index built'),
message: i18n(
'Canonical entity ids were resolved and the Qdrant `crm_chunks` ' +
'collection was rebuilt from your live CRM. You can re-run this ' +
'action any time to refresh the index.',
),
result: null,
}
},
)