import { i18n } from '../i18n' import { sdk } from '../sdk' import { DATA_MOUNT_PATH, IMAGE_ID } from '../utils' /** * Manual "Resolve duplicate names" action (Phase-0 entity resolution, fuzzy tier). * * Runs the fuzzy entity-resolution tier on the box where /data/crm.db lives: * * fuzzy_resolve.py --db /data/crm.db (local-Qwen merge of name-variant dupes) * * The deterministic tier (entity_resolution.py, run by "Build search index") * merges exact/normalized matches and FLAGS the harder name-variant pairs it * won't merge on a guess (e.g. "Kate" / "Katherine"). This action runs the * second tier: it asks the local Qwen model (via Spark Control) to decide which * flagged pairs are truly the same LP and merges them into one canonical id. * It is idempotent, read-only on the CRM source tables, and logs a row to * interaction_log — so re-running is always safe. * * Implementation notes: * - The scripts import their siblings by bare name (`import config`, etc.), * so they must run with cwd = /app/backend/ingest. * - fuzzy_resolve.py talks to Spark Control (the Qwen chat completion that * adjudicates each candidate pair), so the Spark/Qdrant env must be present. * This action runs in its OWN subcontainer and does NOT go through * docker_entrypoint.sh, so it cannot inherit the entrypoint's exports — the * env is passed explicitly below. Spark Control must be reachable for the * Qwen call, or the run will fail. * - allowedStatuses: 'any' — the action runs in its own subcontainer with the * same /data volume mounted, so it works whether or not the CRM is running. * SQLite WAL mode means a concurrently-running CRM is fine for these * reads/derived writes. */ const DB_PATH = `${DATA_MOUNT_PATH}/crm.db` const INGEST_DIR = '/app/backend/ingest' // OPERATOR: Spark Control + Qdrant endpoints for the ingest run. These are the // LAN defaults for the Ten31 deployment — edit them for your network. Keep them // in sync with the export block in docker_entrypoint.sh (single source of truth // for the values; this action needs its own copy because it does not run the // entrypoint). Spark Control is TLS with a self-signed cert by default, hence // SPARK_CONTROL_VERIFY_TLS = 'false'. const ingestEnv: { [k: string]: string } = { CRM_DB_PATH: DB_PATH, SPARK_CONTROL_URL: 'https://192.168.1.72:62419', SPARK_CONTROL_VERIFY_TLS: 'false', QDRANT_URL: 'http://192.168.1.87:6333', } export const resolveDuplicates = sdk.Action.withoutInput( // id 'resolve-duplicate-names', // metadata async ({ effects }) => ({ name: i18n('Resolve duplicate names'), description: i18n( 'Use the local Qwen model to merge name-variant duplicates the ' + 'deterministic resolver flagged (e.g. Kate/Katherine). Idempotent; runs ' + 'entirely on your infrastructure. Requires Spark Control to be reachable ' + '(for the Qwen call; set SPARK_CONTROL_URL).', ), warning: null, allowedStatuses: 'any', group: null, visibility: 'enabled', }), // execution async ({ effects }) => { const env = ingestEnv const subcontainer = await sdk.SubContainer.of( effects, { imageId: IMAGE_ID }, sdk.Mounts.of().mountVolume({ volumeId: 'main', subpath: null, mountpoint: DATA_MOUNT_PATH, readonly: false, }), 'ten31-database-resolve-duplicate-names', ) try { // Fuzzy entity-resolution tier — local Qwen (via Spark Control) adjudicates // the flagged name-variant pairs and merges true duplicates into one // canonical id (idempotent; no CRM source data is modified). await subcontainer.execFail( ['python3', 'fuzzy_resolve.py', '--db', DB_PATH], { cwd: INGEST_DIR, env }, // 30 minutes — each flagged pair is one Qwen call; leave generous // headroom for a large backlog of candidates. 30 * 60 * 1000, ) } finally { await subcontainer.destroy() } return { version: '1', title: i18n('Duplicate names resolved'), message: i18n( 'The local Qwen model reviewed the flagged name-variant pairs and ' + 'merged true duplicates into one canonical id. You can re-run this ' + 'action any time.', ), result: null, } }, )