ten31-database/start9/0.4/startos/actions/resolveDuplicates.ts

import { i18n } from '../i18n'
import { sdk } from '../sdk'
import { DATA_MOUNT_PATH, IMAGE_ID } from '../utils'

/**
 * Manual "Resolve duplicate names" action (Phase-0 entity resolution, fuzzy tier).
 *
 * Runs the fuzzy entity-resolution tier on the box where /data/crm.db lives:
 *
 *   fuzzy_resolve.py --db /data/crm.db   (local-Qwen merge of name-variant dupes)
 *
 * The deterministic tier (entity_resolution.py, run by "Build search index")
 * merges exact/normalized matches and FLAGS the harder name-variant pairs it
 * won't merge on a guess (e.g. "Kate" / "Katherine"). This action runs the
 * second tier: it asks the local Qwen model (via Spark Control) to decide which
 * flagged pairs are truly the same LP and merges them into one canonical id.
 * It is idempotent, read-only on the CRM source tables, and logs a row to
 * interaction_log — so re-running is always safe.
 *
 * Implementation notes:
 *   - The scripts import their siblings by bare name (`import config`, etc.),
 *     so they must run with cwd = /app/backend/ingest.
 *   - fuzzy_resolve.py talks to Spark Control (the Qwen chat completion that
 *     adjudicates each candidate pair), so the Spark/Qdrant env must be present.
 *     This action runs in its OWN subcontainer and does NOT go through
 *     docker_entrypoint.sh, so it cannot inherit the entrypoint's exports — the
 *     env is passed explicitly below. Spark Control must be reachable for the
 *     Qwen call, or the run will fail.
 *   - allowedStatuses: 'any' — the action runs in its own subcontainer with the
 *     same /data volume mounted, so it works whether or not the CRM is running.
 *     SQLite WAL mode means a concurrently-running CRM is fine for these
 *     reads/derived writes.
 */

const DB_PATH = `${DATA_MOUNT_PATH}/crm.db`
const INGEST_DIR = '/app/backend/ingest'

// OPERATOR: Spark Control + Qdrant endpoints for the ingest run. These are the
// LAN defaults for the Ten31 deployment — edit them for your network. Keep them
// in sync with the export block in docker_entrypoint.sh (single source of truth
// for the values; this action needs its own copy because it does not run the
// entrypoint). Spark Control is TLS with a self-signed cert by default, hence
// SPARK_CONTROL_VERIFY_TLS = 'false'.
const ingestEnv: { [k: string]: string } = {
  CRM_DB_PATH: DB_PATH,
  SPARK_CONTROL_URL: 'https://192.168.1.72:62419',
  SPARK_CONTROL_VERIFY_TLS: 'false',
  QDRANT_URL: 'http://192.168.1.87:6333',
}

export const resolveDuplicates = sdk.Action.withoutInput(
  // id
  'resolve-duplicate-names',

  // metadata
  async ({ effects }) => ({
    name: i18n('Resolve duplicate names'),
    description: i18n(
      'Use the local Qwen model to merge name-variant duplicates the ' +
        'deterministic resolver flagged (e.g. Kate/Katherine). Idempotent; runs ' +
        'entirely on your infrastructure. Requires Spark Control to be reachable ' +
        '(for the Qwen call; set SPARK_CONTROL_URL).',
    ),
    warning: null,
    allowedStatuses: 'any',
    group: null,
    visibility: 'enabled',
  }),

  // execution
  async ({ effects }) => {
    const env = ingestEnv

    const subcontainer = await sdk.SubContainer.of(
      effects,
      { imageId: IMAGE_ID },
      sdk.Mounts.of().mountVolume({
        volumeId: 'main',
        subpath: null,
        mountpoint: DATA_MOUNT_PATH,
        readonly: false,
      }),
      'ten31-database-resolve-duplicate-names',
    )

    try {
      // Fuzzy entity-resolution tier — local Qwen (via Spark Control) adjudicates
      // the flagged name-variant pairs and merges true duplicates into one
      // canonical id (idempotent; no CRM source data is modified).
      await subcontainer.execFail(
        ['python3', 'fuzzy_resolve.py', '--db', DB_PATH],
        { cwd: INGEST_DIR, env },
        // 30 minutes — each flagged pair is one Qwen call; leave generous
        // headroom for a large backlog of candidates.
        30 * 60 * 1000,
      )
    } finally {
      await subcontainer.destroy()
    }

    return {
      version: '1',
      title: i18n('Duplicate names resolved'),
      message: i18n(
        'The local Qwen model reviewed the flagged name-variant pairs and ' +
          'merged true duplicates into one canonical id. You can re-run this ' +
          'action any time.',
      ),
      result: null,
    }
  },
)