Phase 0 go-live polish: hands-off incremental sync + refresh action

- backend/ingest/sync_scheduler.py: periodic incremental-sync loop (every
  CRM_INGEST_SYNC_INTERVAL_MIN min); resilient, --once for testing.
- start9/0.4: "Refresh search index" action (incremental sync.py); entrypoint
  launches the scheduler as a background process when Spark/Qdrant are set;
  CRM_INGEST_SYNC_INTERVAL_MIN env; pre-release note on fastembed/mcp pins.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Keysat
2026-06-05 09:36:06 -05:00
parent f357c23c75
commit 6be2e40f54
6 changed files with 298 additions and 19 deletions
+55
View File
@@ -0,0 +1,55 @@
#!/usr/bin/env python3
"""Hands-off periodic incremental-sync loop.
Runs `sync.run()` every CRM_INGEST_SYNC_INTERVAL_MIN minutes so the Qdrant index
tracks CRM changes without manual action. Mirrors the email-sync / backup
scheduler pattern already used in this codebase. Resilient: a failed cycle is
logged and the loop continues. Intended to be launched as a background process
by the StartOS docker_entrypoint.sh (only when Spark/Qdrant are configured).
python3 backend/ingest/sync_scheduler.py --db /data/crm.db
python3 backend/ingest/sync_scheduler.py --db data/crm_dev.db --once # one cycle (test)
"""
import argparse
import os
import sys
import time
import traceback
import config
import sync
def _log(msg):
sys.stderr.write(f"[ingest-scheduler] {msg}\n")
sys.stderr.flush()
def loop(db, interval_min, fuzzy):
interval = max(60, int(interval_min) * 60)
_log(f"started: every {interval_min} min on {db} (fuzzy={fuzzy})")
while True:
try:
s = sync.run(db, fuzzy=fuzzy)
_log(f"{s['mode']}: embedded {s['rows_embedded']} chunk(s); {s['qdrant_points']} points")
except Exception as exc:
_log(f"sync FAILED (continuing): {exc}\n{traceback.format_exc()}")
time.sleep(interval)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--db", default=config.DEFAULT_DB)
ap.add_argument("--interval-min", type=int,
default=int(os.environ.get("CRM_INGEST_SYNC_INTERVAL_MIN", "60")))
ap.add_argument("--fuzzy", action="store_true", help="also run the local-Qwen fuzzy tier each cycle")
ap.add_argument("--once", action="store_true", help="run a single cycle and exit (testing)")
args = ap.parse_args()
if args.once:
print(sync.run(args.db, fuzzy=args.fuzzy))
return
loop(args.db, args.interval_min, args.fuzzy)
if __name__ == "__main__":
main()
+8
View File
@@ -38,6 +38,14 @@ RUN apt-get update \
# (backend/ingest/sparse.py auto-detects it). # (backend/ingest/sparse.py auto-detects it).
# * mcp — MCP Python SDK, only needed to run backend/mcp/server.py. # * mcp — MCP Python SDK, only needed to run backend/mcp/server.py.
# Everything else server.py needs is stdlib. # Everything else server.py needs is stdlib.
#
# ⚠️ PRE-RELEASE — VERIFY THESE PINS BUILD ON BOTH ARCHES BEFORE RELEASE ⚠️
# fastembed==0.4.2 and mcp==1.2.0 were pinned best-effort and have NOT been
# confirmed to build on both x86_64 AND aarch64 (StartOS runs arm64). fastembed
# in particular pulls onnxruntime (no wheel on some arches → source build) and
# downloads a model on first use. Build the image on aarch64 and run the ingest
# once to confirm before cutting a release. See INGEST_PACKAGING.md "Pre-release
# checks". Do not bump these without re-verifying on both arches.
RUN pip install --no-cache-dir \ RUN pip install --no-cache-dir \
cryptography==42.0.5 \ cryptography==42.0.5 \
fastembed==0.4.2 \ fastembed==0.4.2 \
+105 -18
View File
@@ -13,10 +13,11 @@ unchanged.
| File | Change | | File | Change |
| --- | --- | | --- | --- |
| `Dockerfile` | `COPY backend/ingest` and `COPY backend/mcp` into the image alongside `backend/server.py`. Added two runtime deps to the existing `pip install`: `fastembed==0.4.2` (client-side BM25 / `Qdrant/bm25` for the sparse retrieval leg) and `mcp==1.2.0` (MCP Python SDK, only for `backend/mcp/server.py`). | | `Dockerfile` | `COPY backend/ingest` and `COPY backend/mcp` into the image alongside `backend/server.py`. Added two runtime deps to the existing `pip install`: `fastembed==0.4.2` (client-side BM25 / `Qdrant/bm25` for the sparse retrieval leg) and `mcp==1.2.0` (MCP Python SDK, only for `backend/mcp/server.py`). **These two pins carry a pre-release multi-arch verification requirement — see "Pre-release checks" below.** |
| `docker_entrypoint.sh` | Added an export block for the ingest/retrieval env: `CRM_DB_PATH`, `SPARK_CONTROL_URL`, `SPARK_CONTROL_VERIFY_TLS`, `QDRANT_URL`, with LAN-default placeholder values and an operator comment. The CRM web server ignores these; they exist so manual `python3 /app/backend/ingest/...` and `backend/mcp/server.py` runs on the box inherit them. | | `docker_entrypoint.sh` | Added an export block for the ingest/retrieval env: `CRM_DB_PATH`, `SPARK_CONTROL_URL`, `SPARK_CONTROL_VERIFY_TLS`, `QDRANT_URL`, `CRM_INGEST_SYNC_INTERVAL_MIN`, with LAN-default placeholder values and an operator comment. The CRM web server ignores these; they exist so manual `python3 /app/backend/ingest/...` and `backend/mcp/server.py` runs on the box inherit them. Also launches the **background ingest sync scheduler** (`sync_scheduler.py`) before `exec`-ing the web server, guarded so it only starts when Spark Control + Qdrant are configured — see "Automatic scheduled refresh" below. |
| `startos/actions/buildSearchIndex.ts` | **New.** A one-shot "Build search index" StartOS action (Steps 34 of the runbook). | | `startos/actions/buildSearchIndex.ts` | **New.** A one-shot "Build search index" StartOS action (Steps 34 of the runbook) — full rebuild with `--recreate`. |
| `startos/actions/index.ts` | Registered the new action: `sdk.Actions.of().addAction(buildSearchIndex)`. | | `startos/actions/refreshSearchIndex.ts` | **New.** A manual "Refresh search index" action — incremental, idempotent `sync.py` (no `--recreate`); the manual counterpart to the background scheduler. |
| `startos/actions/index.ts` | Registered both actions: `sdk.Actions.of().addAction(buildSearchIndex).addAction(refreshSearchIndex)`. |
| `startos/versions/v0.1.0.44.ts` + `versions/index.ts` | New version `0.1.0:44` (image-only change, no data migration) set as `current`; `0.1.0:43` moved to `other`. | | `startos/versions/v0.1.0.44.ts` + `versions/index.ts` | New version `0.1.0:44` (image-only change, no data migration) set as `current`; `0.1.0:43` moved to `other`. |
| `startos/utils.ts` | Bumped the informational `PACKAGE_VERSION` constant to `0.1.0:44`. | | `startos/utils.ts` | Bumped the informational `PACKAGE_VERSION` constant to `0.1.0:44`. |
@@ -50,6 +51,63 @@ siblings by bare name, e.g. `import config`, so they must run from that
directory). It uses `allowedStatuses: 'any'` — SQLite WAL mode makes a directory). It uses `allowedStatuses: 'any'` — SQLite WAL mode makes a
concurrently-running CRM safe for these reads/derived writes. concurrently-running CRM safe for these reads/derived writes.
## Keeping the index fresh (hands-off refresh)
The "Build search index" action above is a full one-shot rebuild. To keep the
index current as the CRM changes, there are now two incremental paths — both run
`sync.py` (chunk → dense+BM25 → Qdrant upsert) for **changed records only**, with
NO `--recreate`, so they never drop the collection and are safe to run any time.
### Manual: "Refresh search index" action
`startos/actions/refreshSearchIndex.ts` adds a second StartOS action,
**Refresh search index** (id `refresh-search-index`). It mirrors
`buildSearchIndex.ts` exactly — same subcontainer, same `/data` mount, same
explicit `ingestEnv` — but runs `python3 sync.py --db /data/crm.db` (no
`--recreate`) with `cwd = /app/backend/ingest`. An incremental delta is usually
seconds to a few minutes (the action allows up to 30 min of headroom). Use it
for an on-demand refresh; use "Build search index" only for a full rebuild.
### Automatic: background sync scheduler
For hands-off freshness, `docker_entrypoint.sh` launches
`backend/ingest/sync_scheduler.py` as a **background process** just before it
`exec`s the web server. `sync_scheduler.py` loops the incremental sync every
`CRM_INGEST_SYNC_INTERVAL_MIN` minutes (default **60**, exported in the
entrypoint's env block with an operator comment). It logs to
`/data/ingest-sync.log`.
The launch is **guarded**: it only starts when both `SPARK_CONTROL_URL` and
`QDRANT_URL` are set (both are exported just above it, so the default LAN values
satisfy the guard; an operator who clears them to disable ingest also disables
the scheduler). The entrypoint prints `STARTED` or `SKIPPED (Spark/Qdrant not
configured)` so the choice is visible in the service logs.
#### Why a background process and not a StartOS daemon
The prior agent deliberately avoided adding the stdio MCP server as a daemon
because StartOS daemons are built around a network port + `checkPortListening`
health check, and a portless process has no liveness signal to probe (see "MCP
server" below). `sync_scheduler.py` is the same shape — a long-running loop with
no port — so adding it as a second daemon in `main.ts` would hit the same
mismatch.
Launching it as a child of the entrypoint sidesteps that entirely:
- **Pro:** no portless-daemon contortion; it shares the `primary` container's
`/data` and inherited env; the existing `primary` daemon and its
`checkPortListening` health check are untouched.
- **Con:** StartOS does not supervise it independently. If the scheduler dies it
is not auto-restarted on its own (the container as a whole is still
health-checked via the web server), and it has no separate status tile in the
UI. Crashes surface only in `/data/ingest-sync.log`. The manual "Refresh
search index" action is the always-available fallback.
If a future phase wants first-class supervision/visibility, promote it to a real
StartOS daemon — but, as with the MCP server, only after giving the work a
network transport (e.g. a tiny HTTP health endpoint) so it has a meaningful
`checkPortListening` probe.
## Env / config the operator must set (Spark URLs) ## Env / config the operator must set (Spark URLs)
The ingest run reaches out to **Spark Control** (dense embeddings) and **Qdrant** The ingest run reaches out to **Spark Control** (dense embeddings) and **Qdrant**
@@ -63,18 +121,22 @@ the Ten31 LAN defaults:
| `SPARK_CONTROL_VERIFY_TLS` | `false` (Spark Control uses a self-signed cert) | TLS verification toggle | | `SPARK_CONTROL_VERIFY_TLS` | `false` (Spark Control uses a self-signed cert) | TLS verification toggle |
| `QDRANT_URL` | `http://192.168.1.87:6333` | Qdrant collection admin + upserts | | `QDRANT_URL` | `http://192.168.1.87:6333` | Qdrant collection admin + upserts |
| `CRM_DB_PATH` | `/data/crm.db` | both scripts + MCP server (already correct) | | `CRM_DB_PATH` | `/data/crm.db` | both scripts + MCP server (already correct) |
| `CRM_INGEST_SYNC_INTERVAL_MIN` | `60` | background sync scheduler loop interval (entrypoint only) |
Where to set them: Where to set them:
- **`docker_entrypoint.sh`** — for manual `python3` / MCP runs via the running - **`docker_entrypoint.sh`** — for manual `python3` / MCP runs via the running
container. Edit the `${VAR:-default}` block, or override via the StartOS container and for the background sync scheduler. Edit the `${VAR:-default}`
service environment. block, or override via the StartOS service environment.
- **`startos/actions/buildSearchIndex.ts`** (`ingestEnv`) — for the "Build search - **`startos/actions/buildSearchIndex.ts`** and
index" action, which runs in its own subcontainer and does **not** execute the **`startos/actions/refreshSearchIndex.ts`** (`ingestEnv`) — for the "Build
entrypoint, so it carries its own copy of the values. Edit these to match. search index" and "Refresh search index" actions, which run in their own
subcontainers and do **not** execute the entrypoint, so each carries its own
copy of the values. Edit these to match. (`CRM_INGEST_SYNC_INTERVAL_MIN` only
matters to the entrypoint's scheduler loop, not to the actions.)
> Keep the two copies in sync. They are duplicated because the action's > Keep the copies in sync. They are duplicated because the actions'
> subcontainer never runs `docker_entrypoint.sh`; there is no shared config > subcontainers never run `docker_entrypoint.sh`; there is no shared config
> store wired into this package today (see "Still needed" below). > store wired into this package today (see "Still needed" below).
Verify reachability from the box before running the action: Verify reachability from the box before running the action:
@@ -118,13 +180,16 @@ That is deliberately deferred to a later phase.
- **MCP-as-a-service** — see above. Deferred until there is a live agent and a - **MCP-as-a-service** — see above. Deferred until there is a live agent and a
network transport; today it is manual/stdio only. network transport; today it is manual/stdio only.
- **Incremental sync (runbook Step 6 / Workstream B4)** — the action does a full - ~~**Incremental sync (runbook Step 6 / Workstream B4)**~~**done.** The
one-shot rebuild. Keeping the index fresh as the CRM changes needs an background sync scheduler (`sync_scheduler.py`, started by the entrypoint) keeps
incremental, idempotent sync on a schedule. Until that exists, re-running the the index fresh automatically, and the manual "Refresh search index" action
"Build search index" action is the refresh path. When built, it could be wired provides an on-demand incremental sync. See "Keeping the index fresh" above. A
as a recurring StartOS action/task rather than a manual re-run. future enhancement could still promote the scheduler to a first-class StartOS
daemon (with a network transport for a real health check) for independent
supervision/visibility.
- **Single source of truth for Spark/Qdrant config** — currently duplicated in - **Single source of truth for Spark/Qdrant config** — currently duplicated in
`docker_entrypoint.sh` and `buildSearchIndex.ts`. A small StartOS config `docker_entrypoint.sh`, `buildSearchIndex.ts`, and `refreshSearchIndex.ts`. A
small StartOS config
store + input form (the SDK supports `Action.withInput` and a service config) store + input form (the SDK supports `Action.withInput` and a service config)
would let the operator set the endpoints once in the UI; deferred to keep this would let the operator set the endpoints once in the UI; deferred to keep this
change minimal and reviewable. change minimal and reviewable.
@@ -133,10 +198,32 @@ That is deliberately deferred to a later phase.
env). Not required given the exported env above, but available as an env). Not required given the exported env above, but available as an
alternative if the operator prefers a file. alternative if the operator prefers a file.
## Pre-release checks
Verify before cutting a release:
- **Multi-arch dependency build (BLOCKER).** The `fastembed==0.4.2` and
`mcp==1.2.0` pins in `Dockerfile` were chosen best-effort and have **not** been
confirmed to build on **both** `x86_64` and `aarch64`. StartOS targets arm64,
and `fastembed` pulls `onnxruntime` (which may have no prebuilt arm64 wheel and
fall back to a slow source build) plus downloads a model on first use. Build the
image on aarch64 and run the ingest once end-to-end before release. Do not bump
either pin without re-verifying on both arches. (Flagged inline above the pip
line in `Dockerfile`.)
- **Scheduler smoke test.** With Spark Control + Qdrant reachable, start the
container and confirm the entrypoint logs
`[entrypoint] ingest sync scheduler: STARTED`, that `/data/ingest-sync.log`
accumulates sync output, and that clearing one of the endpoints flips the log to
`SKIPPED`.
- **Actions present.** Confirm both **Build search index** and **Refresh search
index** appear under the service's Actions in the StartOS UI and run to success.
## Constraints honored ## Constraints honored
- No files under `backend/ingest/`, `backend/mcp/`, `backend/server.py`, - No files under `backend/ingest/`, `backend/mcp/`, `backend/server.py`,
`backend/core_migrations.py`, `backend/migrations/`, or `data/` were modified; `backend/core_migrations.py`, `backend/migrations/`, or `data/` were modified;
only `start9/0.4/**` and this new doc. only `start9/0.4/**` and this doc. The entrypoint and the refresh action
reference `backend/ingest/sync_scheduler.py` and `backend/ingest/sync.py` by
path only — those scripts are owned/created by a separate process.
- No build/deploy commands were run. `npx tsc --noEmit` was used only to verify - No build/deploy commands were run. `npx tsc --noEmit` was used only to verify
the new TypeScript compiles against the SDK types. the new TypeScript compiles against the SDK types.
+17
View File
@@ -73,6 +73,23 @@ export CRM_DB_PATH="${CRM_DB_PATH:-$DATA_DIR/crm.db}"
export SPARK_CONTROL_URL="${SPARK_CONTROL_URL:-https://192.168.1.72:62419}" export SPARK_CONTROL_URL="${SPARK_CONTROL_URL:-https://192.168.1.72:62419}"
export SPARK_CONTROL_VERIFY_TLS="${SPARK_CONTROL_VERIFY_TLS:-false}" export SPARK_CONTROL_VERIFY_TLS="${SPARK_CONTROL_VERIFY_TLS:-false}"
export QDRANT_URL="${QDRANT_URL:-http://192.168.1.87:6333}" export QDRANT_URL="${QDRANT_URL:-http://192.168.1.87:6333}"
# OPERATOR: how often (minutes) the background sync scheduler re-runs the
# incremental ingest sync to keep the Qdrant search index fresh. Default 60.
export CRM_INGEST_SYNC_INTERVAL_MIN="${CRM_INGEST_SYNC_INTERVAL_MIN:-60}"
# ── Background ingest sync scheduler ────────────────────────────
# Keep the Qdrant search index fresh hands-off: sync_scheduler.py loops the
# incremental sync every CRM_INGEST_SYNC_INTERVAL_MIN minutes. It runs as a
# BACKGROUND process (not a StartOS daemon) — see INGEST_PACKAGING.md for the
# daemon-vs-background-process tradeoff. Started only when ingest is configured,
# i.e. both Spark Control and Qdrant endpoints are set; otherwise the loop would
# just error every interval with nothing to talk to.
if [ -n "${SPARK_CONTROL_URL:-}" ] && [ -n "${QDRANT_URL:-}" ]; then
(cd /app/backend/ingest && CRM_DB_PATH=/data/crm.db python3 sync_scheduler.py --db /data/crm.db >> /data/ingest-sync.log 2>&1 &)
echo "[entrypoint] ingest sync scheduler: STARTED"
else
echo "[entrypoint] ingest sync scheduler: SKIPPED (Spark/Qdrant not configured)"
fi
# ── Launch the app ────────────────────────────────────────────── # ── Launch the app ──────────────────────────────────────────────
exec python3 /app/backend/server.py exec python3 /app/backend/server.py
+4 -1
View File
@@ -1,4 +1,7 @@
import { sdk } from '../sdk' import { sdk } from '../sdk'
import { buildSearchIndex } from './buildSearchIndex' import { buildSearchIndex } from './buildSearchIndex'
import { refreshSearchIndex } from './refreshSearchIndex'
export const actions = sdk.Actions.of().addAction(buildSearchIndex) export const actions = sdk.Actions.of()
.addAction(buildSearchIndex)
.addAction(refreshSearchIndex)
@@ -0,0 +1,109 @@
import { i18n } from '../i18n'
import { sdk } from '../sdk'
import { DATA_MOUNT_PATH, IMAGE_ID } from '../utils'
/**
* Manual "Refresh search index" action (Phase-0 ingest, incremental sync).
*
* Runs an incremental, idempotent sync that brings the Qdrant search index up
* to date with CRM changes since the last sync, on the box where /data/crm.db
* lives:
*
* sync.py --db /data/crm.db (chunk → dense+BM25 → Qdrant, changed records only)
*
* Unlike "Build search index" this does NOT pass --recreate: it does not drop
* the collection, only upserts the delta. It is fast and safe to re-run any
* time; it is the manual counterpart to the background sync scheduler that runs
* automatically when ingest is configured (see INGEST_PACKAGING.md).
*
* Implementation notes:
* - The scripts import their siblings by bare name (`import config`, etc.),
* so they must run with cwd = /app/backend/ingest.
* - sync.py talks to Spark Control (dense embeds) and Qdrant (upserts), so the
* Spark/Qdrant env must be present. This action runs in its OWN subcontainer
* and does NOT go through docker_entrypoint.sh, so it cannot inherit the
* entrypoint's exports — the env is passed explicitly below.
* - allowedStatuses: 'any' — the action runs in its own subcontainer with the
* same /data volume mounted, so it works whether or not the CRM is running.
* SQLite WAL mode means a concurrently-running CRM is fine for these
* reads/derived writes.
*/
const DB_PATH = `${DATA_MOUNT_PATH}/crm.db`
const INGEST_DIR = '/app/backend/ingest'
// OPERATOR: Spark Control + Qdrant endpoints for the ingest run. These are the
// LAN defaults for the Ten31 deployment — edit them for your network. Keep them
// in sync with the export block in docker_entrypoint.sh (single source of truth
// for the values; this action needs its own copy because it does not run the
// entrypoint). Spark Control is TLS with a self-signed cert by default, hence
// SPARK_CONTROL_VERIFY_TLS = 'false'.
const ingestEnv: { [k: string]: string } = {
CRM_DB_PATH: DB_PATH,
SPARK_CONTROL_URL: 'https://192.168.1.72:62419',
SPARK_CONTROL_VERIFY_TLS: 'false',
QDRANT_URL: 'http://192.168.1.87:6333',
}
export const refreshSearchIndex = sdk.Action.withoutInput(
// id
'refresh-search-index',
// metadata
async ({ effects }) => ({
name: i18n('Refresh search index'),
description: i18n(
'Incrementally update the search index with CRM changes since the last ' +
'sync; fast, idempotent. Runs sync.py (chunk → embed → upsert) for only ' +
'the records that changed, without dropping the Qdrant `crm_chunks` ' +
'collection. Requires Spark Control and Qdrant to be reachable (set ' +
'SPARK_CONTROL_URL / QDRANT_URL). Use "Build search index" instead for a ' +
'full rebuild from scratch.',
),
warning: null,
allowedStatuses: 'any',
group: null,
visibility: 'enabled',
}),
// execution
async ({ effects }) => {
const env = ingestEnv
const subcontainer = await sdk.SubContainer.of(
effects,
{ imageId: IMAGE_ID },
sdk.Mounts.of().mountVolume({
volumeId: 'main',
subpath: null,
mountpoint: DATA_MOUNT_PATH,
readonly: false,
}),
'ten31-database-refresh-search-index',
)
try {
// Incremental sync — chunk → dense (Spark Control) + BM25 → Qdrant upsert
// for changed records only (no --recreate).
await subcontainer.execFail(
['python3', 'sync.py', '--db', DB_PATH],
{ cwd: INGEST_DIR, env },
// 30 minutes — an incremental delta is usually secondsminutes, but leave
// generous headroom for a large backlog of changes.
30 * 60 * 1000,
)
} finally {
await subcontainer.destroy()
}
return {
version: '1',
title: i18n('Search index refreshed'),
message: i18n(
'The Qdrant `crm_chunks` collection was incrementally updated with CRM ' +
'changes since the last sync. You can re-run this action any time.',
),
result: null,
}
},
)