Phase 1: dual approval default, web-UI index jobs + merge review queue, thesis v2

- Dual sign-off is now the default (thesis_required_approvals defaults to 2).
- Entity-merge review queue (migration 0003): the fuzzy/Qwen tier no longer
  auto-merges — it writes CANDIDATES (entity_merge_candidates) with a same/different
  suggestion + confidence + reason for a human to approve (merge) or reject (keep
  separate). entity_merge.py applies/rejects (durable via entity_merges, soft-delete,
  repoint links+edges); decided pairs aren't re-surfaced.
- entity_jobs.py: UI-triggered background index jobs (rebuild/update/find-duplicates)
  as subprocesses with a one-at-a-time lock; status in /api/system/status.
- server.py: /api/index/{rebuild,update}, /api/entities/find-duplicates,
  /api/entities/merge-candidates [+ /{id} decide] — admin-gated.
- docs/thesis-seed-v2.md: concrete, plain-English rewrite per Grant's feedback.

Backend verified end-to-end on synthetic data (candidate gen -> approve/reject).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Keysat
2026-06-05 11:14:12 -05:00
parent fa2a5ce95f
commit cd3cca725c
8 changed files with 336 additions and 65 deletions
+65
View File
@@ -0,0 +1,65 @@
"""Background index / entity-resolution jobs, triggered from the CRM web UI.
Replaces the StartOS actions with one-click web-UI buttons. Runs the ingest
scripts as SUBPROCESSES (cwd backend/ingest) so the heavy deps (fastembed, the
Spark clients) are not imported into the CRM server process, and so each run is
isolated. One job at a time (a process-level lock); progress/result is exposed
via get_status() and surfaced in /api/system/status.
Jobs:
rebuild_index — full re-chunk + re-embed into Qdrant (sync.py --recreate)
update_index — incremental sync (sync.py)
find_duplicates — deterministic resolution + Qwen suggestions -> review queue
"""
import os
import subprocess
import threading
from datetime import datetime, timezone
INGEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ingest")
_JOBS = {
"rebuild_index": ["python3", "sync.py", "--recreate"],
"update_index": ["python3", "sync.py"],
"find_duplicates": ["python3", "fuzzy_resolve.py"],
}
_lock = threading.Lock()
_state = {"running": False, "kind": None, "started_at": None,
"finished_at": None, "result": None, "tail": None}
def _now():
return datetime.now(timezone.utc).replace(tzinfo=None).isoformat() + "Z"
def get_status():
with _lock:
return dict(_state)
def start(kind, db_path):
if kind not in _JOBS:
return {"error": "unknown_job", "allowed": list(_JOBS)}
with _lock:
if _state["running"]:
return {"error": "job_running", "kind": _state["kind"]}
_state.update(running=True, kind=kind, started_at=_now(),
finished_at=None, result=None, tail=None)
threading.Thread(target=_run, args=(kind, db_path), daemon=True).start()
return {"started": True, "kind": kind}
def _run(kind, db_path):
cmd = _JOBS[kind] + ["--db", db_path]
env = dict(os.environ)
env["CRM_DB_PATH"] = db_path
try:
p = subprocess.run(cmd, cwd=INGEST_DIR, env=env, capture_output=True,
text=True, timeout=3600)
tail = (p.stdout + p.stderr).strip()[-1500:]
result = "ok" if p.returncode == 0 else f"error (exit {p.returncode})"
except Exception as exc: # noqa: BLE001
tail, result = str(exc), "error"
with _lock:
_state.update(running=False, finished_at=_now(), result=result, tail=tail)
+79
View File
@@ -0,0 +1,79 @@
"""Human-decided entity-merge candidates (Phase 1). Called by server.py routes —
NOT an agent tool. A partner approves (merge) or rejects (keep separate) each
candidate the fuzzy tier surfaced. Approvals apply the merge and record it in
entity_merges (durable, so deterministic resolution respects it); rejections are
remembered so the pair is not re-surfaced. Everything is logged.
"""
import json
import sqlite3
import uuid
from datetime import datetime, timezone
def _now():
return datetime.now(timezone.utc).replace(tzinfo=None).isoformat() + "Z"
def _conn(db):
c = sqlite3.connect(db)
c.row_factory = sqlite3.Row
c.execute("PRAGMA foreign_keys=ON")
return c
def _log(c, actor_id, action, target_id, payload):
c.execute("""INSERT INTO interaction_log
(id, ts, actor_type, actor_id, action, target_type, target_id, payload, source, created_at)
VALUES (?,?,?,?,?,?,?,?,?,?)""",
(str(uuid.uuid4()), _now(), "human", actor_id, action, "canonical_entity", target_id,
json.dumps(payload) if payload is not None else None, "crm_ui", _now()))
def list_candidates(db, status="pending"):
c = _conn(db)
rows = [dict(r) for r in c.execute(
"SELECT * FROM entity_merge_candidates WHERE status=? ORDER BY confidence DESC, created_at DESC", (status,))]
c.close()
return {"candidates": rows, "count": len(rows)}
def _apply_merge(c, survivor, loser, confidence, reason):
"""Merge `loser` into `survivor`: record durably, repoint links + relationship
edges, soft-delete the loser (never hard-delete — guardrail #3)."""
c.execute("""INSERT INTO entity_merges (merged_id, survivor_id, confidence, reason, created_at)
VALUES (?,?,?,?,?)
ON CONFLICT(merged_id) DO UPDATE SET survivor_id=excluded.survivor_id,
confidence=excluded.confidence, reason=excluded.reason""",
(loser, survivor, confidence or 0.8, reason, _now()))
c.execute("UPDATE entity_links SET canonical_id=?, match_kind='fuzzy_merge' WHERE canonical_id=?",
(survivor, loser))
# repoint relationship edges (member_of, etc.); OR IGNORE avoids unique clashes,
# then drop any leftover edges the survivor already had.
c.execute("UPDATE OR IGNORE relationship_edges SET src_id=? WHERE src_id=?", (survivor, loser))
c.execute("UPDATE OR IGNORE relationship_edges SET dst_id=? WHERE dst_id=?", (survivor, loser))
c.execute("DELETE FROM relationship_edges WHERE src_id=? OR dst_id=?", (loser, loser))
c.execute("UPDATE canonical_entities SET deleted_at=?, updated_at=? WHERE id=?", (_now(), _now(), loser))
def decide(db, candidate_id, decision, decided_by):
if decision not in ("approve", "reject"):
return {"error": "bad_decision", "allowed": ["approve", "reject"]}
c = _conn(db)
cand = c.execute("SELECT * FROM entity_merge_candidates WHERE id=?", (candidate_id,)).fetchone()
if not cand:
c.close()
return {"error": "not_found", "candidate_id": candidate_id}
if cand["status"] != "pending":
c.close()
return {"error": "already_decided", "status": cand["status"]}
if decision == "approve":
_apply_merge(c, cand["entity_a"], cand["entity_b"], cand["confidence"], cand["reason"])
c.execute("UPDATE entity_merge_candidates SET status=?, decided_by=?, decided_at=? WHERE id=?",
("approved" if decision == "approve" else "rejected", decided_by, _now(), candidate_id))
_log(c, decided_by, f"entity.merge_{decision}d", cand["entity_a"],
{"survivor": cand["entity_a"], "loser": cand["entity_b"], "names": [cand["name_a"], cand["name_b"]]})
c.commit()
c.close()
return {"id": candidate_id, "decision": decision,
"merged": (cand["entity_b"] if decision == "approve" else None)}
+47 -63
View File
@@ -1,19 +1,15 @@
#!/usr/bin/env python3
"""Phase-0 Workstream B3 — fuzzy entity-resolution tier (local Qwen).
"""Phase-1 — fuzzy entity-resolution tier (local Qwen), REVIEW-QUEUE mode.
The deterministic tier (entity_resolution.py) merges only provable matches and
FLAGS the hard name-variant candidates (same firm + surname, different first
name/email) without guessing. This tier asks the local Qwen model (Spark Control
/v1/chat/completions — sovereign, on Ten31 infra) to adjudicate each candidate
and merges the confirmed ones.
A merge repoints the loser's entity_links to the survivor and soft-deletes the
loser canonical entity (deleted_at; never hard-deleted — guardrail #3). Every
merge is written to the interaction_log (guardrail #5). Idempotent: re-running
finds no new candidates once merged.
The deterministic tier (entity_resolution.py) flags hard name-variant candidates
(same firm + surname, different first name/email) without guessing. This tier asks
the local Qwen model (Spark Control — sovereign) for a SUGGESTION on each, and
writes a CANDIDATE row to entity_merge_candidates for a human to approve (merge)
or reject (keep separate) in the CRM web UI. It NO LONGER auto-merges — uncertainty
is surfaced, not applied (the human decides). Already-decided pairs and
already-merged entities are skipped, so re-running is safe and quiet.
python3 backend/ingest/fuzzy_resolve.py --db data/crm_dev.db
python3 backend/ingest/fuzzy_resolve.py --db data/crm_dev.db --dry-run
"""
import argparse
import json
@@ -32,7 +28,7 @@ _SYSTEM = ("You are an entity-resolution assistant for a CRM. Decide if the list
def _now():
return datetime.now(timezone.utc).isoformat()
return datetime.now(timezone.utc).replace(tzinfo=None).isoformat() + "Z"
def _ask(members, firm):
@@ -40,76 +36,64 @@ def _ask(members, firm):
prompt = (f"Firm: {firm or 'unknown'}\nPeople: {people}\n\n"
"Are these the SAME person under name variants? "
'Answer only JSON: {"same": true|false, "confidence": 0.0-1.0, "reason": "..."}')
return llm.chat_json(prompt, system=_SYSTEM, max_tokens=160) or {"same": False, "confidence": 0.0}
return llm.chat_json(prompt, system=_SYSTEM, max_tokens=160) or {"same": False, "confidence": 0.0, "reason": ""}
def _survivor(members):
# Prefer a member with an email, then the longest (most complete) name.
return sorted(members, key=lambda m: (bool(m[2]), len(m[1])), reverse=True)[0]
def run(db, threshold=0.7, dry_run=False):
counts, candidates = er.run(db) # ensure deterministic state + fresh candidates
def run(db, db_path=None):
db = db_path or db
counts, candidates = er.run(db) # deterministic state (respects prior merges) + fresh candidates
conn = sqlite3.connect(db)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys=ON")
name_of = {r["id"]: r["display_name"] for r in conn.execute("SELECT id, display_name FROM canonical_entities")}
merges = []
decided = {frozenset((r["entity_a"], r["entity_b"]))
for r in conn.execute("SELECT entity_a, entity_b FROM entity_merge_candidates")}
merged = {r[0] for r in conn.execute("SELECT merged_id FROM entity_merges")}
created = skipped = 0
for cand in candidates:
members = cand["members"]
verdict = _ask(members, name_of.get(cand["org"]))
same = bool(verdict.get("same")) and float(verdict.get("confidence", 0)) >= threshold
decision = {"surname": cand["surname"], "firm": name_of.get(cand["org"]),
"members": [{"id": m[0], "name": m[1]} for m in members],
"same": same, "confidence": verdict.get("confidence"),
"reason": verdict.get("reason")}
if same:
keep = _survivor(members)
losers = [m for m in members if m[0] != keep[0]]
decision["merged_into"] = {"id": keep[0], "name": keep[1]}
if not dry_run:
for loser in losers:
# Record the merge durably so deterministic re-runs respect it.
conn.execute("""INSERT INTO entity_merges (merged_id, survivor_id, confidence, reason, created_at)
VALUES (?,?,?,?,?)
ON CONFLICT(merged_id) DO UPDATE SET survivor_id=excluded.survivor_id,
confidence=excluded.confidence, reason=excluded.reason""",
(loser[0], keep[0], verdict.get("confidence", 0.7),
verdict.get("reason"), _now()))
conn.execute("UPDATE entity_links SET canonical_id=?, match_kind='fuzzy_merge', confidence=? "
"WHERE canonical_id=?", (keep[0], verdict.get("confidence", 0.7), loser[0]))
conn.execute("UPDATE canonical_entities SET deleted_at=?, updated_at=? WHERE id=?",
(_now(), _now(), loser[0]))
conn.execute("""INSERT INTO interaction_log
(id, ts, actor_type, actor_id, action, target_type, target_id, payload, source, created_at)
VALUES (?,?,?,?,?,?,?,?,?,?)""",
(str(uuid.uuid4()), _now(), "agent", "qwen_entity_resolver", "entity.merged",
"canonical_entity", keep[0], json.dumps(decision), "ingest", _now()))
merges.append(decision)
keep = _survivor(members)
losers = [m for m in members if m[0] != keep[0]]
verdict = _ask(members, name_of.get(cand["org"])) # one Qwen call per group
for loser in losers:
pair = frozenset((keep[0], loser[0]))
if pair in decided or loser[0] in merged or keep[0] in merged:
skipped += 1
continue
conn.execute("""
INSERT INTO entity_merge_candidates
(id, entity_a, entity_b, name_a, name_b, email_a, email_b, context, verdict, confidence, reason, status, created_at)
VALUES (?,?,?,?,?,?,?,?,?,?,?, 'pending', ?)
ON CONFLICT(entity_a, entity_b) DO NOTHING
""", (str(uuid.uuid4()), keep[0], loser[0], keep[1], loser[1], keep[2], loser[2],
f"{cand['surname']} @ {name_of.get(cand['org']) or 'unknown'}",
'same' if verdict.get('same') else 'different', verdict.get('confidence'),
verdict.get('reason'), _now()))
decided.add(pair)
created += 1
if not dry_run:
conn.commit()
live_people = conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person' AND deleted_at IS NULL").fetchone()[0]
conn.execute("""INSERT INTO interaction_log
(id, ts, actor_type, actor_id, action, target_type, payload, source, created_at)
VALUES (?,?,?,?,?,?,?,?,?)""",
(str(uuid.uuid4()), _now(), "agent", "qwen_entity_resolver", "entity.candidates_generated",
"canonical_entities", json.dumps({"created": created, "skipped": skipped}), "ingest", _now()))
conn.commit()
conn.close()
return merges, live_people
return {"candidates_created": created, "skipped_existing": skipped, "flagged_groups": len(candidates)}
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--db", default="data/crm_dev.db")
ap.add_argument("--threshold", type=float, default=0.7)
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
merges, live = run(args.db, threshold=args.threshold, dry_run=args.dry_run)
print(f"Adjudicated {len(merges)} candidate group(s){' (dry run)' if args.dry_run else ''}:")
for m in merges:
names = " / ".join(p["name"] for p in m["members"])
verdict = f"MERGE -> {m['merged_into']['name']}" if m.get("merged_into") else "keep separate"
print(f" [{m['surname']}] {names}: same={m['same']} conf={m['confidence']} => {verdict}")
if m.get("reason"):
print(f" reason: {m['reason']}")
print(f"Live person entities now: {live}")
s = run(args.db)
print(f"Fuzzy review: {s['candidates_created']} new candidate(s) for review, "
f"{s['skipped_existing']} already decided ({s['flagged_groups']} flagged groups).")
if __name__ == "__main__":
@@ -0,0 +1,2 @@
DROP TABLE IF EXISTS entity_merge_candidates;
DELETE FROM schema_migrations WHERE filename = '0003_entity_merge_review.sql';
@@ -0,0 +1,25 @@
-- Phase 1 — entity-merge review queue.
--
-- ADDITIVE/REVERSIBLE. The fuzzy (local-Qwen) tier no longer auto-merges; it
-- writes CANDIDATES here for a human to approve (same entity -> merge) or reject
-- (different entities -> keep separate), surfaced in the CRM web UI. Approved
-- candidates apply the merge and are recorded in entity_merges (durable);
-- rejected pairs are remembered so they are not re-surfaced.
CREATE TABLE IF NOT EXISTS entity_merge_candidates (
id TEXT PRIMARY KEY,
entity_a TEXT NOT NULL, -- survivor (kept) canonical id
entity_b TEXT NOT NULL, -- would be merged INTO entity_a
name_a TEXT, name_b TEXT,
email_a TEXT, email_b TEXT,
context TEXT, -- firm / surname context for the reviewer
verdict TEXT, -- 'same' | 'different' (local-Qwen suggestion)
confidence REAL,
reason TEXT, -- Qwen's reasoning (why it thinks same/different)
status TEXT NOT NULL DEFAULT 'pending', -- pending | approved | rejected
decided_by TEXT, -- users.id of the partner who decided
decided_at TEXT,
created_at TEXT DEFAULT (datetime('now')),
UNIQUE(entity_a, entity_b)
);
CREATE INDEX IF NOT EXISTS idx_merge_candidates_status ON entity_merge_candidates(status);
+58
View File
@@ -43,6 +43,16 @@ try:
except Exception:
thesis_review = None
# Phase-1: entity-merge review + UI-triggered index jobs (guarded).
try:
import entity_merge # type: ignore
except Exception:
entity_merge = None
try:
import entity_jobs # type: ignore
except Exception:
entity_jobs = None
# ─── Configuration ────────────────────────────────────────────────────────────
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -1749,6 +1759,10 @@ class CRMHandler(BaseHTTPRequestHandler):
if re.match(r'^/api/thesis/[^/]+/canonical$', path):
return self.handle_get_canonical_thesis(user, path.split('/')[-2])
# ─── Entity-merge review queue ───
if path == '/api/entities/merge-candidates':
return self.handle_list_merge_candidates(user, params)
self.send_error_json("Not found", 404)
def do_POST(self):
@@ -1817,6 +1831,16 @@ class CRMHandler(BaseHTTPRequestHandler):
if re.match(r'^/api/thesis/versions/[^/]+/review$', path):
return self.handle_thesis_review(user, path.split('/')[-2], body)
# ─── UI-triggered index jobs + entity-merge decisions (Phase 1) ───
if path == '/api/index/rebuild':
return self.handle_index_job(user, 'rebuild_index')
if path == '/api/index/update':
return self.handle_index_job(user, 'update_index')
if path == '/api/entities/find-duplicates':
return self.handle_index_job(user, 'find_duplicates')
if re.match(r'^/api/entities/merge-candidates/[^/]+$', path):
return self.handle_decide_merge_candidate(user, path.split('/')[-1], body)
self.send_error_json("Not found", 404)
def do_PUT(self):
@@ -3462,9 +3486,43 @@ class CRMHandler(BaseHTTPRequestHandler):
"SELECT ts, actor_type, actor_id, action FROM interaction_log ORDER BY ts DESC LIMIT 12")]
except Exception:
out['recent_activity'] = []
try:
out['pending_merge_candidates'] = conn.execute(
"SELECT COUNT(*) FROM entity_merge_candidates WHERE status='pending'").fetchone()[0]
except Exception:
out['pending_merge_candidates'] = None
out['index_job'] = entity_jobs.get_status() if entity_jobs else None
conn.close()
self.send_json({"data": out})
# ─── UI-triggered index jobs + entity-merge review (Phase 1) ───
def handle_index_job(self, user, kind):
if not require_admin(user):
return self.send_error_json("Admin required", 403)
if entity_jobs is None:
return self.send_error_json("Jobs unavailable", 503)
res = entity_jobs.start(kind, DB_PATH)
if res.get('error'):
return self.send_error_json(res['error'], 409)
return self.send_json({"data": res})
def handle_list_merge_candidates(self, user, params):
if not require_admin(user):
return self.send_error_json("Admin required", 403)
if entity_merge is None:
return self.send_error_json("Unavailable", 503)
return self.send_json(entity_merge.list_candidates(DB_PATH, params.get('status', 'pending')))
def handle_decide_merge_candidate(self, user, candidate_id, body):
if not require_admin(user):
return self.send_error_json("Admin required", 403)
if entity_merge is None:
return self.send_error_json("Unavailable", 503)
res = entity_merge.decide(DB_PATH, candidate_id, (body or {}).get('decision'), user['user_id'])
if res.get('error'):
return self.send_error_json(res['error'], 400)
return self.send_json({"data": res})
# ─── Architect thesis (Phase 1) ───
def handle_list_thesis_lines(self, user):
if thesis_review is None:
+4 -2
View File
@@ -37,11 +37,13 @@ def _log(c, actor_id, action, target_id, payload):
def required_approvals(c):
# Default 2 = dual sign-off (both partners). Override via app_settings
# 'thesis_required_approvals' if you ever want single-approver.
r = c.execute("SELECT value_json FROM app_settings WHERE key='thesis_required_approvals'").fetchone()
try:
return max(1, int(json.loads(r[0]))) if r else 1
return max(1, int(json.loads(r[0]))) if r else 2
except Exception:
return 1
return 2
def _approver_count(c, version_id):