cd3cca725c
- Dual sign-off is now the default (thesis_required_approvals defaults to 2).
- Entity-merge review queue (migration 0003): the fuzzy/Qwen tier no longer
auto-merges — it writes CANDIDATES (entity_merge_candidates) with a same/different
suggestion + confidence + reason for a human to approve (merge) or reject (keep
separate). entity_merge.py applies/rejects (durable via entity_merges, soft-delete,
repoint links+edges); decided pairs aren't re-surfaced.
- entity_jobs.py: UI-triggered background index jobs (rebuild/update/find-duplicates)
as subprocesses with a one-at-a-time lock; status in /api/system/status.
- server.py: /api/index/{rebuild,update}, /api/entities/find-duplicates,
/api/entities/merge-candidates [+ /{id} decide] — admin-gated.
- docs/thesis-seed-v2.md: concrete, plain-English rewrite per Grant's feedback.
Backend verified end-to-end on synthetic data (candidate gen -> approve/reject).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
101 lines
4.5 KiB
Python
101 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Phase-1 — fuzzy entity-resolution tier (local Qwen), REVIEW-QUEUE mode.
|
|
|
|
The deterministic tier (entity_resolution.py) flags hard name-variant candidates
|
|
(same firm + surname, different first name/email) without guessing. This tier asks
|
|
the local Qwen model (Spark Control — sovereign) for a SUGGESTION on each, and
|
|
writes a CANDIDATE row to entity_merge_candidates for a human to approve (merge)
|
|
or reject (keep separate) in the CRM web UI. It NO LONGER auto-merges — uncertainty
|
|
is surfaced, not applied (the human decides). Already-decided pairs and
|
|
already-merged entities are skipped, so re-running is safe and quiet.
|
|
|
|
python3 backend/ingest/fuzzy_resolve.py --db data/crm_dev.db
|
|
"""
|
|
import argparse
|
|
import json
|
|
import sqlite3
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
|
|
import entity_resolution as er
|
|
import llm
|
|
|
|
_SYSTEM = ("You are an entity-resolution assistant for a CRM. Decide if the listed "
|
|
"people are the SAME individual recorded under name variants (e.g. nicknames "
|
|
"like Kate/Katherine, Bill/William), or DIFFERENT people who happen to share a "
|
|
"surname and firm. Be conservative: only say same when a nickname/abbreviation "
|
|
"relationship or matching contact info makes it clear.")
|
|
|
|
|
|
def _now():
|
|
return datetime.now(timezone.utc).replace(tzinfo=None).isoformat() + "Z"
|
|
|
|
|
|
def _ask(members, firm):
|
|
people = "; ".join(f"{n}" + (f" <{e}>" if e else "") for _, n, e in members)
|
|
prompt = (f"Firm: {firm or 'unknown'}\nPeople: {people}\n\n"
|
|
"Are these the SAME person under name variants? "
|
|
'Answer only JSON: {"same": true|false, "confidence": 0.0-1.0, "reason": "..."}')
|
|
return llm.chat_json(prompt, system=_SYSTEM, max_tokens=160) or {"same": False, "confidence": 0.0, "reason": ""}
|
|
|
|
|
|
def _survivor(members):
|
|
return sorted(members, key=lambda m: (bool(m[2]), len(m[1])), reverse=True)[0]
|
|
|
|
|
|
def run(db, db_path=None):
|
|
db = db_path or db
|
|
counts, candidates = er.run(db) # deterministic state (respects prior merges) + fresh candidates
|
|
conn = sqlite3.connect(db)
|
|
conn.row_factory = sqlite3.Row
|
|
name_of = {r["id"]: r["display_name"] for r in conn.execute("SELECT id, display_name FROM canonical_entities")}
|
|
|
|
decided = {frozenset((r["entity_a"], r["entity_b"]))
|
|
for r in conn.execute("SELECT entity_a, entity_b FROM entity_merge_candidates")}
|
|
merged = {r[0] for r in conn.execute("SELECT merged_id FROM entity_merges")}
|
|
|
|
created = skipped = 0
|
|
for cand in candidates:
|
|
members = cand["members"]
|
|
keep = _survivor(members)
|
|
losers = [m for m in members if m[0] != keep[0]]
|
|
verdict = _ask(members, name_of.get(cand["org"])) # one Qwen call per group
|
|
for loser in losers:
|
|
pair = frozenset((keep[0], loser[0]))
|
|
if pair in decided or loser[0] in merged or keep[0] in merged:
|
|
skipped += 1
|
|
continue
|
|
conn.execute("""
|
|
INSERT INTO entity_merge_candidates
|
|
(id, entity_a, entity_b, name_a, name_b, email_a, email_b, context, verdict, confidence, reason, status, created_at)
|
|
VALUES (?,?,?,?,?,?,?,?,?,?,?, 'pending', ?)
|
|
ON CONFLICT(entity_a, entity_b) DO NOTHING
|
|
""", (str(uuid.uuid4()), keep[0], loser[0], keep[1], loser[1], keep[2], loser[2],
|
|
f"{cand['surname']} @ {name_of.get(cand['org']) or 'unknown'}",
|
|
'same' if verdict.get('same') else 'different', verdict.get('confidence'),
|
|
verdict.get('reason'), _now()))
|
|
decided.add(pair)
|
|
created += 1
|
|
|
|
conn.execute("""INSERT INTO interaction_log
|
|
(id, ts, actor_type, actor_id, action, target_type, payload, source, created_at)
|
|
VALUES (?,?,?,?,?,?,?,?,?)""",
|
|
(str(uuid.uuid4()), _now(), "agent", "qwen_entity_resolver", "entity.candidates_generated",
|
|
"canonical_entities", json.dumps({"created": created, "skipped": skipped}), "ingest", _now()))
|
|
conn.commit()
|
|
conn.close()
|
|
return {"candidates_created": created, "skipped_existing": skipped, "flagged_groups": len(candidates)}
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--db", default="data/crm_dev.db")
|
|
args = ap.parse_args()
|
|
s = run(args.db)
|
|
print(f"Fuzzy review: {s['candidates_created']} new candidate(s) for review, "
|
|
f"{s['skipped_existing']} already decided ({s['flagged_groups']} flagged groups).")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|