#!/usr/bin/env python3 """Phase-1 — fuzzy entity-resolution tier (local Qwen), REVIEW-QUEUE mode. The deterministic tier (entity_resolution.py) flags hard name-variant candidates (same firm + surname, different first name/email) without guessing. This tier asks the local Qwen model (Spark Control — sovereign) for a SUGGESTION on each, and writes a CANDIDATE row to entity_merge_candidates for a human to approve (merge) or reject (keep separate) in the CRM web UI. It NO LONGER auto-merges — uncertainty is surfaced, not applied (the human decides). Already-decided pairs and already-merged entities are skipped, so re-running is safe and quiet. python3 backend/ingest/fuzzy_resolve.py --db data/crm_dev.db """ import argparse import json import sqlite3 import uuid from datetime import datetime, timezone import entity_resolution as er import llm _SYSTEM = ("You are an entity-resolution assistant for a CRM. Decide if the listed " "people are the SAME individual recorded under name variants (e.g. nicknames " "like Kate/Katherine, Bill/William), or DIFFERENT people who happen to share a " "surname and firm. Be conservative: only say same when a nickname/abbreviation " "relationship or matching contact info makes it clear.") def _now(): return datetime.now(timezone.utc).replace(tzinfo=None).isoformat() + "Z" def _ask(members, firm): people = "; ".join(f"{n}" + (f" <{e}>" if e else "") for _, n, e in members) prompt = (f"Firm: {firm or 'unknown'}\nPeople: {people}\n\n" "Are these the SAME person under name variants? " 'Answer only JSON: {"same": true|false, "confidence": 0.0-1.0, "reason": "..."}') return llm.chat_json(prompt, system=_SYSTEM, max_tokens=160) or {"same": False, "confidence": 0.0, "reason": ""} def _survivor(members): return sorted(members, key=lambda m: (bool(m[2]), len(m[1])), reverse=True)[0] def run(db, db_path=None): db = db_path or db counts, candidates = er.run(db) # deterministic state (respects prior merges) + fresh candidates conn = sqlite3.connect(db) conn.row_factory = sqlite3.Row name_of = {r["id"]: r["display_name"] for r in conn.execute("SELECT id, display_name FROM canonical_entities")} decided = {frozenset((r["entity_a"], r["entity_b"])) for r in conn.execute("SELECT entity_a, entity_b FROM entity_merge_candidates")} merged = {r[0] for r in conn.execute("SELECT merged_id FROM entity_merges")} created = skipped = 0 for cand in candidates: members = cand["members"] keep = _survivor(members) losers = [m for m in members if m[0] != keep[0]] verdict = _ask(members, name_of.get(cand["org"])) # one Qwen call per group for loser in losers: pair = frozenset((keep[0], loser[0])) if pair in decided or loser[0] in merged or keep[0] in merged: skipped += 1 continue conn.execute(""" INSERT INTO entity_merge_candidates (id, entity_a, entity_b, name_a, name_b, email_a, email_b, context, verdict, confidence, reason, status, created_at) VALUES (?,?,?,?,?,?,?,?,?,?,?, 'pending', ?) ON CONFLICT(entity_a, entity_b) DO NOTHING """, (str(uuid.uuid4()), keep[0], loser[0], keep[1], loser[1], keep[2], loser[2], f"{cand['surname']} @ {name_of.get(cand['org']) or 'unknown'}", 'same' if verdict.get('same') else 'different', verdict.get('confidence'), verdict.get('reason'), _now())) decided.add(pair) created += 1 conn.execute("""INSERT INTO interaction_log (id, ts, actor_type, actor_id, action, target_type, payload, source, created_at) VALUES (?,?,?,?,?,?,?,?,?)""", (str(uuid.uuid4()), _now(), "agent", "qwen_entity_resolver", "entity.candidates_generated", "canonical_entities", json.dumps({"created": created, "skipped": skipped}), "ingest", _now())) conn.commit() conn.close() return {"candidates_created": created, "skipped_existing": skipped, "flagged_groups": len(candidates)} def main(): ap = argparse.ArgumentParser() ap.add_argument("--db", default="data/crm_dev.db") args = ap.parse_args() s = run(args.db) print(f"Fuzzy review: {s['candidates_created']} new candidate(s) for review, " f"{s['skipped_existing']} already decided ({s['flagged_groups']} flagged groups).") if __name__ == "__main__": main()