Matrix intake: fuzzy investor matching + conversational in-thread edits (v0.1.0:86)

Close the two locked post-deploy enhancements for the Matrix intake bot. Fuzzy matching (server-side, ships in the s9pk): new find_intake_candidates in server.py returns ranked deterministic near-matches (difflib name similarity + token-set Jaccard, legal-suffix-aware, + email Levenshtein <= 2); GET /api/intake/match now returns {match, candidates}. The bot surfaces a numbered shortlist so a near-duplicate (Charlie/Charles, Acme Capital vs Acme Capital LLC, a one-char email typo) is confirmed by a human instead of silently creating a second investor. Exact match still auto-attaches; fuzzy candidates are never auto-attached. The optional LLM-judge re-rank is deferred. Conversational edits (bot-side, ships on the Spark): any in-thread reply that isn't yes/no/edit field=value is treated as a natural-language revision and re-run through local Qwen (parse.revise). Email integrity is preserved -- a changed address must literally appear in the instruction; the model's email field is structurally unreachable. No-op revisions re-prompt. Docs/current-state brought current; 27/27 backend tests green.
2026-06-17 18:50:58 -05:00
parent fa6c9da0e6
commit 0b893295e1
15 changed files with 734 additions and 41 deletions
@@ -15,6 +15,7 @@ import uuid
 import csv
 import io
 import re
+import difflib
 import base64
 import threading
 from datetime import datetime, timedelta
@@ -1254,6 +1255,124 @@ def find_intake_match(conn, q, email=None):
    return email_hit


+def _email_edit_distance(a, b):
+    """Levenshtein distance between two short strings (emails). Stdlib-only DP; used to flag
+    near-miss emails (a one- or two-character typo) for the intake fuzzy matcher."""
+    a = (a or '').strip().lower()
+    b = (b or '').strip().lower()
+    if a == b:
+        return 0
+    if not a or not b:
+        return max(len(a), len(b))
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, 1):
+        cur = [i]
+        for j, cb in enumerate(b, 1):
+            cost = 0 if ca == cb else 1
+            cur.append(min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + cost))
+        prev = cur
+    return prev[-1]
+
+
+# Legal-entity suffixes stripped before name comparison so 'Acme Capital' ~ 'Acme Capital LLC'.
+# Deliberately tight: only true entity types, NOT name-type words (Capital/Ventures/Partners),
+# which are distinctive enough to keep. Intentionally EXCLUDES 'sa'/'ag' (Société Anonyme /
+# Aktiengesellschaft) — niche for this portfolio and ambiguous enough as coincidental trailing
+# tokens that stripping them inflates false 1.0 matches across distinct firms.
+_LEGAL_SUFFIX = {"llc", "lp", "llp", "inc", "incorporated", "ltd", "limited", "co", "corp",
+                 "corporation", "company", "plc", "gmbh", "pte"}
+
+
+def _strip_legal_suffix(normalized_name):
+    """Drop trailing legal-entity suffix tokens (llc/lp/inc/…) from an already-normalized name."""
+    toks = re.findall(r"[a-z0-9]+", normalized_name)
+    while toks and toks[-1] in _LEGAL_SUFFIX:
+        toks.pop()
+    return " ".join(toks)
+
+
+def _name_similarity(a, b):
+    """0..1 fuzzy similarity between two investor names: the max of difflib's sequence ratio
+    (catches near-spellings — 'Charlie'/'Charles') and token-set Jaccard overlap (catches
+    word-order differences). Legal-entity suffixes are stripped first, so two names differing
+    only by 'LLC'/'LP'/'Inc' score 1.0 (a near-certain duplicate to surface — find_intake_match
+    won't have caught it, since it compares the full string). Favors recall: a shared common
+    name-word ('… Capital') can lift unrelated firms into the 0.6–0.8 band — acceptable noise in
+    a ranked, human-confirmed shortlist; semantic pruning is the deferred LLM-judge's job."""
+    a = _normalize_text(a)
+    b = _normalize_text(b)
+    if not a or not b:
+        return 0.0
+    if a == b:
+        return 1.0
+    sa = _strip_legal_suffix(a) or a
+    sb = _strip_legal_suffix(b) or b
+    if sa == sb:
+        return 1.0
+    ratio = difflib.SequenceMatcher(None, sa, sb).ratio()
+    ta = set(re.findall(r"[a-z0-9]+", sa))
+    tb = set(re.findall(r"[a-z0-9]+", sb))
+    jaccard = len(ta & tb) / len(ta | tb) if (ta or tb) else 0.0
+    return max(ratio, jaccard)
+
+
+def find_intake_candidates(conn, q, email=None, limit=5, min_score=0.62, max_email_distance=2):
+    """Ranked fuzzy near-matches for the intake bot's disambiguation prompt.
+
+    Complements find_intake_match (which is exact-after-normalization): when the exact matcher
+    misses, this returns the closest existing grid investors so the bot can surface them
+    in-thread and the human can attach to one — instead of unknowingly creating a duplicate.
+    Deterministic (stdlib difflib + token overlap + email edit distance), no LLM. Scans the same
+    canonical grid blob as find_intake_match, so candidate ids are grid row ids the write targets.
+    EXCLUDES exact matches (score 1.0 — those belong to find_intake_match) and ranks by score."""
+    row = conn.execute("SELECT grid_json FROM fundraising_state WHERE id = 'main'").fetchone()
+    if not row or not row['grid_json']:
+        return []
+    try:
+        grid = json.loads(row['grid_json'])
+    except Exception:
+        return []
+    rows = grid.get('rows', []) if isinstance(grid, dict) else []
+    wanted_name = _normalize_text(q) if q else ''
+    wanted_email = (email or '').strip().lower()
+    scored = {}
+    for r in rows:
+        if not isinstance(r, dict):
+            continue
+        rid = str(r.get('id') or '').strip()
+        if not rid:
+            continue
+        name = str(r.get('investor_name') or '').strip()
+        # An exact name match belongs to find_intake_match — never echo it back as a candidate.
+        if wanted_name and _normalize_text(name) == wanted_name:
+            continue
+        name_score = _name_similarity(wanted_name, name) if (wanted_name and name) else 0.0
+        email_score = 0.0
+        if wanted_email:
+            contacts = r.get('contacts')
+            if isinstance(contacts, list):
+                for c in contacts:
+                    if not isinstance(c, dict):
+                        continue
+                    ce = str(c.get('email') or '').strip().lower()
+                    if not ce:
+                        continue
+                    dist = _email_edit_distance(wanted_email, ce)
+                    # dist 0 is an exact email (find_intake_match's); 1→0.9, 2→0.8 are near-misses
+                    if 0 < dist <= max_email_distance:
+                        email_score = max(email_score, 1.0 - 0.1 * dist)
+        score = max(name_score, email_score)
+        if score < min_score:  # too weak to be a useful suggestion
+            continue
+        matched_on = 'email' if email_score >= name_score else 'name'
+        # a row can match on both name and email — keep its highest-scoring read
+        if rid not in scored or score > scored[rid]['score']:
+            scored[rid] = {"id": rid, "investor_name": name,
+                           "score": round(score, 3), "matched_on": matched_on}
+    out = sorted(scored.values(), key=lambda x: x['score'], reverse=True)
+    return out[:limit]
+
+
 def ensure_fundraising_state_row(conn):
    existing = conn.execute("SELECT * FROM fundraising_state WHERE id = 'main'").fetchone()
    if not existing:
@@ -2950,7 +3069,12 @@ class CRMHandler(BaseHTTPRequestHandler):
    def handle_intake_match(self, user, params):
        """Read-only: does an investor matching this intake already exist? Used by the
        Matrix intake bot to label its in-thread proposal new-vs-existing. Returns the
-        grid row id so an approved note lands on exactly that investor."""
+        grid row id so an approved note lands on exactly that investor.
+
+        `match` is the confident exact match (auto-attached by the bot). When there is no
+        exact match, `candidates` carries ranked fuzzy near-matches so the bot can surface
+        a disambiguation shortlist in-thread (the human picks one or creates new) — closing
+        the duplicate-investor hole the exact-only matcher leaves open."""
        q = str(params.get('q') or '').strip()
        email = str(params.get('email') or '').strip()
        if not q and not email:
@@ -2958,9 +3082,10 @@ class CRMHandler(BaseHTTPRequestHandler):
        conn = get_db()
        try:
            match = find_intake_match(conn, q, email)
+            candidates = find_intake_candidates(conn, q, email) if match is None else []
        finally:
            conn.close()
-        return self.send_json({"data": {"match": match}})
+        return self.send_json({"data": {"match": match, "candidates": candidates}})

    def handle_update_communication(self, user, comm_id, body):
        conn = get_db()