Matrix intake: fuzzy investor matching + conversational in-thread edits (v0.1.0:86)
Close the two locked post-deploy enhancements for the Matrix intake bot.
Fuzzy matching (server-side, ships in the s9pk): new find_intake_candidates in
server.py returns ranked deterministic near-matches (difflib name similarity +
token-set Jaccard, legal-suffix-aware, + email Levenshtein <= 2); GET
/api/intake/match now returns {match, candidates}. The bot surfaces a numbered
shortlist so a near-duplicate (Charlie/Charles, Acme Capital vs Acme Capital LLC,
a one-char email typo) is confirmed by a human instead of silently creating a
second investor. Exact match still auto-attaches; fuzzy candidates are never
auto-attached. The optional LLM-judge re-rank is deferred.
Conversational edits (bot-side, ships on the Spark): any in-thread reply that
isn't yes/no/edit field=value is treated as a natural-language revision and
re-run through local Qwen (parse.revise). Email integrity is preserved -- a
changed address must literally appear in the instruction; the model's email
field is structurally unreachable. No-op revisions re-prompt.
Docs/current-state brought current; 27/27 backend tests green.
This commit is contained in:
+127
-2
@@ -15,6 +15,7 @@ import uuid
|
||||
import csv
|
||||
import io
|
||||
import re
|
||||
import difflib
|
||||
import base64
|
||||
import threading
|
||||
from datetime import datetime, timedelta
|
||||
@@ -1254,6 +1255,124 @@ def find_intake_match(conn, q, email=None):
|
||||
return email_hit
|
||||
|
||||
|
||||
def _email_edit_distance(a, b):
|
||||
"""Levenshtein distance between two short strings (emails). Stdlib-only DP; used to flag
|
||||
near-miss emails (a one- or two-character typo) for the intake fuzzy matcher."""
|
||||
a = (a or '').strip().lower()
|
||||
b = (b or '').strip().lower()
|
||||
if a == b:
|
||||
return 0
|
||||
if not a or not b:
|
||||
return max(len(a), len(b))
|
||||
prev = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a, 1):
|
||||
cur = [i]
|
||||
for j, cb in enumerate(b, 1):
|
||||
cost = 0 if ca == cb else 1
|
||||
cur.append(min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + cost))
|
||||
prev = cur
|
||||
return prev[-1]
|
||||
|
||||
|
||||
# Legal-entity suffixes stripped before name comparison so 'Acme Capital' ~ 'Acme Capital LLC'.
|
||||
# Deliberately tight: only true entity types, NOT name-type words (Capital/Ventures/Partners),
|
||||
# which are distinctive enough to keep. Intentionally EXCLUDES 'sa'/'ag' (Société Anonyme /
|
||||
# Aktiengesellschaft) — niche for this portfolio and ambiguous enough as coincidental trailing
|
||||
# tokens that stripping them inflates false 1.0 matches across distinct firms.
|
||||
_LEGAL_SUFFIX = {"llc", "lp", "llp", "inc", "incorporated", "ltd", "limited", "co", "corp",
|
||||
"corporation", "company", "plc", "gmbh", "pte"}
|
||||
|
||||
|
||||
def _strip_legal_suffix(normalized_name):
|
||||
"""Drop trailing legal-entity suffix tokens (llc/lp/inc/…) from an already-normalized name."""
|
||||
toks = re.findall(r"[a-z0-9]+", normalized_name)
|
||||
while toks and toks[-1] in _LEGAL_SUFFIX:
|
||||
toks.pop()
|
||||
return " ".join(toks)
|
||||
|
||||
|
||||
def _name_similarity(a, b):
|
||||
"""0..1 fuzzy similarity between two investor names: the max of difflib's sequence ratio
|
||||
(catches near-spellings — 'Charlie'/'Charles') and token-set Jaccard overlap (catches
|
||||
word-order differences). Legal-entity suffixes are stripped first, so two names differing
|
||||
only by 'LLC'/'LP'/'Inc' score 1.0 (a near-certain duplicate to surface — find_intake_match
|
||||
won't have caught it, since it compares the full string). Favors recall: a shared common
|
||||
name-word ('… Capital') can lift unrelated firms into the 0.6–0.8 band — acceptable noise in
|
||||
a ranked, human-confirmed shortlist; semantic pruning is the deferred LLM-judge's job."""
|
||||
a = _normalize_text(a)
|
||||
b = _normalize_text(b)
|
||||
if not a or not b:
|
||||
return 0.0
|
||||
if a == b:
|
||||
return 1.0
|
||||
sa = _strip_legal_suffix(a) or a
|
||||
sb = _strip_legal_suffix(b) or b
|
||||
if sa == sb:
|
||||
return 1.0
|
||||
ratio = difflib.SequenceMatcher(None, sa, sb).ratio()
|
||||
ta = set(re.findall(r"[a-z0-9]+", sa))
|
||||
tb = set(re.findall(r"[a-z0-9]+", sb))
|
||||
jaccard = len(ta & tb) / len(ta | tb) if (ta or tb) else 0.0
|
||||
return max(ratio, jaccard)
|
||||
|
||||
|
||||
def find_intake_candidates(conn, q, email=None, limit=5, min_score=0.62, max_email_distance=2):
|
||||
"""Ranked fuzzy near-matches for the intake bot's disambiguation prompt.
|
||||
|
||||
Complements find_intake_match (which is exact-after-normalization): when the exact matcher
|
||||
misses, this returns the closest existing grid investors so the bot can surface them
|
||||
in-thread and the human can attach to one — instead of unknowingly creating a duplicate.
|
||||
Deterministic (stdlib difflib + token overlap + email edit distance), no LLM. Scans the same
|
||||
canonical grid blob as find_intake_match, so candidate ids are grid row ids the write targets.
|
||||
EXCLUDES exact matches (score 1.0 — those belong to find_intake_match) and ranks by score."""
|
||||
row = conn.execute("SELECT grid_json FROM fundraising_state WHERE id = 'main'").fetchone()
|
||||
if not row or not row['grid_json']:
|
||||
return []
|
||||
try:
|
||||
grid = json.loads(row['grid_json'])
|
||||
except Exception:
|
||||
return []
|
||||
rows = grid.get('rows', []) if isinstance(grid, dict) else []
|
||||
wanted_name = _normalize_text(q) if q else ''
|
||||
wanted_email = (email or '').strip().lower()
|
||||
scored = {}
|
||||
for r in rows:
|
||||
if not isinstance(r, dict):
|
||||
continue
|
||||
rid = str(r.get('id') or '').strip()
|
||||
if not rid:
|
||||
continue
|
||||
name = str(r.get('investor_name') or '').strip()
|
||||
# An exact name match belongs to find_intake_match — never echo it back as a candidate.
|
||||
if wanted_name and _normalize_text(name) == wanted_name:
|
||||
continue
|
||||
name_score = _name_similarity(wanted_name, name) if (wanted_name and name) else 0.0
|
||||
email_score = 0.0
|
||||
if wanted_email:
|
||||
contacts = r.get('contacts')
|
||||
if isinstance(contacts, list):
|
||||
for c in contacts:
|
||||
if not isinstance(c, dict):
|
||||
continue
|
||||
ce = str(c.get('email') or '').strip().lower()
|
||||
if not ce:
|
||||
continue
|
||||
dist = _email_edit_distance(wanted_email, ce)
|
||||
# dist 0 is an exact email (find_intake_match's); 1→0.9, 2→0.8 are near-misses
|
||||
if 0 < dist <= max_email_distance:
|
||||
email_score = max(email_score, 1.0 - 0.1 * dist)
|
||||
score = max(name_score, email_score)
|
||||
if score < min_score: # too weak to be a useful suggestion
|
||||
continue
|
||||
matched_on = 'email' if email_score >= name_score else 'name'
|
||||
# a row can match on both name and email — keep its highest-scoring read
|
||||
if rid not in scored or score > scored[rid]['score']:
|
||||
scored[rid] = {"id": rid, "investor_name": name,
|
||||
"score": round(score, 3), "matched_on": matched_on}
|
||||
out = sorted(scored.values(), key=lambda x: x['score'], reverse=True)
|
||||
return out[:limit]
|
||||
|
||||
|
||||
def ensure_fundraising_state_row(conn):
|
||||
existing = conn.execute("SELECT * FROM fundraising_state WHERE id = 'main'").fetchone()
|
||||
if not existing:
|
||||
@@ -2950,7 +3069,12 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
def handle_intake_match(self, user, params):
|
||||
"""Read-only: does an investor matching this intake already exist? Used by the
|
||||
Matrix intake bot to label its in-thread proposal new-vs-existing. Returns the
|
||||
grid row id so an approved note lands on exactly that investor."""
|
||||
grid row id so an approved note lands on exactly that investor.
|
||||
|
||||
`match` is the confident exact match (auto-attached by the bot). When there is no
|
||||
exact match, `candidates` carries ranked fuzzy near-matches so the bot can surface
|
||||
a disambiguation shortlist in-thread (the human picks one or creates new) — closing
|
||||
the duplicate-investor hole the exact-only matcher leaves open."""
|
||||
q = str(params.get('q') or '').strip()
|
||||
email = str(params.get('email') or '').strip()
|
||||
if not q and not email:
|
||||
@@ -2958,9 +3082,10 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
conn = get_db()
|
||||
try:
|
||||
match = find_intake_match(conn, q, email)
|
||||
candidates = find_intake_candidates(conn, q, email) if match is None else []
|
||||
finally:
|
||||
conn.close()
|
||||
return self.send_json({"data": {"match": match}})
|
||||
return self.send_json({"data": {"match": match, "candidates": candidates}})
|
||||
|
||||
def handle_update_communication(self, user, comm_id, body):
|
||||
conn = get_db()
|
||||
|
||||
Reference in New Issue
Block a user