Matrix intake: fuzzy investor matching + conversational in-thread edits (v0.1.0:86)

Close the two locked post-deploy enhancements for the Matrix intake bot.

Fuzzy matching (server-side, ships in the s9pk): new find_intake_candidates in
server.py returns ranked deterministic near-matches (difflib name similarity +
token-set Jaccard, legal-suffix-aware, + email Levenshtein <= 2); GET
/api/intake/match now returns {match, candidates}. The bot surfaces a numbered
shortlist so a near-duplicate (Charlie/Charles, Acme Capital vs Acme Capital LLC,
a one-char email typo) is confirmed by a human instead of silently creating a
second investor. Exact match still auto-attaches; fuzzy candidates are never
auto-attached. The optional LLM-judge re-rank is deferred.

Conversational edits (bot-side, ships on the Spark): any in-thread reply that
isn't yes/no/edit field=value is treated as a natural-language revision and
re-run through local Qwen (parse.revise). Email integrity is preserved -- a
changed address must literally appear in the instruction; the model's email
field is structurally unreachable. No-op revisions re-prompt.

Docs/current-state brought current; 27/27 backend tests green.
This commit is contained in:
Keysat
2026-06-17 18:50:58 -05:00
parent fa6c9da0e6
commit 0b893295e1
15 changed files with 734 additions and 41 deletions
+127 -2
View File
@@ -15,6 +15,7 @@ import uuid
import csv
import io
import re
import difflib
import base64
import threading
from datetime import datetime, timedelta
@@ -1254,6 +1255,124 @@ def find_intake_match(conn, q, email=None):
return email_hit
def _email_edit_distance(a, b):
"""Levenshtein distance between two short strings (emails). Stdlib-only DP; used to flag
near-miss emails (a one- or two-character typo) for the intake fuzzy matcher."""
a = (a or '').strip().lower()
b = (b or '').strip().lower()
if a == b:
return 0
if not a or not b:
return max(len(a), len(b))
prev = list(range(len(b) + 1))
for i, ca in enumerate(a, 1):
cur = [i]
for j, cb in enumerate(b, 1):
cost = 0 if ca == cb else 1
cur.append(min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + cost))
prev = cur
return prev[-1]
# Legal-entity suffixes stripped before name comparison so 'Acme Capital' ~ 'Acme Capital LLC'.
# Deliberately tight: only true entity types, NOT name-type words (Capital/Ventures/Partners),
# which are distinctive enough to keep. Intentionally EXCLUDES 'sa'/'ag' (Société Anonyme /
# Aktiengesellschaft) — niche for this portfolio and ambiguous enough as coincidental trailing
# tokens that stripping them inflates false 1.0 matches across distinct firms.
_LEGAL_SUFFIX = {"llc", "lp", "llp", "inc", "incorporated", "ltd", "limited", "co", "corp",
"corporation", "company", "plc", "gmbh", "pte"}
def _strip_legal_suffix(normalized_name):
"""Drop trailing legal-entity suffix tokens (llc/lp/inc/…) from an already-normalized name."""
toks = re.findall(r"[a-z0-9]+", normalized_name)
while toks and toks[-1] in _LEGAL_SUFFIX:
toks.pop()
return " ".join(toks)
def _name_similarity(a, b):
"""0..1 fuzzy similarity between two investor names: the max of difflib's sequence ratio
(catches near-spellings — 'Charlie'/'Charles') and token-set Jaccard overlap (catches
word-order differences). Legal-entity suffixes are stripped first, so two names differing
only by 'LLC'/'LP'/'Inc' score 1.0 (a near-certain duplicate to surface — find_intake_match
won't have caught it, since it compares the full string). Favors recall: a shared common
name-word ('… Capital') can lift unrelated firms into the 0.60.8 band — acceptable noise in
a ranked, human-confirmed shortlist; semantic pruning is the deferred LLM-judge's job."""
a = _normalize_text(a)
b = _normalize_text(b)
if not a or not b:
return 0.0
if a == b:
return 1.0
sa = _strip_legal_suffix(a) or a
sb = _strip_legal_suffix(b) or b
if sa == sb:
return 1.0
ratio = difflib.SequenceMatcher(None, sa, sb).ratio()
ta = set(re.findall(r"[a-z0-9]+", sa))
tb = set(re.findall(r"[a-z0-9]+", sb))
jaccard = len(ta & tb) / len(ta | tb) if (ta or tb) else 0.0
return max(ratio, jaccard)
def find_intake_candidates(conn, q, email=None, limit=5, min_score=0.62, max_email_distance=2):
"""Ranked fuzzy near-matches for the intake bot's disambiguation prompt.
Complements find_intake_match (which is exact-after-normalization): when the exact matcher
misses, this returns the closest existing grid investors so the bot can surface them
in-thread and the human can attach to one — instead of unknowingly creating a duplicate.
Deterministic (stdlib difflib + token overlap + email edit distance), no LLM. Scans the same
canonical grid blob as find_intake_match, so candidate ids are grid row ids the write targets.
EXCLUDES exact matches (score 1.0 — those belong to find_intake_match) and ranks by score."""
row = conn.execute("SELECT grid_json FROM fundraising_state WHERE id = 'main'").fetchone()
if not row or not row['grid_json']:
return []
try:
grid = json.loads(row['grid_json'])
except Exception:
return []
rows = grid.get('rows', []) if isinstance(grid, dict) else []
wanted_name = _normalize_text(q) if q else ''
wanted_email = (email or '').strip().lower()
scored = {}
for r in rows:
if not isinstance(r, dict):
continue
rid = str(r.get('id') or '').strip()
if not rid:
continue
name = str(r.get('investor_name') or '').strip()
# An exact name match belongs to find_intake_match — never echo it back as a candidate.
if wanted_name and _normalize_text(name) == wanted_name:
continue
name_score = _name_similarity(wanted_name, name) if (wanted_name and name) else 0.0
email_score = 0.0
if wanted_email:
contacts = r.get('contacts')
if isinstance(contacts, list):
for c in contacts:
if not isinstance(c, dict):
continue
ce = str(c.get('email') or '').strip().lower()
if not ce:
continue
dist = _email_edit_distance(wanted_email, ce)
# dist 0 is an exact email (find_intake_match's); 1→0.9, 2→0.8 are near-misses
if 0 < dist <= max_email_distance:
email_score = max(email_score, 1.0 - 0.1 * dist)
score = max(name_score, email_score)
if score < min_score: # too weak to be a useful suggestion
continue
matched_on = 'email' if email_score >= name_score else 'name'
# a row can match on both name and email — keep its highest-scoring read
if rid not in scored or score > scored[rid]['score']:
scored[rid] = {"id": rid, "investor_name": name,
"score": round(score, 3), "matched_on": matched_on}
out = sorted(scored.values(), key=lambda x: x['score'], reverse=True)
return out[:limit]
def ensure_fundraising_state_row(conn):
existing = conn.execute("SELECT * FROM fundraising_state WHERE id = 'main'").fetchone()
if not existing:
@@ -2950,7 +3069,12 @@ class CRMHandler(BaseHTTPRequestHandler):
def handle_intake_match(self, user, params):
"""Read-only: does an investor matching this intake already exist? Used by the
Matrix intake bot to label its in-thread proposal new-vs-existing. Returns the
grid row id so an approved note lands on exactly that investor."""
grid row id so an approved note lands on exactly that investor.
`match` is the confident exact match (auto-attached by the bot). When there is no
exact match, `candidates` carries ranked fuzzy near-matches so the bot can surface
a disambiguation shortlist in-thread (the human picks one or creates new) — closing
the duplicate-investor hole the exact-only matcher leaves open."""
q = str(params.get('q') or '').strip()
email = str(params.get('email') or '').strip()
if not q and not email:
@@ -2958,9 +3082,10 @@ class CRMHandler(BaseHTTPRequestHandler):
conn = get_db()
try:
match = find_intake_match(conn, q, email)
candidates = find_intake_candidates(conn, q, email) if match is None else []
finally:
conn.close()
return self.send_json({"data": {"match": match}})
return self.send_json({"data": {"match": match, "candidates": candidates}})
def handle_update_communication(self, user, comm_id, body):
conn = get_db()