Matrix intake: fuzzy investor matching + conversational in-thread edits (v0.1.0:86)

Close the two locked post-deploy enhancements for the Matrix intake bot.

Fuzzy matching (server-side, ships in the s9pk): new find_intake_candidates in
server.py returns ranked deterministic near-matches (difflib name similarity +
token-set Jaccard, legal-suffix-aware, + email Levenshtein <= 2); GET
/api/intake/match now returns {match, candidates}. The bot surfaces a numbered
shortlist so a near-duplicate (Charlie/Charles, Acme Capital vs Acme Capital LLC,
a one-char email typo) is confirmed by a human instead of silently creating a
second investor. Exact match still auto-attaches; fuzzy candidates are never
auto-attached. The optional LLM-judge re-rank is deferred.

Conversational edits (bot-side, ships on the Spark): any in-thread reply that
isn't yes/no/edit field=value is treated as a natural-language revision and
re-run through local Qwen (parse.revise). Email integrity is preserved -- a
changed address must literally appear in the instruction; the model's email
field is structurally unreachable. No-op revisions re-prompt.

Docs/current-state brought current; 27/27 backend tests green.
This commit is contained in:
Keysat
2026-06-17 18:50:58 -05:00
parent fa6c9da0e6
commit 0b893295e1
15 changed files with 734 additions and 41 deletions
+71 -13
View File
@@ -46,30 +46,49 @@ async def main():
try:
proposal = await asyncio.to_thread(parse.parse_message, text)
except Exception as exc: # Spark/Qwen unreachable or bad response
await say(room_id, f"⚠️ couldn't reach the local parser: {exc}", root)
await say(room_id, f"⚠️ couldn't reach the local parser: {str(exc)[:200]}", root)
return
if proposal["intent"] == "unclear":
await say(room_id, UNCLEAR_HELP, root)
return
# Confirm new-vs-existing against the CRM matcher (read-only). Degrade gracefully if
# the CRM is unreachable — still propose, just without the "looks like existing" hint.
hint = ""
# Resolve new-vs-existing against the CRM matcher (read-only). Degrade gracefully if the
# CRM is unreachable — still propose as new, just without match/candidate hints.
match, candidates = None, []
try:
match = await asyncio.to_thread(crm_client.match, proposal)
if match:
proposal["intent"] = "meeting_note"
proposal["_match_id"] = match["id"]
hint = f"\n\n🔎 Looks like an existing investor: **{match['name']}** — this will append a note to them."
res = await asyncio.to_thread(crm_client.match, proposal)
match = res.get("match")
candidates = res.get("candidates") or []
except Exception:
pass
if match:
# Confident exact match → auto-attach the note to that investor (no disambiguation).
proposal["intent"] = "meeting_note"
proposal["_match_id"] = match["id"]
proposal["_stage"] = "approval"
store.put(root, proposal)
hint = (f"\n\n🔎 Looks like an existing investor: **{match['name']}** — "
"this will append a note to them.")
await say(room_id, proposals.render(proposal) + hint, root)
await nudge(room_id, proposals.summary_line(proposal), root)
return
if candidates:
# No exact match but near-misses exist → make the human pick one or confirm "new",
# so a typo'd/near-duplicate name can't silently create a second investor.
proposal["_stage"] = "disambiguate"
proposal["_candidates"] = candidates
store.put(root, proposal)
await say(room_id, proposals.render_disambiguation(proposal), root)
await nudge(room_id, proposals.disambiguation_nudge(proposal), root)
return
# Genuinely new — straight to the new-investor approval card.
proposal["_stage"] = "approval"
store.put(root, proposal)
await say(room_id, proposals.render(proposal) + hint, root)
await say(room_id, proposals.render(proposal), root)
# Also drop a brief, un-threaded reply in the main timeline so the proposal isn't
# easy to miss inside a thread (the full card + yes/edit/no stay in the thread).
await nudge(room_id, proposals.summary_line(proposal), root)
async def handle_reply(room_id, root, text):
action, payload = proposals.interpret_reply(text)
# Claim the proposal synchronously — BEFORE any await — so a second reply that
# arrives while a commit is in flight can't double-process it. asyncio is
# cooperative: nothing else runs between here and the first await below, so the
@@ -77,6 +96,11 @@ async def main():
proposal = store.pop(root)
if proposal is None:
return
if proposal.get("_stage") == "disambiguate":
await handle_disambiguation(room_id, root, text, proposal)
return
action, payload = proposals.interpret_reply(text)
if action == "approve":
try:
summary = await asyncio.to_thread(crm_client.commit, proposal)
@@ -92,9 +116,43 @@ async def main():
proposal = proposals.apply_edit(proposal, field, value)
store.put(root, proposal) # keep it pending (edited) for the next reply
await say(room_id, "✏️ Updated:\n\n" + proposals.render(proposal), root)
else: # unrecognized reply — leave the proposal pending
else:
# Not yes/no/edit-grammar → treat it as a natural-language revision instruction and
# re-run it through local Qwen (no Claude, no scrub). The human still approves the
# revised card, so the draft→approve gate holds.
try:
revised = await asyncio.to_thread(parse.revise, proposal, text)
except Exception as exc:
store.put(root, proposal)
await say(room_id, f"⚠️ couldn't apply that change ({str(exc)[:200]}).\n\nReply **yes** "
"to commit, **no** to discard, **edit field=value**, or rephrase.", root)
return
if proposals.same_fields(proposal, revised):
store.put(root, proposal)
await say(room_id, "I didn't catch a change there. Reply **yes** to commit, **no** "
"to discard, **edit field=value**, or tell me what to change.", root)
return
store.put(root, revised)
await say(room_id, "✏️ Updated:\n\n" + proposals.render(revised), root)
async def handle_disambiguation(room_id, root, text, proposal):
cands = proposal.get("_candidates") or []
action, payload = proposals.interpret_disambiguation(text, len(cands))
if action == "pick":
updated = proposals.attach_to_candidate(proposal, cands[payload])
store.put(root, updated)
await say(room_id, "✏️ Will log against the existing investor:\n\n"
+ proposals.render(updated), root)
elif action == "new":
updated = proposals.promote_to_new(proposal)
store.put(root, updated)
await say(room_id, " OK — adding as a new investor:\n\n"
+ proposals.render(updated), root)
elif action == "reject":
await say(room_id, "🗑️ Discarded — nothing written.", root)
else: # unrecognized — re-show the shortlist
store.put(root, proposal)
await say(room_id, "Reply **yes** to commit, **edit field=value**, or **no**.", root)
await say(room_id, "I didn't catch that.\n\n" + proposals.render_disambiguation(proposal), root)
async def on_message(room: MatrixRoom, event: RoomMessageText):
if event.sender == mx["user_id"]:
+19 -6
View File
@@ -70,19 +70,32 @@ def _authed(method, path, body=None):
def match(proposal):
"""Return {'id', 'name'} for an existing investor matching this proposal, else None."""
"""Resolve new-vs-existing for this proposal against the CRM matcher.
Returns {'match': {...}|None, 'candidates': [...]}:
- `match` is a confident EXACT existing investor — {'id', 'name'} — that the bot
auto-attaches a note to (no human disambiguation needed).
- `candidates` is a ranked list of fuzzy NEAR-matches — each {'id', 'name', 'score',
'matched_on'} — surfaced in-thread for the human to pick from (or confirm "new")
when there is no exact match, so a typo'd/near-duplicate name doesn't silently
create a second investor."""
q = proposal.get("investor_name") or proposal.get("contact_name") or ""
email = proposal.get("contact_email") or ""
if not q and not email:
return None
return {"match": None, "candidates": []}
qs = urlencode({"q": q, "email": email})
status, data = _authed("GET", f"/api/intake/match?{qs}")
if status != 200:
raise RuntimeError(f"intake match failed ({status}): {data.get('error') or data}")
m = (data.get("data") or {}).get("match")
if not m:
return None
return {"id": m["id"], "name": m.get("investor_name") or q}
payload = data.get("data") or {}
m = payload.get("match")
match_out = {"id": m["id"], "name": m.get("investor_name") or q} if m else None
candidates = [
{"id": c["id"], "name": c.get("investor_name") or "?",
"score": c.get("score"), "matched_on": c.get("matched_on")}
for c in (payload.get("candidates") or []) if c.get("id")
]
return {"match": match_out, "candidates": candidates}
def build_commit_payload(proposal):
+57 -1
View File
@@ -2,7 +2,13 @@
The model only EXTRACTS structure; it never decides to write anything. New-vs-existing is
finalized in M2 against the CRM matcher — here `intent` is the model's first read.
`revise()` is the conversational-edit leg: a free-form correction the human types in the
proposal thread (e.g. "add that we met June 14") is applied to the pending proposal via the
same local Qwen — no Claude, no scrub. Email integrity is preserved: a changed address must
literally appear in the instruction (or the original message); the model can never mint one.
"""
import json
import re
import spark
@@ -60,4 +66,54 @@ def parse_message(text, parse_fn=spark.parse_json):
"""Parse one intake message. `parse_fn` is injectable for tests (defaults to Spark/Qwen).
Returns a normalized proposal dict. On a model/transport failure, raises (caller decides)."""
raw = parse_fn(text, system=SYSTEM, max_tokens=400)
return normalize(raw, source_text=text)
proposal = normalize(raw, source_text=text)
# Stash the original message so a later revise() can re-check email integrity against it.
proposal["_source_text"] = text
return proposal
REVISE_SYSTEM = (
"You revise a structured investor-intake proposal from a short correction a venture-fund "
"team member typed. You are given the CURRENT proposal as JSON and an INSTRUCTION. Apply "
"the instruction and reply with ONLY the full revised JSON object, these keys:\n"
' "investor_name", "contact_name", "contact_email", "contact_title", "note".\n'
"Change ONLY what the instruction asks; copy every other field through unchanged. Use null "
"for a field the instruction clears or that is genuinely absent. Never invent an email "
"address. Output JSON only."
)
_REVISABLE = ("investor_name", "contact_name", "contact_title", "note")
def _apply_revision(proposal, model_out, instruction):
"""Merge the model's revised fields onto the proposal. Pure + offline-testable.
Preserves control keys (_match_id / _stage / intent / _source_text). Enforces email
integrity: a revised address is taken only if it literally appears in the INSTRUCTION the
human typed; otherwise the existing (already integrity-checked) address is kept. The model's
own email field is never trusted — it must not mint an address."""
model_out = model_out or {}
out = dict(proposal)
for k in _REVISABLE:
if k in model_out:
out[k] = _clean(model_out.get(k))
m = _EMAIL_RE.search(instruction or "")
if m:
out["contact_email"] = m.group(0).rstrip(".,;:!?)]}>\"'")
# else: keep proposal's current contact_email (untouched above; control key copied by dict())
# Don't let a revision strip the proposal down to nothing actionable.
if not out.get("investor_name") and not out.get("contact_name"):
out["investor_name"] = proposal.get("investor_name")
out["contact_name"] = proposal.get("contact_name")
return out
def revise(proposal, instruction, parse_fn=spark.parse_json):
"""Apply a natural-language correction to a pending proposal via local Qwen; return the
revised proposal dict. `parse_fn` is injectable for tests (defaults to Spark/Qwen)."""
current = {k: proposal.get(k) for k in
("investor_name", "contact_name", "contact_email", "contact_title", "note")}
prompt = ("CURRENT:\n" + json.dumps(current, ensure_ascii=False)
+ "\n\nINSTRUCTION:\n" + (instruction or "").strip())
raw = parse_fn(prompt, system=REVISE_SYSTEM, max_tokens=400)
return _apply_revision(proposal, raw, instruction)
+78
View File
@@ -5,7 +5,12 @@ Matrix thread root (the bot's proposal lives in a thread rooted at the user's me
the user replies inside that thread). In-memory and ephemeral by design — a restart drops
pending proposals (the user just re-sends), matching matrix-bridge's stateless-by-default
ethos. Nothing here writes to the CRM; the bot calls the CRM client only after `approve`.
A proposal carries a `_stage`: "approval" (the normal yes/edit/no card) or "disambiguate"
(a fuzzy-match shortlist the human must resolve — pick a number / "new" / "no" — before it
becomes an approval-stage proposal). The shortlist itself rides on `_candidates`.
"""
import re
# field aliases accepted in `edit <field>=<value>`
_EDIT_ALIASES = {
@@ -18,6 +23,10 @@ _EDIT_ALIASES = {
_YES = {"yes", "y", "approve", "approved", "ok", "confirm", "go", "👍", ""}
_NO = {"no", "n", "cancel", "discard", "reject", "stop", "👎", ""}
# "create a new investor anyway" replies to a disambiguation shortlist
_NEW = {"new", "none", "new investor", "none of these", "create", "create new", "add new", "neither"}
_CONTENT_FIELDS = ("intent", "investor_name", "contact_name", "contact_email", "contact_title", "note")
class ProposalStore:
@@ -84,6 +93,75 @@ def apply_edit(proposal, field, value):
return updated
def same_fields(a, b):
"""True if two proposals carry identical content (used to detect a no-op NL revision so we
don't tell the human 'Updated' when nothing changed)."""
return all((a or {}).get(k) == (b or {}).get(k) for k in _CONTENT_FIELDS)
def interpret_disambiguation(text, n_candidates):
"""Classify a reply to a fuzzy-match shortlist.
Returns ("pick", index) | ("new", None) | ("reject", None) | ("unknown", None). A bare
number selects that candidate; "new"/"none" creates a new investor; "no"/"cancel" discards."""
t = (text or "").strip().lower()
if not t:
return ("unknown", None)
if t in _NO:
return ("reject", None)
if t in _NEW:
return ("new", None)
m = re.fullmatch(r"#?\s*(\d{1,2})", t)
if m:
idx = int(m.group(1)) - 1
if 0 <= idx < n_candidates:
return ("pick", idx)
return ("unknown", None)
def attach_to_candidate(proposal, candidate):
"""Promote a disambiguation pick into an approval-stage meeting note on the chosen investor.
The note will target that existing grid row (via _match_id); the firm name is shown for
accuracy. Drops the shortlist."""
updated = dict(proposal)
updated.pop("_candidates", None)
updated["_stage"] = "approval"
updated["_match_id"] = candidate["id"]
updated["intent"] = "meeting_note"
if candidate.get("name"):
updated["investor_name"] = candidate["name"]
return updated
def promote_to_new(proposal):
"""Disambiguation 'new' — discard the shortlist and proceed as a new-investor proposal."""
updated = dict(proposal)
updated.pop("_candidates", None)
updated.pop("_match_id", None)
updated["_stage"] = "approval"
return updated
def render_disambiguation(proposal):
"""Render the fuzzy-match shortlist a human resolves before we create a new investor."""
name = proposal.get("investor_name") or proposal.get("contact_name") or "?"
cands = proposal.get("_candidates") or []
lines = [f"🔎 Before adding **{name}** as new — these existing investors look similar:"]
for i, c in enumerate(cands, 1):
lines.append(f" **{i}.** {c.get('name') or '?'}")
lines.append("")
lines.append("Reply a **number** to log this against that investor, **new** to add it as a "
"new investor, or **no** to discard.")
return "\n".join(lines)
def disambiguation_nudge(proposal):
"""Brief main-timeline pointer for a disambiguation proposal (the shortlist is in the thread)."""
name = proposal.get("investor_name") or proposal.get("contact_name") or "?"
return (f"🔎 **{name}** may match an existing investor — open the **thread** to pick one "
"or confirm it's new.")
def render(proposal):
"""Render a proposal as the in-thread message a human approves."""
if proposal.get("intent") == "meeting_note":
+55
View File
@@ -58,6 +58,61 @@ def test_subject_blank_when_note_present_else_provenance_label():
assert no_note["subject"] == "Intake (Matrix)"
def _with_stub_authed(reply, capture=None):
"""Swap crm_client._authed for a canned (status, data); return a restorer."""
orig = crm_client._authed
def fake(method, path, body=None):
if capture is not None:
capture["path"] = path
return reply
crm_client._authed = fake
return orig
def test_match_parses_exact_match():
cap = {}
orig = _with_stub_authed((200, {"data": {
"match": {"id": "rowAcme", "investor_name": "Acme Capital", "matched_on": "name"},
"candidates": [],
}}), cap)
try:
res = crm_client.match({"investor_name": "Acme Capital", "contact_email": ""})
finally:
crm_client._authed = orig
assert res["match"] == {"id": "rowAcme", "name": "Acme Capital"}
assert res["candidates"] == []
assert "q=Acme" in cap["path"] # the query was forwarded
def test_match_returns_ranked_candidates_when_no_exact():
orig = _with_stub_authed((200, {"data": {"match": None, "candidates": [
{"id": "rowCharlie", "investor_name": "Charlie Brown", "score": 0.92, "matched_on": "name"},
{"id": "rowBeta", "investor_name": "Beta Capital LLC", "score": 0.86, "matched_on": "name"},
]}}))
try:
res = crm_client.match({"investor_name": "Charles Brown"})
finally:
crm_client._authed = orig
assert res["match"] is None
assert [c["id"] for c in res["candidates"]] == ["rowCharlie", "rowBeta"]
assert res["candidates"][0]["name"] == "Charlie Brown"
assert res["candidates"][0]["matched_on"] == "name"
def test_match_no_query_skips_network():
def boom(*a, **k):
raise AssertionError("should not hit the network when there's nothing to match on")
orig = crm_client._authed
crm_client._authed = boom
try:
res = crm_client.match({"investor_name": None, "contact_name": None, "contact_email": None})
finally:
crm_client._authed = orig
assert res == {"match": None, "candidates": []}
if __name__ == "__main__":
fns = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)]
for fn in fns:
+59
View File
@@ -102,6 +102,65 @@ def test_none_model_reply_is_unclear():
assert p["intent"] == "unclear"
def test_parse_message_stashes_source_text():
p = parse.parse_message("Acme Capital, Jane jane@acme.com",
parse_fn=_stub({"intent": "new_investor", "investor_name": "Acme Capital",
"contact_name": "Jane", "contact_email": "jane@acme.com"}))
assert p["_source_text"] == "Acme Capital, Jane jane@acme.com"
def test_revise_applies_note_change_and_preserves_control_keys():
proposal = parse.parse_message(
"New investor Acme Capital, Jane Doe jane@acme.com",
parse_fn=_stub({"intent": "new_investor", "investor_name": "Acme Capital",
"contact_name": "Jane Doe", "contact_email": "jane@acme.com",
"contact_title": None, "note": None}))
revised = parse.revise(
proposal, "add that we met on June 14",
parse_fn=_stub({"investor_name": "Acme Capital", "contact_name": "Jane Doe",
"contact_email": "jane@acme.com", "contact_title": None,
"note": "met on June 14"}))
assert revised["note"] == "met on June 14"
assert revised["investor_name"] == "Acme Capital"
assert revised["intent"] == "new_investor" # control key preserved
assert revised["_source_text"] == proposal["_source_text"] # preserved for email integrity
def test_revise_email_taken_only_from_instruction():
proposal = {"intent": "new_investor", "investor_name": "Acme", "contact_name": "Jane",
"contact_email": "jane@acme.com", "contact_title": None, "note": None,
"_source_text": "Acme, Jane jane@acme.com"}
# instruction literally carries the new address → accepted
r1 = parse.revise(proposal, "her email is jane@newfirm.com",
parse_fn=_stub({"contact_email": "jane@newfirm.com"}))
assert r1["contact_email"] == "jane@newfirm.com"
# model tries to change the email but the instruction has no address → keep the existing one
r2 = parse.revise(proposal, "set her title to GP",
parse_fn=_stub({"contact_email": "totally@madeup.test", "contact_title": "GP"}))
assert r2["contact_email"] == "jane@acme.com" # model's email ignored (not in instruction)
assert r2["contact_title"] == "GP"
def test_revise_preserves_match_id():
proposal = {"intent": "meeting_note", "investor_name": "Acme", "contact_name": None,
"contact_email": None, "contact_title": None, "note": "old",
"_match_id": "rowAcme", "_stage": "approval", "_source_text": "note for Acme: old"}
revised = parse.revise(proposal, "change the note to: sent the deck",
parse_fn=_stub({"note": "sent the deck"}))
assert revised["note"] == "sent the deck"
assert revised["_match_id"] == "rowAcme"
assert revised["intent"] == "meeting_note"
def test_revise_cannot_empty_the_proposal():
proposal = {"intent": "new_investor", "investor_name": "Acme", "contact_name": "Jane",
"contact_email": None, "contact_title": None, "note": "x", "_source_text": "Acme Jane"}
revised = parse.revise(proposal, "clear it",
parse_fn=_stub({"investor_name": None, "contact_name": None,
"contact_title": None, "note": None}))
assert revised["investor_name"] == "Acme" and revised["contact_name"] == "Jane"
if __name__ == "__main__":
fns = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)]
for fn in fns:
+74
View File
@@ -1,4 +1,5 @@
"""Tests for the proposal store + approval state machine (pure logic, no network)."""
import copy
import os
import sys
@@ -105,6 +106,79 @@ def test_summary_line_new_vs_note():
assert "thread" in new_line.lower()
# --- fuzzy-match disambiguation + conversational-revision helpers ---
DISAMBIG = {"intent": "new_investor", "investor_name": "Charles Brown",
"contact_name": "Charles Brown", "contact_email": None, "contact_title": None,
"note": "met at conf", "_stage": "disambiguate",
"_candidates": [{"id": "rowCharlie", "name": "Charlie Brown", "score": 0.92, "matched_on": "name"},
{"id": "rowBeta", "name": "Beta Capital LLC", "score": 0.7, "matched_on": "name"}]}
def test_interpret_disambiguation_pick_number():
assert proposals.interpret_disambiguation("1", 2) == ("pick", 0)
assert proposals.interpret_disambiguation(" 2 ", 2) == ("pick", 1)
assert proposals.interpret_disambiguation("#1", 2) == ("pick", 0)
def test_interpret_disambiguation_out_of_range_is_unknown():
assert proposals.interpret_disambiguation("3", 2)[0] == "unknown"
assert proposals.interpret_disambiguation("0", 2)[0] == "unknown"
def test_interpret_disambiguation_new_and_no():
assert proposals.interpret_disambiguation("new", 2)[0] == "new"
assert proposals.interpret_disambiguation("none of these", 2)[0] == "new"
assert proposals.interpret_disambiguation("no", 2)[0] == "reject"
def test_interpret_disambiguation_freeform_is_unknown():
# a free-form reply in the shortlist stage isn't guessed at — re-prompt instead
assert proposals.interpret_disambiguation("the first one", 2)[0] == "unknown"
def test_attach_to_candidate_promotes_to_meeting_note():
out = proposals.attach_to_candidate(DISAMBIG, DISAMBIG["_candidates"][0])
assert out["_match_id"] == "rowCharlie"
assert out["intent"] == "meeting_note"
assert out["_stage"] == "approval"
assert out["investor_name"] == "Charlie Brown" # canonical existing name shown
assert "_candidates" not in out
assert "_candidates" in DISAMBIG # original untouched
def test_promote_to_new_clears_shortlist_and_match():
out = proposals.promote_to_new(dict(DISAMBIG, _match_id="rowX"))
assert out["_stage"] == "approval"
assert "_candidates" not in out
assert "_match_id" not in out
def test_disambiguation_pick_then_yes_reaches_approval():
# Closes the seam between the two state machines: a shortlist pick promotes the proposal to
# approval stage carrying the chosen investor's row id, and a following 'yes' classifies as
# approve (the normal commit path) — so pick -> yes lands the note on the existing investor.
picked = proposals.attach_to_candidate(copy.deepcopy(DISAMBIG), DISAMBIG["_candidates"][0])
assert picked["_stage"] == "approval"
assert picked["_match_id"] == "rowCharlie"
assert picked["intent"] == "meeting_note"
assert proposals.interpret_reply("yes") == ("approve", None)
def test_render_disambiguation_lists_numbered_candidates():
text = proposals.render_disambiguation(DISAMBIG)
assert "Charlie Brown" in text and "Beta Capital LLC" in text
assert "1." in text and "2." in text
assert "new" in text.lower() and "no" in text.lower()
def test_same_fields_ignores_control_keys():
a = dict(SAMPLE)
assert proposals.same_fields(a, dict(a))
assert not proposals.same_fields(a, dict(a, note="different"))
assert proposals.same_fields(a, dict(a, _match_id="r1", _stage="approval"))
if __name__ == "__main__":
fns = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)]
for fn in fns:
+127 -2
View File
@@ -15,6 +15,7 @@ import uuid
import csv
import io
import re
import difflib
import base64
import threading
from datetime import datetime, timedelta
@@ -1254,6 +1255,124 @@ def find_intake_match(conn, q, email=None):
return email_hit
def _email_edit_distance(a, b):
"""Levenshtein distance between two short strings (emails). Stdlib-only DP; used to flag
near-miss emails (a one- or two-character typo) for the intake fuzzy matcher."""
a = (a or '').strip().lower()
b = (b or '').strip().lower()
if a == b:
return 0
if not a or not b:
return max(len(a), len(b))
prev = list(range(len(b) + 1))
for i, ca in enumerate(a, 1):
cur = [i]
for j, cb in enumerate(b, 1):
cost = 0 if ca == cb else 1
cur.append(min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + cost))
prev = cur
return prev[-1]
# Legal-entity suffixes stripped before name comparison so 'Acme Capital' ~ 'Acme Capital LLC'.
# Deliberately tight: only true entity types, NOT name-type words (Capital/Ventures/Partners),
# which are distinctive enough to keep. Intentionally EXCLUDES 'sa'/'ag' (Société Anonyme /
# Aktiengesellschaft) — niche for this portfolio and ambiguous enough as coincidental trailing
# tokens that stripping them inflates false 1.0 matches across distinct firms.
_LEGAL_SUFFIX = {"llc", "lp", "llp", "inc", "incorporated", "ltd", "limited", "co", "corp",
"corporation", "company", "plc", "gmbh", "pte"}
def _strip_legal_suffix(normalized_name):
"""Drop trailing legal-entity suffix tokens (llc/lp/inc/…) from an already-normalized name."""
toks = re.findall(r"[a-z0-9]+", normalized_name)
while toks and toks[-1] in _LEGAL_SUFFIX:
toks.pop()
return " ".join(toks)
def _name_similarity(a, b):
"""0..1 fuzzy similarity between two investor names: the max of difflib's sequence ratio
(catches near-spellings — 'Charlie'/'Charles') and token-set Jaccard overlap (catches
word-order differences). Legal-entity suffixes are stripped first, so two names differing
only by 'LLC'/'LP'/'Inc' score 1.0 (a near-certain duplicate to surface — find_intake_match
won't have caught it, since it compares the full string). Favors recall: a shared common
name-word ('… Capital') can lift unrelated firms into the 0.60.8 band — acceptable noise in
a ranked, human-confirmed shortlist; semantic pruning is the deferred LLM-judge's job."""
a = _normalize_text(a)
b = _normalize_text(b)
if not a or not b:
return 0.0
if a == b:
return 1.0
sa = _strip_legal_suffix(a) or a
sb = _strip_legal_suffix(b) or b
if sa == sb:
return 1.0
ratio = difflib.SequenceMatcher(None, sa, sb).ratio()
ta = set(re.findall(r"[a-z0-9]+", sa))
tb = set(re.findall(r"[a-z0-9]+", sb))
jaccard = len(ta & tb) / len(ta | tb) if (ta or tb) else 0.0
return max(ratio, jaccard)
def find_intake_candidates(conn, q, email=None, limit=5, min_score=0.62, max_email_distance=2):
"""Ranked fuzzy near-matches for the intake bot's disambiguation prompt.
Complements find_intake_match (which is exact-after-normalization): when the exact matcher
misses, this returns the closest existing grid investors so the bot can surface them
in-thread and the human can attach to one — instead of unknowingly creating a duplicate.
Deterministic (stdlib difflib + token overlap + email edit distance), no LLM. Scans the same
canonical grid blob as find_intake_match, so candidate ids are grid row ids the write targets.
EXCLUDES exact matches (score 1.0 — those belong to find_intake_match) and ranks by score."""
row = conn.execute("SELECT grid_json FROM fundraising_state WHERE id = 'main'").fetchone()
if not row or not row['grid_json']:
return []
try:
grid = json.loads(row['grid_json'])
except Exception:
return []
rows = grid.get('rows', []) if isinstance(grid, dict) else []
wanted_name = _normalize_text(q) if q else ''
wanted_email = (email or '').strip().lower()
scored = {}
for r in rows:
if not isinstance(r, dict):
continue
rid = str(r.get('id') or '').strip()
if not rid:
continue
name = str(r.get('investor_name') or '').strip()
# An exact name match belongs to find_intake_match — never echo it back as a candidate.
if wanted_name and _normalize_text(name) == wanted_name:
continue
name_score = _name_similarity(wanted_name, name) if (wanted_name and name) else 0.0
email_score = 0.0
if wanted_email:
contacts = r.get('contacts')
if isinstance(contacts, list):
for c in contacts:
if not isinstance(c, dict):
continue
ce = str(c.get('email') or '').strip().lower()
if not ce:
continue
dist = _email_edit_distance(wanted_email, ce)
# dist 0 is an exact email (find_intake_match's); 1→0.9, 2→0.8 are near-misses
if 0 < dist <= max_email_distance:
email_score = max(email_score, 1.0 - 0.1 * dist)
score = max(name_score, email_score)
if score < min_score: # too weak to be a useful suggestion
continue
matched_on = 'email' if email_score >= name_score else 'name'
# a row can match on both name and email — keep its highest-scoring read
if rid not in scored or score > scored[rid]['score']:
scored[rid] = {"id": rid, "investor_name": name,
"score": round(score, 3), "matched_on": matched_on}
out = sorted(scored.values(), key=lambda x: x['score'], reverse=True)
return out[:limit]
def ensure_fundraising_state_row(conn):
existing = conn.execute("SELECT * FROM fundraising_state WHERE id = 'main'").fetchone()
if not existing:
@@ -2950,7 +3069,12 @@ class CRMHandler(BaseHTTPRequestHandler):
def handle_intake_match(self, user, params):
"""Read-only: does an investor matching this intake already exist? Used by the
Matrix intake bot to label its in-thread proposal new-vs-existing. Returns the
grid row id so an approved note lands on exactly that investor."""
grid row id so an approved note lands on exactly that investor.
`match` is the confident exact match (auto-attached by the bot). When there is no
exact match, `candidates` carries ranked fuzzy near-matches so the bot can surface
a disambiguation shortlist in-thread (the human picks one or creates new) — closing
the duplicate-investor hole the exact-only matcher leaves open."""
q = str(params.get('q') or '').strip()
email = str(params.get('email') or '').strip()
if not q and not email:
@@ -2958,9 +3082,10 @@ class CRMHandler(BaseHTTPRequestHandler):
conn = get_db()
try:
match = find_intake_match(conn, q, email)
candidates = find_intake_candidates(conn, q, email) if match is None else []
finally:
conn.close()
return self.send_json({"data": {"match": match}})
return self.send_json({"data": {"match": match, "candidates": candidates}})
def handle_update_communication(self, user, comm_id, body):
conn = get_db()
+59
View File
@@ -71,6 +71,10 @@ GRID = {
"rows": [
{"id": "rowAcme", "investor_name": "Acme Capital", "notes": "",
"contacts": [{"name": "Jane Doe", "email": "jane@acme.com", "title": "GP"}]},
{"id": "rowCharlie", "investor_name": "Charlie Brown", "notes": "",
"contacts": [{"name": "Charlie Brown", "email": "cb@brown.fund", "title": ""}]},
{"id": "rowBeta", "investor_name": "Beta Capital LLC", "notes": "",
"contacts": [{"name": "Pat Roe", "email": "pat@beta.com", "title": ""}]},
],
}
@@ -119,6 +123,61 @@ def main():
check(st == 200 and (d or {}).get("data", {}).get("match") is None,
f"no match -> null (got {st}, {d})")
print("\n[fuzzy: exact match returns no candidates (bot auto-attaches)]")
st, d = _req(port, "GET", "/api/intake/match?q=Acme%20Capital", token)
data = (d or {}).get("data", {})
check(st == 200 and data.get("match") and data.get("candidates") == [],
f"exact match -> match set, candidates empty (got {data})")
print("\n[fuzzy: near-spelling surfaces a candidate (Charles Brown ~ Charlie Brown)]")
st, d = _req(port, "GET", "/api/intake/match?q=Charles%20Brown", token)
data = (d or {}).get("data", {})
cids = [c["id"] for c in data.get("candidates", [])]
check(data.get("match") is None and "rowCharlie" in cids,
f"near-spelling -> candidate rowCharlie, no exact (got {data})")
print("\n[fuzzy: legal-suffix difference surfaces a candidate (Beta Capital ~ Beta Capital LLC)]")
st, d = _req(port, "GET", "/api/intake/match?q=Beta%20Capital", token)
data = (d or {}).get("data", {})
cids = [c["id"] for c in data.get("candidates", [])]
check(data.get("match") is None and "rowBeta" in cids,
f"legal-suffix -> candidate rowBeta, no exact (got {data})")
print("\n[fuzzy: legal-suffix-only difference ranks as a top candidate (Acme Capital LLC ~ Acme Capital)]")
st, d = _req(port, "GET", "/api/intake/match?q=Acme%20Capital%20LLC", token)
data = (d or {}).get("data", {})
top = (data.get("candidates") or [None])[0]
check(data.get("match") is None and top and top["id"] == "rowAcme" and top["score"] == 1.0,
f"legal-suffix-only -> rowAcme top candidate @1.0, no exact (got {data})")
print("\n[fuzzy: one-character email typo surfaces a candidate by email]")
st, d = _req(port, "GET", "/api/intake/match?email=jhane@acme.com", token)
data = (d or {}).get("data", {})
cands = data.get("candidates", [])
hit = next((c for c in cands if c["id"] == "rowAcme"), None)
check(data.get("match") is None and hit and hit["matched_on"] == "email",
f"email typo -> candidate rowAcme matched_on email (got {data})")
print("\n[fuzzy: two-character email typo (distance 2) still surfaces]")
st, d = _req(port, "GET", "/api/intake/match?email=jane@acne.con", token) # acme->acne, com->con
data = (d or {}).get("data", {})
hit = next((c for c in data.get("candidates", []) if c["id"] == "rowAcme"), None)
check(data.get("match") is None and hit and hit["matched_on"] == "email" and hit["score"] == 0.8,
f"dist-2 email -> rowAcme @0.8 (got {data})")
print("\n[fuzzy: a row matching on BOTH name and email appears once (deduped)]")
st, d = _req(port, "GET", "/api/intake/match?q=Acme%20Capitol&email=jhane@acme.com", token)
data = (d or {}).get("data", {})
acme_hits = [c for c in data.get("candidates", []) if c["id"] == "rowAcme"]
check(data.get("match") is None and len(acme_hits) == 1,
f"name+email both match rowAcme -> single deduped entry (got {data})")
print("\n[fuzzy: nothing close -> empty candidates]")
st, d = _req(port, "GET", "/api/intake/match?q=Zphq%20Nobody%20LP", token)
data = (d or {}).get("data", {})
check(st == 200 and data.get("match") is None and data.get("candidates") == [],
f"unrelated query -> no match, no candidates (got {data})")
print("\n[match: missing q and email -> 400]")
st, _ = _req(port, "GET", "/api/intake/match", token)
check(st == 400, f"no params -> 400 (got {st})")