Files
ten31-database/backend/matrix_intake/parse.py
T
Keysat 0b893295e1 Matrix intake: fuzzy investor matching + conversational in-thread edits (v0.1.0:86)
Close the two locked post-deploy enhancements for the Matrix intake bot.

Fuzzy matching (server-side, ships in the s9pk): new find_intake_candidates in
server.py returns ranked deterministic near-matches (difflib name similarity +
token-set Jaccard, legal-suffix-aware, + email Levenshtein <= 2); GET
/api/intake/match now returns {match, candidates}. The bot surfaces a numbered
shortlist so a near-duplicate (Charlie/Charles, Acme Capital vs Acme Capital LLC,
a one-char email typo) is confirmed by a human instead of silently creating a
second investor. Exact match still auto-attaches; fuzzy candidates are never
auto-attached. The optional LLM-judge re-rank is deferred.

Conversational edits (bot-side, ships on the Spark): any in-thread reply that
isn't yes/no/edit field=value is treated as a natural-language revision and
re-run through local Qwen (parse.revise). Email integrity is preserved -- a
changed address must literally appear in the instruction; the model's email
field is structurally unreachable. No-op revisions re-prompt.

Docs/current-state brought current; 27/27 backend tests green.
2026-06-17 18:50:58 -05:00

120 lines
5.8 KiB
Python

"""Turn a free-text intake message into a normalized proposal via local Qwen.
The model only EXTRACTS structure; it never decides to write anything. New-vs-existing is
finalized in M2 against the CRM matcher — here `intent` is the model's first read.
`revise()` is the conversational-edit leg: a free-form correction the human types in the
proposal thread (e.g. "add that we met June 14") is applied to the pending proposal via the
same local Qwen — no Claude, no scrub. Email integrity is preserved: a changed address must
literally appear in the instruction (or the original message); the model can never mint one.
"""
import json
import re
import spark
SYSTEM = (
"You extract structured investor-intake data from a short message a venture-fund "
"team member typed. Reply with ONLY a JSON object, no prose, with these keys:\n"
' "intent": "new_investor" if the message introduces a new investor or prospect, '
'"meeting_note" if it logs a note/update about an investor, else "unclear".\n'
' "investor_name": the investing firm or entity name (e.g. "Acme Capital"), or null.\n'
' "contact_name": the individual person mentioned, or null.\n'
' "contact_email": the person\'s email if explicitly present, else null. Never invent one.\n'
' "contact_title": the person\'s role/title if stated, else null.\n'
' "note": any meeting note, context, or next step, else null.\n'
"Use null (not empty string) for anything not present. Output JSON only."
)
_EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
_VALID_INTENTS = {"new_investor", "meeting_note", "unclear"}
_FIELDS = ("intent", "investor_name", "contact_name", "contact_email", "contact_title", "note")
def _clean(v):
if v is None:
return None
s = str(v).strip()
if not s or s.lower() in ("null", "none", "n/a", "na", "unknown"):
return None
return s
def normalize(raw, source_text=""):
"""Coerce the model's dict into a stable proposal shape; salvage an email from the
source text if the model missed one. Returns a dict with all _FIELDS keys."""
raw = raw or {}
out = {k: _clean(raw.get(k)) for k in _FIELDS}
intent = (out["intent"] or "").lower().replace("-", "_").replace(" ", "_")
out["intent"] = intent if intent in _VALID_INTENTS else "unclear"
# Email integrity: only accept an address that literally appears in the source message.
# The model is unreliable for verbatim strings and must never mint an address — anything
# not present in what the human typed is dropped (a wrong email in the CRM is worse than
# none). This both salvages a missed address and rejects a hallucinated one.
m = _EMAIL_RE.search(source_text or "")
out["contact_email"] = m.group(0).rstrip(".,;:!?)]}>\"'") if m else None
# An intake with no firm AND no person is not actionable.
if not out["investor_name"] and not out["contact_name"]:
out["intent"] = "unclear"
return out
def parse_message(text, parse_fn=spark.parse_json):
"""Parse one intake message. `parse_fn` is injectable for tests (defaults to Spark/Qwen).
Returns a normalized proposal dict. On a model/transport failure, raises (caller decides)."""
raw = parse_fn(text, system=SYSTEM, max_tokens=400)
proposal = normalize(raw, source_text=text)
# Stash the original message so a later revise() can re-check email integrity against it.
proposal["_source_text"] = text
return proposal
REVISE_SYSTEM = (
"You revise a structured investor-intake proposal from a short correction a venture-fund "
"team member typed. You are given the CURRENT proposal as JSON and an INSTRUCTION. Apply "
"the instruction and reply with ONLY the full revised JSON object, these keys:\n"
' "investor_name", "contact_name", "contact_email", "contact_title", "note".\n'
"Change ONLY what the instruction asks; copy every other field through unchanged. Use null "
"for a field the instruction clears or that is genuinely absent. Never invent an email "
"address. Output JSON only."
)
_REVISABLE = ("investor_name", "contact_name", "contact_title", "note")
def _apply_revision(proposal, model_out, instruction):
"""Merge the model's revised fields onto the proposal. Pure + offline-testable.
Preserves control keys (_match_id / _stage / intent / _source_text). Enforces email
integrity: a revised address is taken only if it literally appears in the INSTRUCTION the
human typed; otherwise the existing (already integrity-checked) address is kept. The model's
own email field is never trusted — it must not mint an address."""
model_out = model_out or {}
out = dict(proposal)
for k in _REVISABLE:
if k in model_out:
out[k] = _clean(model_out.get(k))
m = _EMAIL_RE.search(instruction or "")
if m:
out["contact_email"] = m.group(0).rstrip(".,;:!?)]}>\"'")
# else: keep proposal's current contact_email (untouched above; control key copied by dict())
# Don't let a revision strip the proposal down to nothing actionable.
if not out.get("investor_name") and not out.get("contact_name"):
out["investor_name"] = proposal.get("investor_name")
out["contact_name"] = proposal.get("contact_name")
return out
def revise(proposal, instruction, parse_fn=spark.parse_json):
"""Apply a natural-language correction to a pending proposal via local Qwen; return the
revised proposal dict. `parse_fn` is injectable for tests (defaults to Spark/Qwen)."""
current = {k: proposal.get(k) for k in
("investor_name", "contact_name", "contact_email", "contact_title", "note")}
prompt = ("CURRENT:\n" + json.dumps(current, ensure_ascii=False)
+ "\n\nINSTRUCTION:\n" + (instruction or "").strip())
raw = parse_fn(prompt, system=REVISE_SYSTEM, max_tokens=400)
return _apply_revision(proposal, raw, instruction)