Files
ten31-database/backend/matrix_intake/parse.py
T
Keysat 7ad0ee7624 Add Matrix intake bot (M1+M2): typed message → approved fundraising-grid write
New backend/matrix_intake/ runs as its own process (matrix-nio isolated from the
stdlib CRM): local-Qwen parse via Spark Control → in-thread human approval
(yes/edit/no) → write through the CRM's own log-communication endpoint, tagged
source=matrix_intake. Adds read-only GET /api/intake/match (returns grid row id,
no-duplicate contract); threads provenance through handle_log_fundraising_communication.
Reviewer-passed: pop-before-commit closes a double-approve race; edit-grammar fix.
Text-only v1; business-card photo (M3) deferred (no Spark vision model).
26/26 tests green; live Matrix smoke pending deploy.
2026-06-17 07:51:27 -05:00

64 lines
2.9 KiB
Python

"""Turn a free-text intake message into a normalized proposal via local Qwen.
The model only EXTRACTS structure; it never decides to write anything. New-vs-existing is
finalized in M2 against the CRM matcher — here `intent` is the model's first read.
"""
import re
import spark
SYSTEM = (
"You extract structured investor-intake data from a short message a venture-fund "
"team member typed. Reply with ONLY a JSON object, no prose, with these keys:\n"
' "intent": "new_investor" if the message introduces a new investor or prospect, '
'"meeting_note" if it logs a note/update about an investor, else "unclear".\n'
' "investor_name": the investing firm or entity name (e.g. "Acme Capital"), or null.\n'
' "contact_name": the individual person mentioned, or null.\n'
' "contact_email": the person\'s email if explicitly present, else null. Never invent one.\n'
' "contact_title": the person\'s role/title if stated, else null.\n'
' "note": any meeting note, context, or next step, else null.\n'
"Use null (not empty string) for anything not present. Output JSON only."
)
_EMAIL_RE = re.compile(r"[^@\s]+@[^@\s]+\.[^@\s]+")
_VALID_INTENTS = {"new_investor", "meeting_note", "unclear"}
_FIELDS = ("intent", "investor_name", "contact_name", "contact_email", "contact_title", "note")
def _clean(v):
if v is None:
return None
s = str(v).strip()
if not s or s.lower() in ("null", "none", "n/a", "na", "unknown"):
return None
return s
def normalize(raw, source_text=""):
"""Coerce the model's dict into a stable proposal shape; salvage an email from the
source text if the model missed one. Returns a dict with all _FIELDS keys."""
raw = raw or {}
out = {k: _clean(raw.get(k)) for k in _FIELDS}
intent = (out["intent"] or "").lower().replace("-", "_").replace(" ", "_")
out["intent"] = intent if intent in _VALID_INTENTS else "unclear"
# Email integrity: only accept an address that literally appears in the source message.
# The model is unreliable for verbatim strings and must never mint an address — anything
# not present in what the human typed is dropped (a wrong email in the CRM is worse than
# none). This both salvages a missed address and rejects a hallucinated one.
m = _EMAIL_RE.search(source_text or "")
out["contact_email"] = m.group(0).rstrip(".,;:!?)]}>\"'") if m else None
# An intake with no firm AND no person is not actionable.
if not out["investor_name"] and not out["contact_name"]:
out["intent"] = "unclear"
return out
def parse_message(text, parse_fn=spark.parse_json):
"""Parse one intake message. `parse_fn` is injectable for tests (defaults to Spark/Qwen).
Returns a normalized proposal dict. On a model/transport failure, raises (caller decides)."""
raw = parse_fn(text, system=SYSTEM, max_tokens=400)
return normalize(raw, source_text=text)