"""Turn a free-text intake message into a normalized proposal via local Qwen. The model only EXTRACTS structure; it never decides to write anything. New-vs-existing is finalized in M2 against the CRM matcher — here `intent` is the model's first read. """ import re import spark SYSTEM = ( "You extract structured investor-intake data from a short message a venture-fund " "team member typed. Reply with ONLY a JSON object, no prose, with these keys:\n" ' "intent": "new_investor" if the message introduces a new investor or prospect, ' '"meeting_note" if it logs a note/update about an investor, else "unclear".\n' ' "investor_name": the investing firm or entity name (e.g. "Acme Capital"), or null.\n' ' "contact_name": the individual person mentioned, or null.\n' ' "contact_email": the person\'s email if explicitly present, else null. Never invent one.\n' ' "contact_title": the person\'s role/title if stated, else null.\n' ' "note": any meeting note, context, or next step, else null.\n' "Use null (not empty string) for anything not present. Output JSON only." ) _EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}") _VALID_INTENTS = {"new_investor", "meeting_note", "unclear"} _FIELDS = ("intent", "investor_name", "contact_name", "contact_email", "contact_title", "note") def _clean(v): if v is None: return None s = str(v).strip() if not s or s.lower() in ("null", "none", "n/a", "na", "unknown"): return None return s def normalize(raw, source_text=""): """Coerce the model's dict into a stable proposal shape; salvage an email from the source text if the model missed one. Returns a dict with all _FIELDS keys.""" raw = raw or {} out = {k: _clean(raw.get(k)) for k in _FIELDS} intent = (out["intent"] or "").lower().replace("-", "_").replace(" ", "_") out["intent"] = intent if intent in _VALID_INTENTS else "unclear" # Email integrity: only accept an address that literally appears in the source message. # The model is unreliable for verbatim strings and must never mint an address — anything # not present in what the human typed is dropped (a wrong email in the CRM is worse than # none). This both salvages a missed address and rejects a hallucinated one. m = _EMAIL_RE.search(source_text or "") out["contact_email"] = m.group(0).rstrip(".,;:!?)]}>\"'") if m else None # An intake with no firm AND no person is not actionable. if not out["investor_name"] and not out["contact_name"]: out["intent"] = "unclear" return out def parse_message(text, parse_fn=spark.parse_json): """Parse one intake message. `parse_fn` is injectable for tests (defaults to Spark/Qwen). Returns a normalized proposal dict. On a model/transport failure, raises (caller decides).""" raw = parse_fn(text, system=SYSTEM, max_tokens=400) return normalize(raw, source_text=text)