"""Turn a free-text intake message into a normalized proposal via local Qwen. The model only EXTRACTS structure; it never decides to write anything. New-vs-existing is finalized in M2 against the CRM matcher — here `intent` is the model's first read. `revise()` is the conversational-edit leg: a free-form correction the human types in the proposal thread (e.g. "add that we met June 14") is applied to the pending proposal via the same local Qwen — no Claude, no scrub. Email integrity is preserved: a changed address must literally appear in the instruction (or the original message); the model can never mint one. """ import json import re import spark SYSTEM = ( "You extract structured investor-intake data from a short message a venture-fund " "team member typed about their fundraising outreach. The message is a note FROM a " "team member ABOUT an investor or prospect they are contacting. Reply with ONLY a JSON " "object, no prose, with these keys:\n" ' "intent": "new_investor" if the message introduces a new investor or prospect, ' '"meeting_note" if it logs a note/update about an investor, else "unclear".\n' ' "investor_name": the investing firm or entity name (e.g. "Acme Capital"), or null.\n' ' "contact_name": the individual person mentioned, or null.\n' ' "contact_email": the person\'s email if explicitly present, else null. Never invent one.\n' ' "contact_title": the person\'s role/title if stated, else null.\n' ' "note": any meeting note, context, or next step, else null.\n' "Use null (not empty string) for anything not present." ) # Appended when the team roster is known, so the model reads a teammate's name as the person # DOING the outreach, not the investor — fixes "Jonathan is chatting with Wyoming" extracting # the teammate instead of the prospect. Names come from settings.team_roster() (INTAKE_TEAM_ROSTER). ROSTER_FRAME = ( "These names and initials (case-insensitive) are our OWN team members — the people doing " "the outreach, NOT investors or prospects. Never extract one as investor_name or " "contact_name: {names}. When a team member is described talking with, meeting, or chasing " 'someone (e.g. "Jonathan is chatting with Wyoming"), the OTHER party (here "Wyoming") is ' "the investor or prospect to extract." ) def build_system(roster=None, base=SYSTEM): """Assemble the extraction system prompt. With a `roster` (team-member names) it appends the outreach frame so a teammate's name is read as the person doing outreach, not the investor. JSON-only stays the last line for recency. Pure + offline-testable.""" parts = [base] if roster: parts.append(ROSTER_FRAME.format(names=", ".join(roster))) parts.append("Output JSON only.") return "\n".join(parts) _EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}") _VALID_INTENTS = {"new_investor", "meeting_note", "unclear"} _FIELDS = ("intent", "investor_name", "contact_name", "contact_email", "contact_title", "note") def _clean(v): if v is None: return None s = str(v).strip() if not s or s.lower() in ("null", "none", "n/a", "na", "unknown"): return None return s def normalize(raw, source_text=""): """Coerce the model's dict into a stable proposal shape; salvage an email from the source text if the model missed one. Returns a dict with all _FIELDS keys.""" raw = raw or {} out = {k: _clean(raw.get(k)) for k in _FIELDS} intent = (out["intent"] or "").lower().replace("-", "_").replace(" ", "_") out["intent"] = intent if intent in _VALID_INTENTS else "unclear" # Email integrity: only accept an address that literally appears in the source message. # The model is unreliable for verbatim strings and must never mint an address — anything # not present in what the human typed is dropped (a wrong email in the CRM is worse than # none). This both salvages a missed address and rejects a hallucinated one. m = _EMAIL_RE.search(source_text or "") out["contact_email"] = m.group(0).rstrip(".,;:!?)]}>\"'") if m else None # An intake with no firm AND no person is not actionable. if not out["investor_name"] and not out["contact_name"]: out["intent"] = "unclear" return out def parse_message(text, parse_fn=spark.parse_json, roster=None): """Parse one intake message. `parse_fn` is injectable for tests (defaults to Spark/Qwen); `roster` is the team-member names that frame the extraction (see build_system). Returns a normalized proposal dict. On a model/transport failure, raises (caller decides).""" raw = parse_fn(text, system=build_system(roster), max_tokens=400) proposal = normalize(raw, source_text=text) # Stash the original message so a later revise() can re-check email integrity against it. proposal["_source_text"] = text return proposal REVISE_SYSTEM = ( "You revise a structured investor-intake proposal from a short correction a venture-fund " "team member typed. You are given the CURRENT proposal as JSON and an INSTRUCTION. Apply " "the instruction and reply with ONLY the full revised JSON object, these keys:\n" ' "investor_name", "contact_name", "contact_email", "contact_title", "note".\n' "Change ONLY what the instruction asks; copy every other field through unchanged. Use null " "for a field the instruction clears or that is genuinely absent. Never invent an email " "address." ) _REVISABLE = ("investor_name", "contact_name", "contact_title", "note") def _apply_revision(proposal, model_out, instruction): """Merge the model's revised fields onto the proposal. Pure + offline-testable. Preserves control keys (_match_id / _stage / intent / _source_text). Enforces email integrity: a revised address is taken only if it literally appears in the INSTRUCTION the human typed; otherwise the existing (already integrity-checked) address is kept. The model's own email field is never trusted — it must not mint an address.""" model_out = model_out or {} out = dict(proposal) for k in _REVISABLE: if k in model_out: out[k] = _clean(model_out.get(k)) m = _EMAIL_RE.search(instruction or "") if m: out["contact_email"] = m.group(0).rstrip(".,;:!?)]}>\"'") # else: keep proposal's current contact_email (untouched above; control key copied by dict()) # Don't let a revision strip the proposal down to nothing actionable. if not out.get("investor_name") and not out.get("contact_name"): out["investor_name"] = proposal.get("investor_name") out["contact_name"] = proposal.get("contact_name") return out def revise(proposal, instruction, parse_fn=spark.parse_json, roster=None): """Apply a natural-language correction to a pending proposal via local Qwen; return the revised proposal dict. `parse_fn` is injectable for tests (defaults to Spark/Qwen); `roster` frames the revision the same way parse_message does (see build_system).""" current = {k: proposal.get(k) for k in ("investor_name", "contact_name", "contact_email", "contact_title", "note")} prompt = ("CURRENT:\n" + json.dumps(current, ensure_ascii=False) + "\n\nINSTRUCTION:\n" + (instruction or "").strip()) raw = parse_fn(prompt, system=build_system(roster, base=REVISE_SYSTEM), max_tokens=400) return _apply_revision(proposal, raw, instruction)