e824ff2206
Completes business-card contact capture. The transcription prompt now labels Phone/Mobile/Fax on separate lines, and the extractor maps an office/main number -> phone and a cell -> mobile, never a fax. Both carry the same digit-in-source integrity rule as email/LinkedIn: a number is kept only if its digits literally appear in the source (or, on revise, the instruction) -- never minted. The proposal card shows Phone + Mobile and they're editable (aliases phone/tel/office, mobile/cell). Server: _upsert_contact_from_fundraising now accepts contact.phone + contact.mobile and writes them to the canonical contact record (contact-level, not grid pills), shipped in s9pk v0.1.0:98. No schema change -- the contacts columns already exist. 41/41 backend suite green + the matrix_intake units; card flow end-to-end is live-smoke.
188 lines
9.9 KiB
Python
188 lines
9.9 KiB
Python
"""Turn a free-text intake message into a normalized proposal via local Qwen.
|
|
|
|
The model only EXTRACTS structure; it never decides to write anything. New-vs-existing is
|
|
finalized in M2 against the CRM matcher — here `intent` is the model's first read.
|
|
|
|
`revise()` is the conversational-edit leg: a free-form correction the human types in the
|
|
proposal thread (e.g. "add that we met June 14") is applied to the pending proposal via the
|
|
same local Qwen — no Claude, no scrub. Email integrity is preserved: a changed address must
|
|
literally appear in the instruction (or the original message); the model can never mint one.
|
|
"""
|
|
import json
|
|
import re
|
|
|
|
import spark
|
|
|
|
SYSTEM = (
|
|
"You extract structured investor-intake data from a short message a venture-fund "
|
|
"team member typed about their fundraising outreach. The message is a note FROM a "
|
|
"team member ABOUT an investor or prospect they are contacting. Reply with ONLY a JSON "
|
|
"object, no prose, with these keys:\n"
|
|
' "intent": "new_investor" if the message introduces a new investor or prospect, '
|
|
'"meeting_note" if it logs a note/update about an investor, else "unclear".\n'
|
|
' "investor_name": the investing firm or entity name (e.g. "Acme Capital"), or null.\n'
|
|
' "contact_name": the individual person mentioned, or null.\n'
|
|
' "contact_email": the person\'s email if explicitly present, else null. Never invent one.\n'
|
|
' "contact_title": the person\'s role/title if stated, else null.\n'
|
|
' "city": the person\'s city or location if stated (e.g. "New York"), else null.\n'
|
|
' "linkedin_url": the person\'s LinkedIn URL if explicitly present, else null. Never invent one.\n'
|
|
' "phone": the office/main/direct phone number if present (a line labeled Phone/Tel/Office/'
|
|
'Direct, or a single unlabeled number); never a fax or a cell. Else null.\n'
|
|
' "mobile": the cell/mobile number if present (a line labeled Cell/Mobile); never a fax. Else null.\n'
|
|
' "note": any meeting note, context, or next step, else null.\n'
|
|
"Use null (not empty string) for anything not present."
|
|
)
|
|
|
|
# Appended when the team roster is known, so the model reads a teammate's name as the person
|
|
# DOING the outreach, not the investor — fixes "Jonathan is chatting with Wyoming" extracting
|
|
# the teammate instead of the prospect. Names come from settings.team_roster() (INTAKE_TEAM_ROSTER).
|
|
ROSTER_FRAME = (
|
|
"These names and initials (case-insensitive) are our OWN team members — the people doing "
|
|
"the outreach, NOT investors or prospects. Never extract one as investor_name or "
|
|
"contact_name: {names}. When a team member is described talking with, meeting, or chasing "
|
|
'someone (e.g. "Jonathan is chatting with Wyoming"), the OTHER party (here "Wyoming") is '
|
|
"the investor or prospect to extract."
|
|
)
|
|
|
|
|
|
def build_system(roster=None, base=SYSTEM):
|
|
"""Assemble the extraction system prompt. With a `roster` (team-member names) it appends
|
|
the outreach frame so a teammate's name is read as the person doing outreach, not the
|
|
investor. JSON-only stays the last line for recency. Pure + offline-testable."""
|
|
parts = [base]
|
|
if roster:
|
|
parts.append(ROSTER_FRAME.format(names=", ".join(roster)))
|
|
parts.append("Output JSON only.")
|
|
return "\n".join(parts)
|
|
|
|
_EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
|
|
_LINKEDIN_RE = re.compile(r"(?:https?://)?(?:[a-z]{2,3}\.)?linkedin\.com/[A-Za-z0-9_%/\-.]+", re.I)
|
|
_VALID_INTENTS = {"new_investor", "meeting_note", "unclear"}
|
|
_FIELDS = ("intent", "investor_name", "contact_name", "contact_email", "contact_title",
|
|
"city", "linkedin_url", "phone", "mobile", "note")
|
|
|
|
|
|
def _digits(s):
|
|
"""Bare digit run of a string (drops spaces/dashes/parens/dots), for phone-integrity checks."""
|
|
return re.sub(r"\D", "", str(s or ""))
|
|
|
|
|
|
def _clean(v):
|
|
if v is None:
|
|
return None
|
|
s = str(v).strip()
|
|
if not s or s.lower() in ("null", "none", "n/a", "na", "unknown"):
|
|
return None
|
|
return s
|
|
|
|
|
|
def normalize(raw, source_text=""):
|
|
"""Coerce the model's dict into a stable proposal shape; salvage an email from the
|
|
source text if the model missed one. Returns a dict with all _FIELDS keys."""
|
|
raw = raw or {}
|
|
out = {k: _clean(raw.get(k)) for k in _FIELDS}
|
|
|
|
intent = (out["intent"] or "").lower().replace("-", "_").replace(" ", "_")
|
|
out["intent"] = intent if intent in _VALID_INTENTS else "unclear"
|
|
|
|
# Email integrity: only accept an address that literally appears in the source message.
|
|
# The model is unreliable for verbatim strings and must never mint an address — anything
|
|
# not present in what the human typed is dropped (a wrong email in the CRM is worse than
|
|
# none). This both salvages a missed address and rejects a hallucinated one.
|
|
m = _EMAIL_RE.search(source_text or "")
|
|
out["contact_email"] = m.group(0).rstrip(".,;:!?)]}>\"'") if m else None
|
|
|
|
# LinkedIn integrity: same rule as email — a profile URL identifies a specific person, so
|
|
# never let the model mint one; keep only a linkedin.com URL literally present in the source.
|
|
lm = _LINKEDIN_RE.search(source_text or "")
|
|
out["linkedin_url"] = lm.group(0).rstrip(".,;:!?)]}>\"'") if lm else None
|
|
|
|
# Phone integrity: keep a number (in its printed formatting) only if its digit run actually
|
|
# appears in the source — the model must never mint or "complete" a number. phone = the
|
|
# office/main line, mobile = the cell; which is which is the model's call (prompted), this
|
|
# only validates that the number is real. (≥7 digits avoids matching a stray short run.)
|
|
src_digits = _digits(source_text)
|
|
for f in ("phone", "mobile"):
|
|
d = _digits(out.get(f))
|
|
out[f] = out.get(f) if (len(d) >= 7 and d in src_digits) else None
|
|
|
|
# City is left as a plain extracted field (no source gate): a wrong city is low-harm and the
|
|
# human sees it on the card before approving, unlike a wrong email/LinkedIn.
|
|
|
|
# An intake with no firm AND no person is not actionable.
|
|
if not out["investor_name"] and not out["contact_name"]:
|
|
out["intent"] = "unclear"
|
|
return out
|
|
|
|
|
|
def parse_message(text, parse_fn=spark.parse_json, roster=None):
|
|
"""Parse one intake message. `parse_fn` is injectable for tests (defaults to Spark/Qwen);
|
|
`roster` is the team-member names that frame the extraction (see build_system).
|
|
Returns a normalized proposal dict. On a model/transport failure, raises (caller decides)."""
|
|
raw = parse_fn(text, system=build_system(roster), max_tokens=400)
|
|
proposal = normalize(raw, source_text=text)
|
|
# Stash the original message so a later revise() can re-check email integrity against it.
|
|
proposal["_source_text"] = text
|
|
return proposal
|
|
|
|
|
|
REVISE_SYSTEM = (
|
|
"You revise a structured investor-intake proposal from a short correction a venture-fund "
|
|
"team member typed. You are given the CURRENT proposal as JSON and an INSTRUCTION. Apply "
|
|
"the instruction and reply with ONLY the full revised JSON object, these keys:\n"
|
|
' "investor_name", "contact_name", "contact_email", "contact_title", "city", '
|
|
'"linkedin_url", "phone", "mobile", "note".\n'
|
|
"Change ONLY what the instruction asks; copy every other field through unchanged. Use null "
|
|
"for a field the instruction clears or that is genuinely absent. Never invent an email "
|
|
"address, a LinkedIn URL, or a phone number."
|
|
)
|
|
|
|
_REVISABLE = ("investor_name", "contact_name", "contact_title", "city", "note")
|
|
|
|
|
|
def _apply_revision(proposal, model_out, instruction):
|
|
"""Merge the model's revised fields onto the proposal. Pure + offline-testable.
|
|
|
|
Preserves control keys (_match_id / _stage / intent / _source_text). Enforces email
|
|
integrity: a revised address is taken only if it literally appears in the INSTRUCTION the
|
|
human typed; otherwise the existing (already integrity-checked) address is kept. The model's
|
|
own email field is never trusted — it must not mint an address."""
|
|
model_out = model_out or {}
|
|
out = dict(proposal)
|
|
for k in _REVISABLE:
|
|
if k in model_out:
|
|
out[k] = _clean(model_out.get(k))
|
|
m = _EMAIL_RE.search(instruction or "")
|
|
if m:
|
|
out["contact_email"] = m.group(0).rstrip(".,;:!?)]}>\"'")
|
|
# else: keep proposal's current contact_email (untouched above; control key copied by dict())
|
|
# LinkedIn follows the same rule: a revised URL is taken only if it appears in the instruction.
|
|
lm = _LINKEDIN_RE.search(instruction or "")
|
|
if lm:
|
|
out["linkedin_url"] = lm.group(0).rstrip(".,;:!?)]}>\"'")
|
|
# Phone/mobile too: a revised number is accepted only if its digits appear in the instruction
|
|
# (never let the model mint one); otherwise the existing value is kept.
|
|
instr_digits = _digits(instruction)
|
|
for f in ("phone", "mobile"):
|
|
if f in model_out:
|
|
cand = _clean(model_out.get(f))
|
|
d = _digits(cand)
|
|
out[f] = cand if (cand and len(d) >= 7 and d in instr_digits) else out.get(f)
|
|
# Don't let a revision strip the proposal down to nothing actionable.
|
|
if not out.get("investor_name") and not out.get("contact_name"):
|
|
out["investor_name"] = proposal.get("investor_name")
|
|
out["contact_name"] = proposal.get("contact_name")
|
|
return out
|
|
|
|
|
|
def revise(proposal, instruction, parse_fn=spark.parse_json, roster=None):
|
|
"""Apply a natural-language correction to a pending proposal via local Qwen; return the
|
|
revised proposal dict. `parse_fn` is injectable for tests (defaults to Spark/Qwen);
|
|
`roster` frames the revision the same way parse_message does (see build_system)."""
|
|
current = {k: proposal.get(k) for k in
|
|
("investor_name", "contact_name", "contact_email", "contact_title", "note")}
|
|
prompt = ("CURRENT:\n" + json.dumps(current, ensure_ascii=False)
|
|
+ "\n\nINSTRUCTION:\n" + (instruction or "").strip())
|
|
raw = parse_fn(prompt, system=build_system(roster, base=REVISE_SYSTEM), max_tokens=400)
|
|
return _apply_revision(proposal, raw, instruction)
|