From 8b2eb01a65c77b4883b210b7042833cdfc9c0dad Mon Sep 17 00:00:00 2001 From: Keysat Date: Sat, 20 Jun 2026 11:07:17 -0500 Subject: [PATCH] Capture city + LinkedIn on card intake; sharpen the transcription prompt The card transcription prompt now reads emails/URLs/phones character-by-character, explicitly forbids autocompleting toward a plausible domain (the mara.com -> marac.com failure), and emits labeled lines (which also feeds the field extractor cleaner input). The extractor gains city + linkedin_url. city is a plain field (low-harm if wrong; the human sees it on the card). linkedin_url follows the email-integrity rule: kept only if it literally appears in the source / a revise instruction, never minted -- a wrong profile URL points at the wrong person. Both flow to the contact via the existing log-communication upsert (city also syncs to the grid contact pill). Phone is intentionally NOT included yet: the bot's write path can't store it until a small server-side change lands (next s9pk). See the matrix-intake guide. --- backend/matrix_intake/crm_client.py | 4 +++ backend/matrix_intake/parse.py | 25 +++++++++++++--- backend/matrix_intake/proposals.py | 7 ++++- backend/matrix_intake/spark.py | 21 ++++++++----- backend/matrix_intake/test_crm_client.py | 12 +++++++- backend/matrix_intake/test_parse.py | 38 ++++++++++++++++++++++++ backend/matrix_intake/test_proposals.py | 14 +++++++++ docs/guides/matrix-intake.md | 12 ++++++++ 8 files changed, 120 insertions(+), 13 deletions(-) diff --git a/backend/matrix_intake/crm_client.py b/backend/matrix_intake/crm_client.py index dfb1bd6..7d73b34 100644 --- a/backend/matrix_intake/crm_client.py +++ b/backend/matrix_intake/crm_client.py @@ -162,6 +162,10 @@ def build_commit_payload(proposal): "name": proposal.get("contact_name") or proposal.get("investor_name") or "", "email": proposal.get("contact_email") or "", "title": proposal.get("contact_title") or "", + # city + linkedin_url are already honored by the server's contact upsert + # (_upsert_contact_from_fundraising); city also syncs to the grid contact pill. + "city": proposal.get("city") or "", + "linkedin_url": proposal.get("linkedin_url") or "", } note = proposal.get("note") or "" # The CRM's grid note line uses subject-or-body for its one-line summary, so a non-empty diff --git a/backend/matrix_intake/parse.py b/backend/matrix_intake/parse.py index 683bfa1..f02ab7e 100644 --- a/backend/matrix_intake/parse.py +++ b/backend/matrix_intake/parse.py @@ -24,6 +24,8 @@ SYSTEM = ( ' "contact_name": the individual person mentioned, or null.\n' ' "contact_email": the person\'s email if explicitly present, else null. Never invent one.\n' ' "contact_title": the person\'s role/title if stated, else null.\n' + ' "city": the person\'s city or location if stated (e.g. "New York"), else null.\n' + ' "linkedin_url": the person\'s LinkedIn URL if explicitly present, else null. Never invent one.\n' ' "note": any meeting note, context, or next step, else null.\n' "Use null (not empty string) for anything not present." ) @@ -51,8 +53,10 @@ def build_system(roster=None, base=SYSTEM): return "\n".join(parts) _EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}") +_LINKEDIN_RE = re.compile(r"(?:https?://)?(?:[a-z]{2,3}\.)?linkedin\.com/[A-Za-z0-9_%/\-.]+", re.I) _VALID_INTENTS = {"new_investor", "meeting_note", "unclear"} -_FIELDS = ("intent", "investor_name", "contact_name", "contact_email", "contact_title", "note") +_FIELDS = ("intent", "investor_name", "contact_name", "contact_email", "contact_title", + "city", "linkedin_url", "note") def _clean(v): @@ -80,6 +84,14 @@ def normalize(raw, source_text=""): m = _EMAIL_RE.search(source_text or "") out["contact_email"] = m.group(0).rstrip(".,;:!?)]}>\"'") if m else None + # LinkedIn integrity: same rule as email β€” a profile URL identifies a specific person, so + # never let the model mint one; keep only a linkedin.com URL literally present in the source. + lm = _LINKEDIN_RE.search(source_text or "") + out["linkedin_url"] = lm.group(0).rstrip(".,;:!?)]}>\"'") if lm else None + + # City is left as a plain extracted field (no source gate): a wrong city is low-harm and the + # human sees it on the card before approving, unlike a wrong email/LinkedIn. + # An intake with no firm AND no person is not actionable. if not out["investor_name"] and not out["contact_name"]: out["intent"] = "unclear" @@ -101,13 +113,14 @@ REVISE_SYSTEM = ( "You revise a structured investor-intake proposal from a short correction a venture-fund " "team member typed. You are given the CURRENT proposal as JSON and an INSTRUCTION. Apply " "the instruction and reply with ONLY the full revised JSON object, these keys:\n" - ' "investor_name", "contact_name", "contact_email", "contact_title", "note".\n' + ' "investor_name", "contact_name", "contact_email", "contact_title", "city", ' + '"linkedin_url", "note".\n' "Change ONLY what the instruction asks; copy every other field through unchanged. Use null " "for a field the instruction clears or that is genuinely absent. Never invent an email " - "address." + "address or a LinkedIn URL." ) -_REVISABLE = ("investor_name", "contact_name", "contact_title", "note") +_REVISABLE = ("investor_name", "contact_name", "contact_title", "city", "note") def _apply_revision(proposal, model_out, instruction): @@ -126,6 +139,10 @@ def _apply_revision(proposal, model_out, instruction): if m: out["contact_email"] = m.group(0).rstrip(".,;:!?)]}>\"'") # else: keep proposal's current contact_email (untouched above; control key copied by dict()) + # LinkedIn follows the same rule: a revised URL is taken only if it appears in the instruction. + lm = _LINKEDIN_RE.search(instruction or "") + if lm: + out["linkedin_url"] = lm.group(0).rstrip(".,;:!?)]}>\"'") # Don't let a revision strip the proposal down to nothing actionable. if not out.get("investor_name") and not out.get("contact_name"): out["investor_name"] = proposal.get("investor_name") diff --git a/backend/matrix_intake/proposals.py b/backend/matrix_intake/proposals.py index 8893952..c64079e 100644 --- a/backend/matrix_intake/proposals.py +++ b/backend/matrix_intake/proposals.py @@ -18,6 +18,8 @@ _EDIT_ALIASES = { "contact": "contact_name", "person": "contact_name", "email": "contact_email", "title": "contact_title", "role": "contact_title", + "city": "city", "location": "city", + "linkedin": "linkedin_url", "linkedin_url": "linkedin_url", "li": "linkedin_url", "note": "note", } @@ -26,7 +28,8 @@ _NO = {"no", "n", "cancel", "discard", "reject", "stop", "πŸ‘Ž", "❌"} # "create a new investor anyway" replies to a disambiguation shortlist _NEW = {"new", "none", "new investor", "none of these", "create", "create new", "add new", "neither"} -_CONTENT_FIELDS = ("intent", "investor_name", "contact_name", "contact_email", "contact_title", "note") +_CONTENT_FIELDS = ("intent", "investor_name", "contact_name", "contact_email", "contact_title", + "city", "linkedin_url", "note") class ProposalStore: @@ -174,6 +177,8 @@ def render(proposal): ("Contact", proposal.get("contact_name")), ("Email", proposal.get("contact_email")), ("Title", proposal.get("contact_title")), + ("City", proposal.get("city")), + ("LinkedIn", proposal.get("linkedin_url")), ("Note", proposal.get("note")), ] for label, val in fields: diff --git a/backend/matrix_intake/spark.py b/backend/matrix_intake/spark.py index fde8584..c776ecc 100644 --- a/backend/matrix_intake/spark.py +++ b/backend/matrix_intake/spark.py @@ -27,13 +27,20 @@ def parse_json(prompt, system=None, max_tokens=400): # email-integrity check runs against, so the "only keep an address that literally appears in the # source, never let the model mint one" rule (parse.normalize) protects card intake too. CARD_SYSTEM = ( - "You are transcribing a photo of a business card for a venture-fund team. Read every line of " - "text on the card and write it out exactly as printed β€” the person's name, job title, company " - "or firm name, email address, phone number(s), website, and mailing address. Copy the email " - "address and phone numbers character-for-character; never guess, complete, or correct them. Do " - "not summarize, translate, or add anything that is not printed on the card. If the image is not " - "a readable business card, reply with the single word NONE. Output only the transcription, one " - "item per line." + "You are transcribing a photo of a business card. Copy the text EXACTLY as printed β€” never " + "paraphrase, translate, complete, normalize, or correct anything.\n" + "Read each of these character-by-character and reproduce every glyph precisely. Do NOT 'fix' " + "them toward a more common spelling or a well-known company's domain, and never add or drop a " + "character:\n" + " - Email: check the local part, the @, and the domain separately (transcribe 'mara.com' as " + "'mara.com', never 'marac.com').\n" + " - Phone number(s).\n" + " - Website / LinkedIn URL.\n" + "Then list, each on its own labeled line and ONLY if present on the card:\n" + " Name: Title: Company: Email: Phone: LinkedIn: City:\n" + "If a character is genuinely ambiguous, give your single best reading β€” never invent extra " + "characters to fill a gap. If the image is not a readable business card, reply with the single " + "word NONE. Output only the labeled lines, nothing else." ) diff --git a/backend/matrix_intake/test_crm_client.py b/backend/matrix_intake/test_crm_client.py index 2132146..9956217 100644 --- a/backend/matrix_intake/test_crm_client.py +++ b/backend/matrix_intake/test_crm_client.py @@ -15,11 +15,21 @@ def test_new_investor_payload(): assert out["investor_name"] == "Acme Capital" assert out["create_investor_if_missing"] is True assert "row_id" not in out - assert out["contact"] == {"name": "Jane Doe", "email": "jane@acme.com", "title": "GP"} + assert out["contact"] == {"name": "Jane Doe", "email": "jane@acme.com", "title": "GP", + "city": "", "linkedin_url": ""} assert out["body"] == "met at conf" assert out["source"] == "matrix_intake" +def test_contact_carries_city_and_linkedin_when_present(): + p = {"intent": "new_investor", "investor_name": "Acme Capital", "contact_name": "Jane Doe", + "contact_email": "jane@acme.com", "city": "New York", + "linkedin_url": "linkedin.com/in/janedoe", "note": "met at conf"} + out = crm_client.build_commit_payload(p) + assert out["contact"]["city"] == "New York" + assert out["contact"]["linkedin_url"] == "linkedin.com/in/janedoe" + + def test_existing_investor_uses_row_id_not_create(): p = {"intent": "meeting_note", "investor_name": "Acme Capital", "contact_name": "Jane Doe", "contact_email": None, "note": "wants Q3 deck", diff --git a/backend/matrix_intake/test_parse.py b/backend/matrix_intake/test_parse.py index 149186a..8e76d48 100644 --- a/backend/matrix_intake/test_parse.py +++ b/backend/matrix_intake/test_parse.py @@ -195,6 +195,44 @@ def test_revise_injects_roster_into_system_prompt(): assert "doing the outreach" in seen["system"] +def test_city_kept_as_plain_field_and_linkedin_salvaged_from_source(): + # A card transcription carries labeled lines; city is kept as-is, LinkedIn is salvaged from + # the source text (verbatim) the same way email is. + src = ("New investor β€” from a business card:\nName: Jane Doe\nCompany: Acme Capital\n" + "Email: jane@acme.com\nLinkedIn: linkedin.com/in/janedoe\nCity: New York") + p = parse.parse_message( + src, + parse_fn=_stub({"intent": "new_investor", "investor_name": "Acme Capital", + "contact_name": "Jane Doe", "contact_email": "jane@acme.com", + "city": "New York", "linkedin_url": None}), # model missed the URL + ) + assert p["city"] == "New York" + assert p["linkedin_url"] == "linkedin.com/in/janedoe" # salvaged from source + + +def test_fabricated_linkedin_dropped_when_not_in_source(): + p = parse.parse_message( + "new prospect Gamma Partners, talked to their GP", + parse_fn=_stub({"intent": "new_investor", "investor_name": "Gamma Partners", + "contact_name": "their GP", "linkedin_url": "linkedin.com/in/madeup"}), + ) + assert p["linkedin_url"] is None # model invented a URL not in the source β†’ dropped + + +def test_revise_linkedin_taken_only_from_instruction(): + proposal = {"intent": "new_investor", "investor_name": "Acme", "contact_name": "Jane", + "contact_email": "jane@acme.com", "contact_title": None, "city": None, + "linkedin_url": None, "note": None, "_source_text": "Acme Jane jane@acme.com"} + r1 = parse.revise(proposal, "her linkedin is linkedin.com/in/janedoe", + parse_fn=_stub({"linkedin_url": "linkedin.com/in/janedoe"})) + assert r1["linkedin_url"] == "linkedin.com/in/janedoe" + # model tries to set a URL but the instruction carries none β†’ keep existing (None) + r2 = parse.revise(proposal, "set her title to GP", + parse_fn=_stub({"linkedin_url": "linkedin.com/in/fake", "contact_title": "GP"})) + assert r2["linkedin_url"] is None + assert r2["contact_title"] == "GP" + + def test_revise_cannot_empty_the_proposal(): proposal = {"intent": "new_investor", "investor_name": "Acme", "contact_name": "Jane", "contact_email": None, "contact_title": None, "note": "x", "_source_text": "Acme Jane"} diff --git a/backend/matrix_intake/test_proposals.py b/backend/matrix_intake/test_proposals.py index a56c8f1..508f179 100644 --- a/backend/matrix_intake/test_proposals.py +++ b/backend/matrix_intake/test_proposals.py @@ -54,6 +54,20 @@ def test_interpret_edit_colon_and_alias(): assert payload == ("investor_name", "Acme Capital LLC") +def test_interpret_edit_city_and_linkedin_aliases(): + a1, p1 = proposals.interpret_reply("city: New York") + assert (a1, p1) == ("edit", ("city", "New York")) + a2, p2 = proposals.interpret_reply("linkedin=linkedin.com/in/jane") + assert (a2, p2) == ("edit", ("linkedin_url", "linkedin.com/in/jane")) + + +def test_render_shows_city_and_linkedin_when_present(): + p = {**SAMPLE, "city": "New York", "linkedin_url": "linkedin.com/in/jane"} + out = proposals.render(p) + assert "City: New York" in out + assert "LinkedIn: linkedin.com/in/jane" in out + + def test_interpret_unknown(): assert proposals.interpret_reply("maybe later")[0] == "unknown" diff --git a/docs/guides/matrix-intake.md b/docs/guides/matrix-intake.md index d01e4ea..14f303b 100644 --- a/docs/guides/matrix-intake.md +++ b/docs/guides/matrix-intake.md @@ -98,6 +98,18 @@ existing flow (parse β†’ match β†’ disambiguate β†’ approve β†’ `log-communicati - **Provenance:** a card commit tags `source="matrix_card"` (vs `"matrix_intake"` for a typed note) in the audit log, threaded via the proposal's `_source` control key (`handle_intake(…, source=…)` β†’ `crm_client.build_commit_payload`, which defaults to `"matrix_intake"` when absent). +- **Fields captured** (`parse._FIELDS`): investor, contact, email, title, **city**, **linkedin_url**, + note. `city` is a plain extracted field (low-harm if wrong; the human sees it); `linkedin_url` + follows the **email-integrity rule** β€” kept only if it literally appears in the source/instruction, + never minted (a wrong profile URL points at the wrong person). Both ride to the contact via the + existing `log-communication` upsert (`_upsert_contact_from_fundraising` already honors them; `city` + also syncs to the grid contact pill, `linkedin_url` lands on the canonical contact record). +- **Phone is NOT captured yet (pending a server change).** The transcription reads the phone, but the + bot's write path (`_upsert_contact_from_fundraising`) doesn't accept a `contact.phone` today, so + phone is deliberately left off the card to avoid showing a field that won't persist. Enabling it is + a small additive server change (read `contact.phone` β†’ write `contacts.phone`; contact-level, not a + grid pill field) that ships in an **s9pk** (version bump + install), paired with adding `phone` to + the bot's extractor/card/payload in the same deploy. Tracked agreed 2026-06-20. - **UX:** the bot acks `πŸ“‡ Reading the card…` before the (slower) vision call; an unreadable image (model replies `NONE`, or transcription < 5 chars) gets a "try a clearer, well-lit photo" reply instead of a garbage proposal.