diff --git a/.env.example b/.env.example index da159a0..6b96022 100644 --- a/.env.example +++ b/.env.example @@ -64,3 +64,7 @@ CRM_BOT_USERNAME= CRM_BOT_PASSWORD= # Set to false only if CRM_API_BASE is https with a self-signed cert. CRM_API_VERIFY_TLS=true +# Ten31 team-member names (comma-separated), fed to the parser so a teammate's name reads as +# the person DOING outreach, not the prospect ("Jonathan is chatting with Wyoming" → Wyoming). +# Optional; first names as actually used in the room. Leave empty to disable the framing. +INTAKE_TEAM_ROSTER= diff --git a/backend/matrix_intake/bot.py b/backend/matrix_intake/bot.py index b811195..741bade 100644 --- a/backend/matrix_intake/bot.py +++ b/backend/matrix_intake/bot.py @@ -34,6 +34,9 @@ async def main(): nudge = matrix_io.make_reply(client) store = proposals.ProposalStore() intake_room = mx["intake_room"] + roster = settings.team_roster() # frames the parse: teammates do outreach, aren't prospects + if roster: + print(f"matrix-intake: team roster loaded ({len(roster)} names)", flush=True) async def handle_intake(room_id, root, text): # A bare yes/no/approve typed in the MAIN timeline (not inside a proposal's thread) is @@ -44,7 +47,7 @@ async def main(): "and reply there — the note is in the thread.", root) return try: - proposal = await asyncio.to_thread(parse.parse_message, text) + proposal = await asyncio.to_thread(parse.parse_message, text, roster=roster) except Exception as exc: # Spark/Qwen unreachable or bad response await say(room_id, f"⚠️ couldn't reach the local parser: {str(exc)[:200]}", root) return @@ -121,7 +124,7 @@ async def main(): # re-run it through local Qwen (no Claude, no scrub). The human still approves the # revised card, so the draft→approve gate holds. try: - revised = await asyncio.to_thread(parse.revise, proposal, text) + revised = await asyncio.to_thread(parse.revise, proposal, text, roster=roster) except Exception as exc: store.put(root, proposal) await say(room_id, f"⚠️ couldn't apply that change ({str(exc)[:200]}).\n\nReply **yes** " diff --git a/backend/matrix_intake/parse.py b/backend/matrix_intake/parse.py index 1a7bacf..683bfa1 100644 --- a/backend/matrix_intake/parse.py +++ b/backend/matrix_intake/parse.py @@ -15,7 +15,9 @@ import spark SYSTEM = ( "You extract structured investor-intake data from a short message a venture-fund " - "team member typed. Reply with ONLY a JSON object, no prose, with these keys:\n" + "team member typed about their fundraising outreach. The message is a note FROM a " + "team member ABOUT an investor or prospect they are contacting. Reply with ONLY a JSON " + "object, no prose, with these keys:\n" ' "intent": "new_investor" if the message introduces a new investor or prospect, ' '"meeting_note" if it logs a note/update about an investor, else "unclear".\n' ' "investor_name": the investing firm or entity name (e.g. "Acme Capital"), or null.\n' @@ -23,9 +25,31 @@ SYSTEM = ( ' "contact_email": the person\'s email if explicitly present, else null. Never invent one.\n' ' "contact_title": the person\'s role/title if stated, else null.\n' ' "note": any meeting note, context, or next step, else null.\n' - "Use null (not empty string) for anything not present. Output JSON only." + "Use null (not empty string) for anything not present." ) +# Appended when the team roster is known, so the model reads a teammate's name as the person +# DOING the outreach, not the investor — fixes "Jonathan is chatting with Wyoming" extracting +# the teammate instead of the prospect. Names come from settings.team_roster() (INTAKE_TEAM_ROSTER). +ROSTER_FRAME = ( + "These names and initials (case-insensitive) are our OWN team members — the people doing " + "the outreach, NOT investors or prospects. Never extract one as investor_name or " + "contact_name: {names}. When a team member is described talking with, meeting, or chasing " + 'someone (e.g. "Jonathan is chatting with Wyoming"), the OTHER party (here "Wyoming") is ' + "the investor or prospect to extract." +) + + +def build_system(roster=None, base=SYSTEM): + """Assemble the extraction system prompt. With a `roster` (team-member names) it appends + the outreach frame so a teammate's name is read as the person doing outreach, not the + investor. JSON-only stays the last line for recency. Pure + offline-testable.""" + parts = [base] + if roster: + parts.append(ROSTER_FRAME.format(names=", ".join(roster))) + parts.append("Output JSON only.") + return "\n".join(parts) + _EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}") _VALID_INTENTS = {"new_investor", "meeting_note", "unclear"} _FIELDS = ("intent", "investor_name", "contact_name", "contact_email", "contact_title", "note") @@ -62,10 +86,11 @@ def normalize(raw, source_text=""): return out -def parse_message(text, parse_fn=spark.parse_json): - """Parse one intake message. `parse_fn` is injectable for tests (defaults to Spark/Qwen). +def parse_message(text, parse_fn=spark.parse_json, roster=None): + """Parse one intake message. `parse_fn` is injectable for tests (defaults to Spark/Qwen); + `roster` is the team-member names that frame the extraction (see build_system). Returns a normalized proposal dict. On a model/transport failure, raises (caller decides).""" - raw = parse_fn(text, system=SYSTEM, max_tokens=400) + raw = parse_fn(text, system=build_system(roster), max_tokens=400) proposal = normalize(raw, source_text=text) # Stash the original message so a later revise() can re-check email integrity against it. proposal["_source_text"] = text @@ -79,7 +104,7 @@ REVISE_SYSTEM = ( ' "investor_name", "contact_name", "contact_email", "contact_title", "note".\n' "Change ONLY what the instruction asks; copy every other field through unchanged. Use null " "for a field the instruction clears or that is genuinely absent. Never invent an email " - "address. Output JSON only." + "address." ) _REVISABLE = ("investor_name", "contact_name", "contact_title", "note") @@ -108,12 +133,13 @@ def _apply_revision(proposal, model_out, instruction): return out -def revise(proposal, instruction, parse_fn=spark.parse_json): +def revise(proposal, instruction, parse_fn=spark.parse_json, roster=None): """Apply a natural-language correction to a pending proposal via local Qwen; return the - revised proposal dict. `parse_fn` is injectable for tests (defaults to Spark/Qwen).""" + revised proposal dict. `parse_fn` is injectable for tests (defaults to Spark/Qwen); + `roster` frames the revision the same way parse_message does (see build_system).""" current = {k: proposal.get(k) for k in ("investor_name", "contact_name", "contact_email", "contact_title", "note")} prompt = ("CURRENT:\n" + json.dumps(current, ensure_ascii=False) + "\n\nINSTRUCTION:\n" + (instruction or "").strip()) - raw = parse_fn(prompt, system=REVISE_SYSTEM, max_tokens=400) + raw = parse_fn(prompt, system=build_system(roster, base=REVISE_SYSTEM), max_tokens=400) return _apply_revision(proposal, raw, instruction) diff --git a/backend/matrix_intake/settings.py b/backend/matrix_intake/settings.py index 7856730..c070c94 100644 --- a/backend/matrix_intake/settings.py +++ b/backend/matrix_intake/settings.py @@ -54,3 +54,10 @@ def crm_settings(): "password": os.environ.get("CRM_BOT_PASSWORD", ""), "verify_tls": os.environ.get("CRM_API_VERIFY_TLS", "true").lower() in ("1", "true", "yes", "on"), } + + +# Team-member names (comma-separated in INTAKE_TEAM_ROSTER), fed to the parser so a teammate's +# name reads as the person DOING outreach, not the investor (see parse.build_system). Optional — +# unset/empty just means no roster framing, i.e. the prior behavior. +def team_roster(): + return [n.strip() for n in os.environ.get("INTAKE_TEAM_ROSTER", "").split(",") if n.strip()] diff --git a/backend/matrix_intake/test_parse.py b/backend/matrix_intake/test_parse.py index c97da58..149186a 100644 --- a/backend/matrix_intake/test_parse.py +++ b/backend/matrix_intake/test_parse.py @@ -152,6 +152,49 @@ def test_revise_preserves_match_id(): assert revised["intent"] == "meeting_note" +def test_build_system_appends_roster_frame_only_when_roster_given(): + base = parse.build_system() + assert base.strip().endswith("Output JSON only.") + assert "doing the outreach" not in base # no roster → no outreach frame + + framed = parse.build_system(["Grant", "Jonathan", "Marty"]) + assert "Grant" in framed and "Jonathan" in framed and "Marty" in framed + assert "doing the outreach" in framed # the outreach frame is present + assert framed.strip().endswith("Output JSON only.") # JSON-only stays last for recency + + +def test_parse_message_injects_roster_into_system_prompt(): + # Capture the system prompt the model is handed, and confirm the teammate ("jonathan") + # is framed as outreach while the prospect ("wyoming") is what gets extracted. + seen = {} + + def cap(text, system=None, max_tokens=400): + seen["system"] = system + return {"intent": "meeting_note", "investor_name": "Wyoming", "contact_name": None, + "note": "jonathan chatting with them"} + + p = parse.parse_message("jonathan is chatting with wyoming", parse_fn=cap, + roster=["Grant", "Jonathan", "Marty"]) + assert "Jonathan" in seen["system"] + assert "doing the outreach" in seen["system"] + assert p["investor_name"] == "Wyoming" + + +def test_revise_injects_roster_into_system_prompt(): + proposal = {"intent": "meeting_note", "investor_name": "Wyoming", "contact_name": None, + "contact_email": None, "contact_title": None, "note": "x", + "_source_text": "jonathan is chatting with wyoming"} + seen = {} + + def cap(prompt, system=None, max_tokens=400): + seen["system"] = system + return {"note": "sent the deck"} + + parse.revise(proposal, "note: sent the deck", parse_fn=cap, roster=["Grant", "Jonathan"]) + assert "Jonathan" in seen["system"] + assert "doing the outreach" in seen["system"] + + def test_revise_cannot_empty_the_proposal(): proposal = {"intent": "new_investor", "investor_name": "Acme", "contact_name": "Jane", "contact_email": None, "contact_title": None, "note": "x", "_source_text": "Acme Jane"} diff --git a/docs/guides/matrix-intake.md b/docs/guides/matrix-intake.md index ba0e099..7163b27 100644 --- a/docs/guides/matrix-intake.md +++ b/docs/guides/matrix-intake.md @@ -36,7 +36,12 @@ Spark). See *Fuzzy matching* below. Tests green (27/27 backend + the offline bot Control** (`spark.py` reuses `backend/ingest/llm.py`; temp 0, JSON only) extracts `{intent, investor_name, contact_name, contact_email, contact_title, note}`. The original message text is stashed on the proposal as `_source_text` (needed later for `revise`'s - email-integrity check). + email-integrity check). The system prompt is built by `parse.build_system(roster)`, which — + when a **team roster** is configured (`INTAKE_TEAM_ROSTER`, see *Config*) — appends an + **outreach frame**: those names are our own team members *doing* the outreach, so a teammate's + name is never extracted as the investor/contact and the *other* party is the prospect. Fixes + the live-smoke gripe where *"jonathan is chatting with wyoming"* picked the teammate, not the + prospect. `revise` gets the same framing. Roster unset → prior behavior (no frame). 2. `crm_client.match` (`GET /api/intake/match`) resolves new-vs-existing. It returns **both** an exact `match` (returns the **grid row id** so an approved note lands on exactly that investor, no duplicate) **and**, when there's no exact match, a ranked list of fuzzy `candidates` (see @@ -168,3 +173,9 @@ All in `.env` (names in `.env.example`): `MATRIX_HOMESERVER`, `MATRIX_USER`, `MATRIX_ACCESS_TOKEN`, `MATRIX_DEVICE_ID`, `MATRIX_INTAKE_ROOM`; `CRM_API_BASE`, `CRM_BOT_USERNAME`, `CRM_BOT_PASSWORD`, `CRM_API_VERIFY_TLS`. Spark settings are inherited from the ingest client (`SPARK_CONTROL_URL`, `CRM_CHAT_MODEL`). + +- **`INTAKE_TEAM_ROSTER`** (optional, comma-separated) — Ten31 team-member names that frame the + parse (see *Flow* step 1). Use the **first names as actually typed in the room** ("Grant, + Jonathan, …"). Read once at startup by `settings.team_roster()`, so **a roster change needs a + bot restart**. It lives only in the Spark's `.env` (bot-side) — no s9pk change. Empty/unset + disables the framing.