From 536358093ff7fd06d5a66c3c4b08cedd95da4cc8 Mon Sep 17 00:00:00 2001 From: Keysat Date: Sat, 20 Jun 2026 10:26:27 -0500 Subject: [PATCH] Add business-card photo intake to the Matrix bot (M3) The intake bot now accepts a photo of a business card in the intake room and turns it into the same new-investor proposal a typed note would. The only new step is image -> text; everything downstream (parse, fuzzy match, in-thread approval, log-communication write) is reused unchanged. M3 was deferred only because Spark Control had no vision model. That blocker is gone: the daily-driver Qwen is vision-capable under the same model id, and the gateway forwards OpenAI multimodal content untouched, so no gateway/server/s9pk change is needed -- this ships bot-only (git pull + rebuild on the Spark). Transcribe-then-reuse (not vision-straight-to-JSON) is deliberate: the transcription becomes the source text the email-integrity rule checks against, so a mis-read address can't reach the CRM unapproved -- same guarantee as the text path. Card commits tag source="matrix_card" for the audit log. - llm.chat_vision: multimodal /v1/chat/completions, same model, same gateway - spark.transcribe_card: faithful card->text, "" on a non-card (NONE sentinel) - bot.on_image/handle_card: download image, transcribe, hand to handle_intake - crm_client: source provenance overridable via the proposal's _source key - tests: test_spark.py + a provenance case; 41/41 suite green --- backend/ingest/llm.py | 25 +++++++++++ backend/matrix_intake/bot.py | 57 ++++++++++++++++++++++-- backend/matrix_intake/crm_client.py | 5 ++- backend/matrix_intake/spark.py | 26 +++++++++++ backend/matrix_intake/test_crm_client.py | 11 +++++ backend/matrix_intake/test_spark.py | 45 +++++++++++++++++++ docs/guides/matrix-intake.md | 46 ++++++++++++++++++- 7 files changed, 209 insertions(+), 6 deletions(-) create mode 100644 backend/matrix_intake/test_spark.py diff --git a/backend/ingest/llm.py b/backend/ingest/llm.py index a18384d..57f3aef 100644 --- a/backend/ingest/llm.py +++ b/backend/ingest/llm.py @@ -26,6 +26,31 @@ def chat(prompt, system=None, max_tokens=200, temperature=0.0): return (data["choices"][0]["message"].get("content") or "").strip() +def chat_vision(prompt, image_b64, mime="image/jpeg", system=None, max_tokens=600, temperature=0.0): + """Multimodal chat: a text prompt + one base64 image to the local VL model via Spark Control. + + Same endpoint and model as chat() β€” the daily-driver Qwen is vision-capable (capabilities + [vision, reasoning]); the only difference is the user message's `content` is the OpenAI + multimodal array (a text part + an image_url data-URI), which Spark Control forwards to vLLM + unchanged (it's a dumb passthrough). The server downscales to its max_pixels cap, so a + full-res phone photo is fine. Thinking stays off for fast, literal output.""" + messages = [] + if system: + messages.append({"role": "system", "content": system}) + messages.append({"role": "user", "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{image_b64}"}}, + ]}) + body = {"model": config.CHAT_MODEL, "messages": messages, + "temperature": temperature, "max_tokens": max_tokens, + "chat_template_kwargs": {"enable_thinking": False}} + status, data = http_util.request("POST", f"{config.SPARK_CONTROL_URL}/v1/chat/completions", + body, verify=config.SPARK_VERIFY_TLS) + if status != 200: + raise RuntimeError(f"/v1/chat/completions (vision) -> {status}: {data}") + return (data["choices"][0]["message"].get("content") or "").strip() + + def chat_json(prompt, system=None, max_tokens=200): """Chat and parse the first JSON object from the reply (tolerant of fences).""" raw = chat(prompt, system=system, max_tokens=max_tokens) diff --git a/backend/matrix_intake/bot.py b/backend/matrix_intake/bot.py index f9366c3..d60d80a 100644 --- a/backend/matrix_intake/bot.py +++ b/backend/matrix_intake/bot.py @@ -10,8 +10,9 @@ Runs as its own process (its matrix-nio dep is isolated here, never in the CRM r Lifts matrix-bridge's prime-then-listen + threaded-reply plumbing. Config: repo .env. """ import asyncio +import base64 -from nio import AsyncClient, MatrixRoom, MessageDirection, RoomMessageText +from nio import AsyncClient, MatrixRoom, MessageDirection, RoomMessageImage, RoomMessageText import crm_client import email_proposals @@ -20,6 +21,7 @@ import parse import proposals import query import settings +import spark UNCLEAR_HELP = ( "πŸ€” I couldn't tell what to record. Try e.g.\n" @@ -46,7 +48,10 @@ async def main(): query_room = settings.query_room() # dedicated read-only Q&A room (empty β†’ use the intake trigger) email_threads = {} # Matrix thread-root event_id -> {id, investor_name, note} for an email proposal - async def handle_intake(room_id, root, text): + async def handle_intake(room_id, root, text, source="matrix_intake"): + # `source` tags provenance for the eventual commit: "matrix_intake" for a typed note, + # "matrix_card" when the text came from a scanned business card (on_image). Everything + # else about the flow is identical β€” that's the whole point of transcribe-then-reuse. # A bare yes/no/approve typed in the MAIN timeline (not inside a proposal's thread) is # an easy slip β€” point the user back to the thread rather than parse it as a new intake. action, _ = proposals.interpret_reply(text) @@ -62,6 +67,7 @@ async def main(): if proposal["intent"] == "unclear": await say(room_id, UNCLEAR_HELP, root) return + proposal["_source"] = source # rides through to commit (control key, survives dict() copies) # Resolve new-vs-existing against the CRM matcher (read-only). Degrade gracefully if the # CRM is unreachable β€” still propose as new, just without match/candidate hints. match, candidates = None, [] @@ -99,6 +105,39 @@ async def main(): # easy to miss inside a thread (the full card + yes/edit/no stay in the thread). await nudge(room_id, proposals.summary_line(proposal), root) + async def handle_card(room_id, event): + """A photo in the intake room β†’ transcribe the business card on the local VL model, then + hand the transcription to the SAME intake flow as a typed note (parse β†’ match β†’ approve). + The only new step is image β†’ text; everything downstream is reused. The transcription is + also the source text the email-integrity check runs against, so a mis-read address can't + slip in unapproved.""" + mxc = getattr(event, "url", None) + if not mxc: + # Unencrypted images carry a plain mxc:// url; an encrypted room delivers a different + # event class entirely (we don't register for it), so this only guards the odd case. + await say(room_id, "πŸ“‡ I can only read unencrypted images right now.", event.event_id) + return + await say(room_id, "πŸ“‡ Reading the card…", event.event_id) # vision is slower β€” ack first + try: + resp = await client.download(mxc=mxc) + data = getattr(resp, "body", None) + if not isinstance(data, (bytes, bytearray)): # a DownloadError carries no bytes + raise RuntimeError(getattr(resp, "message", None) or "image download failed") + mime = getattr(resp, "content_type", None) or "image/jpeg" + b64 = base64.b64encode(data).decode("ascii") + text = await asyncio.to_thread(spark.transcribe_card, b64, mime) + except Exception as exc: + await say(room_id, f"⚠️ couldn't read the card: {str(exc)[:200]}", event.event_id) + return + if len(text.strip()) < 5: + await say(room_id, "πŸ“‡ I couldn't read any text on that card β€” try a clearer, " + "well-lit photo taken straight-on.", event.event_id) + return + # Frame the raw transcription so the existing extractor reads it as a new-investor intake; + # the transcription itself is what email-integrity is checked against. + framed = "New investor β€” from a business card:\n" + text.strip() + await handle_intake(room_id, event.event_id, framed, source="matrix_card") + async def handle_query(room_id, root, question): """A read-only NL question ('@bot …' / '?…') β€” translate + run it on the BOX (local Qwen, nothing leaves the box) and post the answer in a thread. No write path, no approval gate: @@ -337,11 +376,23 @@ async def main(): else: await handle_query(room.room_id, event.event_id, q) - # Prime the sync token past history, THEN register the callback β€” only react to messages + async def on_image(room: MatrixRoom, event: RoomMessageImage): + # Business-card capture is intake-only: ignore our own uploads, images in the Q&A / + # email-review rooms, and an image dropped inside an existing thread (not a fresh card). + if event.sender == mx["user_id"]: + return + if room.room_id != intake_room: + return + if matrix_io.thread_root_of(event): + return + await handle_card(room.room_id, event) + + # Prime the sync token past history, THEN register the callbacks β€” only react to messages # arriving after startup (no backlog replay). (matrix-bridge pattern.) print("matrix-intake: priming sync (skipping backlog)...", flush=True) await client.sync(timeout=30000, full_state=False) client.add_event_callback(on_message, RoomMessageText) + client.add_event_callback(on_image, RoomMessageImage) who = await client.whoami() print(f"matrix-intake: listening as {who.user_id} in room {intake_room}", flush=True) tasks = [asyncio.create_task(client.sync_forever(timeout=30000))] diff --git a/backend/matrix_intake/crm_client.py b/backend/matrix_intake/crm_client.py index 2f83d08..dfb1bd6 100644 --- a/backend/matrix_intake/crm_client.py +++ b/backend/matrix_intake/crm_client.py @@ -175,7 +175,10 @@ def build_commit_payload(proposal): "body": note, "subject": "" if note.strip() else intent_label, "append_note": True, - "source": "matrix_intake", + # Provenance for the audit log: a typed note is "matrix_intake"; a scanned business card + # rides in on _source="matrix_card" (set by the bot's image handler). Default preserves + # the text path. + "source": proposal.get("_source") or "matrix_intake", } match_id = proposal.get("_match_id") if match_id: diff --git a/backend/matrix_intake/spark.py b/backend/matrix_intake/spark.py index 3b4eea2..fde8584 100644 --- a/backend/matrix_intake/spark.py +++ b/backend/matrix_intake/spark.py @@ -19,3 +19,29 @@ import llm # noqa: E402 (backend/ingest/llm.py β€” chat / chat_json over Spark def parse_json(prompt, system=None, max_tokens=400): """Send to local Qwen (temp 0, thinking off) and parse the first JSON object, or None.""" return llm.chat_json(prompt, system=system, max_tokens=max_tokens) + + +# The vision model only TRANSCRIBES the card; the existing text-parse flow then extracts the +# structured proposal from that transcription. Keeping the two steps separate (vs. asking the +# vision model for JSON directly) is deliberate: the transcription becomes the source text the +# email-integrity check runs against, so the "only keep an address that literally appears in the +# source, never let the model mint one" rule (parse.normalize) protects card intake too. +CARD_SYSTEM = ( + "You are transcribing a photo of a business card for a venture-fund team. Read every line of " + "text on the card and write it out exactly as printed β€” the person's name, job title, company " + "or firm name, email address, phone number(s), website, and mailing address. Copy the email " + "address and phone numbers character-for-character; never guess, complete, or correct them. Do " + "not summarize, translate, or add anything that is not printed on the card. If the image is not " + "a readable business card, reply with the single word NONE. Output only the transcription, one " + "item per line." +) + + +def transcribe_card(image_b64, mime="image/jpeg", chat_fn=None): + """Vision-transcribe a business card to faithful text via the local VL model (same model and + Spark Control endpoint as the text parse). Returns the transcription string, or '' if the model + saw no readable card. `chat_fn` is injectable for offline tests (defaults to Spark/VL).""" + chat_fn = chat_fn or llm.chat_vision + out = (chat_fn("Transcribe this business card.", image_b64, mime=mime, + system=CARD_SYSTEM, max_tokens=600) or "").strip() + return "" if out.upper() == "NONE" else out diff --git a/backend/matrix_intake/test_crm_client.py b/backend/matrix_intake/test_crm_client.py index d9e5101..2132146 100644 --- a/backend/matrix_intake/test_crm_client.py +++ b/backend/matrix_intake/test_crm_client.py @@ -58,6 +58,17 @@ def test_subject_blank_when_note_present_else_provenance_label(): assert no_note["subject"] == "Intake (Matrix)" +def test_source_defaults_to_intake_and_card_overrides(): + # Provenance: a typed note tags source="matrix_intake"; a scanned card rides in on + # _source="matrix_card" (set by the bot's image handler) so the audit log distinguishes them. + typed = crm_client.build_commit_payload( + {"intent": "new_investor", "investor_name": "Acme", "note": "x"}) + assert typed["source"] == "matrix_intake" + card = crm_client.build_commit_payload( + {"intent": "new_investor", "investor_name": "Acme", "note": "x", "_source": "matrix_card"}) + assert card["source"] == "matrix_card" + + def _with_stub_authed(reply, capture=None): """Swap crm_client._authed for a canned (status, data); return a restorer.""" orig = crm_client._authed diff --git a/backend/matrix_intake/test_spark.py b/backend/matrix_intake/test_spark.py new file mode 100644 index 0000000..63b1796 --- /dev/null +++ b/backend/matrix_intake/test_spark.py @@ -0,0 +1,45 @@ +"""Tests for the business-card vision wrapper (pure logic, no network β€” chat_fn is stubbed).""" +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import spark # noqa: E402 + + +def test_transcribe_card_returns_faithful_text(): + captured = {} + + def fake_chat(prompt, image_b64, mime="image/jpeg", system=None, max_tokens=600): + captured["image_b64"] = image_b64 + captured["mime"] = mime + captured["system"] = system + return "Jane Doe\nGeneral Partner\nAcme Capital LLC\njane@acme.com\n+1 555 123 4567" + + out = spark.transcribe_card("Zm9vYmFy", mime="image/png", chat_fn=fake_chat) + # The transcription is passed through verbatim β€” email survives for the integrity check. + assert "jane@acme.com" in out + assert "Acme Capital LLC" in out + # The image + mime reached the vision call; the card system prompt was used. + assert captured["image_b64"] == "Zm9vYmFy" + assert captured["mime"] == "image/png" + assert "business card" in (captured["system"] or "").lower() + + +def test_transcribe_card_none_sentinel_becomes_empty(): + # The model replies NONE for an unreadable / non-card image β†’ we return "" so the bot can + # ask for a clearer photo instead of feeding garbage into the intake parser. + assert spark.transcribe_card("x", chat_fn=lambda *a, **k: "NONE") == "" + assert spark.transcribe_card("x", chat_fn=lambda *a, **k: " none ") == "" + + +def test_transcribe_card_strips_whitespace(): + assert spark.transcribe_card("x", chat_fn=lambda *a, **k: " Acme\n ") == "Acme" + + +if __name__ == "__main__": + fns = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)] + for fn in fns: + fn() + print(f"ok {fn.__name__}") + print(f"\n{len(fns)} passed") diff --git a/docs/guides/matrix-intake.md b/docs/guides/matrix-intake.md index b1befe2..d01e4ea 100644 --- a/docs/guides/matrix-intake.md +++ b/docs/guides/matrix-intake.md @@ -8,8 +8,9 @@ paths: Read this before editing `backend/matrix_intake/`. The bot turns a typed message in a dedicated Matrix room into a proposed fundraising-grid add/edit, gated on **in-thread human approval** before any write. Phase status: **M1 + M2 deployed & live** (text intake + approval + write; bot on the Spark, -CRM endpoints on the box at **v0.1.0:86**; live-smoked 2026-06-17). **M3 (business-card photo) -deferred** β€” Spark Control has no vision model yet. +CRM endpoints on the box at **v0.1.0:86**; live-smoked 2026-06-17). **M3 (business-card photo) BUILT β€” +bot-only, awaiting live-smoke** (the prior blocker β€” "Spark Control has no vision model" β€” is gone: +the daily-driver model is now vision-capable; see *Business-card capture* below). **Post-deploy UX pass β€” DEPLOYED & LIVE 2026-06-17:** fuzzy investor matching (server-side, **v0.1.0:86**, installed to the box β€” `candidates` endpoint verified live) + in-thread @@ -69,6 +70,47 @@ Spark). See *Fuzzy matching* below. Tests green (27/27 backend + the offline bot "reply in the thread" redirect (`store.any_pending()` guard in `handle_intake`), not a misparsed new intake. +## Business-card capture (M3 β€” image intake) + +Send a **photo of a business card** into the intake room and the bot turns it into the same +new-investor proposal a typed note would. The **only added step is image β†’ text**; from there the +existing flow (parse β†’ match β†’ disambiguate β†’ approve β†’ `log-communication`) runs **unchanged** β€” +`handle_card` just calls `handle_intake` with the transcription. + +- **Trigger:** a top-level `m.image` event in the intake room (`on_image` β†’ `handle_card` in + `bot.py`; registered via a second `add_event_callback(on_image, RoomMessageImage)`). Images in + the Q&A / email-review rooms, the bot's own uploads, and an image dropped **inside an existing + thread** are ignored. The card's own event becomes the proposal thread root, like a text message. +- **The one new call** (`spark.transcribe_card` β†’ `llm.chat_vision`): download the image + (`client.download(mxc=event.url)` β€” **unencrypted only**; an E2EE room delivers a different event + class we don't register for, so encryption is naturally excluded), base64-encode, and POST an + **OpenAI multimodal** `/v1/chat/completions` to **Spark Control** β€” *same endpoint, same model id* + (`CRM_CHAT_MODEL`, the daily-driver Qwen, `capabilities: [vision, reasoning]`), with the user + message's `content` an array of a text part + an `image_url` data-URI. Spark Control is a **dumb + passthrough** (`image/app/llm_proxy.py`), so **no gateway change** was needed. The model + **transcribes** the card; it does not emit JSON. +- **Why transcribe-then-reuse (not vision-straight-to-JSON):** the transcription becomes the + **source text** the email-integrity rule checks against β€” `parse.normalize` only keeps an address + that *literally appears in the source*, never one the model mints. So a mis-read address can't + reach the CRM unapproved, exactly as on the text path, and 100% of parse/match/disambiguation/ + approval is reused. The transcription is framed (`"New investor β€” from a business card:\n…"`) so + the extractor reads it as a new investor. +- **Provenance:** a card commit tags `source="matrix_card"` (vs `"matrix_intake"` for a typed note) + in the audit log, threaded via the proposal's `_source` control key (`handle_intake(…, source=…)` + β†’ `crm_client.build_commit_payload`, which defaults to `"matrix_intake"` when absent). +- **UX:** the bot acks `πŸ“‡ Reading the card…` before the (slower) vision call; an unreadable image + (model replies `NONE`, or transcription < 5 chars) gets a "try a clearer, well-lit photo" reply + instead of a garbage proposal. +- **Deploy is bot-only** β€” the change lives in `backend/matrix_intake/` (`bot.py`, `spark.py`) + + `backend/ingest/llm.py` (bundled into the bot image), shipped on the **Spark** via `git pull` + + `docker compose up -d --build`. **No s9pk, no version bump, no new env** (same model; no auth on + the LAN). Contrast with M2 / email-review, whose server endpoints had to ship in the s9pk. +- **Known limits (live-smoke checklist):** β‘  a StartOS reverse-proxy body cap could `413` a large + photo β€” the model already downscales server-side (`max_pixels` β‰ˆ 2 MP), so if it trips, add a + client-side resize (would pull Pillow into the bot image); β‘‘ iPhone **HEIC** may not decode in + vLLM's PIL β€” most clients (Element iOS) transcode to JPEG on upload, but confirm on-device; β‘’ the + offline tests stub the vision call (`test_spark.py`); the download + real OCR is **live-smoke only**. + ## Fuzzy matching (server-side, ships in the s9pk) `GET /api/intake/match` returns `{match, candidates}`. `find_intake_match` is unchanged β€”