diff --git a/backend/server.py b/backend/server.py index 5c48777..56fddd0 100644 --- a/backend/server.py +++ b/backend/server.py @@ -3788,17 +3788,36 @@ class CRMHandler(BaseHTTPRequestHandler): return self.send_json({"data": res}) def _ground_feedback_corpus(self, conn, limit=60): - """Raw LP-feedback prose for grounding (communications + grid notes). Sensitive - Tier-2-heavy text; ONLY ever passed into the redaction boundary, never to Claude - directly.""" - items = [] - for q in ("SELECT body FROM communications WHERE body IS NOT NULL AND TRIM(body)<>'' ORDER BY communication_date DESC LIMIT ?", - "SELECT notes FROM fundraising_investors WHERE notes IS NOT NULL AND TRIM(notes)<>'' LIMIT ?"): + """Raw LP-feedback prose for grounding, newest-first, balanced across sources: + matched email bodies (the richest objection signal), logged communications, and + fundraising grid notes. Sensitive Tier-2-heavy text; ONLY ever passed into the + redaction boundary, never to Claude directly.""" + # Email bodies are capped per item (long threads/quote-chains) to keep the local + # minimize tractable; only `matched` emails (tied to a known investor/contact) are + # pulled. Sources are round-robin merged so email is always represented even when + # communications/notes are plentiful, rather than crowded out by a flat LIMIT. + sources = ( + "SELECT SUBSTR(body_text,1,4000) FROM emails WHERE match_status='matched' " + "AND body_text IS NOT NULL AND TRIM(body_text)<>'' ORDER BY sent_at DESC LIMIT ?", + "SELECT body FROM communications WHERE body IS NOT NULL AND TRIM(body)<>'' " + "ORDER BY communication_date DESC LIMIT ?", + "SELECT notes FROM fundraising_investors WHERE notes IS NOT NULL AND TRIM(notes)<>'' LIMIT ?", + ) + buckets = [] + for q in sources: try: - items += [r[0] for r in conn.execute(q, (limit,))] + buckets.append([r[0] for r in conn.execute(q, (limit,))]) except Exception: - pass - return items[:limit] + buckets.append([]) # table absent (e.g. email integration not migrated) -> skip + items, i = [], 0 + while len(items) < limit and any(i < len(b) for b in buckets): + for b in buckets: + if i < len(b): + items.append(b[i]) + if len(items) >= limit: + break + i += 1 + return items def handle_architect_ground(self, user, body): """Ground an objection register in real LP feedback THROUGH the redaction boundary diff --git a/backend/test_ground_corpus.py b/backend/test_ground_corpus.py new file mode 100644 index 0000000..ff91a46 --- /dev/null +++ b/backend/test_ground_corpus.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +"""Test the grounding feedback corpus assembly (CRMHandler._ground_feedback_corpus). + +Verifies email bodies are wired in: matched-only, balanced round-robin merge across +email/communications/grid-notes, per-item length cap on long threads, and graceful +degradation when the email tables are absent. Synthetic data only (guardrail #9). +Run: cd backend && python3 test_ground_corpus.py +""" +import os +import sqlite3 +import sys +import tempfile + +os.environ.setdefault("CRM_DB_PATH", os.path.join(tempfile.mkdtemp(), "import.db")) +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import server # noqa: E402 + +corpus = server.CRMHandler._ground_feedback_corpus # unbound; method uses only (conn, limit) +FAILS = [] + + +def check(cond, msg): + print((" PASS " if cond else " FAIL ") + msg) + if not cond: + FAILS.append(msg) + + +def mkdb(with_email=True): + db = os.path.join(tempfile.mkdtemp(), "t.db") + c = sqlite3.connect(db) + c.execute("CREATE TABLE communications(body TEXT, communication_date TEXT)") + c.execute("CREATE TABLE fundraising_investors(notes TEXT)") + c.executemany("INSERT INTO communications VALUES(?,?)", [(f"comm {i}", f"2026-01-{i+1:02d}") for i in range(5)]) + c.executemany("INSERT INTO fundraising_investors VALUES(?)", [(f"note {i}",) for i in range(5)]) + if with_email: + c.execute("CREATE TABLE emails(body_text TEXT, match_status TEXT, sent_at TEXT)") + c.executemany("INSERT INTO emails VALUES(?,?,?)", + [(f"EMAIL {i}", "matched", f"2026-02-{i+1:02d}") for i in range(5)] + + [("UNMATCHED should not appear", "unmatched", "2026-03-01")]) + c.commit() + return c + + +def main(): + c = mkdb(True) + items = corpus(None, c, limit=9) + check(any(x.startswith("EMAIL") for x in items), "matched email is represented in the corpus") + check(all("UNMATCHED" not in x for x in items), "unmatched emails are excluded") + check(any(x.startswith("comm") for x in items) and any(x.startswith("note") for x in items), + "communications and grid notes still represented") + check(items[0].startswith("EMAIL") and items[1].startswith("comm") and items[2].startswith("note"), + "sources are round-robin balanced (email not crowded out)") + + items2 = corpus(None, mkdb(False), limit=20) + check(len(items2) == 10 and all("EMAIL" not in x for x in items2), + "degrades gracefully to comms+notes when email table absent") + + c3 = sqlite3.connect(os.path.join(tempfile.mkdtemp(), "t3.db")) + c3.execute("CREATE TABLE communications(body TEXT, communication_date TEXT)") + c3.execute("CREATE TABLE fundraising_investors(notes TEXT)") + c3.execute("CREATE TABLE emails(body_text TEXT, match_status TEXT, sent_at TEXT)") + c3.execute("INSERT INTO emails VALUES(?,?,?)", ("X" * 9000, "matched", "2026-02-01")) + c3.commit() + check(len(corpus(None, c3, limit=5)[0]) == 4000, "long email body capped at 4000 chars for local minimize") + + if FAILS: + print(f"\nFAILED ({len(FAILS)})") + for f in FAILS: + print(" - " + f) + sys.exit(1) + print("\nALL PASS (email grounding corpus wiring)") + + +if __name__ == "__main__": + main()