#!/usr/bin/env python3 """Test the grounding feedback corpus assembly (CRMHandler._ground_feedback_corpus). Verifies email bodies are wired in: matched-only, balanced round-robin merge across email/communications/grid-notes, per-item length cap on long threads, and graceful degradation when the email tables are absent. Synthetic data only (guardrail #9). Run: cd backend && python3 test_ground_corpus.py """ import os import sqlite3 import sys import tempfile os.environ.setdefault("CRM_DB_PATH", os.path.join(tempfile.mkdtemp(), "import.db")) sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import server # noqa: E402 corpus = server.CRMHandler._ground_feedback_corpus # unbound; method uses only (conn, limit) FAILS = [] def check(cond, msg): print((" PASS " if cond else " FAIL ") + msg) if not cond: FAILS.append(msg) def mkdb(with_email=True): db = os.path.join(tempfile.mkdtemp(), "t.db") c = sqlite3.connect(db) c.execute("CREATE TABLE communications(body TEXT, communication_date TEXT)") c.execute("CREATE TABLE fundraising_investors(notes TEXT)") c.executemany("INSERT INTO communications VALUES(?,?)", [(f"comm {i}", f"2026-01-{i+1:02d}") for i in range(5)]) c.executemany("INSERT INTO fundraising_investors VALUES(?)", [(f"note {i}",) for i in range(5)]) if with_email: c.execute("CREATE TABLE emails(body_text TEXT, match_status TEXT, sent_at TEXT)") c.executemany("INSERT INTO emails VALUES(?,?,?)", [(f"EMAIL {i}", "matched", f"2026-02-{i+1:02d}") for i in range(5)] + [("UNMATCHED should not appear", "unmatched", "2026-03-01")]) c.commit() return c def main(): c = mkdb(True) items = corpus(None, c, limit=9) check(any(x.startswith("EMAIL") for x in items), "matched email is represented in the corpus") check(all("UNMATCHED" not in x for x in items), "unmatched emails are excluded") check(any(x.startswith("comm") for x in items) and any(x.startswith("note") for x in items), "communications and grid notes still represented") check(items[0].startswith("EMAIL") and items[1].startswith("comm") and items[2].startswith("note"), "sources are round-robin balanced (email not crowded out)") items2 = corpus(None, mkdb(False), limit=20) check(len(items2) == 10 and all("EMAIL" not in x for x in items2), "degrades gracefully to comms+notes when email table absent") c3 = sqlite3.connect(os.path.join(tempfile.mkdtemp(), "t3.db")) c3.execute("CREATE TABLE communications(body TEXT, communication_date TEXT)") c3.execute("CREATE TABLE fundraising_investors(notes TEXT)") c3.execute("CREATE TABLE emails(body_text TEXT, match_status TEXT, sent_at TEXT)") c3.execute("INSERT INTO emails VALUES(?,?,?)", ("X" * 9000, "matched", "2026-02-01")) c3.commit() check(len(corpus(None, c3, limit=5)[0]) == 4000, "long email body capped at 4000 chars for local minimize") if FAILS: print(f"\nFAILED ({len(FAILS)})") for f in FAILS: print(" - " + f) sys.exit(1) print("\nALL PASS (email grounding corpus wiring)") if __name__ == "__main__": main()