Files
ten31-database/backend/test_ground_corpus.py
Keysat bf829b784a grounding: wire matched email bodies into the LP-feedback corpus
_ground_feedback_corpus now pulls matched email bodies (the richest objection
signal) alongside communications and grid notes, round-robin merged so email is
never crowded out by a flat LIMIT, per-item capped at 4000 chars to keep the local
minimize tractable on long threads, and degrading gracefully when the email tables
are absent. Email remains Tier-2-sensitive: it only ever enters the redaction
boundary, never Claude directly. Inert until Gmail capture is enrolled. Not yet
deployed (bundles into the next release with the meeting-notes work).
Test: test_ground_corpus.py.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 20:30:29 -05:00

76 lines
3.2 KiB
Python

#!/usr/bin/env python3
"""Test the grounding feedback corpus assembly (CRMHandler._ground_feedback_corpus).
Verifies email bodies are wired in: matched-only, balanced round-robin merge across
email/communications/grid-notes, per-item length cap on long threads, and graceful
degradation when the email tables are absent. Synthetic data only (guardrail #9).
Run: cd backend && python3 test_ground_corpus.py
"""
import os
import sqlite3
import sys
import tempfile
os.environ.setdefault("CRM_DB_PATH", os.path.join(tempfile.mkdtemp(), "import.db"))
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import server # noqa: E402
corpus = server.CRMHandler._ground_feedback_corpus # unbound; method uses only (conn, limit)
FAILS = []
def check(cond, msg):
print((" PASS " if cond else " FAIL ") + msg)
if not cond:
FAILS.append(msg)
def mkdb(with_email=True):
db = os.path.join(tempfile.mkdtemp(), "t.db")
c = sqlite3.connect(db)
c.execute("CREATE TABLE communications(body TEXT, communication_date TEXT)")
c.execute("CREATE TABLE fundraising_investors(notes TEXT)")
c.executemany("INSERT INTO communications VALUES(?,?)", [(f"comm {i}", f"2026-01-{i+1:02d}") for i in range(5)])
c.executemany("INSERT INTO fundraising_investors VALUES(?)", [(f"note {i}",) for i in range(5)])
if with_email:
c.execute("CREATE TABLE emails(body_text TEXT, match_status TEXT, sent_at TEXT)")
c.executemany("INSERT INTO emails VALUES(?,?,?)",
[(f"EMAIL {i}", "matched", f"2026-02-{i+1:02d}") for i in range(5)]
+ [("UNMATCHED should not appear", "unmatched", "2026-03-01")])
c.commit()
return c
def main():
c = mkdb(True)
items = corpus(None, c, limit=9)
check(any(x.startswith("EMAIL") for x in items), "matched email is represented in the corpus")
check(all("UNMATCHED" not in x for x in items), "unmatched emails are excluded")
check(any(x.startswith("comm") for x in items) and any(x.startswith("note") for x in items),
"communications and grid notes still represented")
check(items[0].startswith("EMAIL") and items[1].startswith("comm") and items[2].startswith("note"),
"sources are round-robin balanced (email not crowded out)")
items2 = corpus(None, mkdb(False), limit=20)
check(len(items2) == 10 and all("EMAIL" not in x for x in items2),
"degrades gracefully to comms+notes when email table absent")
c3 = sqlite3.connect(os.path.join(tempfile.mkdtemp(), "t3.db"))
c3.execute("CREATE TABLE communications(body TEXT, communication_date TEXT)")
c3.execute("CREATE TABLE fundraising_investors(notes TEXT)")
c3.execute("CREATE TABLE emails(body_text TEXT, match_status TEXT, sent_at TEXT)")
c3.execute("INSERT INTO emails VALUES(?,?,?)", ("X" * 9000, "matched", "2026-02-01"))
c3.commit()
check(len(corpus(None, c3, limit=5)[0]) == 4000, "long email body capped at 4000 chars for local minimize")
if FAILS:
print(f"\nFAILED ({len(FAILS)})")
for f in FAILS:
print(" - " + f)
sys.exit(1)
print("\nALL PASS (email grounding corpus wiring)")
if __name__ == "__main__":
main()