Email search/query + windowed digest preview (v0.1.0:83)

Communications tab (search/query roadmap items 1 & 2):
- Fix the investor dropdown: the facet only listed grid investors, so it
  came back empty whenever email matched a classic contact or org domain
  (no grid id — the common case). It now mirrors the email list, resolving
  each link to a typed identity (fund:/org:/contact:/addr:) with precedence
  grid -> org -> contact -> address; investor_id accepts the typed key
  (bare id = fund: for back-compat) and an unknown prefix matches nothing.
- Add a date-range filter and a click-to-expand full-body view
  (GET /api/email/detail, admin, soft-delete-gated; body_text only, never
  raw remote HTML).
- Add a "Search content" mode: GET /api/email/search wraps the ingest
  hybrid_search over the Qdrant email index (doc_type=email), hydrated and
  soft-delete-filtered against SQLite (canonical), 503 if Spark/Qdrant down.

Daily digest:
- Settings -> Admin builds a digest over a chosen window (last 24h or since
  a date) as an in-app preview before sending (POST /api/admin/digest/preview),
  so the local-Spark summarizer can be verified on demand even on a quiet day.
  Manual send uses the same window; neither advances the daily cursor, so a
  preview never suppresses the scheduled digest.

Code-only, migrations no-op. 22/22 backend tests, render-smoke pass.
This commit is contained in:
Keysat
2026-06-16 20:46:15 -05:00
parent c29ac2f2ee
commit c7b74a2704
14 changed files with 989 additions and 138 deletions
+95
View File
@@ -34,6 +34,8 @@ _GET_ROUTES = {
"/api/email/status": "status",
"/api/email/accounts": "list_accounts",
"/api/email/activity": "activity",
"/api/email/detail": "detail",
"/api/email/search": "search",
"/api/email/threads": "list_threads",
"/api/email/oauth/start": "oauth_start",
"/api/email/oauth/callback": "oauth_callback",
@@ -208,6 +210,8 @@ def _h_activity(handler):
account_id=(q.get("account_id") or "").strip() or None,
search=(q.get("q") or q.get("search") or "").strip() or None,
direction=(q.get("direction") or "").strip() or None,
since=(q.get("since") or "").strip() or None,
until=(q.get("until") or "").strip() or None,
limit=limit,
)
finally:
@@ -215,6 +219,97 @@ def _h_activity(handler):
handler.send_json(result)
def _h_detail(handler):
# Admin-only: the full body + recipients of a captured email is admin-scoped
# substance, same as the activity list it expands from.
user = _require_admin(handler)
if not user:
return
email_id = (handler.get_query_params().get("id") or "").strip()
if not email_id:
return handler.send_error_json("id required", 400)
conn = _conn()
try:
detail = _db.query_email_detail(conn, email_id)
finally:
conn.close()
if detail is None:
return handler.send_error_json("Not found", 404)
handler.send_json(detail)
def _semantic_email_search(query: str, top_k: int) -> list:
"""Hybrid (dense + BM25, reranked) retrieval over the email bodies indexed in
Qdrant, pre-filtered to doc_type='email'. Returns raw ranked hits (payload carries
source_id=email_id, lp_name, date_ts, text). The ingest stack (Spark Control +
Qdrant + the sparse encoder) lives in the Docker image, so it's imported lazily —
a bare CRM without it raises, and the caller maps that to a 503."""
import os
import sys
ingest_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ingest")
if ingest_dir not in sys.path:
sys.path.insert(0, ingest_dir)
import search as _ingest_search # ingest/search.py
filt = {"must": [{"key": "doc_type", "match": {"value": "email"}}]}
return _ingest_search.hybrid_search(query, top_k=top_k, rerank=True, filt=filt)
def _h_search(handler):
# Admin-only semantic search over captured email *content* (bodies), distinct from
# the structured subject/sender filters in _h_activity. Matched email bodies are the
# only email indexed in Qdrant (see ingest/chunking). Soft-delete-filtered + hydrated
# against SQLite (canonical) so a deleted email never surfaces from the stale index.
user = _require_admin(handler)
if not user:
return
q = handler.get_query_params()
query = (q.get("q") or q.get("query") or "").strip()
if not query:
return handler.send_json({"query": "", "results": []})
try:
top_k = min(50, max(1, int(q.get("top_k", 25))))
except (TypeError, ValueError):
top_k = 25
try:
hits = _semantic_email_search(query, top_k)
except Exception as e:
# Spark Control / Qdrant unreachable, or the ingest stack isn't installed.
# Log server-side (an error can carry a URL/host); give the UI a clean 503.
import sys
print(f"[email-search] retrieval failed: {type(e).__name__}: {e}", file=sys.stderr)
return handler.send_error_json("Content search is unavailable (Spark/Qdrant not reachable).", 503)
# Hydrate + soft-delete-filter against SQLite (canonical), preserving rank order.
payloads = [(h.get("payload", {}) or {}, h) for h in hits]
ids = [p.get("source_id") for p, _ in payloads]
conn = _conn()
try:
live = _db.search_hit_emails(conn, ids)
finally:
conn.close()
results = []
for p, h in payloads:
eid = p.get("source_id")
e = live.get(eid)
if not e:
continue # deleted since indexing, or not matched-resolvable -> drop
results.append({
"email_id": eid,
"subject": e["subject"],
"from_name": e["from_name"],
"from_email": e["from_email"],
"sent_at": e["sent_at"],
"direction": e["direction"],
"has_attachments": e["has_attachments"],
"lp_name": p.get("lp_name"),
"score": h.get("score"),
"excerpt": (h.get("text") or p.get("text") or "").replace("\n", " ").strip()[:300],
})
handler.send_json({"query": query, "results": results, "count": len(results)})
def _h_list_threads(handler):
user = _require_auth(handler)
if not user: