Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server

Workstream A–C substrate for the Ten31 agentic system: - A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9 - A2: additive/reversible core migration (canonical_entities, entity_links, interaction_log, relationship_edges, soft-delete) + ledgered runner - B1/B3: chunking + deterministic entity resolution (backend/ingest) - B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks - C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools - docs: redaction/re-hydration, Gmail enablement runbook - synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db, drop legacy files + start9/0.3.5) Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity queries). Real backfill runs on Ten31 infra; index holds synthetic data only. Branch snapshot also captures pre-existing working-tree changes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 08:11:28 -05:00
parent 7027efd777
commit c7ce44d963
99 changed files with 10676 additions and 7817 deletions
@@ -0,0 +1,283 @@
+"""
+Parse a Gmail `users.messages.get` response (format=full) into a flat dict
+ready for db.insert_email().
+
+Input shape (abbreviated):
+  {
+    "id": "...",                     # Gmail message id
+    "threadId": "...",
+    "labelIds": ["INBOX","IMPORTANT",...],
+    "snippet": "...",
+    "historyId": "...",
+    "internalDate": "1713657600000", # ms epoch, authoritative
+    "sizeEstimate": 12345,
+    "payload": {
+      "headers": [{"name":"Subject","value":"..."}, ...],
+      "mimeType": "multipart/mixed",
+      "parts": [...recursive...],
+      "body": {"data": "<base64url>", "size": ...}
+    }
+  }
+"""
+
+import base64
+import email.utils
+import email.header
+import re
+from datetime import datetime, timezone
+from typing import Any, Iterable, Optional
+from html.parser import HTMLParser
+
+
+# ---------------------------------------------------------------------------- public
+
+def parse(message: dict, *, owning_account_address: Optional[str] = None) -> dict:
+    """Parse a Gmail message payload into our canonical dict shape."""
+    headers = _header_map(message.get("payload", {}).get("headers") or [])
+
+    from_name, from_email = _split_addr(headers.get("from", ""))
+    to_list = _parse_address_list(headers.get("to", ""))
+    cc_list = _parse_address_list(headers.get("cc", ""))
+    bcc_list = _parse_address_list(headers.get("bcc", ""))
+    reply_to = _split_addr(headers.get("reply-to", ""))[1] or None
+
+    sent_at = _parse_date_header(headers.get("date"), fallback_ms=message.get("internalDate"))
+
+    rfc_mid = headers.get("message-id", "").strip() or f"synthetic-{message.get('id')}@ten31.local"
+    rfc_mid = _strip_angle_brackets(rfc_mid)
+    in_reply_to = _strip_angle_brackets(headers.get("in-reply-to", "").strip()) or None
+    references = _split_references(headers.get("references", ""))
+    rfc_thread_root_id = references[0] if references else (in_reply_to or rfc_mid)
+
+    body_text, body_html, attachments = _walk_payload(message.get("payload", {}))
+
+    subject = _decode_rfc2047(headers.get("subject") or "")
+
+    labels = message.get("labelIds") or []
+    is_sent = "SENT" in labels
+
+    return {
+        "gmail_message_id": message.get("id"),
+        "gmail_thread_id": message.get("threadId"),
+        "rfc_message_id": rfc_mid,
+        "rfc_thread_root_id": rfc_thread_root_id,
+        "in_reply_to": in_reply_to,
+        "references": references,
+        "subject": subject,
+        "from_email": (from_email or "").lower(),
+        "from_name": from_name,
+        "to": [{"email": e.lower(), "name": n} for n, e in to_list if e],
+        "cc": [{"email": e.lower(), "name": n} for n, e in cc_list if e],
+        "bcc": [{"email": e.lower(), "name": n} for n, e in bcc_list if e],
+        "reply_to": reply_to.lower() if reply_to else None,
+        "sent_at": sent_at,
+        "body_text": _cap_text(body_text),
+        "body_html": _cap_text(body_html),
+        "snippet": message.get("snippet"),
+        "attachments": attachments,
+        "size_estimate": message.get("sizeEstimate"),
+        "labels": labels,
+        "is_sent": is_sent,
+        "raw_headers": headers,
+        "owning_account": owning_account_address,
+    }
+
+
+# ---------------------------------------------------------------------------- headers
+
+def _header_map(header_list: Iterable[dict]) -> dict[str, str]:
+    """Case-insensitive keys. Last-write-wins for duplicates (rare)."""
+    out: dict[str, str] = {}
+    for h in header_list:
+        name = (h.get("name") or "").lower()
+        out[name] = h.get("value") or ""
+    return out
+
+
+def _decode_rfc2047(s: str) -> str:
+    if not s:
+        return ""
+    try:
+        parts = email.header.decode_header(s)
+        pieces = []
+        for text, charset in parts:
+            if isinstance(text, bytes):
+                try:
+                    pieces.append(text.decode(charset or "utf-8", errors="replace"))
+                except LookupError:
+                    pieces.append(text.decode("utf-8", errors="replace"))
+            else:
+                pieces.append(text)
+        return "".join(pieces)
+    except Exception:
+        return s
+
+
+def _split_addr(raw: str) -> tuple[Optional[str], Optional[str]]:
+    if not raw:
+        return (None, None)
+    name, addr = email.utils.parseaddr(raw)
+    return (_decode_rfc2047(name) or None, addr or None)
+
+
+def _parse_address_list(raw: str) -> list[tuple[Optional[str], Optional[str]]]:
+    if not raw:
+        return []
+    parsed = email.utils.getaddresses([raw])
+    return [(_decode_rfc2047(n) or None, a or None) for n, a in parsed if a]
+
+
+def _parse_date_header(raw: Optional[str], *, fallback_ms: Optional[str]) -> str:
+    # Prefer RFC Date header, fall back to Gmail internalDate (epoch ms).
+    if raw:
+        try:
+            dt = email.utils.parsedate_to_datetime(raw)
+            if dt is not None:
+                if dt.tzinfo is None:
+                    dt = dt.replace(tzinfo=timezone.utc)
+                return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+        except (TypeError, ValueError):
+            pass
+    if fallback_ms:
+        try:
+            dt = datetime.fromtimestamp(int(fallback_ms) / 1000.0, tz=timezone.utc)
+            return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
+        except (TypeError, ValueError):
+            pass
+    return datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def _split_references(raw: str) -> list[str]:
+    if not raw:
+        return []
+    return [_strip_angle_brackets(p) for p in raw.split() if p.strip()]
+
+
+def _strip_angle_brackets(s: str) -> str:
+    s = (s or "").strip()
+    if s.startswith("<") and s.endswith(">"):
+        return s[1:-1]
+    return s
+
+
+# ---------------------------------------------------------------------------- MIME walk
+
+def _walk_payload(payload: dict) -> tuple[Optional[str], Optional[str], list[dict]]:
+    """Returns (body_text, body_html, attachments).
+
+    Depth-first walk. First plain/text wins for body_text; first text/html
+    wins for body_html. Anything with a filename or attachment disposition
+    becomes an attachment entry.
+    """
+    text: Optional[str] = None
+    html_body: Optional[str] = None
+    attachments: list[dict] = []
+
+    def visit(part: dict):
+        nonlocal text, html_body
+        mime = (part.get("mimeType") or "").lower()
+        filename = part.get("filename") or ""
+        body = part.get("body") or {}
+        parts = part.get("parts") or []
+
+        headers = _header_map(part.get("headers") or [])
+        disposition = (headers.get("content-disposition") or "").lower()
+        is_attachment = bool(filename) or disposition.startswith("attachment")
+
+        if is_attachment:
+            attachments.append({
+                "filename": filename or f"unnamed.{_ext_for(mime)}",
+                "mime_type": mime or "application/octet-stream",
+                "size": body.get("size"),
+                "gmail_attachment_id": body.get("attachmentId"),
+                # Some tiny attachments come inlined as base64; attachmentId is
+                # then missing and data is in body.data. sync.py handles both.
+                "inline_data_b64": body.get("data"),
+                "content_disposition": "inline" if disposition.startswith("inline") else "attachment",
+            })
+        else:
+            if mime == "text/plain" and text is None:
+                text = _decode_body(body)
+            elif mime == "text/html" and html_body is None:
+                html_body = _decode_body(body)
+
+        for child in parts:
+            visit(child)
+
+    visit(payload)
+
+    # Derive a plain-text body from HTML if only HTML exists.
+    if text is None and html_body:
+        text = _strip_html(html_body)
+
+    return text, html_body, attachments
+
+
+def _decode_body(body: dict) -> Optional[str]:
+    data = body.get("data")
+    if not data:
+        return None
+    try:
+        padding = 4 - (len(data) % 4)
+        if padding != 4:
+            data = data + ("=" * padding)
+        raw = base64.urlsafe_b64decode(data.encode("ascii"))
+        return raw.decode("utf-8", errors="replace").replace("\r\n", "\n")
+    except Exception:
+        return None
+
+
+# ---------------------------------------------------------------------------- HTML stripping
+
+class _HTMLToText(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self._parts: list[str] = []
+        self._skip_depth = 0
+
+    def handle_starttag(self, tag, attrs):
+        if tag in ("script", "style"):
+            self._skip_depth += 1
+        if tag in ("br", "p", "div", "tr", "li"):
+            self._parts.append("\n")
+
+    def handle_endtag(self, tag):
+        if tag in ("script", "style"):
+            self._skip_depth = max(0, self._skip_depth - 1)
+        if tag in ("p", "div", "tr"):
+            self._parts.append("\n")
+
+    def handle_data(self, data):
+        if self._skip_depth == 0:
+            self._parts.append(data)
+
+    def text(self) -> str:
+        raw = "".join(self._parts)
+        return re.sub(r"\n{3,}", "\n\n", raw).strip()
+
+
+def _strip_html(html: str) -> str:
+    p = _HTMLToText()
+    try:
+        p.feed(html)
+        return p.text()
+    except Exception:
+        return re.sub(r"<[^>]+>", " ", html)
+
+
+def _ext_for(mime: str) -> str:
+    return mime.split("/")[-1] if "/" in mime else "bin"
+
+
+# ---------------------------------------------------------------------------- caps
+
+# Keep bodies bounded to avoid a pathological 500MB message exploding the DB.
+_BODY_CAP_BYTES = 10 * 1024 * 1024  # 10MB
+
+
+def _cap_text(s: Optional[str]) -> Optional[str]:
+    if s is None:
+        return None
+    if len(s.encode("utf-8", errors="ignore")) <= _BODY_CAP_BYTES:
+        return s
+    return s[: _BODY_CAP_BYTES // 2] + "\n\n[TRUNCATED BY CRM — body exceeded 10MB]"