Files
ten31-database/backend/email_integration/parser.py
T
Keysat c7ce44d963 Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server
Workstream A–C substrate for the Ten31 agentic system:
- A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9
- A2: additive/reversible core migration (canonical_entities, entity_links,
  interaction_log, relationship_edges, soft-delete) + ledgered runner
- B1/B3: chunking + deterministic entity resolution (backend/ingest)
- B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks
- C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools
- docs: redaction/re-hydration, Gmail enablement runbook
- synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db,
  drop legacy files + start9/0.3.5)

Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity
queries). Real backfill runs on Ten31 infra; index holds synthetic data only.
Branch snapshot also captures pre-existing working-tree changes.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 08:13:35 -05:00

284 lines
9.5 KiB
Python

"""
Parse a Gmail `users.messages.get` response (format=full) into a flat dict
ready for db.insert_email().
Input shape (abbreviated):
{
"id": "...", # Gmail message id
"threadId": "...",
"labelIds": ["INBOX","IMPORTANT",...],
"snippet": "...",
"historyId": "...",
"internalDate": "1713657600000", # ms epoch, authoritative
"sizeEstimate": 12345,
"payload": {
"headers": [{"name":"Subject","value":"..."}, ...],
"mimeType": "multipart/mixed",
"parts": [...recursive...],
"body": {"data": "<base64url>", "size": ...}
}
}
"""
import base64
import email.utils
import email.header
import re
from datetime import datetime, timezone
from typing import Any, Iterable, Optional
from html.parser import HTMLParser
# ---------------------------------------------------------------------------- public
def parse(message: dict, *, owning_account_address: Optional[str] = None) -> dict:
"""Parse a Gmail message payload into our canonical dict shape."""
headers = _header_map(message.get("payload", {}).get("headers") or [])
from_name, from_email = _split_addr(headers.get("from", ""))
to_list = _parse_address_list(headers.get("to", ""))
cc_list = _parse_address_list(headers.get("cc", ""))
bcc_list = _parse_address_list(headers.get("bcc", ""))
reply_to = _split_addr(headers.get("reply-to", ""))[1] or None
sent_at = _parse_date_header(headers.get("date"), fallback_ms=message.get("internalDate"))
rfc_mid = headers.get("message-id", "").strip() or f"synthetic-{message.get('id')}@ten31.local"
rfc_mid = _strip_angle_brackets(rfc_mid)
in_reply_to = _strip_angle_brackets(headers.get("in-reply-to", "").strip()) or None
references = _split_references(headers.get("references", ""))
rfc_thread_root_id = references[0] if references else (in_reply_to or rfc_mid)
body_text, body_html, attachments = _walk_payload(message.get("payload", {}))
subject = _decode_rfc2047(headers.get("subject") or "")
labels = message.get("labelIds") or []
is_sent = "SENT" in labels
return {
"gmail_message_id": message.get("id"),
"gmail_thread_id": message.get("threadId"),
"rfc_message_id": rfc_mid,
"rfc_thread_root_id": rfc_thread_root_id,
"in_reply_to": in_reply_to,
"references": references,
"subject": subject,
"from_email": (from_email or "").lower(),
"from_name": from_name,
"to": [{"email": e.lower(), "name": n} for n, e in to_list if e],
"cc": [{"email": e.lower(), "name": n} for n, e in cc_list if e],
"bcc": [{"email": e.lower(), "name": n} for n, e in bcc_list if e],
"reply_to": reply_to.lower() if reply_to else None,
"sent_at": sent_at,
"body_text": _cap_text(body_text),
"body_html": _cap_text(body_html),
"snippet": message.get("snippet"),
"attachments": attachments,
"size_estimate": message.get("sizeEstimate"),
"labels": labels,
"is_sent": is_sent,
"raw_headers": headers,
"owning_account": owning_account_address,
}
# ---------------------------------------------------------------------------- headers
def _header_map(header_list: Iterable[dict]) -> dict[str, str]:
"""Case-insensitive keys. Last-write-wins for duplicates (rare)."""
out: dict[str, str] = {}
for h in header_list:
name = (h.get("name") or "").lower()
out[name] = h.get("value") or ""
return out
def _decode_rfc2047(s: str) -> str:
if not s:
return ""
try:
parts = email.header.decode_header(s)
pieces = []
for text, charset in parts:
if isinstance(text, bytes):
try:
pieces.append(text.decode(charset or "utf-8", errors="replace"))
except LookupError:
pieces.append(text.decode("utf-8", errors="replace"))
else:
pieces.append(text)
return "".join(pieces)
except Exception:
return s
def _split_addr(raw: str) -> tuple[Optional[str], Optional[str]]:
if not raw:
return (None, None)
name, addr = email.utils.parseaddr(raw)
return (_decode_rfc2047(name) or None, addr or None)
def _parse_address_list(raw: str) -> list[tuple[Optional[str], Optional[str]]]:
if not raw:
return []
parsed = email.utils.getaddresses([raw])
return [(_decode_rfc2047(n) or None, a or None) for n, a in parsed if a]
def _parse_date_header(raw: Optional[str], *, fallback_ms: Optional[str]) -> str:
# Prefer RFC Date header, fall back to Gmail internalDate (epoch ms).
if raw:
try:
dt = email.utils.parsedate_to_datetime(raw)
if dt is not None:
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
except (TypeError, ValueError):
pass
if fallback_ms:
try:
dt = datetime.fromtimestamp(int(fallback_ms) / 1000.0, tz=timezone.utc)
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
except (TypeError, ValueError):
pass
return datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def _split_references(raw: str) -> list[str]:
if not raw:
return []
return [_strip_angle_brackets(p) for p in raw.split() if p.strip()]
def _strip_angle_brackets(s: str) -> str:
s = (s or "").strip()
if s.startswith("<") and s.endswith(">"):
return s[1:-1]
return s
# ---------------------------------------------------------------------------- MIME walk
def _walk_payload(payload: dict) -> tuple[Optional[str], Optional[str], list[dict]]:
"""Returns (body_text, body_html, attachments).
Depth-first walk. First plain/text wins for body_text; first text/html
wins for body_html. Anything with a filename or attachment disposition
becomes an attachment entry.
"""
text: Optional[str] = None
html_body: Optional[str] = None
attachments: list[dict] = []
def visit(part: dict):
nonlocal text, html_body
mime = (part.get("mimeType") or "").lower()
filename = part.get("filename") or ""
body = part.get("body") or {}
parts = part.get("parts") or []
headers = _header_map(part.get("headers") or [])
disposition = (headers.get("content-disposition") or "").lower()
is_attachment = bool(filename) or disposition.startswith("attachment")
if is_attachment:
attachments.append({
"filename": filename or f"unnamed.{_ext_for(mime)}",
"mime_type": mime or "application/octet-stream",
"size": body.get("size"),
"gmail_attachment_id": body.get("attachmentId"),
# Some tiny attachments come inlined as base64; attachmentId is
# then missing and data is in body.data. sync.py handles both.
"inline_data_b64": body.get("data"),
"content_disposition": "inline" if disposition.startswith("inline") else "attachment",
})
else:
if mime == "text/plain" and text is None:
text = _decode_body(body)
elif mime == "text/html" and html_body is None:
html_body = _decode_body(body)
for child in parts:
visit(child)
visit(payload)
# Derive a plain-text body from HTML if only HTML exists.
if text is None and html_body:
text = _strip_html(html_body)
return text, html_body, attachments
def _decode_body(body: dict) -> Optional[str]:
data = body.get("data")
if not data:
return None
try:
padding = 4 - (len(data) % 4)
if padding != 4:
data = data + ("=" * padding)
raw = base64.urlsafe_b64decode(data.encode("ascii"))
return raw.decode("utf-8", errors="replace").replace("\r\n", "\n")
except Exception:
return None
# ---------------------------------------------------------------------------- HTML stripping
class _HTMLToText(HTMLParser):
def __init__(self):
super().__init__()
self._parts: list[str] = []
self._skip_depth = 0
def handle_starttag(self, tag, attrs):
if tag in ("script", "style"):
self._skip_depth += 1
if tag in ("br", "p", "div", "tr", "li"):
self._parts.append("\n")
def handle_endtag(self, tag):
if tag in ("script", "style"):
self._skip_depth = max(0, self._skip_depth - 1)
if tag in ("p", "div", "tr"):
self._parts.append("\n")
def handle_data(self, data):
if self._skip_depth == 0:
self._parts.append(data)
def text(self) -> str:
raw = "".join(self._parts)
return re.sub(r"\n{3,}", "\n\n", raw).strip()
def _strip_html(html: str) -> str:
p = _HTMLToText()
try:
p.feed(html)
return p.text()
except Exception:
return re.sub(r"<[^>]+>", " ", html)
def _ext_for(mime: str) -> str:
return mime.split("/")[-1] if "/" in mime else "bin"
# ---------------------------------------------------------------------------- caps
# Keep bodies bounded to avoid a pathological 500MB message exploding the DB.
_BODY_CAP_BYTES = 10 * 1024 * 1024 # 10MB
def _cap_text(s: Optional[str]) -> Optional[str]:
if s is None:
return None
if len(s.encode("utf-8", errors="ignore")) <= _BODY_CAP_BYTES:
return s
return s[: _BODY_CAP_BYTES // 2] + "\n\n[TRUNCATED BY CRM — body exceeded 10MB]"