c7ce44d963
Workstream A–C substrate for the Ten31 agentic system: - A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9 - A2: additive/reversible core migration (canonical_entities, entity_links, interaction_log, relationship_edges, soft-delete) + ledgered runner - B1/B3: chunking + deterministic entity resolution (backend/ingest) - B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks - C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools - docs: redaction/re-hydration, Gmail enablement runbook - synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db, drop legacy files + start9/0.3.5) Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity queries). Real backfill runs on Ten31 infra; index holds synthetic data only. Branch snapshot also captures pre-existing working-tree changes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
284 lines
9.5 KiB
Python
284 lines
9.5 KiB
Python
"""
|
|
Parse a Gmail `users.messages.get` response (format=full) into a flat dict
|
|
ready for db.insert_email().
|
|
|
|
Input shape (abbreviated):
|
|
{
|
|
"id": "...", # Gmail message id
|
|
"threadId": "...",
|
|
"labelIds": ["INBOX","IMPORTANT",...],
|
|
"snippet": "...",
|
|
"historyId": "...",
|
|
"internalDate": "1713657600000", # ms epoch, authoritative
|
|
"sizeEstimate": 12345,
|
|
"payload": {
|
|
"headers": [{"name":"Subject","value":"..."}, ...],
|
|
"mimeType": "multipart/mixed",
|
|
"parts": [...recursive...],
|
|
"body": {"data": "<base64url>", "size": ...}
|
|
}
|
|
}
|
|
"""
|
|
|
|
import base64
|
|
import email.utils
|
|
import email.header
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from typing import Any, Iterable, Optional
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
# ---------------------------------------------------------------------------- public
|
|
|
|
def parse(message: dict, *, owning_account_address: Optional[str] = None) -> dict:
|
|
"""Parse a Gmail message payload into our canonical dict shape."""
|
|
headers = _header_map(message.get("payload", {}).get("headers") or [])
|
|
|
|
from_name, from_email = _split_addr(headers.get("from", ""))
|
|
to_list = _parse_address_list(headers.get("to", ""))
|
|
cc_list = _parse_address_list(headers.get("cc", ""))
|
|
bcc_list = _parse_address_list(headers.get("bcc", ""))
|
|
reply_to = _split_addr(headers.get("reply-to", ""))[1] or None
|
|
|
|
sent_at = _parse_date_header(headers.get("date"), fallback_ms=message.get("internalDate"))
|
|
|
|
rfc_mid = headers.get("message-id", "").strip() or f"synthetic-{message.get('id')}@ten31.local"
|
|
rfc_mid = _strip_angle_brackets(rfc_mid)
|
|
in_reply_to = _strip_angle_brackets(headers.get("in-reply-to", "").strip()) or None
|
|
references = _split_references(headers.get("references", ""))
|
|
rfc_thread_root_id = references[0] if references else (in_reply_to or rfc_mid)
|
|
|
|
body_text, body_html, attachments = _walk_payload(message.get("payload", {}))
|
|
|
|
subject = _decode_rfc2047(headers.get("subject") or "")
|
|
|
|
labels = message.get("labelIds") or []
|
|
is_sent = "SENT" in labels
|
|
|
|
return {
|
|
"gmail_message_id": message.get("id"),
|
|
"gmail_thread_id": message.get("threadId"),
|
|
"rfc_message_id": rfc_mid,
|
|
"rfc_thread_root_id": rfc_thread_root_id,
|
|
"in_reply_to": in_reply_to,
|
|
"references": references,
|
|
"subject": subject,
|
|
"from_email": (from_email or "").lower(),
|
|
"from_name": from_name,
|
|
"to": [{"email": e.lower(), "name": n} for n, e in to_list if e],
|
|
"cc": [{"email": e.lower(), "name": n} for n, e in cc_list if e],
|
|
"bcc": [{"email": e.lower(), "name": n} for n, e in bcc_list if e],
|
|
"reply_to": reply_to.lower() if reply_to else None,
|
|
"sent_at": sent_at,
|
|
"body_text": _cap_text(body_text),
|
|
"body_html": _cap_text(body_html),
|
|
"snippet": message.get("snippet"),
|
|
"attachments": attachments,
|
|
"size_estimate": message.get("sizeEstimate"),
|
|
"labels": labels,
|
|
"is_sent": is_sent,
|
|
"raw_headers": headers,
|
|
"owning_account": owning_account_address,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------- headers
|
|
|
|
def _header_map(header_list: Iterable[dict]) -> dict[str, str]:
|
|
"""Case-insensitive keys. Last-write-wins for duplicates (rare)."""
|
|
out: dict[str, str] = {}
|
|
for h in header_list:
|
|
name = (h.get("name") or "").lower()
|
|
out[name] = h.get("value") or ""
|
|
return out
|
|
|
|
|
|
def _decode_rfc2047(s: str) -> str:
|
|
if not s:
|
|
return ""
|
|
try:
|
|
parts = email.header.decode_header(s)
|
|
pieces = []
|
|
for text, charset in parts:
|
|
if isinstance(text, bytes):
|
|
try:
|
|
pieces.append(text.decode(charset or "utf-8", errors="replace"))
|
|
except LookupError:
|
|
pieces.append(text.decode("utf-8", errors="replace"))
|
|
else:
|
|
pieces.append(text)
|
|
return "".join(pieces)
|
|
except Exception:
|
|
return s
|
|
|
|
|
|
def _split_addr(raw: str) -> tuple[Optional[str], Optional[str]]:
|
|
if not raw:
|
|
return (None, None)
|
|
name, addr = email.utils.parseaddr(raw)
|
|
return (_decode_rfc2047(name) or None, addr or None)
|
|
|
|
|
|
def _parse_address_list(raw: str) -> list[tuple[Optional[str], Optional[str]]]:
|
|
if not raw:
|
|
return []
|
|
parsed = email.utils.getaddresses([raw])
|
|
return [(_decode_rfc2047(n) or None, a or None) for n, a in parsed if a]
|
|
|
|
|
|
def _parse_date_header(raw: Optional[str], *, fallback_ms: Optional[str]) -> str:
|
|
# Prefer RFC Date header, fall back to Gmail internalDate (epoch ms).
|
|
if raw:
|
|
try:
|
|
dt = email.utils.parsedate_to_datetime(raw)
|
|
if dt is not None:
|
|
if dt.tzinfo is None:
|
|
dt = dt.replace(tzinfo=timezone.utc)
|
|
return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
except (TypeError, ValueError):
|
|
pass
|
|
if fallback_ms:
|
|
try:
|
|
dt = datetime.fromtimestamp(int(fallback_ms) / 1000.0, tz=timezone.utc)
|
|
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
except (TypeError, ValueError):
|
|
pass
|
|
return datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def _split_references(raw: str) -> list[str]:
|
|
if not raw:
|
|
return []
|
|
return [_strip_angle_brackets(p) for p in raw.split() if p.strip()]
|
|
|
|
|
|
def _strip_angle_brackets(s: str) -> str:
|
|
s = (s or "").strip()
|
|
if s.startswith("<") and s.endswith(">"):
|
|
return s[1:-1]
|
|
return s
|
|
|
|
|
|
# ---------------------------------------------------------------------------- MIME walk
|
|
|
|
def _walk_payload(payload: dict) -> tuple[Optional[str], Optional[str], list[dict]]:
|
|
"""Returns (body_text, body_html, attachments).
|
|
|
|
Depth-first walk. First plain/text wins for body_text; first text/html
|
|
wins for body_html. Anything with a filename or attachment disposition
|
|
becomes an attachment entry.
|
|
"""
|
|
text: Optional[str] = None
|
|
html_body: Optional[str] = None
|
|
attachments: list[dict] = []
|
|
|
|
def visit(part: dict):
|
|
nonlocal text, html_body
|
|
mime = (part.get("mimeType") or "").lower()
|
|
filename = part.get("filename") or ""
|
|
body = part.get("body") or {}
|
|
parts = part.get("parts") or []
|
|
|
|
headers = _header_map(part.get("headers") or [])
|
|
disposition = (headers.get("content-disposition") or "").lower()
|
|
is_attachment = bool(filename) or disposition.startswith("attachment")
|
|
|
|
if is_attachment:
|
|
attachments.append({
|
|
"filename": filename or f"unnamed.{_ext_for(mime)}",
|
|
"mime_type": mime or "application/octet-stream",
|
|
"size": body.get("size"),
|
|
"gmail_attachment_id": body.get("attachmentId"),
|
|
# Some tiny attachments come inlined as base64; attachmentId is
|
|
# then missing and data is in body.data. sync.py handles both.
|
|
"inline_data_b64": body.get("data"),
|
|
"content_disposition": "inline" if disposition.startswith("inline") else "attachment",
|
|
})
|
|
else:
|
|
if mime == "text/plain" and text is None:
|
|
text = _decode_body(body)
|
|
elif mime == "text/html" and html_body is None:
|
|
html_body = _decode_body(body)
|
|
|
|
for child in parts:
|
|
visit(child)
|
|
|
|
visit(payload)
|
|
|
|
# Derive a plain-text body from HTML if only HTML exists.
|
|
if text is None and html_body:
|
|
text = _strip_html(html_body)
|
|
|
|
return text, html_body, attachments
|
|
|
|
|
|
def _decode_body(body: dict) -> Optional[str]:
|
|
data = body.get("data")
|
|
if not data:
|
|
return None
|
|
try:
|
|
padding = 4 - (len(data) % 4)
|
|
if padding != 4:
|
|
data = data + ("=" * padding)
|
|
raw = base64.urlsafe_b64decode(data.encode("ascii"))
|
|
return raw.decode("utf-8", errors="replace").replace("\r\n", "\n")
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
# ---------------------------------------------------------------------------- HTML stripping
|
|
|
|
class _HTMLToText(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self._parts: list[str] = []
|
|
self._skip_depth = 0
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag in ("script", "style"):
|
|
self._skip_depth += 1
|
|
if tag in ("br", "p", "div", "tr", "li"):
|
|
self._parts.append("\n")
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in ("script", "style"):
|
|
self._skip_depth = max(0, self._skip_depth - 1)
|
|
if tag in ("p", "div", "tr"):
|
|
self._parts.append("\n")
|
|
|
|
def handle_data(self, data):
|
|
if self._skip_depth == 0:
|
|
self._parts.append(data)
|
|
|
|
def text(self) -> str:
|
|
raw = "".join(self._parts)
|
|
return re.sub(r"\n{3,}", "\n\n", raw).strip()
|
|
|
|
|
|
def _strip_html(html: str) -> str:
|
|
p = _HTMLToText()
|
|
try:
|
|
p.feed(html)
|
|
return p.text()
|
|
except Exception:
|
|
return re.sub(r"<[^>]+>", " ", html)
|
|
|
|
|
|
def _ext_for(mime: str) -> str:
|
|
return mime.split("/")[-1] if "/" in mime else "bin"
|
|
|
|
|
|
# ---------------------------------------------------------------------------- caps
|
|
|
|
# Keep bodies bounded to avoid a pathological 500MB message exploding the DB.
|
|
_BODY_CAP_BYTES = 10 * 1024 * 1024 # 10MB
|
|
|
|
|
|
def _cap_text(s: Optional[str]) -> Optional[str]:
|
|
if s is None:
|
|
return None
|
|
if len(s.encode("utf-8", errors="ignore")) <= _BODY_CAP_BYTES:
|
|
return s
|
|
return s[: _BODY_CAP_BYTES // 2] + "\n\n[TRUNCATED BY CRM — body exceeded 10MB]"
|