Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server
Workstream A–C substrate for the Ten31 agentic system: - A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9 - A2: additive/reversible core migration (canonical_entities, entity_links, interaction_log, relationship_edges, soft-delete) + ledgered runner - B1/B3: chunking + deterministic entity resolution (backend/ingest) - B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks - C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools - docs: redaction/re-hydration, Gmail enablement runbook - synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db, drop legacy files + start9/0.3.5) Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity queries). Real backfill runs on Ten31 infra; index holds synthetic data only. Branch snapshot also captures pre-existing working-tree changes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,283 @@
|
||||
"""
|
||||
Parse a Gmail `users.messages.get` response (format=full) into a flat dict
|
||||
ready for db.insert_email().
|
||||
|
||||
Input shape (abbreviated):
|
||||
{
|
||||
"id": "...", # Gmail message id
|
||||
"threadId": "...",
|
||||
"labelIds": ["INBOX","IMPORTANT",...],
|
||||
"snippet": "...",
|
||||
"historyId": "...",
|
||||
"internalDate": "1713657600000", # ms epoch, authoritative
|
||||
"sizeEstimate": 12345,
|
||||
"payload": {
|
||||
"headers": [{"name":"Subject","value":"..."}, ...],
|
||||
"mimeType": "multipart/mixed",
|
||||
"parts": [...recursive...],
|
||||
"body": {"data": "<base64url>", "size": ...}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
import base64
|
||||
import email.utils
|
||||
import email.header
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Iterable, Optional
|
||||
from html.parser import HTMLParser
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- public
|
||||
|
||||
def parse(message: dict, *, owning_account_address: Optional[str] = None) -> dict:
|
||||
"""Parse a Gmail message payload into our canonical dict shape."""
|
||||
headers = _header_map(message.get("payload", {}).get("headers") or [])
|
||||
|
||||
from_name, from_email = _split_addr(headers.get("from", ""))
|
||||
to_list = _parse_address_list(headers.get("to", ""))
|
||||
cc_list = _parse_address_list(headers.get("cc", ""))
|
||||
bcc_list = _parse_address_list(headers.get("bcc", ""))
|
||||
reply_to = _split_addr(headers.get("reply-to", ""))[1] or None
|
||||
|
||||
sent_at = _parse_date_header(headers.get("date"), fallback_ms=message.get("internalDate"))
|
||||
|
||||
rfc_mid = headers.get("message-id", "").strip() or f"synthetic-{message.get('id')}@ten31.local"
|
||||
rfc_mid = _strip_angle_brackets(rfc_mid)
|
||||
in_reply_to = _strip_angle_brackets(headers.get("in-reply-to", "").strip()) or None
|
||||
references = _split_references(headers.get("references", ""))
|
||||
rfc_thread_root_id = references[0] if references else (in_reply_to or rfc_mid)
|
||||
|
||||
body_text, body_html, attachments = _walk_payload(message.get("payload", {}))
|
||||
|
||||
subject = _decode_rfc2047(headers.get("subject") or "")
|
||||
|
||||
labels = message.get("labelIds") or []
|
||||
is_sent = "SENT" in labels
|
||||
|
||||
return {
|
||||
"gmail_message_id": message.get("id"),
|
||||
"gmail_thread_id": message.get("threadId"),
|
||||
"rfc_message_id": rfc_mid,
|
||||
"rfc_thread_root_id": rfc_thread_root_id,
|
||||
"in_reply_to": in_reply_to,
|
||||
"references": references,
|
||||
"subject": subject,
|
||||
"from_email": (from_email or "").lower(),
|
||||
"from_name": from_name,
|
||||
"to": [{"email": e.lower(), "name": n} for n, e in to_list if e],
|
||||
"cc": [{"email": e.lower(), "name": n} for n, e in cc_list if e],
|
||||
"bcc": [{"email": e.lower(), "name": n} for n, e in bcc_list if e],
|
||||
"reply_to": reply_to.lower() if reply_to else None,
|
||||
"sent_at": sent_at,
|
||||
"body_text": _cap_text(body_text),
|
||||
"body_html": _cap_text(body_html),
|
||||
"snippet": message.get("snippet"),
|
||||
"attachments": attachments,
|
||||
"size_estimate": message.get("sizeEstimate"),
|
||||
"labels": labels,
|
||||
"is_sent": is_sent,
|
||||
"raw_headers": headers,
|
||||
"owning_account": owning_account_address,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- headers
|
||||
|
||||
def _header_map(header_list: Iterable[dict]) -> dict[str, str]:
|
||||
"""Case-insensitive keys. Last-write-wins for duplicates (rare)."""
|
||||
out: dict[str, str] = {}
|
||||
for h in header_list:
|
||||
name = (h.get("name") or "").lower()
|
||||
out[name] = h.get("value") or ""
|
||||
return out
|
||||
|
||||
|
||||
def _decode_rfc2047(s: str) -> str:
|
||||
if not s:
|
||||
return ""
|
||||
try:
|
||||
parts = email.header.decode_header(s)
|
||||
pieces = []
|
||||
for text, charset in parts:
|
||||
if isinstance(text, bytes):
|
||||
try:
|
||||
pieces.append(text.decode(charset or "utf-8", errors="replace"))
|
||||
except LookupError:
|
||||
pieces.append(text.decode("utf-8", errors="replace"))
|
||||
else:
|
||||
pieces.append(text)
|
||||
return "".join(pieces)
|
||||
except Exception:
|
||||
return s
|
||||
|
||||
|
||||
def _split_addr(raw: str) -> tuple[Optional[str], Optional[str]]:
|
||||
if not raw:
|
||||
return (None, None)
|
||||
name, addr = email.utils.parseaddr(raw)
|
||||
return (_decode_rfc2047(name) or None, addr or None)
|
||||
|
||||
|
||||
def _parse_address_list(raw: str) -> list[tuple[Optional[str], Optional[str]]]:
|
||||
if not raw:
|
||||
return []
|
||||
parsed = email.utils.getaddresses([raw])
|
||||
return [(_decode_rfc2047(n) or None, a or None) for n, a in parsed if a]
|
||||
|
||||
|
||||
def _parse_date_header(raw: Optional[str], *, fallback_ms: Optional[str]) -> str:
|
||||
# Prefer RFC Date header, fall back to Gmail internalDate (epoch ms).
|
||||
if raw:
|
||||
try:
|
||||
dt = email.utils.parsedate_to_datetime(raw)
|
||||
if dt is not None:
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
if fallback_ms:
|
||||
try:
|
||||
dt = datetime.fromtimestamp(int(fallback_ms) / 1000.0, tz=timezone.utc)
|
||||
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
return datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def _split_references(raw: str) -> list[str]:
|
||||
if not raw:
|
||||
return []
|
||||
return [_strip_angle_brackets(p) for p in raw.split() if p.strip()]
|
||||
|
||||
|
||||
def _strip_angle_brackets(s: str) -> str:
|
||||
s = (s or "").strip()
|
||||
if s.startswith("<") and s.endswith(">"):
|
||||
return s[1:-1]
|
||||
return s
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- MIME walk
|
||||
|
||||
def _walk_payload(payload: dict) -> tuple[Optional[str], Optional[str], list[dict]]:
|
||||
"""Returns (body_text, body_html, attachments).
|
||||
|
||||
Depth-first walk. First plain/text wins for body_text; first text/html
|
||||
wins for body_html. Anything with a filename or attachment disposition
|
||||
becomes an attachment entry.
|
||||
"""
|
||||
text: Optional[str] = None
|
||||
html_body: Optional[str] = None
|
||||
attachments: list[dict] = []
|
||||
|
||||
def visit(part: dict):
|
||||
nonlocal text, html_body
|
||||
mime = (part.get("mimeType") or "").lower()
|
||||
filename = part.get("filename") or ""
|
||||
body = part.get("body") or {}
|
||||
parts = part.get("parts") or []
|
||||
|
||||
headers = _header_map(part.get("headers") or [])
|
||||
disposition = (headers.get("content-disposition") or "").lower()
|
||||
is_attachment = bool(filename) or disposition.startswith("attachment")
|
||||
|
||||
if is_attachment:
|
||||
attachments.append({
|
||||
"filename": filename or f"unnamed.{_ext_for(mime)}",
|
||||
"mime_type": mime or "application/octet-stream",
|
||||
"size": body.get("size"),
|
||||
"gmail_attachment_id": body.get("attachmentId"),
|
||||
# Some tiny attachments come inlined as base64; attachmentId is
|
||||
# then missing and data is in body.data. sync.py handles both.
|
||||
"inline_data_b64": body.get("data"),
|
||||
"content_disposition": "inline" if disposition.startswith("inline") else "attachment",
|
||||
})
|
||||
else:
|
||||
if mime == "text/plain" and text is None:
|
||||
text = _decode_body(body)
|
||||
elif mime == "text/html" and html_body is None:
|
||||
html_body = _decode_body(body)
|
||||
|
||||
for child in parts:
|
||||
visit(child)
|
||||
|
||||
visit(payload)
|
||||
|
||||
# Derive a plain-text body from HTML if only HTML exists.
|
||||
if text is None and html_body:
|
||||
text = _strip_html(html_body)
|
||||
|
||||
return text, html_body, attachments
|
||||
|
||||
|
||||
def _decode_body(body: dict) -> Optional[str]:
|
||||
data = body.get("data")
|
||||
if not data:
|
||||
return None
|
||||
try:
|
||||
padding = 4 - (len(data) % 4)
|
||||
if padding != 4:
|
||||
data = data + ("=" * padding)
|
||||
raw = base64.urlsafe_b64decode(data.encode("ascii"))
|
||||
return raw.decode("utf-8", errors="replace").replace("\r\n", "\n")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- HTML stripping
|
||||
|
||||
class _HTMLToText(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._parts: list[str] = []
|
||||
self._skip_depth = 0
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in ("script", "style"):
|
||||
self._skip_depth += 1
|
||||
if tag in ("br", "p", "div", "tr", "li"):
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in ("script", "style"):
|
||||
self._skip_depth = max(0, self._skip_depth - 1)
|
||||
if tag in ("p", "div", "tr"):
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_data(self, data):
|
||||
if self._skip_depth == 0:
|
||||
self._parts.append(data)
|
||||
|
||||
def text(self) -> str:
|
||||
raw = "".join(self._parts)
|
||||
return re.sub(r"\n{3,}", "\n\n", raw).strip()
|
||||
|
||||
|
||||
def _strip_html(html: str) -> str:
|
||||
p = _HTMLToText()
|
||||
try:
|
||||
p.feed(html)
|
||||
return p.text()
|
||||
except Exception:
|
||||
return re.sub(r"<[^>]+>", " ", html)
|
||||
|
||||
|
||||
def _ext_for(mime: str) -> str:
|
||||
return mime.split("/")[-1] if "/" in mime else "bin"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- caps
|
||||
|
||||
# Keep bodies bounded to avoid a pathological 500MB message exploding the DB.
|
||||
_BODY_CAP_BYTES = 10 * 1024 * 1024 # 10MB
|
||||
|
||||
|
||||
def _cap_text(s: Optional[str]) -> Optional[str]:
|
||||
if s is None:
|
||||
return None
|
||||
if len(s.encode("utf-8", errors="ignore")) <= _BODY_CAP_BYTES:
|
||||
return s
|
||||
return s[: _BODY_CAP_BYTES // 2] + "\n\n[TRUNCATED BY CRM — body exceeded 10MB]"
|
||||
Reference in New Issue
Block a user