Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server

Workstream A–C substrate for the Ten31 agentic system: - A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9 - A2: additive/reversible core migration (canonical_entities, entity_links, interaction_log, relationship_edges, soft-delete) + ledgered runner - B1/B3: chunking + deterministic entity resolution (backend/ingest) - B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks - C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools - docs: redaction/re-hydration, Gmail enablement runbook - synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db, drop legacy files + start9/0.3.5) Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity queries). Real backfill runs on Ten31 infra; index holds synthetic data only. Branch snapshot also captures pre-existing working-tree changes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 08:11:28 -05:00
parent 7027efd777
commit c7ce44d963
99 changed files with 10676 additions and 7817 deletions
@@ -0,0 +1,234 @@
+"""
+Attachment download + on-disk storage.
+
+Two usage patterns:
+
+  1. During message parsing we call `register_stubs(conn, email_id, parsed)`
+     which writes pending rows to email_attachments.
+
+  2. A separate worker (kicked off by sync after each account completes)
+     calls `drain_pending()` which fetches attachment bytes from Gmail and
+     writes them to disk under CONFIG.attachments_dir.
+
+Files are named: <CRM_DATA_DIR>/email_attachments/<email_id[:2]>/<email_id>/<attachment_id>-<sanitized_filename>
+
+Sanitization prevents path traversal and keeps cross-platform-safe names.
+"""
+
+import base64
+import hashlib
+import os
+import re
+import sqlite3
+from typing import Iterable, Optional
+
+from . import config as _cfg
+from . import db as _db
+from . import errors as _errors
+from . import gmail_client as _gmail
+
+
+_MAX_FILENAME_LEN = 200
+_BAD_FILENAME_CHARS = re.compile(r'[/\\\x00-\x1f\x7f:*?"<>|]+')
+
+
+def _sanitize_filename(name: str) -> str:
+    if not name:
+        return "unnamed.bin"
+    # strip path components first
+    name = os.path.basename(name.replace("\\", "/"))
+    name = _BAD_FILENAME_CHARS.sub("_", name).strip(" .")
+    if not name:
+        name = "unnamed.bin"
+    if len(name) > _MAX_FILENAME_LEN:
+        stem, dot, ext = name.rpartition(".")
+        if dot:
+            name = stem[: _MAX_FILENAME_LEN - len(ext) - 1] + "." + ext
+        else:
+            name = name[:_MAX_FILENAME_LEN]
+    return name
+
+
+def _storage_path_for(email_id: str, attachment_id: str, sanitized_filename: str) -> str:
+    root = _cfg.CONFIG.attachments_dir
+    bucket = email_id[:2] or "_0"
+    dir_ = os.path.join(root, bucket, email_id)
+    os.makedirs(dir_, exist_ok=True)
+    return os.path.join(dir_, f"{attachment_id}-{sanitized_filename}")
+
+
+# ---------------------------------------------------------------------------- phase 1: register stubs
+
+def register_stubs(conn: sqlite3.Connection, *, email_id: str,
+                   parsed_attachments: Iterable[dict]) -> list[str]:
+    """Write pending attachment rows from parsed message data.
+
+    Also handles tiny inline attachments whose bytes arrived with the message
+    body (body.data present, no separate attachmentId) by writing them
+    directly and marking as downloaded.
+
+    Returns list of attachment ids created.
+    """
+    max_bytes = _cfg.CONFIG.max_attachment_mb * 1024 * 1024
+    ids = []
+
+    for att in parsed_attachments:
+        filename = att.get("filename") or "unnamed.bin"
+        sanitized = _sanitize_filename(filename)
+        gmail_att_id = att.get("gmail_attachment_id") or ""
+        mime = att.get("mime_type")
+        size = att.get("size")
+
+        # Determine storage path (we write the path whether or not the download
+        # succeeded; missing files surface via download_status).
+        att_row_id = _db.insert_attachment_stub(
+            conn,
+            email_id=email_id,
+            gmail_attachment_id=gmail_att_id,
+            filename=filename,
+            sanitized_filename=sanitized,
+            mime_type=mime,
+            size_bytes=size,
+            storage_path=_storage_path_for(email_id, gmail_att_id or att_row_id_fallback(), sanitized),
+        )
+        ids.append(att_row_id)
+
+        # Oversize guard.
+        if isinstance(size, int) and size > max_bytes:
+            conn.execute(
+                "UPDATE email_attachments SET download_status = 'skipped', "
+                "download_error = ? WHERE id = ?",
+                (f"exceeds max size {_cfg.CONFIG.max_attachment_mb}MB", att_row_id),
+            )
+            continue
+
+        # Inline data fast-path.
+        inline_b64 = att.get("inline_data_b64")
+        if inline_b64:
+            try:
+                raw = base64.urlsafe_b64decode(_pad(inline_b64).encode("ascii"))
+                path = _storage_path_for(email_id, att_row_id, sanitized)
+                _write_bytes(path, raw)
+                sha = hashlib.sha256(raw).hexdigest()
+                conn.execute(
+                    "UPDATE email_attachments SET storage_path = ? WHERE id = ?",
+                    (path, att_row_id),
+                )
+                _db.mark_attachment_downloaded(
+                    conn, att_row_id, sha256_hex=sha, size_bytes=len(raw)
+                )
+            except Exception as e:
+                _db.mark_attachment_failed(conn, att_row_id, error=f"inline decode: {e}")
+
+    return ids
+
+
+def att_row_id_fallback() -> str:
+    # Placeholder so the path template always produces something if gmail_att_id
+    # was missing at stub time; the real path is rewritten when the worker
+    # picks it up.
+    import uuid
+    return uuid.uuid4().hex
+
+
+# ---------------------------------------------------------------------------- phase 2: worker
+
+def drain_pending(conn_factory, client: _gmail.GmailClient, account_id: str,
+                  *, limit: int = 50) -> int:
+    """Download up to `limit` pending attachments for `account_id`.
+
+    Returns count of successfully downloaded attachments. Called after each
+    account's sync completes so large files don't block the sync loop.
+    """
+    conn = conn_factory()
+    try:
+        pending = _db.pending_attachments(conn, limit=limit)
+    finally:
+        conn.close()
+
+    downloaded = 0
+    for row in pending:
+        if row["account_id"] != account_id:
+            continue
+        conn = conn_factory()
+        try:
+            ok = _download_one(conn, client, row)
+            if ok:
+                downloaded += 1
+            conn.commit()
+        finally:
+            conn.close()
+    return downloaded
+
+
+def _download_one(conn: sqlite3.Connection, client: _gmail.GmailClient, row) -> bool:
+    try:
+        resp = client.get_attachment(row["gmail_message_id"], row["gmail_attachment_id"])
+    except _errors.RETRYABLE as e:
+        _db.mark_attachment_failed(conn, row["id"], error=f"transient: {type(e).__name__}")
+        return False
+    except _errors.GmailError as e:
+        _db.mark_attachment_failed(conn, row["id"], error=f"{type(e).__name__}: {e}")
+        return False
+
+    data_b64 = resp.get("data")
+    if not data_b64:
+        _db.mark_attachment_failed(conn, row["id"], error="empty data in response")
+        return False
+
+    try:
+        raw = base64.urlsafe_b64decode(_pad(data_b64).encode("ascii"))
+    except Exception as e:
+        _db.mark_attachment_failed(conn, row["id"], error=f"decode: {e}")
+        return False
+
+    sha = hashlib.sha256(raw).hexdigest()
+    # If an existing attachment has the same SHA, re-point storage_path and skip write.
+    existing = _find_existing_by_sha(conn, sha, exclude_id=row["id"])
+    if existing:
+        conn.execute(
+            "UPDATE email_attachments SET storage_path = ? WHERE id = ?",
+            (existing["storage_path"], row["id"]),
+        )
+        _db.mark_attachment_downloaded(conn, row["id"], sha256_hex=sha, size_bytes=len(raw))
+        return True
+
+    path = _storage_path_for(row["email_id"], row["id"], row["sanitized_filename"])
+    try:
+        _write_bytes(path, raw)
+    except OSError as e:
+        _db.mark_attachment_failed(conn, row["id"], error=f"disk: {e}")
+        return False
+
+    conn.execute(
+        "UPDATE email_attachments SET storage_path = ? WHERE id = ?",
+        (path, row["id"]),
+    )
+    _db.mark_attachment_downloaded(conn, row["id"], sha256_hex=sha, size_bytes=len(raw))
+    return True
+
+
+def _find_existing_by_sha(conn: sqlite3.Connection, sha: str, *, exclude_id: str) -> Optional[sqlite3.Row]:
+    cur = conn.cursor()
+    cur.execute(
+        "SELECT * FROM email_attachments WHERE sha256_hex = ? AND id != ? "
+        "AND download_status = 'downloaded' LIMIT 1",
+        (sha, exclude_id),
+    )
+    return cur.fetchone()
+
+
+# ---------------------------------------------------------------------------- utils
+
+def _pad(b64: str) -> str:
+    pad = 4 - (len(b64) % 4)
+    return b64 + ("=" * pad if pad != 4 else "")
+
+
+def _write_bytes(path: str, data: bytes) -> None:
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    tmp = path + ".tmp"
+    with open(tmp, "wb") as f:
+        f.write(data)
+    os.chmod(tmp, 0o600)
+    os.replace(tmp, path)