ea036f49a6
insert_email's recipients loop did `for a in parsed.get(kind, [])`, but the parser sets reply_to=None when there is no Reply-To header, so .get returns None (key present) and the loop raised 'NoneType' object is not iterable — aborting the entire Gmail backfill on the first such email (i.e. almost immediately). Fixed with `or []`. Regression test test_insert_email.py (reply_to=None, all-None recipients, happy path). Because the scheduler intentionally skips error-status accounts (no retry storms), an errored mailbox would never resume on its own. "Sync now" now clears error status first, so it is an explicit retry; backfill resumes from its saved cursor and dedups by Message-ID, so nothing is re-captured. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
420 lines
15 KiB
Python
420 lines
15 KiB
Python
"""
|
|
Data-access layer for the email_integration module.
|
|
|
|
All SQL touching emails_* tables lives here. Other modules call named
|
|
helpers — they never write SQL inline. This keeps schema changes contained.
|
|
|
|
Connection pattern matches server.py get_db():
|
|
- WAL mode, foreign keys on, busy_timeout
|
|
- sqlite3.Row row_factory
|
|
The caller is responsible for committing / closing.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from typing import Iterable, Optional
|
|
|
|
|
|
# ------------------------------------------------------------------ migrations
|
|
|
|
def apply_migrations(cursor: sqlite3.Cursor) -> None:
|
|
"""Apply all .sql migration files in migrations/ in lexicographic order.
|
|
|
|
Called from server.init_db(). Idempotent. Does not log past migrations in
|
|
a table yet — each file is guarded by CREATE ... IF NOT EXISTS etc. If
|
|
we ever need more complex migrations, add a schema_migrations table.
|
|
"""
|
|
here = os.path.dirname(os.path.abspath(__file__))
|
|
mdir = os.path.join(here, "migrations")
|
|
if not os.path.isdir(mdir):
|
|
return
|
|
for name in sorted(os.listdir(mdir)):
|
|
if not name.endswith(".sql"):
|
|
continue
|
|
path = os.path.join(mdir, name)
|
|
with open(path, "r") as f:
|
|
sql = f.read()
|
|
cursor.executescript(sql)
|
|
|
|
|
|
# ------------------------------------------------------------------ utils
|
|
|
|
def _uuid() -> str:
|
|
return str(uuid.uuid4())
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def _json(v) -> str:
|
|
return json.dumps(v, separators=(",", ":"))
|
|
|
|
|
|
# ------------------------------------------------------------------ email_accounts
|
|
|
|
def list_sync_ready_accounts(conn: sqlite3.Connection) -> list[sqlite3.Row]:
|
|
cur = conn.cursor()
|
|
cur.execute(
|
|
"SELECT * FROM email_accounts "
|
|
"WHERE sync_enabled = 1 AND sync_status IN ('pending','active') "
|
|
"ORDER BY last_synced_at IS NOT NULL, last_synced_at"
|
|
)
|
|
return cur.fetchall()
|
|
|
|
|
|
def get_account_by_email(conn: sqlite3.Connection, email_address: str) -> Optional[sqlite3.Row]:
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT * FROM email_accounts WHERE email_address = ?", (email_address,))
|
|
return cur.fetchone()
|
|
|
|
|
|
def upsert_account(conn: sqlite3.Connection, *, user_id: str, email_address: str,
|
|
auth_method: str) -> str:
|
|
existing = get_account_by_email(conn, email_address)
|
|
if existing:
|
|
return existing["id"]
|
|
account_id = _uuid()
|
|
conn.execute(
|
|
"INSERT INTO email_accounts (id, user_id, email_address, auth_method) "
|
|
"VALUES (?, ?, ?, ?)",
|
|
(account_id, user_id, email_address, auth_method),
|
|
)
|
|
return account_id
|
|
|
|
|
|
def set_account_status(conn: sqlite3.Connection, account_id: str, *,
|
|
status: str, error: Optional[str] = None) -> None:
|
|
conn.execute(
|
|
"UPDATE email_accounts SET sync_status = ?, sync_error = ?, "
|
|
"updated_at = datetime('now') WHERE id = ?",
|
|
(status, error, account_id),
|
|
)
|
|
|
|
|
|
def set_account_checkpoint(conn: sqlite3.Connection, account_id: str, *,
|
|
history_id: Optional[str] = None,
|
|
backfill_cursor: Optional[str] = None,
|
|
backfill_complete: Optional[bool] = None,
|
|
last_synced_at: Optional[str] = None) -> None:
|
|
sets, params = [], []
|
|
if history_id is not None:
|
|
sets.append("last_history_id = ?"); params.append(history_id)
|
|
if backfill_cursor is not None:
|
|
sets.append("backfill_cursor = ?"); params.append(backfill_cursor)
|
|
if backfill_complete is not None:
|
|
sets.append("backfill_complete = ?"); params.append(1 if backfill_complete else 0)
|
|
if last_synced_at is not None:
|
|
sets.append("last_synced_at = ?"); params.append(last_synced_at)
|
|
if not sets:
|
|
return
|
|
sets.append("updated_at = datetime('now')")
|
|
params.append(account_id)
|
|
conn.execute(f"UPDATE email_accounts SET {', '.join(sets)} WHERE id = ?", params)
|
|
|
|
|
|
# ------------------------------------------------------------------ emails
|
|
|
|
def find_email_by_rfc_id(conn: sqlite3.Connection, rfc_message_id: str) -> Optional[sqlite3.Row]:
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT * FROM emails WHERE rfc_message_id = ?", (rfc_message_id,))
|
|
return cur.fetchone()
|
|
|
|
|
|
def find_email_id_by_any_rfc_id(conn: sqlite3.Connection,
|
|
rfc_ids: Iterable[str]) -> Optional[str]:
|
|
ids = [r for r in rfc_ids if r]
|
|
if not ids:
|
|
return None
|
|
placeholders = ",".join("?" for _ in ids)
|
|
cur = conn.cursor()
|
|
cur.execute(
|
|
f"SELECT id FROM emails WHERE rfc_message_id IN ({placeholders}) "
|
|
"ORDER BY sent_at ASC LIMIT 1",
|
|
ids,
|
|
)
|
|
row = cur.fetchone()
|
|
return row["id"] if row else None
|
|
|
|
|
|
def insert_email(conn: sqlite3.Connection, *, parsed: dict, match_status: str) -> str:
|
|
"""Insert a fresh emails row. Returns email_id.
|
|
|
|
Caller must ensure no row exists for parsed['rfc_message_id']; use
|
|
find_email_by_rfc_id first.
|
|
"""
|
|
email_id = _uuid()
|
|
conn.execute(
|
|
"""INSERT INTO emails
|
|
(id, rfc_message_id, gmail_thread_id, rfc_thread_root_id, subject,
|
|
from_email, from_name, to_emails_json, cc_emails_json, bcc_emails_json,
|
|
reply_to, sent_at, body_text, body_html, snippet, in_reply_to,
|
|
references_json, has_attachments, size_estimate, is_matched,
|
|
match_status, raw_headers_json)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
(
|
|
email_id,
|
|
parsed["rfc_message_id"],
|
|
parsed.get("gmail_thread_id"),
|
|
parsed.get("rfc_thread_root_id"),
|
|
parsed.get("subject"),
|
|
parsed["from_email"],
|
|
parsed.get("from_name"),
|
|
_json(parsed.get("to", [])),
|
|
_json(parsed.get("cc", [])),
|
|
_json(parsed.get("bcc", [])),
|
|
parsed.get("reply_to"),
|
|
parsed["sent_at"],
|
|
parsed.get("body_text"),
|
|
parsed.get("body_html"),
|
|
parsed.get("snippet"),
|
|
parsed.get("in_reply_to"),
|
|
_json(parsed.get("references", [])),
|
|
1 if parsed.get("attachments") else 0,
|
|
parsed.get("size_estimate"),
|
|
1 if match_status == "matched" else 0,
|
|
match_status,
|
|
_json(parsed.get("raw_headers", {})) if parsed.get("raw_headers") else None,
|
|
),
|
|
)
|
|
# recipients
|
|
for kind in ("from", "to", "cc", "bcc", "reply_to"):
|
|
addrs = []
|
|
if kind == "from" and parsed.get("from_email"):
|
|
addrs = [(parsed["from_email"], parsed.get("from_name"))]
|
|
elif kind == "reply_to" and parsed.get("reply_to"):
|
|
addrs = [(parsed["reply_to"], None)]
|
|
else:
|
|
# `or []` (not get(kind, [])): the key is often present with value None
|
|
# (e.g. reply_to when there is no Reply-To header), and `for a in None`
|
|
# would raise TypeError and abort the whole backfill.
|
|
for a in (parsed.get(kind) or []):
|
|
if isinstance(a, dict):
|
|
addrs.append((a.get("email"), a.get("name")))
|
|
else:
|
|
addrs.append((a, None))
|
|
for address, name in addrs:
|
|
if not address:
|
|
continue
|
|
conn.execute(
|
|
"INSERT INTO email_recipients (id, email_id, address, display_name, kind) "
|
|
"VALUES (?, ?, ?, ?, ?)",
|
|
(_uuid(), email_id, address.lower().strip(), name, kind),
|
|
)
|
|
return email_id
|
|
|
|
|
|
def set_email_thread(conn: sqlite3.Connection, email_id: str, thread_id: str) -> None:
|
|
conn.execute(
|
|
"UPDATE emails SET thread_id = ?, updated_at = datetime('now') WHERE id = ?",
|
|
(thread_id, email_id),
|
|
)
|
|
|
|
|
|
# ------------------------------------------------------------------ sightings
|
|
|
|
def upsert_sighting(conn: sqlite3.Connection, *, email_id: str, account_id: str,
|
|
gmail_message_id: str, gmail_thread_id: str,
|
|
labels: list[str], is_sent: bool) -> None:
|
|
conn.execute(
|
|
"""INSERT OR IGNORE INTO email_account_messages
|
|
(id, email_id, account_id, gmail_message_id, gmail_thread_id,
|
|
labels_json, is_sent)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
|
(_uuid(), email_id, account_id, gmail_message_id, gmail_thread_id,
|
|
_json(labels), 1 if is_sent else 0),
|
|
)
|
|
|
|
|
|
def update_sighting_labels(conn: sqlite3.Connection, *, account_id: str,
|
|
gmail_message_id: str, labels: list[str]) -> None:
|
|
conn.execute(
|
|
"UPDATE email_account_messages SET labels_json = ? "
|
|
"WHERE account_id = ? AND gmail_message_id = ?",
|
|
(_json(labels), account_id, gmail_message_id),
|
|
)
|
|
|
|
|
|
def tombstone_sighting(conn: sqlite3.Connection, *, account_id: str,
|
|
gmail_message_id: str) -> None:
|
|
conn.execute(
|
|
"UPDATE email_account_messages SET deleted_at = datetime('now') "
|
|
"WHERE account_id = ? AND gmail_message_id = ?",
|
|
(account_id, gmail_message_id),
|
|
)
|
|
|
|
|
|
# ------------------------------------------------------------------ attachments
|
|
|
|
def insert_attachment_stub(conn: sqlite3.Connection, *, email_id: str,
|
|
gmail_attachment_id: str, filename: str,
|
|
sanitized_filename: str, mime_type: Optional[str],
|
|
size_bytes: Optional[int], storage_path: str) -> str:
|
|
att_id = _uuid()
|
|
conn.execute(
|
|
"""INSERT INTO email_attachments
|
|
(id, email_id, gmail_attachment_id, filename, sanitized_filename,
|
|
mime_type, size_bytes, storage_path)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
(att_id, email_id, gmail_attachment_id, filename, sanitized_filename,
|
|
mime_type, size_bytes, storage_path),
|
|
)
|
|
return att_id
|
|
|
|
|
|
def mark_attachment_downloaded(conn: sqlite3.Connection, attachment_id: str, *,
|
|
sha256_hex: str, size_bytes: int) -> None:
|
|
conn.execute(
|
|
"UPDATE email_attachments SET download_status = 'downloaded', "
|
|
"sha256_hex = ?, size_bytes = ?, downloaded_at = datetime('now') "
|
|
"WHERE id = ?",
|
|
(sha256_hex, size_bytes, attachment_id),
|
|
)
|
|
|
|
|
|
def mark_attachment_failed(conn: sqlite3.Connection, attachment_id: str, *,
|
|
error: str) -> None:
|
|
conn.execute(
|
|
"UPDATE email_attachments SET download_status = 'failed', "
|
|
"download_attempts = download_attempts + 1, download_error = ? "
|
|
"WHERE id = ?",
|
|
(error, attachment_id),
|
|
)
|
|
|
|
|
|
def pending_attachments(conn: sqlite3.Connection, limit: int = 50) -> list[sqlite3.Row]:
|
|
cur = conn.cursor()
|
|
cur.execute(
|
|
"SELECT a.*, eam.gmail_message_id, eam.account_id "
|
|
"FROM email_attachments a "
|
|
"JOIN email_account_messages eam ON eam.email_id = a.email_id "
|
|
"WHERE a.download_status = 'pending' AND a.download_attempts < 5 "
|
|
"LIMIT ?",
|
|
(limit,),
|
|
)
|
|
return cur.fetchall()
|
|
|
|
|
|
# ------------------------------------------------------------------ threads
|
|
|
|
def find_thread_by_gmail_id(conn: sqlite3.Connection, gmail_thread_id: str) -> Optional[sqlite3.Row]:
|
|
cur = conn.cursor()
|
|
cur.execute(
|
|
"SELECT * FROM email_threads WHERE gmail_thread_id = ?",
|
|
(gmail_thread_id,),
|
|
)
|
|
return cur.fetchone()
|
|
|
|
|
|
def find_thread_by_rfc_root(conn: sqlite3.Connection, rfc_root: str) -> Optional[sqlite3.Row]:
|
|
cur = conn.cursor()
|
|
cur.execute(
|
|
"SELECT * FROM email_threads WHERE rfc_thread_root_id = ?",
|
|
(rfc_root,),
|
|
)
|
|
return cur.fetchone()
|
|
|
|
|
|
def create_thread(conn: sqlite3.Connection, *, gmail_thread_id: Optional[str],
|
|
rfc_thread_root_id: Optional[str], subject_normalized: Optional[str],
|
|
first_message_at: Optional[str]) -> str:
|
|
thread_id = _uuid()
|
|
conn.execute(
|
|
"""INSERT INTO email_threads
|
|
(id, gmail_thread_id, rfc_thread_root_id, subject_normalized,
|
|
first_message_at, last_message_at, message_count)
|
|
VALUES (?, ?, ?, ?, ?, ?, 0)""",
|
|
(thread_id, gmail_thread_id, rfc_thread_root_id, subject_normalized,
|
|
first_message_at, first_message_at),
|
|
)
|
|
return thread_id
|
|
|
|
|
|
def rollup_thread(conn: sqlite3.Connection, thread_id: str) -> None:
|
|
"""Recompute count / last_message_at / participants from member emails.
|
|
|
|
Cheap at 5-person team volumes. For larger deployments swap to triggers.
|
|
"""
|
|
cur = conn.cursor()
|
|
cur.execute(
|
|
"SELECT COUNT(*) AS n, MIN(sent_at) AS first, MAX(sent_at) AS last, "
|
|
"MAX(is_matched) AS matched FROM emails WHERE thread_id = ?",
|
|
(thread_id,),
|
|
)
|
|
row = cur.fetchone()
|
|
if not row or row["n"] == 0:
|
|
return
|
|
cur.execute(
|
|
"SELECT DISTINCT address FROM email_recipients er "
|
|
"JOIN emails e ON e.id = er.email_id WHERE e.thread_id = ?",
|
|
(thread_id,),
|
|
)
|
|
participants = [r["address"] for r in cur.fetchall()]
|
|
conn.execute(
|
|
"UPDATE email_threads SET message_count = ?, first_message_at = ?, "
|
|
"last_message_at = ?, participant_count = ?, participants_json = ?, "
|
|
"is_matched = ?, updated_at = datetime('now') WHERE id = ?",
|
|
(row["n"], row["first"], row["last"], len(participants),
|
|
_json(participants), int(row["matched"] or 0), thread_id),
|
|
)
|
|
|
|
|
|
# ------------------------------------------------------------------ investor links
|
|
|
|
def insert_investor_link(conn: sqlite3.Connection, *, email_id: str,
|
|
link: dict) -> None:
|
|
conn.execute(
|
|
"""INSERT INTO email_investor_links
|
|
(id, email_id, fundraising_investor_id, fundraising_contact_id,
|
|
contact_id, organization_id, matched_address, match_kind,
|
|
match_confidence)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
(
|
|
_uuid(),
|
|
email_id,
|
|
link.get("fundraising_investor_id"),
|
|
link.get("fundraising_contact_id"),
|
|
link.get("contact_id"),
|
|
link.get("organization_id"),
|
|
link["matched_address"],
|
|
link["match_kind"],
|
|
float(link.get("match_confidence", 1.0)),
|
|
),
|
|
)
|
|
|
|
|
|
# ------------------------------------------------------------------ sync runs
|
|
|
|
def start_sync_run(conn: sqlite3.Connection, *, account_id: str, kind: str) -> str:
|
|
run_id = _uuid()
|
|
conn.execute(
|
|
"INSERT INTO email_sync_runs (id, account_id, kind, started_at, status) "
|
|
"VALUES (?, ?, ?, ?, 'running')",
|
|
(run_id, account_id, kind, _now_iso()),
|
|
)
|
|
return run_id
|
|
|
|
|
|
def finish_sync_run(conn: sqlite3.Connection, run_id: str, *, status: str,
|
|
stats: Optional[dict] = None, error: Optional[str] = None) -> None:
|
|
stats = stats or {}
|
|
conn.execute(
|
|
"""UPDATE email_sync_runs
|
|
SET finished_at = ?, status = ?, messages_seen = ?, messages_stored = ?,
|
|
attachments_saved = ?, api_calls = ?, retries = ?, error = ?
|
|
WHERE id = ?""",
|
|
(
|
|
_now_iso(), status,
|
|
int(stats.get("messages_seen", 0)),
|
|
int(stats.get("messages_stored", 0)),
|
|
int(stats.get("attachments_saved", 0)),
|
|
int(stats.get("api_calls", 0)),
|
|
int(stats.get("retries", 0)),
|
|
error,
|
|
run_id,
|
|
),
|
|
)
|