Phase 0 foundation: canonical schema, ingest pipeline, CRM MCP server
Workstream A–C substrate for the Ten31 agentic system: - A1: docs/crm-overview.md; CLAUDE.md conventions + guardrail #9 - A2: additive/reversible core migration (canonical_entities, entity_links, interaction_log, relationship_edges, soft-delete) + ledgered runner - B1/B3: chunking + deterministic entity resolution (backend/ingest) - B2: dense (bge-m3) + BM25 sparse ingest to Qdrant crm_chunks - C: CRM MCP server (reads, retrieval modes, logged writes) — no outbound tools - docs: redaction/re-hydration, Gmail enablement runbook - synthetic test data; .env.example; housekeeping (.gitignore, untrack crm.db, drop legacy files + start9/0.3.5) Verified end-to-end on synthetic data + live Sparks (hybrid > dense on entity queries). Real backfill runs on Ten31 infra; index holds synthetic data only. Branch snapshot also captures pre-existing working-tree changes. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Binary file not shown.
@@ -0,0 +1,90 @@
|
||||
"""Core-schema migration runner for the Ten31 CRM.
|
||||
|
||||
Phase 0 (Workstream A2) introduces ordered, reviewable SQL migrations for the
|
||||
*core* schema, generalizing the pattern the Gmail subsystem already uses
|
||||
(email_integration/db.py). Unlike the Gmail runner, this one keeps a
|
||||
`schema_migrations` ledger so each file runs exactly once and the applied set is
|
||||
auditable.
|
||||
|
||||
Design rules (CLAUDE.md guardrails):
|
||||
* Additive and reversible only — migrations add tables / nullable columns;
|
||||
they never drop or rewrite existing data. Each NNNN_*.sql may ship a paired
|
||||
NNNN_*.down.sql for manual rollback (the .down files are never auto-applied).
|
||||
* Idempotent — files use `CREATE TABLE/INDEX IF NOT EXISTS`. For the few
|
||||
non-idempotent `ALTER TABLE ... ADD COLUMN` statements (SQLite has no
|
||||
IF NOT EXISTS for columns), a partial-apply is tolerated by skipping
|
||||
"duplicate column name" errors, matching the existing defensive pattern in
|
||||
server.py:init_db().
|
||||
|
||||
Call `apply_core_migrations(conn)` from init_db() after the base tables exist.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
MIGRATIONS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "migrations")
|
||||
|
||||
|
||||
def _split_statements(sql: str):
|
||||
"""Split a SQL script into individual statements on ';' boundaries.
|
||||
|
||||
Sufficient for our migrations, which contain no procedural blocks or
|
||||
semicolons inside string literals.
|
||||
"""
|
||||
return [s.strip() for s in sql.split(";") if s.strip()]
|
||||
|
||||
|
||||
def _apply_statementwise(cursor, sql: str) -> None:
|
||||
"""Execute a migration one statement at a time, tolerating an already-applied
|
||||
`ALTER TABLE ... ADD COLUMN` (duplicate column). Used only as a fallback when
|
||||
executescript() trips over a partially-applied migration."""
|
||||
for stmt in _split_statements(sql):
|
||||
try:
|
||||
cursor.execute(stmt)
|
||||
except sqlite3.OperationalError as exc:
|
||||
if "duplicate column name" in str(exc).lower():
|
||||
continue
|
||||
raise
|
||||
|
||||
|
||||
def apply_core_migrations(conn) -> None:
|
||||
"""Apply any pending backend/migrations/NNNN_*.sql files once, in order.
|
||||
|
||||
Records each applied file in the `schema_migrations` ledger. `*.down.sql`
|
||||
files are ignored (manual rollback only).
|
||||
"""
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS schema_migrations (
|
||||
filename TEXT PRIMARY KEY,
|
||||
applied_at TEXT DEFAULT (datetime('now'))
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
if not os.path.isdir(MIGRATIONS_DIR):
|
||||
return
|
||||
|
||||
applied = {row[0] for row in cursor.execute("SELECT filename FROM schema_migrations")}
|
||||
|
||||
pending = sorted(
|
||||
fn for fn in os.listdir(MIGRATIONS_DIR)
|
||||
if fn.endswith(".sql") and not fn.endswith(".down.sql") and fn not in applied
|
||||
)
|
||||
|
||||
for filename in pending:
|
||||
path = os.path.join(MIGRATIONS_DIR, filename)
|
||||
with open(path, "r", encoding="utf-8") as handle:
|
||||
sql = handle.read()
|
||||
try:
|
||||
cursor.executescript(sql)
|
||||
except sqlite3.OperationalError as exc:
|
||||
if "duplicate column name" in str(exc).lower():
|
||||
_apply_statementwise(cursor, sql)
|
||||
else:
|
||||
raise
|
||||
cursor.execute("INSERT INTO schema_migrations (filename) VALUES (?)", (filename,))
|
||||
conn.commit()
|
||||
print(f"[migrations] applied {filename}")
|
||||
@@ -0,0 +1,175 @@
|
||||
# `email_integration` — Gmail capture for the Venture CRM
|
||||
|
||||
Scaffolded Phase 1 of the Gmail integration described in
|
||||
`GMAIL_INTEGRATION_ARCHITECTURE.md` (repo root). Everything in this module is
|
||||
isolated from `server.py` until you wire it in explicitly.
|
||||
|
||||
## Contents
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `config.py` | Env-var loader; exposes `CONFIG` singleton. |
|
||||
| `errors.py` | Exception taxonomy used by the retry loop. |
|
||||
| `crypto.py` | AES-GCM wrapper for OAuth refresh-token encryption (only used in OAuth mode). |
|
||||
| `credentials.py` | `CredentialProvider` protocol + `DWDCredentialProvider` / `OAuthCredentialProvider`. |
|
||||
| `gmail_client.py` | Gmail API HTTP wrapper (rate limit, retry, pagination). |
|
||||
| `db.py` | All SQL touching `emails_*` tables. Migrations live under `migrations/`. |
|
||||
| `parser.py` | Gmail payload → canonical dict (headers, body, attachments). |
|
||||
| `matcher.py` | Investor address index + match logic. |
|
||||
| `threads.py` | Thread resolution using Gmail threadId + RFC References. |
|
||||
| `attachments.py` | Stub rows + on-disk storage + download worker. |
|
||||
| `sync.py` | Orchestrator for backfill + incremental sync of one account. |
|
||||
| `scheduler.py` | Background thread that runs `sync.sync_all` on an interval. |
|
||||
| `routes.py` | HTTP handlers under `/api/email/*` compatible with `CRMHandler`. |
|
||||
| `migrations/0001_email_tables.sql` | Table DDL. |
|
||||
|
||||
## Wiring it in
|
||||
|
||||
All changes are in `backend/server.py`, all guarded by an env flag. Each is
|
||||
independently revertible. None run unless `CRM_GMAIL_INTEGRATION_ENABLED=true`.
|
||||
|
||||
**Patch 1 — migrations** (append to `init_db()` after all existing
|
||||
`cursor.executescript(...)` calls, before `conn.commit()`):
|
||||
|
||||
```python
|
||||
try:
|
||||
from email_integration.db import apply_migrations
|
||||
apply_migrations(cursor)
|
||||
except ImportError:
|
||||
pass
|
||||
```
|
||||
|
||||
**Patch 2 — scheduler** (in `main()`, after `start_backup_scheduler()`):
|
||||
|
||||
```python
|
||||
if os.environ.get("CRM_GMAIL_INTEGRATION_ENABLED", "").lower() in ("1", "true", "yes", "on"):
|
||||
from email_integration.scheduler import start_sync_scheduler
|
||||
start_sync_scheduler()
|
||||
```
|
||||
|
||||
**Patch 3 — routes** (add near the top of `CRMHandler.do_GET` and
|
||||
`CRMHandler.do_POST`, after auth/rate-limit pre-checks, before API routing):
|
||||
|
||||
```python
|
||||
try:
|
||||
from email_integration.routes import try_handle
|
||||
if try_handle(self):
|
||||
return
|
||||
except ImportError:
|
||||
pass
|
||||
```
|
||||
|
||||
## Environment variables
|
||||
|
||||
```bash
|
||||
# Master on/off. Default off; scheduler won't start, routes return 503.
|
||||
CRM_GMAIL_INTEGRATION_ENABLED=true
|
||||
|
||||
# Auth method: "dwd" (default, recommended) or "oauth"
|
||||
CRM_GMAIL_AUTH_METHOD=dwd
|
||||
|
||||
# DWD mode
|
||||
CRM_GMAIL_SA_KEY_PATH=/path/to/CRM/data/secrets/gmail-service-account.json
|
||||
CRM_GMAIL_WORKSPACE_DOMAIN=ten31.xyz
|
||||
|
||||
# OAuth mode (fallback; not required for DWD)
|
||||
CRM_GMAIL_OAUTH_CLIENT_ID=...
|
||||
CRM_GMAIL_OAUTH_CLIENT_SECRET=...
|
||||
CRM_GMAIL_OAUTH_REDIRECT_URI=https://crm.ten31.xyz/api/email/oauth/callback
|
||||
CRM_GMAIL_SECRET_KEY=<base64-32-random-bytes> # for encrypting refresh tokens
|
||||
|
||||
# Sync
|
||||
CRM_GMAIL_SYNC_INTERVAL_MIN=180 # default 3h
|
||||
CRM_GMAIL_BACKFILL_PAGE_SIZE=500
|
||||
CRM_GMAIL_MAX_ATTACHMENT_MB=50
|
||||
CRM_GMAIL_ATTACH_CONCURRENCY=4
|
||||
CRM_GMAIL_RATE_UNITS_SEC=150 # per account, leaves 40% headroom
|
||||
CRM_GMAIL_RETRY_MAX=5
|
||||
CRM_GMAIL_HISTORY_STALE_DAYS=5
|
||||
```
|
||||
|
||||
## Google Cloud / Workspace setup (DWD)
|
||||
|
||||
See `GMAIL_INTEGRATION_ARCHITECTURE.md` §3 for the full runbook. Short form:
|
||||
|
||||
1. Create GCP project, enable Gmail API.
|
||||
2. Create service account, download JSON key, enable domain-wide delegation.
|
||||
3. In Google Admin console → Security → API controls → Manage domain-wide
|
||||
delegation, authorize the service account's client ID with scope
|
||||
`https://www.googleapis.com/auth/gmail.readonly`.
|
||||
4. Copy the JSON key to `data/secrets/gmail-service-account.json`, `chmod 600`.
|
||||
5. Set env vars in `.env.beta`, restart CRM.
|
||||
6. As admin, POST `/api/email/accounts/enroll-all` to create `email_accounts`
|
||||
rows for every active user whose email ends in the Workspace domain.
|
||||
|
||||
## Adding the crypto dependency (only for OAuth mode)
|
||||
|
||||
If you use OAuth fallback you need `cryptography`:
|
||||
|
||||
```
|
||||
cryptography==42.0.5
|
||||
```
|
||||
|
||||
Append to `backend/requirements.txt`. DWD mode also uses `cryptography` for
|
||||
the RSA signing of the JWT bearer token — so if you enable the integration in
|
||||
either mode, add the dep.
|
||||
|
||||
## Rollback
|
||||
|
||||
To disable instantly: set `CRM_GMAIL_INTEGRATION_ENABLED=false` and restart.
|
||||
The scheduler won't start, routes return 503, DB tables remain (unused).
|
||||
|
||||
To remove completely: drop the env var, delete `data/email_attachments/`,
|
||||
drop all `emails_*` tables and `email_*` tables (migration is idempotent
|
||||
create-only; a separate drop script would be required — not provided in
|
||||
Phase 1 since you said you're not rushing).
|
||||
|
||||
## Local development
|
||||
|
||||
The module has zero network dependencies when imported without the scheduler
|
||||
starting. You can:
|
||||
|
||||
```python
|
||||
python3 -c "from email_integration.parser import parse; \
|
||||
import json; \
|
||||
print(parse(json.load(open('fixture.json'))))"
|
||||
```
|
||||
|
||||
## Testing checklist (before enabling in production)
|
||||
|
||||
- [ ] Enable `CRM_GMAIL_INTEGRATION_ENABLED=true` on a staging copy of the DB only.
|
||||
- [ ] Verify migrations applied: `emails`, `email_accounts`, etc. present.
|
||||
- [ ] Enroll one account (yours) via `/api/email/accounts/enroll`.
|
||||
- [ ] Trigger `POST /api/email/sync/run-now`.
|
||||
- [ ] Check `email_sync_runs` for `status='ok'`.
|
||||
- [ ] Spot-check `emails` rows against Gmail.
|
||||
- [ ] Verify an attachment downloaded correctly (hash and size).
|
||||
- [ ] Let the scheduler run for 24 hours; monitor `/api/email/status`.
|
||||
- [ ] Enroll remaining 4 teammates.
|
||||
|
||||
## What's scaffolded vs. TODO
|
||||
|
||||
**Scaffolded and complete:**
|
||||
- Schema (migration 0001)
|
||||
- Config and env parsing
|
||||
- Error taxonomy + retry classifier
|
||||
- AES-GCM crypto helpers
|
||||
- DWD JWT minting + access token caching
|
||||
- OAuth refresh + consent flow endpoints
|
||||
- Gmail client (list/get/history/attachments/profile) with rate limit + retry
|
||||
- Full DB data-access layer
|
||||
- MIME parser including RFC 2047 subjects and HTML→text fallback
|
||||
- Investor matcher with exact + domain strategies
|
||||
- Thread resolution (Gmail threadId + RFC References cross-account)
|
||||
- Attachment storage with SHA-256 dedup
|
||||
- Sync orchestrator (backfill + incremental with history-expired fallback)
|
||||
- Scheduler with manual-trigger hook
|
||||
- HTTP routes (status, accounts, threads, enroll, run-now, rematch, oauth)
|
||||
|
||||
**TODO before production (see architecture doc §15):**
|
||||
- Multipart batch metadata fetch in `gmail_client.batch_get_metadata`
|
||||
(currently serial fallback).
|
||||
- Unit tests (fixtures for parser, matcher, threads; integration tests with
|
||||
responses-style HTTP mock).
|
||||
- Frontend UI: a thread list + detail pane in `frontend/index.html`.
|
||||
- Sandboxed HTML rendering for email bodies (out of scope here).
|
||||
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
Gmail Integration for Venture CRM.
|
||||
|
||||
Phase 1 scope: OAuth2/DWD authentication, incremental Gmail sync, MIME parsing,
|
||||
investor matching, threading, attachment storage. All logic isolated to this
|
||||
module; server.py integration is a 3-line patch guarded by
|
||||
CRM_GMAIL_INTEGRATION_ENABLED.
|
||||
|
||||
See GMAIL_INTEGRATION_ARCHITECTURE.md at the repo root for full design.
|
||||
"""
|
||||
|
||||
from . import config # noqa: F401
|
||||
from . import errors # noqa: F401
|
||||
|
||||
__all__ = ["config", "errors"]
|
||||
@@ -0,0 +1,234 @@
|
||||
"""
|
||||
Attachment download + on-disk storage.
|
||||
|
||||
Two usage patterns:
|
||||
|
||||
1. During message parsing we call `register_stubs(conn, email_id, parsed)`
|
||||
which writes pending rows to email_attachments.
|
||||
|
||||
2. A separate worker (kicked off by sync after each account completes)
|
||||
calls `drain_pending()` which fetches attachment bytes from Gmail and
|
||||
writes them to disk under CONFIG.attachments_dir.
|
||||
|
||||
Files are named: <CRM_DATA_DIR>/email_attachments/<email_id[:2]>/<email_id>/<attachment_id>-<sanitized_filename>
|
||||
|
||||
Sanitization prevents path traversal and keeps cross-platform-safe names.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
from typing import Iterable, Optional
|
||||
|
||||
from . import config as _cfg
|
||||
from . import db as _db
|
||||
from . import errors as _errors
|
||||
from . import gmail_client as _gmail
|
||||
|
||||
|
||||
_MAX_FILENAME_LEN = 200
|
||||
_BAD_FILENAME_CHARS = re.compile(r'[/\\\x00-\x1f\x7f:*?"<>|]+')
|
||||
|
||||
|
||||
def _sanitize_filename(name: str) -> str:
|
||||
if not name:
|
||||
return "unnamed.bin"
|
||||
# strip path components first
|
||||
name = os.path.basename(name.replace("\\", "/"))
|
||||
name = _BAD_FILENAME_CHARS.sub("_", name).strip(" .")
|
||||
if not name:
|
||||
name = "unnamed.bin"
|
||||
if len(name) > _MAX_FILENAME_LEN:
|
||||
stem, dot, ext = name.rpartition(".")
|
||||
if dot:
|
||||
name = stem[: _MAX_FILENAME_LEN - len(ext) - 1] + "." + ext
|
||||
else:
|
||||
name = name[:_MAX_FILENAME_LEN]
|
||||
return name
|
||||
|
||||
|
||||
def _storage_path_for(email_id: str, attachment_id: str, sanitized_filename: str) -> str:
|
||||
root = _cfg.CONFIG.attachments_dir
|
||||
bucket = email_id[:2] or "_0"
|
||||
dir_ = os.path.join(root, bucket, email_id)
|
||||
os.makedirs(dir_, exist_ok=True)
|
||||
return os.path.join(dir_, f"{attachment_id}-{sanitized_filename}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- phase 1: register stubs
|
||||
|
||||
def register_stubs(conn: sqlite3.Connection, *, email_id: str,
|
||||
parsed_attachments: Iterable[dict]) -> list[str]:
|
||||
"""Write pending attachment rows from parsed message data.
|
||||
|
||||
Also handles tiny inline attachments whose bytes arrived with the message
|
||||
body (body.data present, no separate attachmentId) by writing them
|
||||
directly and marking as downloaded.
|
||||
|
||||
Returns list of attachment ids created.
|
||||
"""
|
||||
max_bytes = _cfg.CONFIG.max_attachment_mb * 1024 * 1024
|
||||
ids = []
|
||||
|
||||
for att in parsed_attachments:
|
||||
filename = att.get("filename") or "unnamed.bin"
|
||||
sanitized = _sanitize_filename(filename)
|
||||
gmail_att_id = att.get("gmail_attachment_id") or ""
|
||||
mime = att.get("mime_type")
|
||||
size = att.get("size")
|
||||
|
||||
# Determine storage path (we write the path whether or not the download
|
||||
# succeeded; missing files surface via download_status).
|
||||
att_row_id = _db.insert_attachment_stub(
|
||||
conn,
|
||||
email_id=email_id,
|
||||
gmail_attachment_id=gmail_att_id,
|
||||
filename=filename,
|
||||
sanitized_filename=sanitized,
|
||||
mime_type=mime,
|
||||
size_bytes=size,
|
||||
storage_path=_storage_path_for(email_id, gmail_att_id or att_row_id_fallback(), sanitized),
|
||||
)
|
||||
ids.append(att_row_id)
|
||||
|
||||
# Oversize guard.
|
||||
if isinstance(size, int) and size > max_bytes:
|
||||
conn.execute(
|
||||
"UPDATE email_attachments SET download_status = 'skipped', "
|
||||
"download_error = ? WHERE id = ?",
|
||||
(f"exceeds max size {_cfg.CONFIG.max_attachment_mb}MB", att_row_id),
|
||||
)
|
||||
continue
|
||||
|
||||
# Inline data fast-path.
|
||||
inline_b64 = att.get("inline_data_b64")
|
||||
if inline_b64:
|
||||
try:
|
||||
raw = base64.urlsafe_b64decode(_pad(inline_b64).encode("ascii"))
|
||||
path = _storage_path_for(email_id, att_row_id, sanitized)
|
||||
_write_bytes(path, raw)
|
||||
sha = hashlib.sha256(raw).hexdigest()
|
||||
conn.execute(
|
||||
"UPDATE email_attachments SET storage_path = ? WHERE id = ?",
|
||||
(path, att_row_id),
|
||||
)
|
||||
_db.mark_attachment_downloaded(
|
||||
conn, att_row_id, sha256_hex=sha, size_bytes=len(raw)
|
||||
)
|
||||
except Exception as e:
|
||||
_db.mark_attachment_failed(conn, att_row_id, error=f"inline decode: {e}")
|
||||
|
||||
return ids
|
||||
|
||||
|
||||
def att_row_id_fallback() -> str:
|
||||
# Placeholder so the path template always produces something if gmail_att_id
|
||||
# was missing at stub time; the real path is rewritten when the worker
|
||||
# picks it up.
|
||||
import uuid
|
||||
return uuid.uuid4().hex
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- phase 2: worker
|
||||
|
||||
def drain_pending(conn_factory, client: _gmail.GmailClient, account_id: str,
|
||||
*, limit: int = 50) -> int:
|
||||
"""Download up to `limit` pending attachments for `account_id`.
|
||||
|
||||
Returns count of successfully downloaded attachments. Called after each
|
||||
account's sync completes so large files don't block the sync loop.
|
||||
"""
|
||||
conn = conn_factory()
|
||||
try:
|
||||
pending = _db.pending_attachments(conn, limit=limit)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
downloaded = 0
|
||||
for row in pending:
|
||||
if row["account_id"] != account_id:
|
||||
continue
|
||||
conn = conn_factory()
|
||||
try:
|
||||
ok = _download_one(conn, client, row)
|
||||
if ok:
|
||||
downloaded += 1
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
return downloaded
|
||||
|
||||
|
||||
def _download_one(conn: sqlite3.Connection, client: _gmail.GmailClient, row) -> bool:
|
||||
try:
|
||||
resp = client.get_attachment(row["gmail_message_id"], row["gmail_attachment_id"])
|
||||
except _errors.RETRYABLE as e:
|
||||
_db.mark_attachment_failed(conn, row["id"], error=f"transient: {type(e).__name__}")
|
||||
return False
|
||||
except _errors.GmailError as e:
|
||||
_db.mark_attachment_failed(conn, row["id"], error=f"{type(e).__name__}: {e}")
|
||||
return False
|
||||
|
||||
data_b64 = resp.get("data")
|
||||
if not data_b64:
|
||||
_db.mark_attachment_failed(conn, row["id"], error="empty data in response")
|
||||
return False
|
||||
|
||||
try:
|
||||
raw = base64.urlsafe_b64decode(_pad(data_b64).encode("ascii"))
|
||||
except Exception as e:
|
||||
_db.mark_attachment_failed(conn, row["id"], error=f"decode: {e}")
|
||||
return False
|
||||
|
||||
sha = hashlib.sha256(raw).hexdigest()
|
||||
# If an existing attachment has the same SHA, re-point storage_path and skip write.
|
||||
existing = _find_existing_by_sha(conn, sha, exclude_id=row["id"])
|
||||
if existing:
|
||||
conn.execute(
|
||||
"UPDATE email_attachments SET storage_path = ? WHERE id = ?",
|
||||
(existing["storage_path"], row["id"]),
|
||||
)
|
||||
_db.mark_attachment_downloaded(conn, row["id"], sha256_hex=sha, size_bytes=len(raw))
|
||||
return True
|
||||
|
||||
path = _storage_path_for(row["email_id"], row["id"], row["sanitized_filename"])
|
||||
try:
|
||||
_write_bytes(path, raw)
|
||||
except OSError as e:
|
||||
_db.mark_attachment_failed(conn, row["id"], error=f"disk: {e}")
|
||||
return False
|
||||
|
||||
conn.execute(
|
||||
"UPDATE email_attachments SET storage_path = ? WHERE id = ?",
|
||||
(path, row["id"]),
|
||||
)
|
||||
_db.mark_attachment_downloaded(conn, row["id"], sha256_hex=sha, size_bytes=len(raw))
|
||||
return True
|
||||
|
||||
|
||||
def _find_existing_by_sha(conn: sqlite3.Connection, sha: str, *, exclude_id: str) -> Optional[sqlite3.Row]:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT * FROM email_attachments WHERE sha256_hex = ? AND id != ? "
|
||||
"AND download_status = 'downloaded' LIMIT 1",
|
||||
(sha, exclude_id),
|
||||
)
|
||||
return cur.fetchone()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- utils
|
||||
|
||||
def _pad(b64: str) -> str:
|
||||
pad = 4 - (len(b64) % 4)
|
||||
return b64 + ("=" * pad if pad != 4 else "")
|
||||
|
||||
|
||||
def _write_bytes(path: str, data: bytes) -> None:
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
tmp = path + ".tmp"
|
||||
with open(tmp, "wb") as f:
|
||||
f.write(data)
|
||||
os.chmod(tmp, 0o600)
|
||||
os.replace(tmp, path)
|
||||
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
Email integration configuration.
|
||||
|
||||
Reads from the same env-var surface as the rest of the CRM (server.py style),
|
||||
no pydantic/dotenv magic — stdlib only.
|
||||
"""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
# Reuse the CRM's data dir so backups and email storage live together.
|
||||
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
_DEFAULT_DATA_DIR = os.path.join(_PROJECT_DIR, "data")
|
||||
|
||||
|
||||
def _bool_env(name: str, default: bool = False) -> bool:
|
||||
v = os.environ.get(name, "").strip().lower()
|
||||
if v in ("1", "true", "yes", "on"):
|
||||
return True
|
||||
if v in ("0", "false", "no", "off"):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def _int_env(name: str, default: int) -> int:
|
||||
try:
|
||||
return int(os.environ.get(name, str(default)))
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EmailConfig:
|
||||
# Master kill switch. When False, scheduler doesn't start and routes
|
||||
# return 503. Migrations are still applied (so schema is ready).
|
||||
enabled: bool
|
||||
|
||||
# Primary auth path. "dwd" means service account / domain-wide delegation.
|
||||
# "oauth" means per-user refresh tokens. DWD is default; OAuth is the
|
||||
# pluggable fallback.
|
||||
primary_auth: str
|
||||
|
||||
# DWD specifics
|
||||
dwd_key_path: Optional[str]
|
||||
workspace_domain: Optional[str]
|
||||
|
||||
# OAuth specifics (used for fallback + admin UI)
|
||||
oauth_client_id: Optional[str]
|
||||
oauth_client_secret: Optional[str]
|
||||
oauth_redirect_uri: Optional[str]
|
||||
|
||||
# Encryption key (base64) for OAuth refresh-token-at-rest encryption.
|
||||
# Required whenever oauth path is in use. DWD path never persists tokens.
|
||||
secret_key_b64: Optional[str]
|
||||
|
||||
# Sync scheduling
|
||||
sync_interval_sec: int
|
||||
backfill_page_size: int
|
||||
max_attachment_mb: int
|
||||
max_parallel_attachment_downloads: int
|
||||
|
||||
# Storage
|
||||
data_dir: str
|
||||
attachments_dir: str
|
||||
secrets_dir: str
|
||||
|
||||
# Rate limit / retry
|
||||
rate_limit_units_per_sec_per_account: int
|
||||
retry_max_attempts: int
|
||||
retry_initial_delay_sec: float
|
||||
retry_max_delay_sec: float
|
||||
|
||||
# Gmail history retention — if we fall this far behind, switch to date
|
||||
# backfill since Gmail may have pruned history records.
|
||||
history_stale_days: int
|
||||
|
||||
|
||||
def load() -> EmailConfig:
|
||||
data_dir = os.environ.get("CRM_DATA_DIR", _DEFAULT_DATA_DIR)
|
||||
return EmailConfig(
|
||||
enabled=_bool_env("CRM_GMAIL_INTEGRATION_ENABLED", False),
|
||||
primary_auth=os.environ.get("CRM_GMAIL_AUTH_METHOD", "dwd").lower(),
|
||||
dwd_key_path=os.environ.get("CRM_GMAIL_SA_KEY_PATH") or None,
|
||||
workspace_domain=os.environ.get("CRM_GMAIL_WORKSPACE_DOMAIN") or None,
|
||||
oauth_client_id=os.environ.get("CRM_GMAIL_OAUTH_CLIENT_ID") or None,
|
||||
oauth_client_secret=os.environ.get("CRM_GMAIL_OAUTH_CLIENT_SECRET") or None,
|
||||
oauth_redirect_uri=os.environ.get("CRM_GMAIL_OAUTH_REDIRECT_URI") or None,
|
||||
secret_key_b64=os.environ.get("CRM_GMAIL_SECRET_KEY") or None,
|
||||
sync_interval_sec=_int_env("CRM_GMAIL_SYNC_INTERVAL_MIN", 180) * 60,
|
||||
backfill_page_size=_int_env("CRM_GMAIL_BACKFILL_PAGE_SIZE", 500),
|
||||
max_attachment_mb=_int_env("CRM_GMAIL_MAX_ATTACHMENT_MB", 50),
|
||||
max_parallel_attachment_downloads=_int_env("CRM_GMAIL_ATTACH_CONCURRENCY", 4),
|
||||
data_dir=data_dir,
|
||||
attachments_dir=os.path.join(data_dir, "email_attachments"),
|
||||
secrets_dir=os.path.join(data_dir, "secrets"),
|
||||
rate_limit_units_per_sec_per_account=_int_env("CRM_GMAIL_RATE_UNITS_SEC", 150),
|
||||
retry_max_attempts=_int_env("CRM_GMAIL_RETRY_MAX", 5),
|
||||
retry_initial_delay_sec=float(os.environ.get("CRM_GMAIL_RETRY_INITIAL_SEC", "1.0")),
|
||||
retry_max_delay_sec=float(os.environ.get("CRM_GMAIL_RETRY_MAX_SEC", "60.0")),
|
||||
history_stale_days=_int_env("CRM_GMAIL_HISTORY_STALE_DAYS", 5),
|
||||
)
|
||||
|
||||
|
||||
# Singleton. Reload with `reload_config()` if env changes (mostly for tests).
|
||||
CONFIG = load()
|
||||
|
||||
|
||||
def reload_config() -> EmailConfig:
|
||||
global CONFIG
|
||||
CONFIG = load()
|
||||
return CONFIG
|
||||
@@ -0,0 +1,297 @@
|
||||
"""
|
||||
Credential providers for Gmail API access.
|
||||
|
||||
Two implementations behind a common protocol:
|
||||
|
||||
- DWDCredentialProvider: signs a JWT with the Workspace-authorized service
|
||||
account, exchanges for a short-lived access token that impersonates a
|
||||
specific user. No per-user persistent state.
|
||||
|
||||
- OAuthCredentialProvider: uses a per-user refresh token (stored encrypted
|
||||
in email_accounts.oauth_refresh_enc) to mint access tokens. Supports the
|
||||
'connect Gmail' UI flow.
|
||||
|
||||
Both provide the same interface:
|
||||
|
||||
provider.access_token_for(email_address: str) -> AccessToken
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Protocol
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
from . import config as _cfg
|
||||
from . import crypto
|
||||
from . import errors
|
||||
|
||||
|
||||
GMAIL_READONLY_SCOPE = "https://www.googleapis.com/auth/gmail.readonly"
|
||||
GOOGLE_TOKEN_URL = "https://oauth2.googleapis.com/token"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AccessToken:
|
||||
token: str
|
||||
expires_at: float # epoch seconds
|
||||
|
||||
|
||||
class CredentialProvider(Protocol):
|
||||
def access_token_for(self, email_address: str) -> AccessToken: ...
|
||||
def revoke(self, email_address: str) -> None: ...
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Domain-wide delegation
|
||||
# ============================================================================
|
||||
|
||||
class DWDCredentialProvider:
|
||||
"""Impersonation via service-account JWT bearer grant."""
|
||||
|
||||
def __init__(self, key_path: str):
|
||||
with open(key_path, "r") as f:
|
||||
self._key = json.load(f)
|
||||
self._client_email = self._key["client_email"]
|
||||
self._private_key_pem = self._key["private_key"].encode("utf-8")
|
||||
self._cache: dict[str, AccessToken] = {}
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def access_token_for(self, email_address: str) -> AccessToken:
|
||||
with self._lock:
|
||||
cached = self._cache.get(email_address)
|
||||
if cached and cached.expires_at - time.time() > 60:
|
||||
return cached
|
||||
token = self._mint(email_address)
|
||||
self._cache[email_address] = token
|
||||
return token
|
||||
|
||||
def revoke(self, email_address: str) -> None:
|
||||
# DWD tokens expire naturally in <1h. Revocation is via Admin console.
|
||||
# We just drop the cache so next call mints fresh.
|
||||
with self._lock:
|
||||
self._cache.pop(email_address, None)
|
||||
|
||||
# ------------------------------------------------------------------ helpers
|
||||
|
||||
def _mint(self, subject_email: str) -> AccessToken:
|
||||
try:
|
||||
from cryptography.hazmat.primitives import hashes, serialization # type: ignore
|
||||
from cryptography.hazmat.primitives.asymmetric import padding # type: ignore
|
||||
except ImportError as e: # pragma: no cover
|
||||
raise errors.AuthError(
|
||||
"DWD requires the `cryptography` package. Add to requirements.txt."
|
||||
) from e
|
||||
|
||||
now = int(time.time())
|
||||
header = {"alg": "RS256", "typ": "JWT"}
|
||||
claim = {
|
||||
"iss": self._client_email,
|
||||
"sub": subject_email,
|
||||
"scope": GMAIL_READONLY_SCOPE,
|
||||
"aud": GOOGLE_TOKEN_URL,
|
||||
"iat": now,
|
||||
"exp": now + 3600,
|
||||
}
|
||||
signing_input = _b64url(_json(header)) + b"." + _b64url(_json(claim))
|
||||
|
||||
private_key = serialization.load_pem_private_key(self._private_key_pem, password=None)
|
||||
signature = private_key.sign(signing_input, padding.PKCS1v15(), hashes.SHA256())
|
||||
jwt = signing_input + b"." + _b64url(signature)
|
||||
|
||||
body = urllib.parse.urlencode({
|
||||
"grant_type": "urn:ietf:params:oauth:grant-type:jwt-bearer",
|
||||
"assertion": jwt.decode("ascii"),
|
||||
}).encode("ascii")
|
||||
|
||||
req = urllib.request.Request(
|
||||
GOOGLE_TOKEN_URL,
|
||||
data=body,
|
||||
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
payload = json.loads(resp.read())
|
||||
except urllib.error.HTTPError as e:
|
||||
body = e.read().decode("utf-8", errors="replace")
|
||||
try:
|
||||
payload = json.loads(body)
|
||||
except Exception:
|
||||
payload = {"raw": body}
|
||||
raise errors.classify_http(e.code, payload)
|
||||
|
||||
if "access_token" not in payload:
|
||||
raise errors.AuthError("DWD token exchange returned no access_token", payload=payload)
|
||||
return AccessToken(
|
||||
token=payload["access_token"],
|
||||
expires_at=time.time() + float(payload.get("expires_in", 3600)) - 30,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Per-user OAuth (fallback)
|
||||
# ============================================================================
|
||||
|
||||
class OAuthCredentialProvider:
|
||||
"""Refreshes access tokens using a stored encrypted refresh token.
|
||||
|
||||
Refresh tokens are obtained via the consent-flow routes in routes.py and
|
||||
stored in email_accounts.oauth_refresh_enc (AES-GCM ciphertext).
|
||||
"""
|
||||
|
||||
def __init__(self, db_conn_factory, client_id: str, client_secret: str, secret_key_b64: str):
|
||||
self._db = db_conn_factory
|
||||
self._client_id = client_id
|
||||
self._client_secret = client_secret
|
||||
self._secret_key_b64 = secret_key_b64
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def access_token_for(self, email_address: str) -> AccessToken:
|
||||
with self._lock:
|
||||
row = self._load_account(email_address)
|
||||
if row is None:
|
||||
raise errors.AuthError(f"no email_accounts row for {email_address}")
|
||||
# Cached access token still valid?
|
||||
if row["oauth_token_enc"] and row["oauth_token_exp"]:
|
||||
try:
|
||||
exp = float(row["oauth_token_exp"])
|
||||
except ValueError:
|
||||
exp = 0.0
|
||||
if exp - time.time() > 60:
|
||||
token = crypto.decrypt(row["oauth_token_enc"], secret_key_b64=self._secret_key_b64).decode("ascii")
|
||||
return AccessToken(token=token, expires_at=exp)
|
||||
# Refresh.
|
||||
return self._refresh(email_address, row)
|
||||
|
||||
def revoke(self, email_address: str) -> None:
|
||||
row = self._load_account(email_address)
|
||||
if not row or not row["oauth_refresh_enc"]:
|
||||
return
|
||||
refresh = crypto.decrypt(row["oauth_refresh_enc"], secret_key_b64=self._secret_key_b64).decode("ascii")
|
||||
body = urllib.parse.urlencode({"token": refresh}).encode("ascii")
|
||||
req = urllib.request.Request(
|
||||
"https://oauth2.googleapis.com/revoke",
|
||||
data=body,
|
||||
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
||||
)
|
||||
try:
|
||||
urllib.request.urlopen(req, timeout=10).read()
|
||||
except Exception:
|
||||
pass # best effort; we zero locally regardless
|
||||
self._zero_account(email_address)
|
||||
|
||||
# ------------------------------------------------------------------ helpers
|
||||
|
||||
def _refresh(self, email_address: str, row) -> AccessToken:
|
||||
if not row["oauth_refresh_enc"]:
|
||||
raise errors.AuthError(f"no refresh token stored for {email_address}")
|
||||
refresh = crypto.decrypt(row["oauth_refresh_enc"], secret_key_b64=self._secret_key_b64).decode("ascii")
|
||||
body = urllib.parse.urlencode({
|
||||
"grant_type": "refresh_token",
|
||||
"refresh_token": refresh,
|
||||
"client_id": self._client_id,
|
||||
"client_secret": self._client_secret,
|
||||
}).encode("ascii")
|
||||
req = urllib.request.Request(
|
||||
GOOGLE_TOKEN_URL,
|
||||
data=body,
|
||||
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
payload = json.loads(resp.read())
|
||||
except urllib.error.HTTPError as e:
|
||||
body_text = e.read().decode("utf-8", errors="replace")
|
||||
try:
|
||||
payload = json.loads(body_text)
|
||||
except Exception:
|
||||
payload = {"raw": body_text}
|
||||
raise errors.classify_http(e.code, payload)
|
||||
|
||||
if "access_token" not in payload:
|
||||
raise errors.AuthError("OAuth refresh returned no access_token", payload=payload)
|
||||
|
||||
token_str = payload["access_token"]
|
||||
exp = time.time() + float(payload.get("expires_in", 3600)) - 30
|
||||
enc_token = crypto.encrypt(token_str.encode("ascii"), secret_key_b64=self._secret_key_b64)
|
||||
self._save_token(email_address, enc_token, exp)
|
||||
return AccessToken(token=token_str, expires_at=exp)
|
||||
|
||||
def _load_account(self, email_address: str):
|
||||
conn = self._db()
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT id, oauth_refresh_enc, oauth_token_enc, oauth_token_exp "
|
||||
"FROM email_accounts WHERE email_address = ?",
|
||||
(email_address,),
|
||||
)
|
||||
return cur.fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def _save_token(self, email_address: str, enc_token: bytes, exp: float):
|
||||
conn = self._db()
|
||||
try:
|
||||
conn.execute(
|
||||
"UPDATE email_accounts SET oauth_token_enc = ?, oauth_token_exp = ?, "
|
||||
"updated_at = datetime('now') WHERE email_address = ?",
|
||||
(enc_token, str(exp), email_address),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def _zero_account(self, email_address: str):
|
||||
conn = self._db()
|
||||
try:
|
||||
conn.execute(
|
||||
"UPDATE email_accounts SET oauth_refresh_enc = NULL, oauth_token_enc = NULL, "
|
||||
"oauth_token_exp = NULL, sync_enabled = 0, sync_status = 'paused', "
|
||||
"updated_at = datetime('now') WHERE email_address = ?",
|
||||
(email_address,),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Factory — resolves CONFIG.primary_auth to a concrete provider
|
||||
# ============================================================================
|
||||
|
||||
def build_provider(db_conn_factory) -> CredentialProvider:
|
||||
cfg = _cfg.CONFIG
|
||||
if cfg.primary_auth == "dwd":
|
||||
if not cfg.dwd_key_path or not os.path.exists(cfg.dwd_key_path):
|
||||
raise errors.AuthError(
|
||||
f"CRM_GMAIL_SA_KEY_PATH not found: {cfg.dwd_key_path!r}"
|
||||
)
|
||||
return DWDCredentialProvider(cfg.dwd_key_path)
|
||||
if cfg.primary_auth == "oauth":
|
||||
if not (cfg.oauth_client_id and cfg.oauth_client_secret and cfg.secret_key_b64):
|
||||
raise errors.AuthError(
|
||||
"OAuth mode requires CRM_GMAIL_OAUTH_CLIENT_ID, "
|
||||
"CRM_GMAIL_OAUTH_CLIENT_SECRET, and CRM_GMAIL_SECRET_KEY."
|
||||
)
|
||||
return OAuthCredentialProvider(
|
||||
db_conn_factory,
|
||||
cfg.oauth_client_id,
|
||||
cfg.oauth_client_secret,
|
||||
cfg.secret_key_b64,
|
||||
)
|
||||
raise errors.AuthError(f"unknown primary_auth: {cfg.primary_auth!r}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- utils
|
||||
|
||||
def _b64url(data: bytes) -> bytes:
|
||||
return base64.urlsafe_b64encode(data).rstrip(b"=")
|
||||
|
||||
|
||||
def _json(obj) -> bytes:
|
||||
return json.dumps(obj, separators=(",", ":")).encode("utf-8")
|
||||
@@ -0,0 +1,79 @@
|
||||
"""
|
||||
AES-256-GCM encryption for OAuth refresh tokens at rest.
|
||||
|
||||
Key material comes from CONFIG.secret_key_b64 (env: CRM_GMAIL_SECRET_KEY).
|
||||
Must be at least 32 bytes of entropy, base64-encoded.
|
||||
|
||||
Storage format (as stored in BLOB columns):
|
||||
version(1 byte) || nonce(12 bytes) || ciphertext+tag(N bytes)
|
||||
|
||||
version = 1 for AES-GCM-256.
|
||||
|
||||
Uses the `cryptography` library. If not available (optional at scaffold time),
|
||||
the OAuth fallback path is disabled with a clear error — DWD path is unaffected.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import os
|
||||
import secrets
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
from cryptography.hazmat.primitives.ciphers.aead import AESGCM # type: ignore
|
||||
_AVAILABLE = True
|
||||
except ImportError: # pragma: no cover
|
||||
AESGCM = None # type: ignore
|
||||
_AVAILABLE = False
|
||||
|
||||
|
||||
VERSION = 1
|
||||
NONCE_LEN = 12
|
||||
|
||||
|
||||
class CryptoUnavailable(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def _load_key(secret_key_b64: Optional[str]) -> bytes:
|
||||
if not secret_key_b64:
|
||||
raise CryptoUnavailable(
|
||||
"CRM_GMAIL_SECRET_KEY not set; cannot encrypt/decrypt OAuth tokens. "
|
||||
"DWD auth does not require this."
|
||||
)
|
||||
try:
|
||||
key = base64.b64decode(secret_key_b64)
|
||||
except Exception as e:
|
||||
raise CryptoUnavailable(f"CRM_GMAIL_SECRET_KEY not valid base64: {e}") from e
|
||||
if len(key) < 32:
|
||||
raise CryptoUnavailable(
|
||||
f"CRM_GMAIL_SECRET_KEY decodes to {len(key)} bytes; need >= 32."
|
||||
)
|
||||
return key[:32] # AES-256
|
||||
|
||||
|
||||
def encrypt(plaintext: bytes, *, secret_key_b64: Optional[str]) -> bytes:
|
||||
if not _AVAILABLE:
|
||||
raise CryptoUnavailable("cryptography library not installed")
|
||||
key = _load_key(secret_key_b64)
|
||||
nonce = secrets.token_bytes(NONCE_LEN)
|
||||
ct = AESGCM(key).encrypt(nonce, plaintext, None)
|
||||
return bytes([VERSION]) + nonce + ct
|
||||
|
||||
|
||||
def decrypt(blob: bytes, *, secret_key_b64: Optional[str]) -> bytes:
|
||||
if not _AVAILABLE:
|
||||
raise CryptoUnavailable("cryptography library not installed")
|
||||
if not blob or len(blob) < 1 + NONCE_LEN + 16:
|
||||
raise ValueError("ciphertext too short")
|
||||
version = blob[0]
|
||||
if version != VERSION:
|
||||
raise ValueError(f"unsupported crypto version: {version}")
|
||||
nonce = blob[1:1 + NONCE_LEN]
|
||||
ct = blob[1 + NONCE_LEN:]
|
||||
key = _load_key(secret_key_b64)
|
||||
return AESGCM(key).decrypt(nonce, ct, None)
|
||||
|
||||
|
||||
def generate_secret_key_b64() -> str:
|
||||
"""Helper for initial setup: prints a fresh key you can drop into env."""
|
||||
return base64.b64encode(os.urandom(32)).decode("ascii")
|
||||
@@ -0,0 +1,416 @@
|
||||
"""
|
||||
Data-access layer for the email_integration module.
|
||||
|
||||
All SQL touching emails_* tables lives here. Other modules call named
|
||||
helpers — they never write SQL inline. This keeps schema changes contained.
|
||||
|
||||
Connection pattern matches server.py get_db():
|
||||
- WAL mode, foreign keys on, busy_timeout
|
||||
- sqlite3.Row row_factory
|
||||
The caller is responsible for committing / closing.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Iterable, Optional
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ migrations
|
||||
|
||||
def apply_migrations(cursor: sqlite3.Cursor) -> None:
|
||||
"""Apply all .sql migration files in migrations/ in lexicographic order.
|
||||
|
||||
Called from server.init_db(). Idempotent. Does not log past migrations in
|
||||
a table yet — each file is guarded by CREATE ... IF NOT EXISTS etc. If
|
||||
we ever need more complex migrations, add a schema_migrations table.
|
||||
"""
|
||||
here = os.path.dirname(os.path.abspath(__file__))
|
||||
mdir = os.path.join(here, "migrations")
|
||||
if not os.path.isdir(mdir):
|
||||
return
|
||||
for name in sorted(os.listdir(mdir)):
|
||||
if not name.endswith(".sql"):
|
||||
continue
|
||||
path = os.path.join(mdir, name)
|
||||
with open(path, "r") as f:
|
||||
sql = f.read()
|
||||
cursor.executescript(sql)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ utils
|
||||
|
||||
def _uuid() -> str:
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def _json(v) -> str:
|
||||
return json.dumps(v, separators=(",", ":"))
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ email_accounts
|
||||
|
||||
def list_sync_ready_accounts(conn: sqlite3.Connection) -> list[sqlite3.Row]:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT * FROM email_accounts "
|
||||
"WHERE sync_enabled = 1 AND sync_status IN ('pending','active') "
|
||||
"ORDER BY last_synced_at IS NOT NULL, last_synced_at"
|
||||
)
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
def get_account_by_email(conn: sqlite3.Connection, email_address: str) -> Optional[sqlite3.Row]:
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT * FROM email_accounts WHERE email_address = ?", (email_address,))
|
||||
return cur.fetchone()
|
||||
|
||||
|
||||
def upsert_account(conn: sqlite3.Connection, *, user_id: str, email_address: str,
|
||||
auth_method: str) -> str:
|
||||
existing = get_account_by_email(conn, email_address)
|
||||
if existing:
|
||||
return existing["id"]
|
||||
account_id = _uuid()
|
||||
conn.execute(
|
||||
"INSERT INTO email_accounts (id, user_id, email_address, auth_method) "
|
||||
"VALUES (?, ?, ?, ?)",
|
||||
(account_id, user_id, email_address, auth_method),
|
||||
)
|
||||
return account_id
|
||||
|
||||
|
||||
def set_account_status(conn: sqlite3.Connection, account_id: str, *,
|
||||
status: str, error: Optional[str] = None) -> None:
|
||||
conn.execute(
|
||||
"UPDATE email_accounts SET sync_status = ?, sync_error = ?, "
|
||||
"updated_at = datetime('now') WHERE id = ?",
|
||||
(status, error, account_id),
|
||||
)
|
||||
|
||||
|
||||
def set_account_checkpoint(conn: sqlite3.Connection, account_id: str, *,
|
||||
history_id: Optional[str] = None,
|
||||
backfill_cursor: Optional[str] = None,
|
||||
backfill_complete: Optional[bool] = None,
|
||||
last_synced_at: Optional[str] = None) -> None:
|
||||
sets, params = [], []
|
||||
if history_id is not None:
|
||||
sets.append("last_history_id = ?"); params.append(history_id)
|
||||
if backfill_cursor is not None:
|
||||
sets.append("backfill_cursor = ?"); params.append(backfill_cursor)
|
||||
if backfill_complete is not None:
|
||||
sets.append("backfill_complete = ?"); params.append(1 if backfill_complete else 0)
|
||||
if last_synced_at is not None:
|
||||
sets.append("last_synced_at = ?"); params.append(last_synced_at)
|
||||
if not sets:
|
||||
return
|
||||
sets.append("updated_at = datetime('now')")
|
||||
params.append(account_id)
|
||||
conn.execute(f"UPDATE email_accounts SET {', '.join(sets)} WHERE id = ?", params)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ emails
|
||||
|
||||
def find_email_by_rfc_id(conn: sqlite3.Connection, rfc_message_id: str) -> Optional[sqlite3.Row]:
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT * FROM emails WHERE rfc_message_id = ?", (rfc_message_id,))
|
||||
return cur.fetchone()
|
||||
|
||||
|
||||
def find_email_id_by_any_rfc_id(conn: sqlite3.Connection,
|
||||
rfc_ids: Iterable[str]) -> Optional[str]:
|
||||
ids = [r for r in rfc_ids if r]
|
||||
if not ids:
|
||||
return None
|
||||
placeholders = ",".join("?" for _ in ids)
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
f"SELECT id FROM emails WHERE rfc_message_id IN ({placeholders}) "
|
||||
"ORDER BY sent_at ASC LIMIT 1",
|
||||
ids,
|
||||
)
|
||||
row = cur.fetchone()
|
||||
return row["id"] if row else None
|
||||
|
||||
|
||||
def insert_email(conn: sqlite3.Connection, *, parsed: dict, match_status: str) -> str:
|
||||
"""Insert a fresh emails row. Returns email_id.
|
||||
|
||||
Caller must ensure no row exists for parsed['rfc_message_id']; use
|
||||
find_email_by_rfc_id first.
|
||||
"""
|
||||
email_id = _uuid()
|
||||
conn.execute(
|
||||
"""INSERT INTO emails
|
||||
(id, rfc_message_id, gmail_thread_id, rfc_thread_root_id, subject,
|
||||
from_email, from_name, to_emails_json, cc_emails_json, bcc_emails_json,
|
||||
reply_to, sent_at, body_text, body_html, snippet, in_reply_to,
|
||||
references_json, has_attachments, size_estimate, is_matched,
|
||||
match_status, raw_headers_json)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
email_id,
|
||||
parsed["rfc_message_id"],
|
||||
parsed.get("gmail_thread_id"),
|
||||
parsed.get("rfc_thread_root_id"),
|
||||
parsed.get("subject"),
|
||||
parsed["from_email"],
|
||||
parsed.get("from_name"),
|
||||
_json(parsed.get("to", [])),
|
||||
_json(parsed.get("cc", [])),
|
||||
_json(parsed.get("bcc", [])),
|
||||
parsed.get("reply_to"),
|
||||
parsed["sent_at"],
|
||||
parsed.get("body_text"),
|
||||
parsed.get("body_html"),
|
||||
parsed.get("snippet"),
|
||||
parsed.get("in_reply_to"),
|
||||
_json(parsed.get("references", [])),
|
||||
1 if parsed.get("attachments") else 0,
|
||||
parsed.get("size_estimate"),
|
||||
1 if match_status == "matched" else 0,
|
||||
match_status,
|
||||
_json(parsed.get("raw_headers", {})) if parsed.get("raw_headers") else None,
|
||||
),
|
||||
)
|
||||
# recipients
|
||||
for kind in ("from", "to", "cc", "bcc", "reply_to"):
|
||||
addrs = []
|
||||
if kind == "from" and parsed.get("from_email"):
|
||||
addrs = [(parsed["from_email"], parsed.get("from_name"))]
|
||||
elif kind == "reply_to" and parsed.get("reply_to"):
|
||||
addrs = [(parsed["reply_to"], None)]
|
||||
else:
|
||||
for a in parsed.get(kind, []):
|
||||
if isinstance(a, dict):
|
||||
addrs.append((a.get("email"), a.get("name")))
|
||||
else:
|
||||
addrs.append((a, None))
|
||||
for address, name in addrs:
|
||||
if not address:
|
||||
continue
|
||||
conn.execute(
|
||||
"INSERT INTO email_recipients (id, email_id, address, display_name, kind) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
(_uuid(), email_id, address.lower().strip(), name, kind),
|
||||
)
|
||||
return email_id
|
||||
|
||||
|
||||
def set_email_thread(conn: sqlite3.Connection, email_id: str, thread_id: str) -> None:
|
||||
conn.execute(
|
||||
"UPDATE emails SET thread_id = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
(thread_id, email_id),
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ sightings
|
||||
|
||||
def upsert_sighting(conn: sqlite3.Connection, *, email_id: str, account_id: str,
|
||||
gmail_message_id: str, gmail_thread_id: str,
|
||||
labels: list[str], is_sent: bool) -> None:
|
||||
conn.execute(
|
||||
"""INSERT OR IGNORE INTO email_account_messages
|
||||
(id, email_id, account_id, gmail_message_id, gmail_thread_id,
|
||||
labels_json, is_sent)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
||||
(_uuid(), email_id, account_id, gmail_message_id, gmail_thread_id,
|
||||
_json(labels), 1 if is_sent else 0),
|
||||
)
|
||||
|
||||
|
||||
def update_sighting_labels(conn: sqlite3.Connection, *, account_id: str,
|
||||
gmail_message_id: str, labels: list[str]) -> None:
|
||||
conn.execute(
|
||||
"UPDATE email_account_messages SET labels_json = ? "
|
||||
"WHERE account_id = ? AND gmail_message_id = ?",
|
||||
(_json(labels), account_id, gmail_message_id),
|
||||
)
|
||||
|
||||
|
||||
def tombstone_sighting(conn: sqlite3.Connection, *, account_id: str,
|
||||
gmail_message_id: str) -> None:
|
||||
conn.execute(
|
||||
"UPDATE email_account_messages SET deleted_at = datetime('now') "
|
||||
"WHERE account_id = ? AND gmail_message_id = ?",
|
||||
(account_id, gmail_message_id),
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ attachments
|
||||
|
||||
def insert_attachment_stub(conn: sqlite3.Connection, *, email_id: str,
|
||||
gmail_attachment_id: str, filename: str,
|
||||
sanitized_filename: str, mime_type: Optional[str],
|
||||
size_bytes: Optional[int], storage_path: str) -> str:
|
||||
att_id = _uuid()
|
||||
conn.execute(
|
||||
"""INSERT INTO email_attachments
|
||||
(id, email_id, gmail_attachment_id, filename, sanitized_filename,
|
||||
mime_type, size_bytes, storage_path)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(att_id, email_id, gmail_attachment_id, filename, sanitized_filename,
|
||||
mime_type, size_bytes, storage_path),
|
||||
)
|
||||
return att_id
|
||||
|
||||
|
||||
def mark_attachment_downloaded(conn: sqlite3.Connection, attachment_id: str, *,
|
||||
sha256_hex: str, size_bytes: int) -> None:
|
||||
conn.execute(
|
||||
"UPDATE email_attachments SET download_status = 'downloaded', "
|
||||
"sha256_hex = ?, size_bytes = ?, downloaded_at = datetime('now') "
|
||||
"WHERE id = ?",
|
||||
(sha256_hex, size_bytes, attachment_id),
|
||||
)
|
||||
|
||||
|
||||
def mark_attachment_failed(conn: sqlite3.Connection, attachment_id: str, *,
|
||||
error: str) -> None:
|
||||
conn.execute(
|
||||
"UPDATE email_attachments SET download_status = 'failed', "
|
||||
"download_attempts = download_attempts + 1, download_error = ? "
|
||||
"WHERE id = ?",
|
||||
(error, attachment_id),
|
||||
)
|
||||
|
||||
|
||||
def pending_attachments(conn: sqlite3.Connection, limit: int = 50) -> list[sqlite3.Row]:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT a.*, eam.gmail_message_id, eam.account_id "
|
||||
"FROM email_attachments a "
|
||||
"JOIN email_account_messages eam ON eam.email_id = a.email_id "
|
||||
"WHERE a.download_status = 'pending' AND a.download_attempts < 5 "
|
||||
"LIMIT ?",
|
||||
(limit,),
|
||||
)
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ threads
|
||||
|
||||
def find_thread_by_gmail_id(conn: sqlite3.Connection, gmail_thread_id: str) -> Optional[sqlite3.Row]:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT * FROM email_threads WHERE gmail_thread_id = ?",
|
||||
(gmail_thread_id,),
|
||||
)
|
||||
return cur.fetchone()
|
||||
|
||||
|
||||
def find_thread_by_rfc_root(conn: sqlite3.Connection, rfc_root: str) -> Optional[sqlite3.Row]:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT * FROM email_threads WHERE rfc_thread_root_id = ?",
|
||||
(rfc_root,),
|
||||
)
|
||||
return cur.fetchone()
|
||||
|
||||
|
||||
def create_thread(conn: sqlite3.Connection, *, gmail_thread_id: Optional[str],
|
||||
rfc_thread_root_id: Optional[str], subject_normalized: Optional[str],
|
||||
first_message_at: Optional[str]) -> str:
|
||||
thread_id = _uuid()
|
||||
conn.execute(
|
||||
"""INSERT INTO email_threads
|
||||
(id, gmail_thread_id, rfc_thread_root_id, subject_normalized,
|
||||
first_message_at, last_message_at, message_count)
|
||||
VALUES (?, ?, ?, ?, ?, ?, 0)""",
|
||||
(thread_id, gmail_thread_id, rfc_thread_root_id, subject_normalized,
|
||||
first_message_at, first_message_at),
|
||||
)
|
||||
return thread_id
|
||||
|
||||
|
||||
def rollup_thread(conn: sqlite3.Connection, thread_id: str) -> None:
|
||||
"""Recompute count / last_message_at / participants from member emails.
|
||||
|
||||
Cheap at 5-person team volumes. For larger deployments swap to triggers.
|
||||
"""
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) AS n, MIN(sent_at) AS first, MAX(sent_at) AS last, "
|
||||
"MAX(is_matched) AS matched FROM emails WHERE thread_id = ?",
|
||||
(thread_id,),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if not row or row["n"] == 0:
|
||||
return
|
||||
cur.execute(
|
||||
"SELECT DISTINCT address FROM email_recipients er "
|
||||
"JOIN emails e ON e.id = er.email_id WHERE e.thread_id = ?",
|
||||
(thread_id,),
|
||||
)
|
||||
participants = [r["address"] for r in cur.fetchall()]
|
||||
conn.execute(
|
||||
"UPDATE email_threads SET message_count = ?, first_message_at = ?, "
|
||||
"last_message_at = ?, participant_count = ?, participants_json = ?, "
|
||||
"is_matched = ?, updated_at = datetime('now') WHERE id = ?",
|
||||
(row["n"], row["first"], row["last"], len(participants),
|
||||
_json(participants), int(row["matched"] or 0), thread_id),
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ investor links
|
||||
|
||||
def insert_investor_link(conn: sqlite3.Connection, *, email_id: str,
|
||||
link: dict) -> None:
|
||||
conn.execute(
|
||||
"""INSERT INTO email_investor_links
|
||||
(id, email_id, fundraising_investor_id, fundraising_contact_id,
|
||||
contact_id, organization_id, matched_address, match_kind,
|
||||
match_confidence)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
_uuid(),
|
||||
email_id,
|
||||
link.get("fundraising_investor_id"),
|
||||
link.get("fundraising_contact_id"),
|
||||
link.get("contact_id"),
|
||||
link.get("organization_id"),
|
||||
link["matched_address"],
|
||||
link["match_kind"],
|
||||
float(link.get("match_confidence", 1.0)),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------ sync runs
|
||||
|
||||
def start_sync_run(conn: sqlite3.Connection, *, account_id: str, kind: str) -> str:
|
||||
run_id = _uuid()
|
||||
conn.execute(
|
||||
"INSERT INTO email_sync_runs (id, account_id, kind, started_at, status) "
|
||||
"VALUES (?, ?, ?, ?, 'running')",
|
||||
(run_id, account_id, kind, _now_iso()),
|
||||
)
|
||||
return run_id
|
||||
|
||||
|
||||
def finish_sync_run(conn: sqlite3.Connection, run_id: str, *, status: str,
|
||||
stats: Optional[dict] = None, error: Optional[str] = None) -> None:
|
||||
stats = stats or {}
|
||||
conn.execute(
|
||||
"""UPDATE email_sync_runs
|
||||
SET finished_at = ?, status = ?, messages_seen = ?, messages_stored = ?,
|
||||
attachments_saved = ?, api_calls = ?, retries = ?, error = ?
|
||||
WHERE id = ?""",
|
||||
(
|
||||
_now_iso(), status,
|
||||
int(stats.get("messages_seen", 0)),
|
||||
int(stats.get("messages_stored", 0)),
|
||||
int(stats.get("attachments_saved", 0)),
|
||||
int(stats.get("api_calls", 0)),
|
||||
int(stats.get("retries", 0)),
|
||||
error,
|
||||
run_id,
|
||||
),
|
||||
)
|
||||
@@ -0,0 +1,79 @@
|
||||
"""
|
||||
Exception taxonomy for Gmail integration.
|
||||
|
||||
gmail_client._call() maps HTTP status codes to these exception types. The retry
|
||||
loop in gmail_client._with_retry() inspects the class hierarchy to decide
|
||||
whether to back off + retry or fail fast.
|
||||
"""
|
||||
|
||||
|
||||
class GmailError(Exception):
|
||||
"""Base class for all Gmail-integration errors."""
|
||||
|
||||
def __init__(self, message: str = "", *, status: int = 0, payload: object = None):
|
||||
super().__init__(message)
|
||||
self.status = status
|
||||
self.payload = payload
|
||||
|
||||
|
||||
class AuthError(GmailError):
|
||||
"""401 / 403 that is not a rate-limit. Requires operator intervention
|
||||
(bad service account key, revoked OAuth, missing DWD scope). Not retried."""
|
||||
|
||||
|
||||
class RateLimitError(GmailError):
|
||||
"""429 or 403 with reason in {rateLimitExceeded, userRateLimitExceeded}.
|
||||
Retried with exponential backoff."""
|
||||
|
||||
|
||||
class TransientError(GmailError):
|
||||
"""5xx or network error. Retried with exponential backoff."""
|
||||
|
||||
|
||||
class NotFoundError(GmailError):
|
||||
"""404. For messages this usually means 'deleted in Gmail after we saw it';
|
||||
for history this is HistoryExpiredError."""
|
||||
|
||||
|
||||
class HistoryExpiredError(NotFoundError):
|
||||
"""404 on history.list with startHistoryId — Gmail only retains history
|
||||
for a limited window (~7 days). Triggers date-based backfill fallback."""
|
||||
|
||||
|
||||
class PermanentError(GmailError):
|
||||
"""400 or other permanent failure. Skip and log; do not retry."""
|
||||
|
||||
|
||||
def classify_http(status: int, payload: object) -> GmailError:
|
||||
"""Map a Gmail API response to the appropriate exception type.
|
||||
|
||||
`payload` is the decoded JSON body if any; used to distinguish rate-limit
|
||||
403s from pure auth 403s via the `reason` field Google returns.
|
||||
"""
|
||||
reason = ""
|
||||
if isinstance(payload, dict):
|
||||
try:
|
||||
errs = payload.get("error", {}).get("errors") or []
|
||||
if errs:
|
||||
reason = str(errs[0].get("reason", ""))
|
||||
except Exception: # pragma: no cover — defensive
|
||||
pass
|
||||
|
||||
if status == 429:
|
||||
return RateLimitError(f"rate limited: {reason}", status=status, payload=payload)
|
||||
if status == 403:
|
||||
if reason in ("rateLimitExceeded", "userRateLimitExceeded", "quotaExceeded"):
|
||||
return RateLimitError(f"quota: {reason}", status=status, payload=payload)
|
||||
return AuthError(f"forbidden: {reason}", status=status, payload=payload)
|
||||
if status == 401:
|
||||
return AuthError("unauthorized", status=status, payload=payload)
|
||||
if status == 404:
|
||||
return NotFoundError("not found", status=status, payload=payload)
|
||||
if 500 <= status < 600:
|
||||
return TransientError(f"server error {status}", status=status, payload=payload)
|
||||
if 400 <= status < 500:
|
||||
return PermanentError(f"client error {status}: {reason}", status=status, payload=payload)
|
||||
return GmailError(f"unexpected status {status}", status=status, payload=payload)
|
||||
|
||||
|
||||
RETRYABLE = (RateLimitError, TransientError)
|
||||
@@ -0,0 +1,249 @@
|
||||
"""
|
||||
Thin Gmail API wrapper.
|
||||
|
||||
Responsibilities:
|
||||
- HTTPS calls to https://gmail.googleapis.com/gmail/v1/users/me/*
|
||||
- Per-account access-token injection via CredentialProvider
|
||||
- Rate limiting via token bucket
|
||||
- Retry loop with exponential backoff + jitter for RETRYABLE errors
|
||||
- Batch requests for metadata fetches (multipart/mixed) — sketch provided
|
||||
- Call-count accounting for observability (plumbed to email_sync_runs)
|
||||
|
||||
We call Gmail over raw urllib instead of the google-api-python-client to keep
|
||||
the dependency surface small. If you prefer the Google SDK, replace _call()
|
||||
with client calls; everything else is independent.
|
||||
"""
|
||||
|
||||
import json
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Iterator, Optional
|
||||
|
||||
from . import config as _cfg
|
||||
from . import errors
|
||||
|
||||
|
||||
BASE = "https://gmail.googleapis.com/gmail/v1/users"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- token bucket
|
||||
|
||||
class _TokenBucket:
|
||||
"""Simple per-account rate limiter. Call wait(cost) before each API call."""
|
||||
|
||||
def __init__(self, units_per_sec: int, burst: Optional[int] = None):
|
||||
self._rate = float(units_per_sec)
|
||||
self._burst = float(burst if burst is not None else units_per_sec)
|
||||
self._tokens = self._burst
|
||||
self._last = time.monotonic()
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def wait(self, cost: float) -> None:
|
||||
while True:
|
||||
with self._lock:
|
||||
now = time.monotonic()
|
||||
self._tokens = min(self._burst, self._tokens + (now - self._last) * self._rate)
|
||||
self._last = now
|
||||
if self._tokens >= cost:
|
||||
self._tokens -= cost
|
||||
return
|
||||
needed = cost - self._tokens
|
||||
sleep_for = needed / self._rate
|
||||
time.sleep(sleep_for)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- call stats
|
||||
|
||||
@dataclass
|
||||
class CallStats:
|
||||
api_calls: int = 0
|
||||
retries: int = 0
|
||||
bytes_in: int = 0
|
||||
last_errors: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- client
|
||||
|
||||
class GmailClient:
|
||||
"""Per-account Gmail client. Bind one instance per sync run."""
|
||||
|
||||
def __init__(self, credential_provider, email_address: str, stats: Optional[CallStats] = None):
|
||||
self._creds = credential_provider
|
||||
self._email = email_address
|
||||
self._bucket = _TokenBucket(units_per_sec=_cfg.CONFIG.rate_limit_units_per_sec_per_account)
|
||||
self.stats = stats or CallStats()
|
||||
|
||||
# -------------------------------------------------------------- messages.*
|
||||
|
||||
def list_messages(self, *, q: str = "", page_token: Optional[str] = None,
|
||||
max_results: int = 500, label_ids: Optional[list[str]] = None) -> dict:
|
||||
"""https://developers.google.com/gmail/api/reference/rest/v1/users.messages/list"""
|
||||
params = {"maxResults": str(max_results)}
|
||||
if q:
|
||||
params["q"] = q
|
||||
if page_token:
|
||||
params["pageToken"] = page_token
|
||||
if label_ids:
|
||||
for lid in label_ids:
|
||||
params.setdefault("labelIds", [])
|
||||
params["labelIds"].append(lid) if False else None
|
||||
return self._get("/messages", params=params, cost=5)
|
||||
|
||||
def get_message(self, message_id: str, *, format: str = "metadata",
|
||||
metadata_headers: Optional[list[str]] = None) -> dict:
|
||||
params = {"format": format}
|
||||
if format == "metadata" and metadata_headers:
|
||||
params["metadataHeaders"] = metadata_headers
|
||||
return self._get(f"/messages/{message_id}", params=params, cost=5)
|
||||
|
||||
def get_attachment(self, message_id: str, attachment_id: str) -> dict:
|
||||
return self._get(
|
||||
f"/messages/{message_id}/attachments/{attachment_id}",
|
||||
params=None,
|
||||
cost=5,
|
||||
)
|
||||
|
||||
# -------------------------------------------------------------- history.*
|
||||
|
||||
def list_history(self, *, start_history_id: str, page_token: Optional[str] = None,
|
||||
history_types: Optional[list[str]] = None) -> dict:
|
||||
params = {"startHistoryId": start_history_id, "maxResults": "500"}
|
||||
if page_token:
|
||||
params["pageToken"] = page_token
|
||||
if history_types:
|
||||
params["historyTypes"] = history_types
|
||||
try:
|
||||
return self._get("/history", params=params, cost=2)
|
||||
except errors.NotFoundError as e:
|
||||
# Gmail returns 404 when startHistoryId is too old. Wrap for callers.
|
||||
raise errors.HistoryExpiredError(
|
||||
"startHistoryId no longer available", status=404, payload=getattr(e, "payload", None)
|
||||
) from e
|
||||
|
||||
# -------------------------------------------------------------- profile
|
||||
|
||||
def get_profile(self) -> dict:
|
||||
return self._get("/profile", params=None, cost=1)
|
||||
|
||||
# -------------------------------------------------------------- iteration helpers
|
||||
|
||||
def iter_messages(self, *, q: str = "") -> Iterator[dict]:
|
||||
page_token: Optional[str] = None
|
||||
while True:
|
||||
resp = self.list_messages(q=q, page_token=page_token,
|
||||
max_results=_cfg.CONFIG.backfill_page_size)
|
||||
for m in resp.get("messages") or []:
|
||||
yield m
|
||||
page_token = resp.get("nextPageToken")
|
||||
if not page_token:
|
||||
return
|
||||
|
||||
def iter_history(self, *, start_history_id: str,
|
||||
history_types: Optional[list[str]] = None) -> Iterator[dict]:
|
||||
page_token: Optional[str] = None
|
||||
while True:
|
||||
resp = self.list_history(
|
||||
start_history_id=start_history_id,
|
||||
page_token=page_token,
|
||||
history_types=history_types,
|
||||
)
|
||||
for h in resp.get("history") or []:
|
||||
yield h
|
||||
page_token = resp.get("nextPageToken")
|
||||
if not page_token:
|
||||
# Cache final historyId for caller to checkpoint.
|
||||
self._last_history_id = resp.get("historyId")
|
||||
return
|
||||
|
||||
@property
|
||||
def last_history_id(self) -> Optional[str]:
|
||||
return getattr(self, "_last_history_id", None)
|
||||
|
||||
# -------------------------------------------------------------- internals
|
||||
|
||||
def _get(self, path: str, *, params: Optional[dict], cost: float) -> dict:
|
||||
return self._with_retry(lambda: self._call("GET", path, params=params, cost=cost))
|
||||
|
||||
def _call(self, method: str, path: str, *, params: Optional[dict] = None,
|
||||
body: Optional[bytes] = None, cost: float = 1.0) -> dict:
|
||||
self._bucket.wait(cost)
|
||||
self.stats.api_calls += 1
|
||||
|
||||
qs = ""
|
||||
if params:
|
||||
# urllib.parse.urlencode with doseq=True handles repeated params
|
||||
# like metadataHeaders=Foo&metadataHeaders=Bar correctly.
|
||||
qs = "?" + urllib.parse.urlencode(params, doseq=True)
|
||||
url = f"{BASE}/me{path}{qs}"
|
||||
|
||||
token = self._creds.access_token_for(self._email)
|
||||
req = urllib.request.Request(url, method=method, data=body)
|
||||
req.add_header("Authorization", f"Bearer {token.token}")
|
||||
req.add_header("Accept", "application/json")
|
||||
if body:
|
||||
req.add_header("Content-Type", "application/json")
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
raw = resp.read()
|
||||
self.stats.bytes_in += len(raw)
|
||||
if not raw:
|
||||
return {}
|
||||
return json.loads(raw)
|
||||
except urllib.error.HTTPError as e:
|
||||
raw = e.read() or b""
|
||||
self.stats.bytes_in += len(raw)
|
||||
try:
|
||||
payload = json.loads(raw) if raw else {}
|
||||
except Exception:
|
||||
payload = {"raw": raw.decode("utf-8", errors="replace")}
|
||||
err = errors.classify_http(e.code, payload)
|
||||
# short-message logging hook (redacted of tokens by design)
|
||||
self.stats.last_errors.append(f"{e.code} {type(err).__name__}")
|
||||
self.stats.last_errors = self.stats.last_errors[-10:]
|
||||
raise err
|
||||
except (urllib.error.URLError, TimeoutError) as e:
|
||||
raise errors.TransientError(f"network error: {e}") from e
|
||||
|
||||
def _with_retry(self, fn):
|
||||
cfg = _cfg.CONFIG
|
||||
attempts = 0
|
||||
delay = cfg.retry_initial_delay_sec
|
||||
while True:
|
||||
try:
|
||||
return fn()
|
||||
except errors.RETRYABLE as e:
|
||||
attempts += 1
|
||||
if attempts >= cfg.retry_max_attempts:
|
||||
raise
|
||||
self.stats.retries += 1
|
||||
# Full jitter
|
||||
sleep_for = random.uniform(0, min(delay, cfg.retry_max_delay_sec))
|
||||
time.sleep(sleep_for)
|
||||
delay = min(delay * 2, cfg.retry_max_delay_sec)
|
||||
# Non-retryable errors propagate immediately.
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- batch fetch sketch
|
||||
|
||||
def batch_get_metadata(client: GmailClient, message_ids: list[str],
|
||||
headers: list[str]) -> dict[str, dict]:
|
||||
"""Fetch metadata for up to ~100 messages.
|
||||
|
||||
TODO: implement using Gmail's multipart/mixed batch endpoint at
|
||||
https://www.googleapis.com/batch/gmail/v1 for efficiency. In the scaffold
|
||||
we fall back to serial gets so the logic is correct from day 1.
|
||||
"""
|
||||
out: dict[str, dict] = {}
|
||||
for mid in message_ids:
|
||||
try:
|
||||
out[mid] = client.get_message(mid, format="metadata", metadata_headers=headers)
|
||||
except errors.NotFoundError:
|
||||
# Message deleted between list and get — skip.
|
||||
continue
|
||||
return out
|
||||
@@ -0,0 +1,215 @@
|
||||
"""
|
||||
Investor matching.
|
||||
|
||||
Builds an in-memory index of investor email addresses from:
|
||||
- fundraising_contacts.email
|
||||
- contacts.email
|
||||
- organizations.email + organizations.website (domain only)
|
||||
|
||||
For each synced email, returns a list of investor links. Exact-email matches
|
||||
beat domain matches; if any exact match exists, domain matches are suppressed.
|
||||
|
||||
The index is rebuilt every `REFRESH_INTERVAL_SEC` or on demand via rebuild().
|
||||
"""
|
||||
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
REFRESH_INTERVAL_SEC = 900 # 15 minutes
|
||||
|
||||
# Domains we never domain-match against (personal mailboxes).
|
||||
COMMON_PERSONAL_DOMAINS = {
|
||||
"gmail.com", "googlemail.com",
|
||||
"outlook.com", "hotmail.com", "live.com", "msn.com",
|
||||
"yahoo.com", "yahoo.co.uk", "ymail.com",
|
||||
"icloud.com", "me.com", "mac.com",
|
||||
"aol.com", "proton.me", "protonmail.com",
|
||||
"pm.me", "fastmail.com", "tuta.io", "hey.com",
|
||||
"duck.com", "zoho.com",
|
||||
}
|
||||
|
||||
|
||||
# Also skip matching on the team's own domain (they email each other).
|
||||
# Populated from CONFIG.workspace_domain at rebuild time.
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchTarget:
|
||||
fundraising_investor_id: Optional[str] = None
|
||||
fundraising_contact_id: Optional[str] = None
|
||||
contact_id: Optional[str] = None
|
||||
organization_id: Optional[str] = None
|
||||
investor_name: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class InvestorLink:
|
||||
matched_address: str
|
||||
match_kind: str # exact_email | domain_match | manual
|
||||
match_confidence: float
|
||||
target: MatchTarget
|
||||
|
||||
|
||||
class InvestorIndex:
|
||||
|
||||
def __init__(self, own_domain: Optional[str] = None):
|
||||
self._email_index: dict[str, MatchTarget] = {}
|
||||
self._domain_index: dict[str, list[MatchTarget]] = {}
|
||||
self._own_domain = (own_domain or "").lower() or None
|
||||
self._last_built = 0.0
|
||||
self._lock = threading.Lock()
|
||||
|
||||
# ------------------------------------------------------------------ build
|
||||
|
||||
def rebuild(self, db_conn_factory) -> None:
|
||||
with self._lock:
|
||||
email_idx: dict[str, MatchTarget] = {}
|
||||
domain_idx: dict[str, list[MatchTarget]] = {}
|
||||
|
||||
conn = db_conn_factory()
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
|
||||
# fundraising_contacts
|
||||
cur.execute(
|
||||
"SELECT fc.id, fc.email, fc.investor_id, fi.investor_name "
|
||||
"FROM fundraising_contacts fc "
|
||||
"LEFT JOIN fundraising_investors fi ON fi.id = fc.investor_id "
|
||||
"WHERE fc.email IS NOT NULL AND fc.email != ''"
|
||||
)
|
||||
for r in cur.fetchall():
|
||||
addr = (r["email"] or "").lower().strip()
|
||||
if not _valid_email(addr):
|
||||
continue
|
||||
email_idx[addr] = MatchTarget(
|
||||
fundraising_contact_id=r["id"],
|
||||
fundraising_investor_id=r["investor_id"],
|
||||
investor_name=r["investor_name"],
|
||||
)
|
||||
|
||||
# contacts
|
||||
cur.execute(
|
||||
"SELECT id, email, organization_id FROM contacts "
|
||||
"WHERE email IS NOT NULL AND email != ''"
|
||||
)
|
||||
for r in cur.fetchall():
|
||||
addr = (r["email"] or "").lower().strip()
|
||||
if not _valid_email(addr):
|
||||
continue
|
||||
# Don't overwrite a fundraising_contact match; they're higher signal.
|
||||
email_idx.setdefault(addr, MatchTarget(
|
||||
contact_id=r["id"],
|
||||
organization_id=r["organization_id"],
|
||||
))
|
||||
|
||||
# organizations — domain-only match source
|
||||
cur.execute(
|
||||
"SELECT id, name, email, website FROM organizations "
|
||||
"WHERE (email IS NOT NULL AND email != '') OR (website IS NOT NULL AND website != '')"
|
||||
)
|
||||
for r in cur.fetchall():
|
||||
for d in _domains_for_org(r):
|
||||
if d in COMMON_PERSONAL_DOMAINS:
|
||||
continue
|
||||
if self._own_domain and d == self._own_domain:
|
||||
continue
|
||||
domain_idx.setdefault(d, []).append(MatchTarget(
|
||||
organization_id=r["id"],
|
||||
investor_name=r["name"],
|
||||
))
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
self._email_index = email_idx
|
||||
self._domain_index = domain_idx
|
||||
self._last_built = time.time()
|
||||
|
||||
def rebuild_if_stale(self, db_conn_factory) -> None:
|
||||
if time.time() - self._last_built > REFRESH_INTERVAL_SEC:
|
||||
self.rebuild(db_conn_factory)
|
||||
|
||||
# ------------------------------------------------------------------ query
|
||||
|
||||
def match(self, addresses: set[str], *,
|
||||
exclude_addresses: Optional[set[str]] = None) -> list[InvestorLink]:
|
||||
excl = {a.lower() for a in (exclude_addresses or set())}
|
||||
candidates = {a.lower().strip() for a in addresses if a} - excl
|
||||
|
||||
# Exclude own domain addresses (teammates emailing each other).
|
||||
if self._own_domain:
|
||||
candidates = {a for a in candidates
|
||||
if not a.endswith("@" + self._own_domain)}
|
||||
|
||||
links: list[InvestorLink] = []
|
||||
seen_targets: set[tuple] = set()
|
||||
|
||||
# Exact email matches first.
|
||||
for addr in candidates:
|
||||
t = self._email_index.get(addr)
|
||||
if t:
|
||||
key = (t.fundraising_contact_id, t.contact_id)
|
||||
if key in seen_targets:
|
||||
continue
|
||||
seen_targets.add(key)
|
||||
links.append(InvestorLink(
|
||||
matched_address=addr,
|
||||
match_kind="exact_email",
|
||||
match_confidence=1.0,
|
||||
target=t,
|
||||
))
|
||||
|
||||
if links: # exact hits short-circuit domain matching
|
||||
return links
|
||||
|
||||
# Domain fallback.
|
||||
for addr in candidates:
|
||||
_, _, domain = addr.partition("@")
|
||||
if not domain or domain in COMMON_PERSONAL_DOMAINS:
|
||||
continue
|
||||
for t in self._domain_index.get(domain, []):
|
||||
key = ("org", t.organization_id)
|
||||
if key in seen_targets:
|
||||
continue
|
||||
seen_targets.add(key)
|
||||
links.append(InvestorLink(
|
||||
matched_address=addr,
|
||||
match_kind="domain_match",
|
||||
match_confidence=0.6,
|
||||
target=t,
|
||||
))
|
||||
return links
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- helpers
|
||||
|
||||
_EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
||||
|
||||
|
||||
def _valid_email(s: str) -> bool:
|
||||
return bool(_EMAIL_RE.match(s))
|
||||
|
||||
|
||||
def _domains_for_org(row) -> list[str]:
|
||||
out: list[str] = []
|
||||
if row["email"]:
|
||||
_, _, d = row["email"].lower().partition("@")
|
||||
if d:
|
||||
out.append(d)
|
||||
if row["website"]:
|
||||
d = _domain_from_url(row["website"])
|
||||
if d:
|
||||
out.append(d)
|
||||
return list({d for d in out if d})
|
||||
|
||||
|
||||
def _domain_from_url(url: str) -> Optional[str]:
|
||||
if not url:
|
||||
return None
|
||||
m = re.match(r"^\s*(?:https?://)?(?:www\.)?([^/:?#\s]+)", url.strip(), re.IGNORECASE)
|
||||
if not m:
|
||||
return None
|
||||
return m.group(1).lower()
|
||||
@@ -0,0 +1,192 @@
|
||||
-- Gmail Integration — Phase 1 migration
|
||||
-- Creates all tables for email capture, matching, threading, attachments.
|
||||
-- This migration is IDEMPOTENT: safe to re-run.
|
||||
-- Applied by email_integration.db.apply_migrations() on server startup when
|
||||
-- CRM_GMAIL_INTEGRATION_ENABLED is truthy.
|
||||
--
|
||||
-- DO NOT modify this file in place after it ships. Create 0002_*.sql, etc.
|
||||
|
||||
-- ============================================================================
|
||||
-- email_accounts — one row per enrolled team-member mailbox
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS email_accounts (
|
||||
id TEXT PRIMARY KEY,
|
||||
user_id TEXT NOT NULL,
|
||||
email_address TEXT NOT NULL UNIQUE,
|
||||
auth_method TEXT NOT NULL, -- 'dwd' | 'oauth'
|
||||
oauth_refresh_enc BLOB,
|
||||
oauth_token_enc BLOB,
|
||||
oauth_token_exp TEXT,
|
||||
sync_enabled INTEGER NOT NULL DEFAULT 1,
|
||||
sync_status TEXT NOT NULL DEFAULT 'pending',
|
||||
sync_error TEXT,
|
||||
last_history_id TEXT,
|
||||
last_synced_at TEXT,
|
||||
backfill_complete INTEGER NOT NULL DEFAULT 0,
|
||||
backfill_cursor TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
updated_at TEXT DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(user_id) REFERENCES users(id)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_email_accounts_user ON email_accounts(user_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_email_accounts_sync ON email_accounts(sync_enabled, sync_status);
|
||||
|
||||
-- ============================================================================
|
||||
-- emails — canonical email record, dedup'd across accounts by RFC Message-ID
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS emails (
|
||||
id TEXT PRIMARY KEY,
|
||||
rfc_message_id TEXT NOT NULL UNIQUE,
|
||||
gmail_thread_id TEXT,
|
||||
rfc_thread_root_id TEXT,
|
||||
thread_id TEXT, -- FK email_threads.id (populated by threads.py)
|
||||
subject TEXT,
|
||||
from_email TEXT NOT NULL,
|
||||
from_name TEXT,
|
||||
to_emails_json TEXT NOT NULL DEFAULT '[]',
|
||||
cc_emails_json TEXT NOT NULL DEFAULT '[]',
|
||||
bcc_emails_json TEXT NOT NULL DEFAULT '[]',
|
||||
reply_to TEXT,
|
||||
sent_at TEXT NOT NULL,
|
||||
body_text TEXT,
|
||||
body_html TEXT,
|
||||
snippet TEXT,
|
||||
in_reply_to TEXT,
|
||||
references_json TEXT DEFAULT '[]',
|
||||
has_attachments INTEGER NOT NULL DEFAULT 0,
|
||||
size_estimate INTEGER,
|
||||
is_matched INTEGER NOT NULL DEFAULT 0,
|
||||
match_status TEXT NOT NULL DEFAULT 'unmatched', -- unmatched|matched|skipped
|
||||
raw_headers_json TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
updated_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_emails_thread ON emails(gmail_thread_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_emails_rfc_thread ON emails(rfc_thread_root_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_emails_thread_fk ON emails(thread_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_emails_from ON emails(from_email);
|
||||
CREATE INDEX IF NOT EXISTS idx_emails_sent_at ON emails(sent_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_emails_matched ON emails(is_matched, sent_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_emails_in_reply_to ON emails(in_reply_to);
|
||||
|
||||
-- ============================================================================
|
||||
-- email_recipients — denormalized for fast address lookups
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS email_recipients (
|
||||
id TEXT PRIMARY KEY,
|
||||
email_id TEXT NOT NULL,
|
||||
address TEXT NOT NULL,
|
||||
display_name TEXT,
|
||||
kind TEXT NOT NULL, -- from|to|cc|bcc|reply_to
|
||||
FOREIGN KEY(email_id) REFERENCES emails(id) ON DELETE CASCADE
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_email_recipients_addr ON email_recipients(address);
|
||||
CREATE INDEX IF NOT EXISTS idx_email_recipients_email ON email_recipients(email_id);
|
||||
|
||||
-- ============================================================================
|
||||
-- email_account_messages — per-mailbox sighting of an email
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS email_account_messages (
|
||||
id TEXT PRIMARY KEY,
|
||||
email_id TEXT NOT NULL,
|
||||
account_id TEXT NOT NULL,
|
||||
gmail_message_id TEXT NOT NULL,
|
||||
gmail_thread_id TEXT NOT NULL,
|
||||
labels_json TEXT DEFAULT '[]',
|
||||
is_sent INTEGER NOT NULL DEFAULT 0,
|
||||
first_seen_at TEXT DEFAULT (datetime('now')),
|
||||
deleted_at TEXT,
|
||||
FOREIGN KEY(email_id) REFERENCES emails(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY(account_id) REFERENCES email_accounts(id) ON DELETE CASCADE,
|
||||
UNIQUE(account_id, gmail_message_id)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_eam_email ON email_account_messages(email_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_eam_account ON email_account_messages(account_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_eam_gmail_msg ON email_account_messages(gmail_message_id);
|
||||
|
||||
-- ============================================================================
|
||||
-- email_attachments — metadata; bytes on disk under data/email_attachments/
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS email_attachments (
|
||||
id TEXT PRIMARY KEY,
|
||||
email_id TEXT NOT NULL,
|
||||
gmail_attachment_id TEXT NOT NULL,
|
||||
filename TEXT NOT NULL,
|
||||
sanitized_filename TEXT NOT NULL,
|
||||
mime_type TEXT,
|
||||
size_bytes INTEGER,
|
||||
sha256_hex TEXT,
|
||||
storage_path TEXT NOT NULL,
|
||||
download_status TEXT NOT NULL DEFAULT 'pending', -- pending|downloaded|failed|skipped
|
||||
download_attempts INTEGER NOT NULL DEFAULT 0,
|
||||
download_error TEXT,
|
||||
downloaded_at TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(email_id) REFERENCES emails(id) ON DELETE CASCADE
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_attach_email ON email_attachments(email_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_attach_sha ON email_attachments(sha256_hex);
|
||||
CREATE INDEX IF NOT EXISTS idx_attach_status ON email_attachments(download_status);
|
||||
|
||||
-- ============================================================================
|
||||
-- email_threads — thread roll-up for UI
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS email_threads (
|
||||
id TEXT PRIMARY KEY,
|
||||
gmail_thread_id TEXT,
|
||||
rfc_thread_root_id TEXT,
|
||||
subject_normalized TEXT,
|
||||
first_message_at TEXT,
|
||||
last_message_at TEXT,
|
||||
message_count INTEGER NOT NULL DEFAULT 0,
|
||||
participant_count INTEGER NOT NULL DEFAULT 0,
|
||||
participants_json TEXT DEFAULT '[]',
|
||||
is_matched INTEGER NOT NULL DEFAULT 0,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
updated_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_threads_gmail_uniq ON email_threads(gmail_thread_id)
|
||||
WHERE gmail_thread_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_threads_rfc_root ON email_threads(rfc_thread_root_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_threads_last_msg ON email_threads(last_message_at);
|
||||
|
||||
-- ============================================================================
|
||||
-- email_investor_links — matched investors
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS email_investor_links (
|
||||
id TEXT PRIMARY KEY,
|
||||
email_id TEXT NOT NULL,
|
||||
fundraising_investor_id TEXT,
|
||||
fundraising_contact_id TEXT,
|
||||
contact_id TEXT,
|
||||
organization_id TEXT,
|
||||
matched_address TEXT NOT NULL,
|
||||
match_kind TEXT NOT NULL, -- exact_email|domain_match|manual
|
||||
match_confidence REAL NOT NULL DEFAULT 1.0,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(email_id) REFERENCES emails(id) ON DELETE CASCADE
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_eil_email ON email_investor_links(email_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_eil_investor ON email_investor_links(fundraising_investor_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_eil_fr_contact ON email_investor_links(fundraising_contact_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_eil_contact ON email_investor_links(contact_id);
|
||||
|
||||
-- ============================================================================
|
||||
-- email_sync_runs — per-run observability
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS email_sync_runs (
|
||||
id TEXT PRIMARY KEY,
|
||||
account_id TEXT NOT NULL,
|
||||
kind TEXT NOT NULL, -- backfill|incremental|manual
|
||||
started_at TEXT NOT NULL,
|
||||
finished_at TEXT,
|
||||
status TEXT NOT NULL, -- running|ok|error|partial
|
||||
messages_seen INTEGER NOT NULL DEFAULT 0,
|
||||
messages_stored INTEGER NOT NULL DEFAULT 0,
|
||||
attachments_saved INTEGER NOT NULL DEFAULT 0,
|
||||
api_calls INTEGER NOT NULL DEFAULT 0,
|
||||
retries INTEGER NOT NULL DEFAULT 0,
|
||||
error TEXT,
|
||||
FOREIGN KEY(account_id) REFERENCES email_accounts(id) ON DELETE CASCADE
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_sync_runs_account ON email_sync_runs(account_id, started_at);
|
||||
@@ -0,0 +1,283 @@
|
||||
"""
|
||||
Parse a Gmail `users.messages.get` response (format=full) into a flat dict
|
||||
ready for db.insert_email().
|
||||
|
||||
Input shape (abbreviated):
|
||||
{
|
||||
"id": "...", # Gmail message id
|
||||
"threadId": "...",
|
||||
"labelIds": ["INBOX","IMPORTANT",...],
|
||||
"snippet": "...",
|
||||
"historyId": "...",
|
||||
"internalDate": "1713657600000", # ms epoch, authoritative
|
||||
"sizeEstimate": 12345,
|
||||
"payload": {
|
||||
"headers": [{"name":"Subject","value":"..."}, ...],
|
||||
"mimeType": "multipart/mixed",
|
||||
"parts": [...recursive...],
|
||||
"body": {"data": "<base64url>", "size": ...}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
import base64
|
||||
import email.utils
|
||||
import email.header
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Iterable, Optional
|
||||
from html.parser import HTMLParser
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- public
|
||||
|
||||
def parse(message: dict, *, owning_account_address: Optional[str] = None) -> dict:
|
||||
"""Parse a Gmail message payload into our canonical dict shape."""
|
||||
headers = _header_map(message.get("payload", {}).get("headers") or [])
|
||||
|
||||
from_name, from_email = _split_addr(headers.get("from", ""))
|
||||
to_list = _parse_address_list(headers.get("to", ""))
|
||||
cc_list = _parse_address_list(headers.get("cc", ""))
|
||||
bcc_list = _parse_address_list(headers.get("bcc", ""))
|
||||
reply_to = _split_addr(headers.get("reply-to", ""))[1] or None
|
||||
|
||||
sent_at = _parse_date_header(headers.get("date"), fallback_ms=message.get("internalDate"))
|
||||
|
||||
rfc_mid = headers.get("message-id", "").strip() or f"synthetic-{message.get('id')}@ten31.local"
|
||||
rfc_mid = _strip_angle_brackets(rfc_mid)
|
||||
in_reply_to = _strip_angle_brackets(headers.get("in-reply-to", "").strip()) or None
|
||||
references = _split_references(headers.get("references", ""))
|
||||
rfc_thread_root_id = references[0] if references else (in_reply_to or rfc_mid)
|
||||
|
||||
body_text, body_html, attachments = _walk_payload(message.get("payload", {}))
|
||||
|
||||
subject = _decode_rfc2047(headers.get("subject") or "")
|
||||
|
||||
labels = message.get("labelIds") or []
|
||||
is_sent = "SENT" in labels
|
||||
|
||||
return {
|
||||
"gmail_message_id": message.get("id"),
|
||||
"gmail_thread_id": message.get("threadId"),
|
||||
"rfc_message_id": rfc_mid,
|
||||
"rfc_thread_root_id": rfc_thread_root_id,
|
||||
"in_reply_to": in_reply_to,
|
||||
"references": references,
|
||||
"subject": subject,
|
||||
"from_email": (from_email or "").lower(),
|
||||
"from_name": from_name,
|
||||
"to": [{"email": e.lower(), "name": n} for n, e in to_list if e],
|
||||
"cc": [{"email": e.lower(), "name": n} for n, e in cc_list if e],
|
||||
"bcc": [{"email": e.lower(), "name": n} for n, e in bcc_list if e],
|
||||
"reply_to": reply_to.lower() if reply_to else None,
|
||||
"sent_at": sent_at,
|
||||
"body_text": _cap_text(body_text),
|
||||
"body_html": _cap_text(body_html),
|
||||
"snippet": message.get("snippet"),
|
||||
"attachments": attachments,
|
||||
"size_estimate": message.get("sizeEstimate"),
|
||||
"labels": labels,
|
||||
"is_sent": is_sent,
|
||||
"raw_headers": headers,
|
||||
"owning_account": owning_account_address,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- headers
|
||||
|
||||
def _header_map(header_list: Iterable[dict]) -> dict[str, str]:
|
||||
"""Case-insensitive keys. Last-write-wins for duplicates (rare)."""
|
||||
out: dict[str, str] = {}
|
||||
for h in header_list:
|
||||
name = (h.get("name") or "").lower()
|
||||
out[name] = h.get("value") or ""
|
||||
return out
|
||||
|
||||
|
||||
def _decode_rfc2047(s: str) -> str:
|
||||
if not s:
|
||||
return ""
|
||||
try:
|
||||
parts = email.header.decode_header(s)
|
||||
pieces = []
|
||||
for text, charset in parts:
|
||||
if isinstance(text, bytes):
|
||||
try:
|
||||
pieces.append(text.decode(charset or "utf-8", errors="replace"))
|
||||
except LookupError:
|
||||
pieces.append(text.decode("utf-8", errors="replace"))
|
||||
else:
|
||||
pieces.append(text)
|
||||
return "".join(pieces)
|
||||
except Exception:
|
||||
return s
|
||||
|
||||
|
||||
def _split_addr(raw: str) -> tuple[Optional[str], Optional[str]]:
|
||||
if not raw:
|
||||
return (None, None)
|
||||
name, addr = email.utils.parseaddr(raw)
|
||||
return (_decode_rfc2047(name) or None, addr or None)
|
||||
|
||||
|
||||
def _parse_address_list(raw: str) -> list[tuple[Optional[str], Optional[str]]]:
|
||||
if not raw:
|
||||
return []
|
||||
parsed = email.utils.getaddresses([raw])
|
||||
return [(_decode_rfc2047(n) or None, a or None) for n, a in parsed if a]
|
||||
|
||||
|
||||
def _parse_date_header(raw: Optional[str], *, fallback_ms: Optional[str]) -> str:
|
||||
# Prefer RFC Date header, fall back to Gmail internalDate (epoch ms).
|
||||
if raw:
|
||||
try:
|
||||
dt = email.utils.parsedate_to_datetime(raw)
|
||||
if dt is not None:
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
if fallback_ms:
|
||||
try:
|
||||
dt = datetime.fromtimestamp(int(fallback_ms) / 1000.0, tz=timezone.utc)
|
||||
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
return datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def _split_references(raw: str) -> list[str]:
|
||||
if not raw:
|
||||
return []
|
||||
return [_strip_angle_brackets(p) for p in raw.split() if p.strip()]
|
||||
|
||||
|
||||
def _strip_angle_brackets(s: str) -> str:
|
||||
s = (s or "").strip()
|
||||
if s.startswith("<") and s.endswith(">"):
|
||||
return s[1:-1]
|
||||
return s
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- MIME walk
|
||||
|
||||
def _walk_payload(payload: dict) -> tuple[Optional[str], Optional[str], list[dict]]:
|
||||
"""Returns (body_text, body_html, attachments).
|
||||
|
||||
Depth-first walk. First plain/text wins for body_text; first text/html
|
||||
wins for body_html. Anything with a filename or attachment disposition
|
||||
becomes an attachment entry.
|
||||
"""
|
||||
text: Optional[str] = None
|
||||
html_body: Optional[str] = None
|
||||
attachments: list[dict] = []
|
||||
|
||||
def visit(part: dict):
|
||||
nonlocal text, html_body
|
||||
mime = (part.get("mimeType") or "").lower()
|
||||
filename = part.get("filename") or ""
|
||||
body = part.get("body") or {}
|
||||
parts = part.get("parts") or []
|
||||
|
||||
headers = _header_map(part.get("headers") or [])
|
||||
disposition = (headers.get("content-disposition") or "").lower()
|
||||
is_attachment = bool(filename) or disposition.startswith("attachment")
|
||||
|
||||
if is_attachment:
|
||||
attachments.append({
|
||||
"filename": filename or f"unnamed.{_ext_for(mime)}",
|
||||
"mime_type": mime or "application/octet-stream",
|
||||
"size": body.get("size"),
|
||||
"gmail_attachment_id": body.get("attachmentId"),
|
||||
# Some tiny attachments come inlined as base64; attachmentId is
|
||||
# then missing and data is in body.data. sync.py handles both.
|
||||
"inline_data_b64": body.get("data"),
|
||||
"content_disposition": "inline" if disposition.startswith("inline") else "attachment",
|
||||
})
|
||||
else:
|
||||
if mime == "text/plain" and text is None:
|
||||
text = _decode_body(body)
|
||||
elif mime == "text/html" and html_body is None:
|
||||
html_body = _decode_body(body)
|
||||
|
||||
for child in parts:
|
||||
visit(child)
|
||||
|
||||
visit(payload)
|
||||
|
||||
# Derive a plain-text body from HTML if only HTML exists.
|
||||
if text is None and html_body:
|
||||
text = _strip_html(html_body)
|
||||
|
||||
return text, html_body, attachments
|
||||
|
||||
|
||||
def _decode_body(body: dict) -> Optional[str]:
|
||||
data = body.get("data")
|
||||
if not data:
|
||||
return None
|
||||
try:
|
||||
padding = 4 - (len(data) % 4)
|
||||
if padding != 4:
|
||||
data = data + ("=" * padding)
|
||||
raw = base64.urlsafe_b64decode(data.encode("ascii"))
|
||||
return raw.decode("utf-8", errors="replace").replace("\r\n", "\n")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- HTML stripping
|
||||
|
||||
class _HTMLToText(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._parts: list[str] = []
|
||||
self._skip_depth = 0
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in ("script", "style"):
|
||||
self._skip_depth += 1
|
||||
if tag in ("br", "p", "div", "tr", "li"):
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in ("script", "style"):
|
||||
self._skip_depth = max(0, self._skip_depth - 1)
|
||||
if tag in ("p", "div", "tr"):
|
||||
self._parts.append("\n")
|
||||
|
||||
def handle_data(self, data):
|
||||
if self._skip_depth == 0:
|
||||
self._parts.append(data)
|
||||
|
||||
def text(self) -> str:
|
||||
raw = "".join(self._parts)
|
||||
return re.sub(r"\n{3,}", "\n\n", raw).strip()
|
||||
|
||||
|
||||
def _strip_html(html: str) -> str:
|
||||
p = _HTMLToText()
|
||||
try:
|
||||
p.feed(html)
|
||||
return p.text()
|
||||
except Exception:
|
||||
return re.sub(r"<[^>]+>", " ", html)
|
||||
|
||||
|
||||
def _ext_for(mime: str) -> str:
|
||||
return mime.split("/")[-1] if "/" in mime else "bin"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- caps
|
||||
|
||||
# Keep bodies bounded to avoid a pathological 500MB message exploding the DB.
|
||||
_BODY_CAP_BYTES = 10 * 1024 * 1024 # 10MB
|
||||
|
||||
|
||||
def _cap_text(s: Optional[str]) -> Optional[str]:
|
||||
if s is None:
|
||||
return None
|
||||
if len(s.encode("utf-8", errors="ignore")) <= _BODY_CAP_BYTES:
|
||||
return s
|
||||
return s[: _BODY_CAP_BYTES // 2] + "\n\n[TRUNCATED BY CRM — body exceeded 10MB]"
|
||||
@@ -0,0 +1,462 @@
|
||||
"""
|
||||
HTTP route handlers for the Gmail integration.
|
||||
|
||||
Designed to plug into server.py's CRMHandler (BaseHTTPRequestHandler) pattern.
|
||||
The hook is a single function call near the top of do_GET / do_POST that
|
||||
lets this module claim any /api/email/* request:
|
||||
|
||||
# in CRMHandler.do_GET and CRMHandler.do_POST, before the 404 fallthrough:
|
||||
from email_integration.routes import try_handle
|
||||
if try_handle(self):
|
||||
return
|
||||
|
||||
`try_handle(handler)` inspects `handler.command` and `handler.get_path()` and
|
||||
returns True if it handled the request (sent a response).
|
||||
|
||||
Every handler respects the same auth / rate-limit model as the rest of server.py
|
||||
by calling handler.get_user() and handler.rate_limited(...).
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from typing import Optional
|
||||
|
||||
from . import config as _cfg
|
||||
from . import credentials as _creds
|
||||
from . import crypto as _crypto
|
||||
from . import db as _db
|
||||
from . import scheduler as _sched
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- dispatch
|
||||
|
||||
_GET_ROUTES = {
|
||||
"/api/email/status": "status",
|
||||
"/api/email/accounts": "list_accounts",
|
||||
"/api/email/threads": "list_threads",
|
||||
"/api/email/oauth/start": "oauth_start",
|
||||
"/api/email/oauth/callback": "oauth_callback",
|
||||
}
|
||||
|
||||
_POST_ROUTES = {
|
||||
"/api/email/accounts/enroll-all": "enroll_all",
|
||||
"/api/email/accounts/enroll": "enroll_one",
|
||||
"/api/email/sync/run-now": "run_now",
|
||||
"/api/email/rematch": "rematch",
|
||||
}
|
||||
|
||||
|
||||
def try_handle(handler) -> bool:
|
||||
path = handler.get_path()
|
||||
method = handler.command
|
||||
table = _GET_ROUTES if method == "GET" else _POST_ROUTES if method == "POST" else {}
|
||||
name = table.get(path)
|
||||
if not path.startswith("/api/email/"):
|
||||
return False
|
||||
if not name:
|
||||
# Route is owned by this module but unknown — return a proper 404
|
||||
# instead of letting the main dispatcher's 404 abuse counter fire.
|
||||
handler.send_error_json("Not found", 404)
|
||||
return True
|
||||
|
||||
if not _cfg.CONFIG.enabled:
|
||||
handler.send_error_json("Email integration disabled", 503)
|
||||
return True
|
||||
|
||||
# Also enforce attachment streaming under a different prefix
|
||||
# (handled above via prefix check).
|
||||
|
||||
impl = globals().get(f"_h_{name}")
|
||||
if impl is None:
|
||||
handler.send_error_json("Not implemented", 500)
|
||||
return True
|
||||
|
||||
try:
|
||||
impl(handler)
|
||||
except Exception as e:
|
||||
handler.send_error_json(f"Internal error: {e}", 500)
|
||||
return True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- helpers
|
||||
|
||||
def _conn() -> sqlite3.Connection:
|
||||
import os
|
||||
db_path = os.environ.get(
|
||||
"CRM_DB_PATH",
|
||||
os.path.join(_cfg.CONFIG.data_dir, "crm.db"),
|
||||
)
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
conn.execute("PRAGMA busy_timeout=5000")
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
|
||||
def _require_auth(handler) -> Optional[dict]:
|
||||
user = handler.get_user()
|
||||
if not user:
|
||||
handler.send_error_json("Unauthorized", 401)
|
||||
return None
|
||||
return user
|
||||
|
||||
|
||||
def _require_admin(handler) -> Optional[dict]:
|
||||
user = _require_auth(handler)
|
||||
if user is None:
|
||||
return None
|
||||
if user.get("role") != "admin":
|
||||
handler.send_error_json("Admin required", 403)
|
||||
return None
|
||||
return user
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- GET handlers
|
||||
|
||||
def _h_status(handler):
|
||||
user = _require_auth(handler)
|
||||
if not user:
|
||||
return
|
||||
snap = _sched.status_snapshot()
|
||||
conn = _conn()
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) AS n_accounts, "
|
||||
"SUM(CASE WHEN sync_status='active' THEN 1 ELSE 0 END) AS n_active, "
|
||||
"SUM(CASE WHEN sync_status='error' THEN 1 ELSE 0 END) AS n_error "
|
||||
"FROM email_accounts"
|
||||
)
|
||||
counts = dict(cur.fetchone() or {})
|
||||
cur.execute("SELECT COUNT(*) AS n FROM emails WHERE match_status = 'matched'")
|
||||
snap["matched_emails"] = cur.fetchone()["n"]
|
||||
finally:
|
||||
conn.close()
|
||||
snap["accounts_summary"] = counts
|
||||
handler.send_json(snap)
|
||||
|
||||
|
||||
def _h_list_accounts(handler):
|
||||
user = _require_auth(handler)
|
||||
if not user:
|
||||
return
|
||||
conn = _conn()
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT id, user_id, email_address, auth_method, sync_enabled, "
|
||||
"sync_status, sync_error, last_synced_at, backfill_complete "
|
||||
"FROM email_accounts ORDER BY email_address"
|
||||
)
|
||||
rows = [dict(r) for r in cur.fetchall()]
|
||||
finally:
|
||||
conn.close()
|
||||
# Non-admins only see their own row
|
||||
if user.get("role") != "admin":
|
||||
rows = [r for r in rows if r["user_id"] == user["user_id"]]
|
||||
handler.send_json({"accounts": rows})
|
||||
|
||||
|
||||
def _h_list_threads(handler):
|
||||
user = _require_auth(handler)
|
||||
if not user:
|
||||
return
|
||||
q = handler.get_query_params()
|
||||
investor_id = q.get("investor_id")
|
||||
limit = min(int(q.get("limit", 50)), 500)
|
||||
conn = _conn()
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
if investor_id:
|
||||
cur.execute(
|
||||
"""SELECT t.*
|
||||
FROM email_threads t
|
||||
JOIN emails e ON e.thread_id = t.id
|
||||
JOIN email_investor_links l ON l.email_id = e.id
|
||||
WHERE l.fundraising_investor_id = ?
|
||||
OR l.fundraising_contact_id IN (
|
||||
SELECT id FROM fundraising_contacts WHERE investor_id = ?
|
||||
)
|
||||
GROUP BY t.id
|
||||
ORDER BY t.last_message_at DESC
|
||||
LIMIT ?""",
|
||||
(investor_id, investor_id, limit),
|
||||
)
|
||||
else:
|
||||
cur.execute(
|
||||
"SELECT * FROM email_threads WHERE is_matched = 1 "
|
||||
"ORDER BY last_message_at DESC LIMIT ?",
|
||||
(limit,),
|
||||
)
|
||||
threads = [dict(r) for r in cur.fetchall()]
|
||||
finally:
|
||||
conn.close()
|
||||
handler.send_json({"threads": threads})
|
||||
|
||||
|
||||
def _h_oauth_start(handler):
|
||||
"""Begin per-user OAuth consent flow (fallback path)."""
|
||||
user = _require_auth(handler)
|
||||
if not user:
|
||||
return
|
||||
if _cfg.CONFIG.primary_auth != "oauth":
|
||||
return handler.send_error_json(
|
||||
"Per-user OAuth disabled (set CRM_GMAIL_AUTH_METHOD=oauth to enable)", 400
|
||||
)
|
||||
q = handler.get_query_params()
|
||||
account_email = q.get("account_email") or ""
|
||||
if not account_email:
|
||||
return handler.send_error_json("account_email required", 400)
|
||||
|
||||
import secrets
|
||||
import urllib.parse
|
||||
state = secrets.token_urlsafe(32)
|
||||
_oauth_state_store(state, user["user_id"], account_email)
|
||||
|
||||
params = {
|
||||
"client_id": _cfg.CONFIG.oauth_client_id,
|
||||
"redirect_uri": _cfg.CONFIG.oauth_redirect_uri,
|
||||
"response_type": "code",
|
||||
"scope": _creds.GMAIL_READONLY_SCOPE,
|
||||
"access_type": "offline",
|
||||
"prompt": "consent",
|
||||
"state": state,
|
||||
"login_hint": account_email,
|
||||
}
|
||||
url = "https://accounts.google.com/o/oauth2/v2/auth?" + urllib.parse.urlencode(params)
|
||||
handler.send_json({"redirect_url": url})
|
||||
|
||||
|
||||
def _h_oauth_callback(handler):
|
||||
"""Exchange code for tokens, encrypt refresh token, store."""
|
||||
q = handler.get_query_params()
|
||||
code = q.get("code")
|
||||
state = q.get("state")
|
||||
if not code or not state:
|
||||
return handler.send_error_json("code and state required", 400)
|
||||
|
||||
state_row = _oauth_state_consume(state)
|
||||
if not state_row:
|
||||
return handler.send_error_json("Invalid state", 400)
|
||||
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
body = urllib.parse.urlencode({
|
||||
"code": code,
|
||||
"client_id": _cfg.CONFIG.oauth_client_id,
|
||||
"client_secret": _cfg.CONFIG.oauth_client_secret,
|
||||
"redirect_uri": _cfg.CONFIG.oauth_redirect_uri,
|
||||
"grant_type": "authorization_code",
|
||||
}).encode("ascii")
|
||||
req = urllib.request.Request(
|
||||
"https://oauth2.googleapis.com/token",
|
||||
data=body,
|
||||
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
payload = json.loads(resp.read())
|
||||
except Exception as e:
|
||||
return handler.send_error_json(f"Token exchange failed: {e}", 500)
|
||||
|
||||
refresh = payload.get("refresh_token")
|
||||
if not refresh:
|
||||
return handler.send_error_json("No refresh_token returned (user may have previously consented; prompt=consent required)", 400)
|
||||
|
||||
enc = _crypto.encrypt(refresh.encode("ascii"), secret_key_b64=_cfg.CONFIG.secret_key_b64)
|
||||
|
||||
conn = _conn()
|
||||
try:
|
||||
_db.upsert_account(conn, user_id=state_row["user_id"],
|
||||
email_address=state_row["account_email"],
|
||||
auth_method="oauth")
|
||||
conn.execute(
|
||||
"UPDATE email_accounts SET oauth_refresh_enc = ?, sync_status = 'pending', "
|
||||
"updated_at = datetime('now') WHERE email_address = ?",
|
||||
(enc, state_row["account_email"]),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
handler.send_json({"ok": True, "account_email": state_row["account_email"]})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- POST handlers
|
||||
|
||||
def _h_enroll_all(handler):
|
||||
"""Admin: enroll every CRM user whose email is @workspace_domain via DWD."""
|
||||
user = _require_admin(handler)
|
||||
if not user:
|
||||
return
|
||||
if _cfg.CONFIG.primary_auth != "dwd":
|
||||
return handler.send_error_json("enroll-all only valid in DWD mode", 400)
|
||||
domain = _cfg.CONFIG.workspace_domain
|
||||
if not domain:
|
||||
return handler.send_error_json("CRM_GMAIL_WORKSPACE_DOMAIN not set", 400)
|
||||
|
||||
conn = _conn()
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT id, email FROM users WHERE is_active = 1 AND email LIKE ?",
|
||||
(f"%@{domain}",),
|
||||
)
|
||||
users = cur.fetchall()
|
||||
created = []
|
||||
for u in users:
|
||||
aid = _db.upsert_account(conn, user_id=u["id"],
|
||||
email_address=u["email"].lower(),
|
||||
auth_method="dwd")
|
||||
created.append({"account_id": aid, "email": u["email"]})
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
handler.send_json({"enrolled": created, "count": len(created)})
|
||||
|
||||
|
||||
def _h_enroll_one(handler):
|
||||
user = _require_admin(handler)
|
||||
if not user:
|
||||
return
|
||||
body = handler.get_body() or {}
|
||||
# Accept either `email` or `email_address` for ergonomics.
|
||||
email_address = (body.get("email_address") or body.get("email") or "").lower().strip()
|
||||
user_id = body.get("user_id")
|
||||
auth_method = body.get("auth_method") or _cfg.CONFIG.primary_auth
|
||||
|
||||
if not email_address:
|
||||
return handler.send_error_json("email (or email_address) required", 400)
|
||||
|
||||
# If the caller didn't specify a CRM user_id, resolve it from the
|
||||
# users table by matching email. Falls back to the authenticated
|
||||
# admin's own id (handles the common case of a single admin
|
||||
# enrolling themselves without having to paste their UUID).
|
||||
if not user_id:
|
||||
conn = _conn()
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id FROM users WHERE LOWER(email) = ?",
|
||||
(email_address,))
|
||||
row = cur.fetchone()
|
||||
user_id = row["id"] if row else user.get("id")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if not user_id:
|
||||
return handler.send_error_json("could not resolve user_id for that email", 400)
|
||||
|
||||
conn = _conn()
|
||||
try:
|
||||
aid = _db.upsert_account(conn, user_id=user_id,
|
||||
email_address=email_address,
|
||||
auth_method=auth_method)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
handler.send_json({"account_id": aid, "email": email_address, "user_id": user_id})
|
||||
|
||||
|
||||
def _h_run_now(handler):
|
||||
user = _require_admin(handler)
|
||||
if not user:
|
||||
return
|
||||
# Reuse existing rate limit so admins can't hammer this.
|
||||
if handler.rate_limited("email-sync-now", 6):
|
||||
return handler.send_error_json("Too many requests", 429)
|
||||
result = _sched.trigger_run_now()
|
||||
handler.send_json(result)
|
||||
|
||||
|
||||
def _h_rematch(handler):
|
||||
"""Re-evaluate unmatched emails against the current investor index."""
|
||||
user = _require_admin(handler)
|
||||
if not user:
|
||||
return
|
||||
body = handler.get_body() or {}
|
||||
since = body.get("since") # optional ISO8601
|
||||
conn = _conn()
|
||||
scanned = 0
|
||||
matched = 0
|
||||
try:
|
||||
from .matcher import InvestorIndex
|
||||
index = InvestorIndex(own_domain=_cfg.CONFIG.workspace_domain)
|
||||
index.rebuild(_conn)
|
||||
cur = conn.cursor()
|
||||
sql = ("SELECT id, from_email, to_emails_json, cc_emails_json "
|
||||
"FROM emails WHERE match_status = 'unmatched'")
|
||||
params: list = []
|
||||
if since:
|
||||
sql += " AND sent_at >= ?"
|
||||
params.append(since)
|
||||
sql += " ORDER BY sent_at DESC LIMIT 10000"
|
||||
cur.execute(sql, params)
|
||||
for row in cur.fetchall():
|
||||
scanned += 1
|
||||
participants = set()
|
||||
if row["from_email"]:
|
||||
participants.add(row["from_email"].lower())
|
||||
for col in ("to_emails_json", "cc_emails_json"):
|
||||
try:
|
||||
arr = json.loads(row[col] or "[]")
|
||||
except Exception:
|
||||
arr = []
|
||||
for a in arr:
|
||||
e = a.get("email") if isinstance(a, dict) else a
|
||||
if e:
|
||||
participants.add(e.lower())
|
||||
links = index.match(participants)
|
||||
if not links:
|
||||
continue
|
||||
matched += 1
|
||||
conn.execute(
|
||||
"UPDATE emails SET match_status='matched', is_matched=1, "
|
||||
"updated_at=datetime('now') WHERE id=?",
|
||||
(row["id"],),
|
||||
)
|
||||
for link in links:
|
||||
_db.insert_investor_link(conn, email_id=row["id"], link={
|
||||
"matched_address": link.matched_address,
|
||||
"match_kind": link.match_kind,
|
||||
"match_confidence": link.match_confidence,
|
||||
"fundraising_investor_id": link.target.fundraising_investor_id,
|
||||
"fundraising_contact_id": link.target.fundraising_contact_id,
|
||||
"contact_id": link.target.contact_id,
|
||||
"organization_id": link.target.organization_id,
|
||||
})
|
||||
# NOTE: body is still missing — we only have headers. A follow-up
|
||||
# job can re-fetch the full message from Gmail using the sighting's
|
||||
# gmail_message_id. Not done inline to keep this endpoint fast.
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
handler.send_json({"scanned": scanned, "newly_matched": matched})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- OAuth state store (in-memory)
|
||||
# For a 5-person CRM the state store doesn't need to be durable — a server
|
||||
# restart between start and callback is rare and just requires a retry.
|
||||
|
||||
_oauth_states: dict[str, dict] = {}
|
||||
_oauth_state_lock = __import__("threading").Lock()
|
||||
|
||||
|
||||
def _oauth_state_store(state: str, user_id: str, account_email: str) -> None:
|
||||
import time
|
||||
with _oauth_state_lock:
|
||||
# Prune stale entries (>10 min).
|
||||
cutoff = time.time() - 600
|
||||
for k, v in list(_oauth_states.items()):
|
||||
if v["created"] < cutoff:
|
||||
_oauth_states.pop(k, None)
|
||||
_oauth_states[state] = {
|
||||
"user_id": user_id,
|
||||
"account_email": account_email.lower().strip(),
|
||||
"created": time.time(),
|
||||
}
|
||||
|
||||
|
||||
def _oauth_state_consume(state: str) -> Optional[dict]:
|
||||
with _oauth_state_lock:
|
||||
return _oauth_states.pop(state, None)
|
||||
@@ -0,0 +1,143 @@
|
||||
"""
|
||||
Background sync scheduler.
|
||||
|
||||
Runs as a daemon thread started from server.py main(). One thread; it wakes
|
||||
every `sync_interval_sec`, processes all accounts serially, sleeps again.
|
||||
|
||||
Singleton: start_sync_scheduler() is idempotent — calling twice won't spawn
|
||||
a second thread. stop_sync_scheduler() gracefully signals shutdown (not
|
||||
strictly needed since it's daemon, but useful for tests).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
|
||||
from . import config as _cfg
|
||||
from . import credentials as _creds
|
||||
from . import sync as _sync
|
||||
from .matcher import InvestorIndex
|
||||
|
||||
|
||||
log = logging.getLogger("email_integration.scheduler")
|
||||
|
||||
|
||||
_state: dict[str, object] = {
|
||||
"thread": None,
|
||||
"stop": threading.Event(),
|
||||
"last_run": 0.0,
|
||||
"last_result": None,
|
||||
"running_now": False,
|
||||
}
|
||||
|
||||
|
||||
def _conn_factory_from_env() -> Callable[[], sqlite3.Connection]:
|
||||
"""Build a get_db() compatible with server.py's pattern.
|
||||
|
||||
We don't import server.py (avoid circular / startup ordering). Instead
|
||||
we re-implement the same settings. If server.py's DB path differs from
|
||||
the default, CRM_DB_PATH env var should be set — same mechanism.
|
||||
"""
|
||||
import os
|
||||
db_path = os.environ.get(
|
||||
"CRM_DB_PATH",
|
||||
os.path.join(_cfg.CONFIG.data_dir, "crm.db"),
|
||||
)
|
||||
|
||||
def get_db() -> sqlite3.Connection:
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
conn.execute("PRAGMA busy_timeout=5000")
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
return get_db
|
||||
|
||||
|
||||
def start_sync_scheduler(conn_factory: Optional[Callable] = None) -> None:
|
||||
if _state["thread"] is not None:
|
||||
return # already running
|
||||
|
||||
if not _cfg.CONFIG.enabled:
|
||||
log.info("email_integration not enabled; scheduler will not start")
|
||||
return
|
||||
|
||||
factory = conn_factory or _conn_factory_from_env()
|
||||
|
||||
try:
|
||||
provider = _creds.build_provider(factory)
|
||||
except Exception as e:
|
||||
log.exception("cannot build credential provider: %s", e)
|
||||
return
|
||||
|
||||
index = InvestorIndex(own_domain=_cfg.CONFIG.workspace_domain)
|
||||
try:
|
||||
index.rebuild(factory)
|
||||
except Exception:
|
||||
log.exception("initial investor-index build failed; scheduler continues")
|
||||
|
||||
stop = threading.Event()
|
||||
_state["stop"] = stop
|
||||
|
||||
def _loop():
|
||||
log.info("email sync scheduler started; interval=%ss", _cfg.CONFIG.sync_interval_sec)
|
||||
# First cycle: short delay to let server finish startup.
|
||||
if stop.wait(10):
|
||||
return
|
||||
while not stop.is_set():
|
||||
_state["running_now"] = True
|
||||
t0 = time.time()
|
||||
try:
|
||||
result = _sync.sync_all(factory, provider, index)
|
||||
_state["last_result"] = result
|
||||
except Exception:
|
||||
log.exception("sync loop crashed; will retry next cycle")
|
||||
finally:
|
||||
_state["running_now"] = False
|
||||
_state["last_run"] = t0
|
||||
if stop.wait(_cfg.CONFIG.sync_interval_sec):
|
||||
return
|
||||
|
||||
t = threading.Thread(target=_loop, name="email-sync", daemon=True)
|
||||
t.start()
|
||||
_state["thread"] = t
|
||||
_state["provider"] = provider
|
||||
_state["index"] = index
|
||||
_state["factory"] = factory
|
||||
|
||||
|
||||
def stop_sync_scheduler() -> None:
|
||||
ev: threading.Event = _state["stop"] # type: ignore
|
||||
ev.set()
|
||||
t = _state.get("thread")
|
||||
if t:
|
||||
try:
|
||||
t.join(timeout=5)
|
||||
except Exception:
|
||||
pass
|
||||
_state["thread"] = None
|
||||
|
||||
|
||||
def trigger_run_now() -> dict:
|
||||
"""Force a single sync pass synchronously (admin 'sync now' endpoint)."""
|
||||
if _state.get("running_now"):
|
||||
return {"status": "already_running"}
|
||||
factory = _state.get("factory")
|
||||
provider = _state.get("provider")
|
||||
index = _state.get("index")
|
||||
if not (factory and provider and index):
|
||||
return {"status": "not_initialized"}
|
||||
return _sync.sync_all(factory, provider, index) # type: ignore
|
||||
|
||||
|
||||
def status_snapshot() -> dict:
|
||||
return {
|
||||
"enabled": _cfg.CONFIG.enabled,
|
||||
"running": _state["running_now"],
|
||||
"last_run_unix": _state.get("last_run"),
|
||||
"last_result": _state.get("last_result"),
|
||||
"interval_sec": _cfg.CONFIG.sync_interval_sec,
|
||||
}
|
||||
@@ -0,0 +1,390 @@
|
||||
"""
|
||||
Sync orchestrator.
|
||||
|
||||
Top-level entry points:
|
||||
|
||||
sync_account(conn_factory, credential_provider, account_row, matcher)
|
||||
Full sync pass for one mailbox. Decides backfill vs. incremental based
|
||||
on email_accounts.backfill_complete. Writes a sync_runs row.
|
||||
|
||||
sync_all(conn_factory, credential_provider, matcher)
|
||||
Iterates every sync-enabled account sequentially. Called from
|
||||
scheduler.py every CRM_GMAIL_SYNC_INTERVAL_MIN minutes.
|
||||
|
||||
Design: match-only storage (see architecture doc §7). For each message:
|
||||
1. Fetch metadata (cheap, 5 units).
|
||||
2. Run matcher against participant addresses.
|
||||
3. If matched → fetch full message, parse, persist body + register attachments.
|
||||
4. If unmatched → persist header-only row.
|
||||
5. In both cases, record the per-account sighting.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
import traceback
|
||||
from typing import Optional
|
||||
|
||||
from . import attachments as _attach
|
||||
from . import config as _cfg
|
||||
from . import db as _db
|
||||
from . import errors as _errors
|
||||
from . import gmail_client as _gmail
|
||||
from . import parser as _parser
|
||||
from . import threads as _threads
|
||||
from .matcher import InvestorIndex, InvestorLink
|
||||
|
||||
|
||||
log = logging.getLogger("email_integration.sync")
|
||||
|
||||
|
||||
METADATA_HEADERS = [
|
||||
"From", "To", "Cc", "Bcc", "Subject", "Date",
|
||||
"Message-ID", "In-Reply-To", "References", "Reply-To",
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- public
|
||||
|
||||
def sync_all(conn_factory, credential_provider, index: InvestorIndex) -> dict:
|
||||
"""Run one pass across all enabled accounts. Returns summary stats."""
|
||||
index.rebuild_if_stale(conn_factory)
|
||||
|
||||
conn = conn_factory()
|
||||
try:
|
||||
accounts = _db.list_sync_ready_accounts(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
totals = {"accounts": 0, "messages_stored": 0, "errors": 0}
|
||||
for acc in accounts:
|
||||
totals["accounts"] += 1
|
||||
try:
|
||||
stats = sync_account(conn_factory, credential_provider, acc, index)
|
||||
totals["messages_stored"] += stats.get("messages_stored", 0)
|
||||
except Exception:
|
||||
totals["errors"] += 1
|
||||
log.exception("sync failed for account %s", acc["email_address"])
|
||||
return totals
|
||||
|
||||
|
||||
def sync_account(conn_factory, credential_provider, account,
|
||||
index: InvestorIndex) -> dict:
|
||||
"""Sync a single mailbox. Returns stats dict."""
|
||||
email_addr = account["email_address"]
|
||||
stats = _gmail.CallStats()
|
||||
client = _gmail.GmailClient(credential_provider, email_addr, stats=stats)
|
||||
|
||||
# Mark running
|
||||
conn = conn_factory()
|
||||
try:
|
||||
run_id = _db.start_sync_run(conn,
|
||||
account_id=account["id"],
|
||||
kind="backfill" if not account["backfill_complete"] else "incremental")
|
||||
_db.set_account_status(conn, account["id"], status="active", error=None)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
run_stats = {"messages_seen": 0, "messages_stored": 0, "attachments_saved": 0}
|
||||
error_str: Optional[str] = None
|
||||
status = "ok"
|
||||
|
||||
try:
|
||||
if not account["backfill_complete"]:
|
||||
_run_backfill(conn_factory, client, account, index, run_stats)
|
||||
else:
|
||||
_run_incremental(conn_factory, client, account, index, run_stats)
|
||||
|
||||
# Drain attachments for this account.
|
||||
conn = conn_factory()
|
||||
try:
|
||||
# Limit to a few cycles' worth of attachments per pass.
|
||||
batched = _attach.drain_pending(conn_factory, client, account["id"], limit=100)
|
||||
run_stats["attachments_saved"] = batched
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
except _errors.AuthError as e:
|
||||
error_str = f"auth: {e}"
|
||||
status = "error"
|
||||
except _errors.HistoryExpiredError:
|
||||
# Recover: reset to date-based backfill from last_synced_at.
|
||||
error_str = "history expired; fallback to date backfill"
|
||||
status = "partial"
|
||||
_fallback_date_backfill(conn_factory, client, account, index, run_stats)
|
||||
except Exception as e:
|
||||
error_str = f"unexpected: {type(e).__name__}: {e}"
|
||||
status = "error"
|
||||
log.exception("unexpected during sync of %s", email_addr)
|
||||
finally:
|
||||
run_stats["api_calls"] = stats.api_calls
|
||||
run_stats["retries"] = stats.retries
|
||||
conn = conn_factory()
|
||||
try:
|
||||
_db.finish_sync_run(conn, run_id, status=status, stats=run_stats, error=error_str)
|
||||
_db.set_account_status(conn, account["id"],
|
||||
status="active" if status == "ok" else status,
|
||||
error=error_str)
|
||||
_db.set_account_checkpoint(conn, account["id"],
|
||||
last_synced_at=_db._now_iso())
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return run_stats
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- backfill
|
||||
|
||||
def _run_backfill(conn_factory, client, account, index: InvestorIndex,
|
||||
run_stats: dict) -> None:
|
||||
"""Initial full-mailbox backfill, resumable via backfill_cursor."""
|
||||
page_token = account["backfill_cursor"]
|
||||
while True:
|
||||
resp = client.list_messages(page_token=page_token,
|
||||
max_results=_cfg.CONFIG.backfill_page_size)
|
||||
messages = resp.get("messages") or []
|
||||
for m in messages:
|
||||
run_stats["messages_seen"] += 1
|
||||
try:
|
||||
_process_one_message(conn_factory, client, account, index,
|
||||
gmail_message_id=m["id"], run_stats=run_stats)
|
||||
except _errors.GmailError as e:
|
||||
log.warning("skip msg %s on %s: %s", m["id"], account["email_address"], e)
|
||||
continue
|
||||
|
||||
page_token = resp.get("nextPageToken")
|
||||
conn = conn_factory()
|
||||
try:
|
||||
_db.set_account_checkpoint(conn, account["id"],
|
||||
backfill_cursor=page_token,
|
||||
backfill_complete=(not page_token))
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if not page_token:
|
||||
# Capture current historyId as checkpoint for future incrementals.
|
||||
prof = client.get_profile()
|
||||
hid = prof.get("historyId")
|
||||
if hid:
|
||||
conn = conn_factory()
|
||||
try:
|
||||
_db.set_account_checkpoint(conn, account["id"], history_id=str(hid))
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
return
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- incremental
|
||||
|
||||
def _run_incremental(conn_factory, client, account, index: InvestorIndex,
|
||||
run_stats: dict) -> None:
|
||||
start_hid = account["last_history_id"]
|
||||
if not start_hid:
|
||||
# Safety: if checkpoint is missing, re-enter backfill.
|
||||
_run_backfill(conn_factory, client, account, index, run_stats)
|
||||
return
|
||||
|
||||
# history_types filter limits bandwidth to what we care about.
|
||||
new_hid: Optional[str] = None
|
||||
try:
|
||||
for h in client.iter_history(
|
||||
start_history_id=start_hid,
|
||||
history_types=["messageAdded", "messageDeleted", "labelAdded", "labelRemoved"],
|
||||
):
|
||||
for ma in h.get("messagesAdded") or []:
|
||||
msg = ma.get("message") or {}
|
||||
run_stats["messages_seen"] += 1
|
||||
try:
|
||||
_process_one_message(conn_factory, client, account, index,
|
||||
gmail_message_id=msg.get("id"),
|
||||
run_stats=run_stats)
|
||||
except _errors.GmailError as e:
|
||||
log.warning("skip msg %s on %s: %s", msg.get("id"), account["email_address"], e)
|
||||
|
||||
for md in h.get("messagesDeleted") or []:
|
||||
msg = md.get("message") or {}
|
||||
conn = conn_factory()
|
||||
try:
|
||||
_db.tombstone_sighting(
|
||||
conn,
|
||||
account_id=account["id"],
|
||||
gmail_message_id=msg.get("id"),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
for la in (h.get("labelsAdded") or []) + (h.get("labelsRemoved") or []):
|
||||
msg = la.get("message") or {}
|
||||
# labels are the resulting label set in Gmail's payload after
|
||||
# the change. We refresh them wholesale.
|
||||
labels = msg.get("labelIds") or []
|
||||
conn = conn_factory()
|
||||
try:
|
||||
_db.update_sighting_labels(
|
||||
conn,
|
||||
account_id=account["id"],
|
||||
gmail_message_id=msg.get("id"),
|
||||
labels=labels,
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
new_hid = client.last_history_id
|
||||
except _errors.HistoryExpiredError:
|
||||
raise
|
||||
|
||||
if new_hid:
|
||||
conn = conn_factory()
|
||||
try:
|
||||
_db.set_account_checkpoint(conn, account["id"], history_id=str(new_hid))
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _fallback_date_backfill(conn_factory, client, account, index, run_stats):
|
||||
"""Used when startHistoryId has been pruned by Gmail.
|
||||
|
||||
Pulls everything since last_synced_at (or 14d if unknown), which will
|
||||
hit a large overlap with existing data but upserts are idempotent.
|
||||
"""
|
||||
from datetime import datetime, timedelta, timezone
|
||||
since = account["last_synced_at"] or (
|
||||
datetime.now(tz=timezone.utc) - timedelta(days=14)
|
||||
).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
q = f"after:{since.replace('-', '/').split('T')[0]}"
|
||||
for m in client.iter_messages(q=q):
|
||||
run_stats["messages_seen"] += 1
|
||||
try:
|
||||
_process_one_message(conn_factory, client, account, index,
|
||||
gmail_message_id=m["id"], run_stats=run_stats)
|
||||
except _errors.GmailError as e:
|
||||
log.warning("skip during date-backfill msg %s: %s", m["id"], e)
|
||||
prof = client.get_profile()
|
||||
hid = prof.get("historyId")
|
||||
if hid:
|
||||
conn = conn_factory()
|
||||
try:
|
||||
_db.set_account_checkpoint(conn, account["id"], history_id=str(hid))
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- per-message
|
||||
|
||||
def _process_one_message(conn_factory, client, account, index: InvestorIndex,
|
||||
*, gmail_message_id: str, run_stats: dict) -> None:
|
||||
"""Fetch, match, persist one message. Idempotent."""
|
||||
if not gmail_message_id:
|
||||
return
|
||||
|
||||
# Skip if we've already sighted this message for this account.
|
||||
conn = conn_factory()
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT email_id FROM email_account_messages "
|
||||
"WHERE account_id = ? AND gmail_message_id = ?",
|
||||
(account["id"], gmail_message_id),
|
||||
)
|
||||
if cur.fetchone():
|
||||
return
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# 1. Metadata fetch (cheap).
|
||||
meta = client.get_message(gmail_message_id, format="metadata",
|
||||
metadata_headers=METADATA_HEADERS)
|
||||
meta_parsed = _parser.parse(meta, owning_account_address=account["email_address"])
|
||||
|
||||
participants = set()
|
||||
if meta_parsed.get("from_email"):
|
||||
participants.add(meta_parsed["from_email"])
|
||||
for kind in ("to", "cc", "bcc"):
|
||||
for a in meta_parsed.get(kind, []):
|
||||
if isinstance(a, dict) and a.get("email"):
|
||||
participants.add(a["email"])
|
||||
|
||||
# Exclude owning account's own address so we don't try to "match" ourselves.
|
||||
own = {account["email_address"].lower()}
|
||||
links = index.match(participants, exclude_addresses=own)
|
||||
is_matched = bool(links)
|
||||
|
||||
# 2. If matched, fetch full and parse for body + attachments.
|
||||
if is_matched:
|
||||
full = client.get_message(gmail_message_id, format="full")
|
||||
parsed = _parser.parse(full, owning_account_address=account["email_address"])
|
||||
else:
|
||||
parsed = meta_parsed
|
||||
# Strip any body fields (metadata fetch shouldn't have them but be safe).
|
||||
parsed["body_text"] = None
|
||||
parsed["body_html"] = None
|
||||
parsed["attachments"] = []
|
||||
|
||||
# 3. Persist (idempotent on rfc_message_id).
|
||||
conn = conn_factory()
|
||||
try:
|
||||
existing = _db.find_email_by_rfc_id(conn, parsed["rfc_message_id"])
|
||||
if existing:
|
||||
email_id = existing["id"]
|
||||
# If the email was previously unmatched but now matches (e.g. user
|
||||
# added the investor after first sight), upgrade the row.
|
||||
if is_matched and existing["match_status"] == "unmatched":
|
||||
conn.execute(
|
||||
"UPDATE emails SET match_status = 'matched', is_matched = 1, "
|
||||
"body_text = ?, body_html = ?, updated_at = datetime('now') "
|
||||
"WHERE id = ?",
|
||||
(parsed.get("body_text"), parsed.get("body_html"), email_id),
|
||||
)
|
||||
_attach.register_stubs(conn,
|
||||
email_id=email_id,
|
||||
parsed_attachments=parsed.get("attachments") or [])
|
||||
for link in links:
|
||||
_db.insert_investor_link(conn, email_id=email_id, link=_flatten_link(link))
|
||||
else:
|
||||
match_status = "matched" if is_matched else "unmatched"
|
||||
email_id = _db.insert_email(conn, parsed=parsed, match_status=match_status)
|
||||
thread_id = _threads.resolve_thread_id(conn, parsed)
|
||||
_db.set_email_thread(conn, email_id, thread_id)
|
||||
if is_matched:
|
||||
_attach.register_stubs(conn,
|
||||
email_id=email_id,
|
||||
parsed_attachments=parsed.get("attachments") or [])
|
||||
for link in links:
|
||||
_db.insert_investor_link(conn, email_id=email_id, link=_flatten_link(link))
|
||||
_db.rollup_thread(conn, thread_id)
|
||||
run_stats["messages_stored"] += 1
|
||||
|
||||
# Record sighting (always, even if email row was pre-existing).
|
||||
_db.upsert_sighting(
|
||||
conn,
|
||||
email_id=email_id,
|
||||
account_id=account["id"],
|
||||
gmail_message_id=gmail_message_id,
|
||||
gmail_thread_id=parsed.get("gmail_thread_id") or "",
|
||||
labels=parsed.get("labels", []),
|
||||
is_sent=parsed.get("is_sent", False),
|
||||
)
|
||||
conn.commit()
|
||||
except sqlite3.IntegrityError:
|
||||
# Concurrent insert race — re-read and proceed.
|
||||
pass
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _flatten_link(link: InvestorLink) -> dict:
|
||||
return {
|
||||
"matched_address": link.matched_address,
|
||||
"match_kind": link.match_kind,
|
||||
"match_confidence": link.match_confidence,
|
||||
"fundraising_investor_id": link.target.fundraising_investor_id,
|
||||
"fundraising_contact_id": link.target.fundraising_contact_id,
|
||||
"contact_id": link.target.contact_id,
|
||||
"organization_id": link.target.organization_id,
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
"""
|
||||
Threading resolution.
|
||||
|
||||
Given a freshly-inserted emails row (or its about-to-be-inserted parsed dict),
|
||||
figure out which email_threads row it belongs to. If none exists, create one.
|
||||
|
||||
Priority order (see architecture doc §10):
|
||||
1. Existing email in our DB that shares any RFC Message-ID with this one's
|
||||
References/In-Reply-To chain — inherit its thread.
|
||||
2. Existing thread with the same gmail_thread_id.
|
||||
3. Existing thread with the same rfc_thread_root_id.
|
||||
4. Create a new thread.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sqlite3
|
||||
from typing import Optional
|
||||
|
||||
from . import db as _db
|
||||
|
||||
|
||||
SUBJECT_PREFIX_RE = re.compile(r"^\s*(re|fwd?|aw|sv|antw|回复|fw)\s*:\s*", re.IGNORECASE)
|
||||
|
||||
|
||||
def normalize_subject(s: Optional[str]) -> Optional[str]:
|
||||
if not s:
|
||||
return None
|
||||
out = s
|
||||
# Strip up to 5 nested Re:/Fwd: prefixes.
|
||||
for _ in range(5):
|
||||
new = SUBJECT_PREFIX_RE.sub("", out, count=1)
|
||||
if new == out:
|
||||
break
|
||||
out = new
|
||||
return out.strip().lower()
|
||||
|
||||
|
||||
def resolve_thread_id(conn: sqlite3.Connection, parsed: dict) -> str:
|
||||
"""Returns a thread_id — either an existing one or a newly created one."""
|
||||
# Step 1: RFC cross-link.
|
||||
candidates = list(parsed.get("references") or [])
|
||||
if parsed.get("in_reply_to"):
|
||||
candidates.append(parsed["in_reply_to"])
|
||||
|
||||
if candidates:
|
||||
existing_email_id = _db.find_email_id_by_any_rfc_id(conn, candidates)
|
||||
if existing_email_id:
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT thread_id FROM emails WHERE id = ?", (existing_email_id,))
|
||||
row = cur.fetchone()
|
||||
if row and row["thread_id"]:
|
||||
return row["thread_id"]
|
||||
|
||||
# Step 2: gmail_thread_id match.
|
||||
gt = parsed.get("gmail_thread_id")
|
||||
if gt:
|
||||
existing = _db.find_thread_by_gmail_id(conn, gt)
|
||||
if existing:
|
||||
return existing["id"]
|
||||
|
||||
# Step 3: RFC thread-root match.
|
||||
rfc_root = parsed.get("rfc_thread_root_id")
|
||||
if rfc_root:
|
||||
existing = _db.find_thread_by_rfc_root(conn, rfc_root)
|
||||
if existing:
|
||||
return existing["id"]
|
||||
|
||||
# Step 4: create.
|
||||
return _db.create_thread(
|
||||
conn,
|
||||
gmail_thread_id=gt,
|
||||
rfc_thread_root_id=rfc_root,
|
||||
subject_normalized=normalize_subject(parsed.get("subject")),
|
||||
first_message_at=parsed.get("sent_at"),
|
||||
)
|
||||
@@ -0,0 +1,5 @@
|
||||
"""Ten31 Phase-0 ingest pipeline (entity resolution, chunking, embed, Qdrant upsert).
|
||||
|
||||
All modules are local-only and read the CRM by SQLite file path (CRM is canonical;
|
||||
the canonical/vector layers are derived). No real data is sent to Claude here.
|
||||
"""
|
||||
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Phase-0 Workstream B — backfill the CRM into Qdrant.
|
||||
|
||||
Chunk -> dense (bge-m3 via Spark Control) + sparse (BM25 client-side) -> upsert
|
||||
to Qdrant `crm_chunks` with payload. Idempotent: deterministic point ids mean
|
||||
re-running upserts in place. Reads the CRM by file path; never sends data to Claude.
|
||||
|
||||
python3 backend/ingest/backfill.py --db data/crm_dev.db --recreate
|
||||
"""
|
||||
import argparse
|
||||
import sqlite3
|
||||
|
||||
import chunking
|
||||
import config
|
||||
import embed
|
||||
import qdrant_io
|
||||
import sparse
|
||||
|
||||
|
||||
def run(db, recreate=False, batch=32):
|
||||
conn = sqlite3.connect(db)
|
||||
conn.row_factory = sqlite3.Row
|
||||
chunks = chunking.build_chunks(conn)
|
||||
conn.close()
|
||||
print(f"Built {len(chunks)} chunks from {db}")
|
||||
|
||||
state = qdrant_io.create_collection(recreate=recreate)
|
||||
qdrant_io.ensure_indexes()
|
||||
print(f"Collection '{config.COLLECTION}': {state}")
|
||||
|
||||
total = 0
|
||||
for i in range(0, len(chunks), batch):
|
||||
group = chunks[i:i + batch]
|
||||
dense = embed.dense_embed([c["text"] for c in group])
|
||||
points = []
|
||||
for c, dv in zip(group, dense):
|
||||
sv = sparse.encode(c["text"])
|
||||
points.append({
|
||||
"id": c["point_id"],
|
||||
"vector": {"dense": dv, "sparse": {"indices": sv["indices"], "values": sv["values"]}},
|
||||
"payload": {
|
||||
"lp_id": c["lp_id"], "lp_name": c["lp_name"], "person_id": c["person_id"],
|
||||
"doc_type": c["doc_type"], "date_ts": c["date_ts"], "text": c["text"],
|
||||
"source_model": c["source_model"], "source_id": c["source_id"], "chunk_key": c["chunk_key"],
|
||||
},
|
||||
})
|
||||
qdrant_io.upsert(points)
|
||||
total += len(points)
|
||||
print(f" upserted {total}/{len(chunks)}")
|
||||
|
||||
print(f"Done. Qdrant '{config.COLLECTION}' now holds {qdrant_io.count()} points.")
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--db", default=config.DEFAULT_DB)
|
||||
ap.add_argument("--recreate", action="store_true", help="drop & recreate the collection first")
|
||||
ap.add_argument("--batch", type=int, default=32)
|
||||
args = ap.parse_args()
|
||||
run(args.db, recreate=args.recreate, batch=args.batch)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,184 @@
|
||||
"""Phase-0 Workstream B1 — chunk the CRM for retrieval.
|
||||
|
||||
Maps each CRM record type to one or more chunks per docs/EMBEDDINGS.md:
|
||||
* one chunk per communications row (doc_type = the comm type)
|
||||
* one chunk per MATCHED email (doc_type = email; body only when matched)
|
||||
* one chunk per fundraising_investors notes LINE (the outreach log; split per line)
|
||||
* one chunk each for free-text fields: contacts.notes, lp_profiles.notes,
|
||||
opportunities (description + next_step), organizations.description
|
||||
|
||||
Each chunk carries a canonical `lp_id` (resolved via entity_links) and a `date_ts`
|
||||
(epoch of the EVENT time, not created_at) so Qdrant can pre-filter and recency-rank.
|
||||
Entities/names/dates/types are payload (filterable); only prose is embedded.
|
||||
|
||||
A chunk's stable `chunk_key` -> deterministic point id (uuid5), so re-ingest
|
||||
upserts in place (idempotent).
|
||||
"""
|
||||
import sqlite3
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
_NS = uuid.UUID("6ba7b811-9dad-11d1-80b4-00c04fd430c8") # uuid5 namespace for chunk ids
|
||||
|
||||
|
||||
def to_epoch(ts: str):
|
||||
if not ts:
|
||||
return None
|
||||
s = ts.strip().replace("Z", "+00:00")
|
||||
for parse in (datetime.fromisoformat,):
|
||||
try:
|
||||
dt = parse(s)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return int(dt.timestamp())
|
||||
except Exception:
|
||||
pass
|
||||
# date-only fallback
|
||||
try:
|
||||
return int(datetime.strptime(ts[:10], "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp())
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _point_id(chunk_key: str) -> str:
|
||||
return str(uuid.uuid5(_NS, chunk_key))
|
||||
|
||||
|
||||
def _mk(chunk_key, lp_id, lp_name, person_id, doc_type, date_ts, text, source_model, source_id):
|
||||
text = (text or "").strip()
|
||||
if not text or not lp_id:
|
||||
return None
|
||||
return {
|
||||
"chunk_key": chunk_key,
|
||||
"point_id": _point_id(chunk_key),
|
||||
"lp_id": lp_id,
|
||||
"lp_name": lp_name,
|
||||
"person_id": person_id,
|
||||
"doc_type": doc_type,
|
||||
"date_ts": date_ts,
|
||||
"text": text,
|
||||
"source_model": source_model,
|
||||
"source_id": source_id,
|
||||
}
|
||||
|
||||
|
||||
def _canon_maps(conn):
|
||||
"""Resolution lookups from entity_links / canonical_entities."""
|
||||
person_canon, org_canon, inv_canon = {}, {}, {}
|
||||
for r in conn.execute("SELECT source_model, source_id, canonical_id FROM entity_links"):
|
||||
if r["source_model"] == "contacts":
|
||||
person_canon[r["source_id"]] = r["canonical_id"]
|
||||
elif r["source_model"] == "organizations":
|
||||
org_canon[r["source_id"]] = r["canonical_id"]
|
||||
elif r["source_model"] == "fundraising_investors":
|
||||
inv_canon[r["source_id"]] = r["canonical_id"]
|
||||
name = {r["id"]: r["display_name"] for r in conn.execute("SELECT id, display_name FROM canonical_entities")}
|
||||
contact_org = {r["id"]: r["organization_id"] for r in conn.execute("SELECT id, organization_id FROM contacts")}
|
||||
return person_canon, org_canon, inv_canon, name, contact_org
|
||||
|
||||
|
||||
def _contact_lp(cid, person_canon, org_canon, name, contact_org):
|
||||
"""Best lp_id for a contact-anchored chunk: the firm if known, else the person."""
|
||||
person = person_canon.get(cid)
|
||||
firm = org_canon.get(contact_org.get(cid))
|
||||
lp = firm or person
|
||||
return lp, name.get(lp), person
|
||||
|
||||
|
||||
def build_chunks(conn):
|
||||
person_canon, org_canon, inv_canon, name, contact_org = _canon_maps(conn)
|
||||
chunks = []
|
||||
|
||||
# communications
|
||||
for r in conn.execute("""SELECT id, contact_id, type, subject, body, outcome, next_action, communication_date
|
||||
FROM communications"""):
|
||||
lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
|
||||
parts = [p for p in (r["subject"], r["body"], r["outcome"], r["next_action"]) if (p or "").strip()]
|
||||
chunks.append(_mk(f"communications:{r['id']}", lp, lp_name, person,
|
||||
r["type"] or "note", to_epoch(r["communication_date"]),
|
||||
"\n".join(parts), "communications", r["id"]))
|
||||
|
||||
# contacts.notes
|
||||
for r in conn.execute("SELECT id, notes, updated_at FROM contacts WHERE notes IS NOT NULL AND notes <> ''"):
|
||||
lp, lp_name, person = _contact_lp(r["id"], person_canon, org_canon, name, contact_org)
|
||||
chunks.append(_mk(f"contacts.notes:{r['id']}", lp, lp_name, person,
|
||||
"contact_note", to_epoch(r["updated_at"]), r["notes"], "contacts", r["id"]))
|
||||
|
||||
# lp_profiles.notes
|
||||
for r in conn.execute("""SELECT lp.id, lp.contact_id, lp.notes, lp.updated_at
|
||||
FROM lp_profiles lp WHERE lp.notes IS NOT NULL AND lp.notes <> ''"""):
|
||||
lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
|
||||
chunks.append(_mk(f"lp_profiles.notes:{r['id']}", lp, lp_name, person,
|
||||
"lp_note", to_epoch(r["updated_at"]), r["notes"], "lp_profiles", r["id"]))
|
||||
|
||||
# opportunities (description + next_step)
|
||||
for r in conn.execute("""SELECT id, contact_id, name, description, next_step, updated_at
|
||||
FROM opportunities"""):
|
||||
lp, lp_name, person = _contact_lp(r["contact_id"], person_canon, org_canon, name, contact_org)
|
||||
parts = [p for p in (r["name"], r["description"], r["next_step"]) if (p or "").strip()]
|
||||
chunks.append(_mk(f"opportunities:{r['id']}", lp, lp_name, person,
|
||||
"opportunity", to_epoch(r["updated_at"]), "\n".join(parts), "opportunities", r["id"]))
|
||||
|
||||
# organizations.description
|
||||
for r in conn.execute("""SELECT id, description, updated_at FROM organizations
|
||||
WHERE description IS NOT NULL AND description <> ''"""):
|
||||
lp = org_canon.get(r["id"])
|
||||
chunks.append(_mk(f"organizations.description:{r['id']}", lp, name.get(lp), None,
|
||||
"org_note", to_epoch(r["updated_at"]), r["description"], "organizations", r["id"]))
|
||||
|
||||
# fundraising_investors.notes — running outreach log, split per non-empty line
|
||||
for r in conn.execute("""SELECT id, notes, updated_at FROM fundraising_investors
|
||||
WHERE notes IS NOT NULL AND notes <> ''"""):
|
||||
lp = inv_canon.get(r["id"])
|
||||
for i, line in enumerate(str(r["notes"]).splitlines()):
|
||||
if line.strip():
|
||||
chunks.append(_mk(f"fundraising_investors.notes:{r['id']}:{i}", lp, name.get(lp), None,
|
||||
"outreach_note", to_epoch(r["updated_at"]), line, "fundraising_investors", r["id"]))
|
||||
|
||||
# MATCHED emails (only matched rows carry a body; key lp via email_investor_links)
|
||||
if _has_table(conn, "emails") and _has_table(conn, "email_investor_links"):
|
||||
for r in conn.execute("""SELECT id, subject, body_text, snippet, sent_at FROM emails WHERE is_matched=1"""):
|
||||
lp, lp_name = _email_lp(conn, r["id"], inv_canon, org_canon, person_canon, name)
|
||||
text = "\n".join(p for p in (r["subject"], r["body_text"] or r["snippet"]) if (p or "").strip())
|
||||
chunks.append(_mk(f"emails:{r['id']}", lp, lp_name, None, "email",
|
||||
to_epoch(r["sent_at"]), text, "emails", r["id"]))
|
||||
|
||||
return [c for c in chunks if c]
|
||||
|
||||
|
||||
def _has_table(conn, name):
|
||||
return conn.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name=?", (name,)).fetchone() is not None
|
||||
|
||||
|
||||
def _email_lp(conn, email_id, inv_canon, org_canon, person_canon, name):
|
||||
"""Resolve a matched email's lp_id via email_investor_links, precedence:
|
||||
fundraising_investor -> contact -> organization."""
|
||||
row = conn.execute("""SELECT fundraising_investor_id, contact_id, organization_id
|
||||
FROM email_investor_links WHERE email_id=? ORDER BY match_confidence DESC LIMIT 1""",
|
||||
(email_id,)).fetchone()
|
||||
if not row:
|
||||
return None, None
|
||||
lp = (inv_canon.get(row["fundraising_investor_id"]) or person_canon.get(row["contact_id"])
|
||||
or org_canon.get(row["organization_id"]))
|
||||
return lp, name.get(lp)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
from collections import Counter
|
||||
from config import DEFAULT_DB
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--db", default=DEFAULT_DB)
|
||||
args = ap.parse_args()
|
||||
conn = sqlite3.connect(args.db)
|
||||
conn.row_factory = sqlite3.Row
|
||||
chunks = build_chunks(conn)
|
||||
print(f"{len(chunks)} chunks from {args.db}")
|
||||
for dt, n in Counter(c["doc_type"] for c in chunks).most_common():
|
||||
print(f" {dt:<16} {n}")
|
||||
unresolved = sum(1 for c in chunks if not c["lp_id"])
|
||||
print(f" (all chunks have an lp_id: {unresolved == 0})")
|
||||
print("\nSample chunk:")
|
||||
s = chunks[0]
|
||||
print({k: (v[:80] + '…' if k == 'text' and v and len(v) > 80 else v) for k, v in s.items()})
|
||||
@@ -0,0 +1,28 @@
|
||||
"""Ingest config — loads .env and exposes the Spark/Qdrant/CRM settings."""
|
||||
import os
|
||||
|
||||
_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
|
||||
def load_env(path=None):
|
||||
path = path or os.path.join(_ROOT, ".env")
|
||||
if not os.path.exists(path):
|
||||
return
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
k, v = line.split("=", 1)
|
||||
os.environ.setdefault(k.strip(), v.strip())
|
||||
|
||||
|
||||
load_env()
|
||||
|
||||
SPARK_CONTROL_URL = os.environ.get("SPARK_CONTROL_URL", "").rstrip("/")
|
||||
SPARK_VERIFY_TLS = os.environ.get("SPARK_CONTROL_VERIFY_TLS", "false").lower() in ("1", "true", "yes", "on")
|
||||
QDRANT_URL = os.environ.get("QDRANT_URL", "").rstrip("/")
|
||||
COLLECTION = os.environ.get("CRM_QDRANT_COLLECTION", "crm_chunks")
|
||||
EMBED_MODEL = os.environ.get("CRM_EMBED_MODEL", "BAAI/bge-m3")
|
||||
DENSE_DIM = int(os.environ.get("CRM_EMBED_DIM", "1024"))
|
||||
DEFAULT_DB = os.environ.get("CRM_DEV_DB_PATH", os.path.join(_ROOT, "data", "crm_dev.db"))
|
||||
@@ -0,0 +1,17 @@
|
||||
"""Dense embeddings via Spark Control /v1/embeddings (BAAI/bge-m3, 1024-d)."""
|
||||
import config
|
||||
import http_util
|
||||
|
||||
|
||||
def dense_embed(texts, batch=32):
|
||||
out = []
|
||||
for i in range(0, len(texts), batch):
|
||||
group = texts[i:i + batch]
|
||||
status, data = http_util.request(
|
||||
"POST", f"{config.SPARK_CONTROL_URL}/v1/embeddings",
|
||||
{"input": group, "model": config.EMBED_MODEL}, verify=config.SPARK_VERIFY_TLS)
|
||||
if status != 200:
|
||||
raise RuntimeError(f"/v1/embeddings -> {status}: {data}")
|
||||
rows = sorted(data["data"], key=lambda d: d["index"])
|
||||
out.extend(r["embedding"] for r in rows)
|
||||
return out
|
||||
@@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Phase-0 Workstream B3 / A4 — entity resolution (deterministic tier).
|
||||
|
||||
Collapses the CRM's two parallel investor models into the canonical identity
|
||||
layer created by migration 0001:
|
||||
|
||||
organizations ─┐
|
||||
fundraising_investors ─┴─► canonical_entities (entity_kind = lp | organization)
|
||||
contacts ─┐
|
||||
fundraising_contacts ─┴─► canonical_entities (entity_kind = person)
|
||||
lp_profiles ───► linked to its contact's person entity
|
||||
|
||||
Every source row is recorded in `entity_links` so any name variant resolves to
|
||||
one canonical id. This is the DETERMINISTIC tier — it merges only what we can
|
||||
prove (exact email; exact normalized name within the same canonical org). The
|
||||
HARD cases (nicknames like "Jon" vs "Jonathan", typos) are NOT guessed; they are
|
||||
emitted as *fuzzy candidates* for the local-Qwen tier (Spark Control
|
||||
/v1/chat/completions) to adjudicate later. Honest separation: we never silently
|
||||
merge on a guess.
|
||||
|
||||
Properties:
|
||||
* Local-only, read-mostly: reads CRM source tables, writes only the derived
|
||||
canonical_entities / entity_links and an interaction_log audit row. Never
|
||||
mutates a CRM source record (guardrail #2/#3).
|
||||
* Idempotent: canonical ids are deterministic (sha1 of the resolution key), so
|
||||
re-running upserts in place and keeps ids stable across runs — which keeps
|
||||
downstream Qdrant point ids valid (no churn on re-embed).
|
||||
* Logged: writes one interaction_log row per run (guardrail #5).
|
||||
|
||||
Usage:
|
||||
python3 backend/ingest/entity_resolution.py --db data/crm_dev.db
|
||||
python3 backend/ingest/entity_resolution.py --db data/crm_dev.db --show-candidates
|
||||
"""
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
# ── normalization ─────────────────────────────────────────────────────────────
|
||||
|
||||
def norm_text(s: str) -> str:
|
||||
s = (s or "").strip().lower()
|
||||
s = re.sub(r"[^\w\s]", " ", s)
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def norm_email(s: str) -> str:
|
||||
return (s or "").strip().lower()
|
||||
|
||||
|
||||
def _eid(prefix: str, key: str) -> str:
|
||||
"""Deterministic canonical id: stable across runs for the same resolution key."""
|
||||
return f"{prefix}_{hashlib.sha1(key.encode('utf-8')).hexdigest()[:12]}"
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _split_name(full: str):
|
||||
parts = norm_text(full).split()
|
||||
if not parts:
|
||||
return "", ""
|
||||
return parts[0], parts[-1] if len(parts) > 1 else ""
|
||||
|
||||
|
||||
# ── upsert helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
def _upsert_entity(conn, eid, kind, display_name, primary_email):
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO canonical_entities (id, entity_kind, display_name, primary_email, source, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, 'entity_resolution', ?, ?)
|
||||
ON CONFLICT(id) DO UPDATE SET
|
||||
display_name = excluded.display_name,
|
||||
primary_email = COALESCE(excluded.primary_email, canonical_entities.primary_email),
|
||||
entity_kind = excluded.entity_kind,
|
||||
updated_at = excluded.updated_at
|
||||
""",
|
||||
(eid, kind, display_name, primary_email or None, _now(), _now()),
|
||||
)
|
||||
|
||||
|
||||
def _link(conn, canonical_id, source_model, source_id, match_value, match_kind, confidence):
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO entity_links (id, canonical_id, source_model, source_id, match_value, match_kind, confidence, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(source_model, source_id, match_value) DO UPDATE SET
|
||||
canonical_id = excluded.canonical_id,
|
||||
match_kind = excluded.match_kind,
|
||||
confidence = excluded.confidence
|
||||
""",
|
||||
(str(uuid.uuid4()), canonical_id, source_model, source_id, match_value, match_kind, confidence, _now()),
|
||||
)
|
||||
|
||||
|
||||
# ── resolution passes ─────────────────────────────────────────────────────────
|
||||
|
||||
def resolve_organizations(conn):
|
||||
"""Merge organizations + fundraising_investors by normalized name.
|
||||
|
||||
Returns (org_canon_by_orgid, org_canon_by_fundinv) so the people pass can
|
||||
attach each person to their firm's canonical id.
|
||||
"""
|
||||
groups = defaultdict(lambda: {"orgs": [], "investors": [], "name": "", "email": ""})
|
||||
|
||||
for r in conn.execute("SELECT id, name, email FROM organizations"):
|
||||
key = norm_text(r["name"])
|
||||
if not key:
|
||||
continue
|
||||
g = groups[key]
|
||||
g["orgs"].append(r["id"])
|
||||
if len(r["name"] or "") > len(g["name"]):
|
||||
g["name"] = r["name"]
|
||||
if not g["email"] and (r["email"] or "").strip():
|
||||
g["email"] = r["email"].strip()
|
||||
|
||||
for r in conn.execute("SELECT id, investor_name FROM fundraising_investors"):
|
||||
key = norm_text(r["investor_name"])
|
||||
if not key:
|
||||
continue
|
||||
g = groups[key]
|
||||
g["investors"].append(r["id"])
|
||||
if not g["name"]:
|
||||
g["name"] = r["investor_name"]
|
||||
|
||||
org_canon_by_orgid, org_canon_by_fundinv = {}, {}
|
||||
for key, g in groups.items():
|
||||
# An org we are actively raising from (has a fundraising row) is an 'lp';
|
||||
# otherwise a plain 'organization'.
|
||||
kind = "lp" if g["investors"] else "organization"
|
||||
cid = _eid("lp" if kind == "lp" else "org", key)
|
||||
_upsert_entity(conn, cid, kind, g["name"], g["email"])
|
||||
for oid in g["orgs"]:
|
||||
_link(conn, cid, "organizations", oid, key, "exact_name", 1.0)
|
||||
org_canon_by_orgid[oid] = cid
|
||||
for iid in g["investors"]:
|
||||
_link(conn, cid, "fundraising_investors", iid, key, "exact_name", 1.0)
|
||||
org_canon_by_fundinv[iid] = cid
|
||||
|
||||
return org_canon_by_orgid, org_canon_by_fundinv
|
||||
|
||||
|
||||
def resolve_people(conn, org_canon_by_orgid, org_canon_by_fundinv):
|
||||
"""Merge contacts + fundraising_contacts by exact email, else exact name within
|
||||
the same canonical org. Returns contact_id -> person canonical id (for lp_profiles)."""
|
||||
# gather (model, source_id, full_name, email, org_canon)
|
||||
people = []
|
||||
for r in conn.execute("SELECT id, first_name, last_name, email, organization_id FROM contacts"):
|
||||
full = f"{r['first_name'] or ''} {r['last_name'] or ''}".strip()
|
||||
people.append(("contacts", r["id"], full, norm_email(r["email"]),
|
||||
org_canon_by_orgid.get(r["organization_id"])))
|
||||
for r in conn.execute("SELECT id, full_name, email, investor_id FROM fundraising_contacts"):
|
||||
people.append(("fundraising_contacts", r["id"], r["full_name"] or "", norm_email(r["email"]),
|
||||
org_canon_by_fundinv.get(r["investor_id"])))
|
||||
|
||||
contact_to_person = {}
|
||||
person_meta = {} # canonical_id -> {"org": org_canon, "last": last_norm, "name": display, "email": email}
|
||||
|
||||
for model, sid, full, email, org_canon in people:
|
||||
name_norm = norm_text(full)
|
||||
if email:
|
||||
key = f"e|{email}"
|
||||
match_kind, conf, match_value = "exact_email", 1.0, email
|
||||
elif name_norm:
|
||||
key = f"n|{name_norm}|{org_canon or ''}"
|
||||
match_kind, conf, match_value = "name_org", 0.8, name_norm
|
||||
else:
|
||||
continue
|
||||
cid = _eid("per", key)
|
||||
display = full.strip() or email
|
||||
_upsert_entity(conn, cid, "person", display, email)
|
||||
_link(conn, cid, model, sid, match_value, match_kind, conf)
|
||||
if model == "contacts":
|
||||
contact_to_person[sid] = cid
|
||||
meta = person_meta.setdefault(cid, {"org": org_canon, "last": _split_name(full)[1],
|
||||
"name": display, "email": email})
|
||||
if org_canon and not meta["org"]:
|
||||
meta["org"] = org_canon
|
||||
|
||||
# lp_profiles -> the person entity of its contact
|
||||
for r in conn.execute("SELECT id, contact_id FROM lp_profiles"):
|
||||
cid = contact_to_person.get(r["contact_id"])
|
||||
if cid:
|
||||
_link(conn, cid, "lp_profiles", r["id"], r["contact_id"], "contact_fk", 1.0)
|
||||
|
||||
return person_meta
|
||||
|
||||
|
||||
def find_fuzzy_candidates(person_meta):
|
||||
"""Distinct person entities sharing the same canonical org AND surname are
|
||||
likely the same individual under a name variant (e.g. Jon/Jonathan). Emit them
|
||||
for the local-Qwen tier; do NOT merge here."""
|
||||
by_org_last = defaultdict(list)
|
||||
for cid, m in person_meta.items():
|
||||
if m["org"] and m["last"]:
|
||||
by_org_last[(m["org"], m["last"])].append((cid, m["name"], m["email"]))
|
||||
return [{"org": org, "surname": last, "members": members}
|
||||
for (org, last), members in by_org_last.items() if len(members) > 1]
|
||||
|
||||
|
||||
def run(db_path: str):
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
org_by_oid, org_by_inv = resolve_organizations(conn)
|
||||
conn.commit()
|
||||
person_meta = resolve_people(conn, org_by_oid, org_by_inv)
|
||||
conn.commit()
|
||||
candidates = find_fuzzy_candidates(person_meta)
|
||||
|
||||
counts = {
|
||||
"canonical_total": conn.execute("SELECT COUNT(*) FROM canonical_entities").fetchone()[0],
|
||||
"lp": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='lp'").fetchone()[0],
|
||||
"organization": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='organization'").fetchone()[0],
|
||||
"person": conn.execute("SELECT COUNT(*) FROM canonical_entities WHERE entity_kind='person'").fetchone()[0],
|
||||
"links": conn.execute("SELECT COUNT(*) FROM entity_links").fetchone()[0],
|
||||
"fuzzy_candidates": len(candidates),
|
||||
}
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO interaction_log (id, ts, actor_type, actor_id, action, target_type, payload, source, created_at)
|
||||
VALUES (?, ?, 'system', 'entity_resolver', 'entity_resolution.run', 'canonical_entities', ?, 'ingest', ?)
|
||||
""",
|
||||
(str(uuid.uuid4()), _now(), json.dumps(counts), _now()),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return counts, candidates
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Deterministic entity resolution into the canonical layer.")
|
||||
ap.add_argument("--db", default="data/crm_dev.db", help="path to the CRM SQLite DB")
|
||||
ap.add_argument("--show-candidates", action="store_true", help="print fuzzy merge candidates")
|
||||
args = ap.parse_args()
|
||||
|
||||
counts, candidates = run(args.db)
|
||||
print(f"Entity resolution on {args.db}:")
|
||||
for k, v in counts.items():
|
||||
print(f" {k:<18} {v}")
|
||||
if args.show_candidates and candidates:
|
||||
print("\nFuzzy candidates (same org + surname, different person — for the local-Qwen tier):")
|
||||
for c in candidates:
|
||||
names = ", ".join(f"{n!r}{(' <'+e+'>') if e else ''}" for _, n, e in c["members"])
|
||||
print(f" [{c['surname']}] {names}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,35 @@
|
||||
"""Tiny stdlib JSON HTTP client (no third-party deps).
|
||||
|
||||
Handles the Spark Control self-signed cert (verify=False) and plain-HTTP Qdrant.
|
||||
"""
|
||||
import json
|
||||
import ssl
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
|
||||
def _ctx(verify: bool):
|
||||
if verify:
|
||||
return None
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
return ctx
|
||||
|
||||
|
||||
def request(method: str, url: str, body=None, verify: bool = True, timeout: int = 180):
|
||||
data = json.dumps(body).encode("utf-8") if body is not None else None
|
||||
req = urllib.request.Request(url, data=data, method=method,
|
||||
headers={"Content-Type": "application/json"})
|
||||
ctx = _ctx(verify) if url.lower().startswith("https") else None
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
|
||||
raw = resp.read()
|
||||
return resp.status, (json.loads(raw) if raw else {})
|
||||
except urllib.error.HTTPError as exc:
|
||||
raw = exc.read()
|
||||
try:
|
||||
payload = json.loads(raw) if raw else {}
|
||||
except Exception:
|
||||
payload = {"raw": raw.decode("utf-8", "replace")}
|
||||
return exc.code, payload
|
||||
@@ -0,0 +1,50 @@
|
||||
"""Minimal Qdrant REST client for the ingest pipeline (direct to QDRANT_URL).
|
||||
|
||||
Creates the crm_chunks collection per EMBEDDINGS.md: a named dense vector
|
||||
(1024, Cosine) + a named sparse vector with modifier:idf, plus payload indexes.
|
||||
"""
|
||||
import config
|
||||
import http_util
|
||||
|
||||
Q = config.QDRANT_URL
|
||||
COL = config.COLLECTION
|
||||
|
||||
|
||||
def _req(method, path, body=None):
|
||||
return http_util.request(method, f"{Q}{path}", body, verify=False)
|
||||
|
||||
|
||||
def exists() -> bool:
|
||||
status, _ = _req("GET", f"/collections/{COL}")
|
||||
return status == 200
|
||||
|
||||
|
||||
def create_collection(recreate=False, dim=config.DENSE_DIM):
|
||||
if exists():
|
||||
if not recreate:
|
||||
return "exists"
|
||||
_req("DELETE", f"/collections/{COL}")
|
||||
status, data = _req("PUT", f"/collections/{COL}", {
|
||||
"vectors": {"dense": {"size": dim, "distance": "Cosine"}},
|
||||
"sparse_vectors": {"sparse": {"modifier": "idf"}},
|
||||
})
|
||||
if status not in (200, 201):
|
||||
raise RuntimeError(f"create collection -> {status}: {data}")
|
||||
return "created"
|
||||
|
||||
|
||||
def ensure_indexes():
|
||||
for field, schema in (("lp_id", "keyword"), ("doc_type", "keyword"), ("date_ts", "integer")):
|
||||
_req("PUT", f"/collections/{COL}/index", {"field_name": field, "field_schema": schema})
|
||||
|
||||
|
||||
def upsert(points):
|
||||
status, data = _req("PUT", f"/collections/{COL}/points?wait=true", {"points": points})
|
||||
if status not in (200, 201):
|
||||
raise RuntimeError(f"upsert -> {status}: {data}")
|
||||
return data
|
||||
|
||||
|
||||
def count():
|
||||
status, data = _req("POST", f"/collections/{COL}/points/count", {"exact": True})
|
||||
return (data or {}).get("result", {}).get("count")
|
||||
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Phase-0 retrieval — thin wrappers over Spark Control /api/search.
|
||||
|
||||
These are the retrieval modes the CRM MCP server (Workstream C) will expose:
|
||||
* semantic_search — dense only (omit sparse), high recall
|
||||
* hybrid_search — dense + BM25 sparse (RRF) + rerank; best for entity queries
|
||||
* keyword_search — lean on the sparse leg
|
||||
All support a Qdrant `filter` (e.g. lp_id / date_ts range) to pre-filter.
|
||||
|
||||
`--demo` runs an entity-heavy query in dense-only vs hybrid to show the BM25
|
||||
lexical leg surfacing the right LP. The query's sparse vector uses the SAME
|
||||
encoder as ingest (sparse.encode).
|
||||
"""
|
||||
import argparse
|
||||
|
||||
import config
|
||||
import http_util
|
||||
import sparse
|
||||
|
||||
|
||||
def _search(query, sparse_vec=None, rerank=False, top_k=5, lp_id=None, retrieve_n=80, filt=None):
|
||||
body = {"query": query, "collection": config.COLLECTION, "top_k": top_k,
|
||||
"retrieve_n": retrieve_n, "fusion": "rrf", "text_field": "text",
|
||||
"with_payload": True, "rerank": rerank}
|
||||
if sparse_vec is not None:
|
||||
body["sparse"] = {"indices": sparse_vec["indices"], "values": sparse_vec["values"]}
|
||||
# An explicit raw Qdrant filter (filt) wins; otherwise build one from lp_id.
|
||||
if filt is not None:
|
||||
body["filter"] = filt
|
||||
elif lp_id:
|
||||
body["filter"] = {"must": [{"key": "lp_id", "match": {"value": lp_id}}]}
|
||||
status, data = http_util.request("POST", f"{config.SPARK_CONTROL_URL}/api/search",
|
||||
body, verify=config.SPARK_VERIFY_TLS)
|
||||
if status != 200:
|
||||
raise RuntimeError(f"/api/search -> {status}: {data}")
|
||||
return data.get("data", [])
|
||||
|
||||
|
||||
def semantic_search(query, **kw):
|
||||
return _search(query, sparse_vec=None, rerank=kw.pop("rerank", False), **kw)
|
||||
|
||||
|
||||
def hybrid_search(query, **kw):
|
||||
return _search(query, sparse_vec=sparse.encode(query), rerank=kw.pop("rerank", True), **kw)
|
||||
|
||||
|
||||
def keyword_search(query, **kw):
|
||||
return _search(query, sparse_vec=sparse.encode(query), rerank=kw.pop("rerank", True), **kw)
|
||||
|
||||
|
||||
def _row(r):
|
||||
p = r.get("payload", {}) or {}
|
||||
text = (r.get("text") or p.get("text") or "").replace("\n", " ")
|
||||
return f"{p.get('lp_name', '?'):<22} [{p.get('doc_type', '?'):<13}] {text[:58]}"
|
||||
|
||||
|
||||
def _print(title, rows):
|
||||
print(f"\n {title}")
|
||||
if not rows:
|
||||
print(" (no results)")
|
||||
for i, r in enumerate(rows, 1):
|
||||
print(f" {i}. score={r.get('score', 0):+.3f} {_row(r)}")
|
||||
|
||||
|
||||
def demo():
|
||||
target = "Cedar Point Capital"
|
||||
q = "Fund III diligence and wire timeline for Cedar Point"
|
||||
print(f"QUERY: {q!r}\nTarget LP: {target}")
|
||||
|
||||
dense = semantic_search(q, top_k=5)
|
||||
hybrid = hybrid_search(q, top_k=5, rerank=False) # rerank off to isolate the BM25 leg
|
||||
_print("dense-only (semantic):", dense)
|
||||
_print("hybrid (dense + BM25 RRF):", hybrid)
|
||||
|
||||
def first_rank(rows):
|
||||
for i, r in enumerate(rows, 1):
|
||||
if (r.get("payload", {}) or {}).get("lp_name") == target:
|
||||
return i
|
||||
return None
|
||||
print(f"\n First '{target}' chunk — dense rank: {first_rank(dense)}, hybrid rank: {first_rank(hybrid)}")
|
||||
|
||||
# Pre-filter demo: same query, restricted to one LP's chunks.
|
||||
lp_id = None
|
||||
for r in hybrid:
|
||||
p = r.get("payload", {}) or {}
|
||||
if p.get("lp_name") == target:
|
||||
lp_id = p.get("lp_id")
|
||||
break
|
||||
if lp_id:
|
||||
_print(f"hybrid + payload pre-filter (lp_id={lp_id}):",
|
||||
hybrid_search(q, top_k=5, rerank=True, lp_id=lp_id))
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("query", nargs="?")
|
||||
ap.add_argument("--mode", choices=["semantic", "hybrid", "keyword"], default="hybrid")
|
||||
ap.add_argument("--top-k", type=int, default=5)
|
||||
ap.add_argument("--lp-id")
|
||||
ap.add_argument("--demo", action="store_true")
|
||||
args = ap.parse_args()
|
||||
if args.demo or not args.query:
|
||||
return demo()
|
||||
fn = {"semantic": semantic_search, "hybrid": hybrid_search, "keyword": keyword_search}[args.mode]
|
||||
_print(f"{args.mode}: {args.query!r}", fn(args.query, top_k=args.top_k, lp_id=args.lp_id))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,40 @@
|
||||
"""Client-side BM25 sparse vectors.
|
||||
|
||||
EMBEDDINGS.md specifies FastEmbed `Qdrant/bm25` so Qdrant applies IDF (via the
|
||||
sparse vector's `modifier: idf`) over OUR corpus. FastEmbed pulls onnxruntime,
|
||||
which has no wheel for this Python (3.14) yet, so this module provides a
|
||||
dependency-free BM25 term-frequency encoder with the same contract:
|
||||
`encode(text) -> {"indices": [...], "values": [...]}`.
|
||||
|
||||
Qdrant computes IDF server-side from the stored sparse vectors regardless of how
|
||||
indices are assigned, so this is a legitimate corpus-IDF BM25 leg. The ONLY hard
|
||||
requirement is that ingest and query use the SAME encoder — they both import this
|
||||
one. For production, swap `encode()` for FastEmbed `Qdrant/bm25` (and re-index, so
|
||||
ingest and query stay on the same tokenizer).
|
||||
"""
|
||||
import hashlib
|
||||
import math
|
||||
import re
|
||||
|
||||
_TOKEN_RE = re.compile(r"[a-z0-9]+")
|
||||
|
||||
|
||||
def tokenize(text: str):
|
||||
return _TOKEN_RE.findall((text or "").lower())
|
||||
|
||||
|
||||
def _index(token: str) -> int:
|
||||
# Stable unsigned 32-bit index for a token (Qdrant sparse indices are u32).
|
||||
return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:4], "big")
|
||||
|
||||
|
||||
def encode(text: str):
|
||||
"""Return a sparse vector {indices, values}. Value is 1 + ln(tf) (sublinear
|
||||
term frequency); IDF is applied by Qdrant via modifier:idf."""
|
||||
tf = {}
|
||||
for tok in tokenize(text):
|
||||
tf[tok] = tf.get(tok, 0) + 1
|
||||
idx_val = {}
|
||||
for tok, count in tf.items():
|
||||
idx_val[_index(tok)] = 1.0 + math.log(count)
|
||||
return {"indices": list(idx_val.keys()), "values": list(idx_val.values())}
|
||||
@@ -0,0 +1,3 @@
|
||||
"""Ten31 CRM MCP server (Workstream C) — wraps CRM reads, retrieval modes, and
|
||||
logged writes for the Claude Agent SDK. No outbound/contact tools (Phase 0 gate).
|
||||
"""
|
||||
@@ -0,0 +1,200 @@
|
||||
"""CRM MCP tool logic (Workstream C) — plain functions, transport-agnostic.
|
||||
|
||||
Kept separate from the MCP server wiring so it is unit-testable without the MCP
|
||||
SDK. Each function returns JSON-serializable dicts. Reads go against the CRM
|
||||
SQLite DB by path; retrieval wraps Spark Control /api/search; writes go through
|
||||
the interaction_log (guardrail #5).
|
||||
|
||||
Tool surface:
|
||||
reads get_entity, search_records, get_interaction_history
|
||||
retrieval semantic_search, hybrid_search, keyword_search
|
||||
writes log_interaction, set_entity_enrichment
|
||||
NO outbound/contact tools — that capability is gated to Phase 3.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ingest"))
|
||||
import config # noqa: E402
|
||||
import search # noqa: E402
|
||||
|
||||
|
||||
def _conn(db=None):
|
||||
c = sqlite3.connect(db or os.environ.get("CRM_DB_PATH") or config.DEFAULT_DB)
|
||||
c.row_factory = sqlite3.Row
|
||||
c.execute("PRAGMA foreign_keys=ON")
|
||||
return c
|
||||
|
||||
|
||||
def _now():
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
# ── read tools ────────────────────────────────────────────────────────────────
|
||||
|
||||
def _contact_ids_for(c, lp_id):
|
||||
"""All contact ids belonging to a canonical entity: directly linked contacts
|
||||
plus contacts whose organization resolves to this entity."""
|
||||
ids = {r["source_id"] for r in c.execute(
|
||||
"SELECT source_id FROM entity_links WHERE canonical_id=? AND source_model='contacts'", (lp_id,))}
|
||||
org_src = [r["source_id"] for r in c.execute(
|
||||
"SELECT source_id FROM entity_links WHERE canonical_id=? AND source_model='organizations'", (lp_id,))]
|
||||
if org_src:
|
||||
q = "SELECT id FROM contacts WHERE organization_id IN (%s)" % ",".join("?" * len(org_src))
|
||||
ids.update(r["id"] for r in c.execute(q, org_src))
|
||||
return ids
|
||||
|
||||
|
||||
def get_entity(lp_id, db=None):
|
||||
"""Fetch a canonical entity + its linked source rows and interaction count."""
|
||||
c = _conn(db)
|
||||
e = c.execute("SELECT * FROM canonical_entities WHERE id=?", (lp_id,)).fetchone()
|
||||
if not e:
|
||||
c.close()
|
||||
return {"error": "not_found", "lp_id": lp_id}
|
||||
out = dict(e)
|
||||
out["links"] = [dict(r) for r in c.execute(
|
||||
"SELECT source_model, source_id, match_kind, confidence FROM entity_links WHERE canonical_id=?", (lp_id,))]
|
||||
cids = _contact_ids_for(c, lp_id)
|
||||
out["interaction_count"] = (c.execute(
|
||||
"SELECT COUNT(*) FROM communications WHERE contact_id IN (%s)" % ",".join("?" * len(cids)),
|
||||
list(cids)).fetchone()[0] if cids else 0)
|
||||
c.close()
|
||||
return out
|
||||
|
||||
|
||||
def search_records(query=None, entity_kind=None, limit=20, db=None):
|
||||
"""Structured search over canonical entities (name substring + kind)."""
|
||||
c = _conn(db)
|
||||
sql = ("SELECT id, entity_kind, display_name, primary_email, segment, warmth_score "
|
||||
"FROM canonical_entities WHERE deleted_at IS NULL")
|
||||
args = []
|
||||
if entity_kind:
|
||||
sql += " AND entity_kind=?"
|
||||
args.append(entity_kind)
|
||||
if query:
|
||||
sql += " AND lower(display_name) LIKE ?"
|
||||
args.append(f"%{query.lower()}%")
|
||||
sql += " ORDER BY display_name LIMIT ?"
|
||||
args.append(limit)
|
||||
rows = [dict(r) for r in c.execute(sql, args)]
|
||||
c.close()
|
||||
return {"results": rows, "count": len(rows)}
|
||||
|
||||
|
||||
def get_interaction_history(lp_id, limit=20, db=None):
|
||||
"""Merged, dated interaction history for an entity: communications + grid notes."""
|
||||
c = _conn(db)
|
||||
items = []
|
||||
cids = _contact_ids_for(c, lp_id)
|
||||
if cids:
|
||||
q = ("SELECT type, subject, body, communication_date FROM communications "
|
||||
"WHERE contact_id IN (%s) ORDER BY communication_date DESC LIMIT ?" % ",".join("?" * len(cids)))
|
||||
for r in c.execute(q, [*cids, limit]):
|
||||
items.append({"kind": r["type"], "date": r["communication_date"],
|
||||
"subject": r["subject"], "text": (r["body"] or "")[:240]})
|
||||
inv_src = [r["source_id"] for r in c.execute(
|
||||
"SELECT source_id FROM entity_links WHERE canonical_id=? AND source_model='fundraising_investors'", (lp_id,))]
|
||||
if inv_src:
|
||||
q = "SELECT notes, updated_at FROM fundraising_investors WHERE id IN (%s)" % ",".join("?" * len(inv_src))
|
||||
for r in c.execute(q, inv_src):
|
||||
if (r["notes"] or "").strip():
|
||||
items.append({"kind": "grid_note", "date": r["updated_at"],
|
||||
"subject": "Fundraising grid notes", "text": r["notes"][:300]})
|
||||
c.close()
|
||||
items.sort(key=lambda x: (x["date"] or ""), reverse=True)
|
||||
return {"lp_id": lp_id, "items": items[:limit], "count": len(items)}
|
||||
|
||||
|
||||
# ── retrieval tools (wrap /api/search) ────────────────────────────────────────
|
||||
|
||||
def _filter(lp_id=None, doc_type=None, date_from=None, date_to=None):
|
||||
must = []
|
||||
if lp_id:
|
||||
must.append({"key": "lp_id", "match": {"value": lp_id}})
|
||||
if doc_type:
|
||||
must.append({"key": "doc_type", "match": {"value": doc_type}})
|
||||
if date_from is not None or date_to is not None:
|
||||
rng = {}
|
||||
if date_from is not None:
|
||||
rng["gte"] = date_from
|
||||
if date_to is not None:
|
||||
rng["lte"] = date_to
|
||||
must.append({"key": "date_ts", "range": rng})
|
||||
return {"must": must} if must else None
|
||||
|
||||
|
||||
def _shape(rows):
|
||||
out = []
|
||||
for r in rows:
|
||||
p = r.get("payload", {}) or {}
|
||||
out.append({"score": r.get("score"), "lp_id": p.get("lp_id"), "lp_name": p.get("lp_name"),
|
||||
"doc_type": p.get("doc_type"), "date_ts": p.get("date_ts"),
|
||||
"text": r.get("text") or p.get("text"),
|
||||
"source": f"{p.get('source_model')}:{p.get('source_id')}"})
|
||||
return out
|
||||
|
||||
|
||||
def hybrid_search(query, top_k=8, lp_id=None, doc_type=None, date_from=None, date_to=None):
|
||||
"""Dense + BM25 + rerank. Default mode; best for entity-heavy queries."""
|
||||
return {"mode": "hybrid", "query": query,
|
||||
"results": _shape(search.hybrid_search(query, top_k=top_k,
|
||||
filt=_filter(lp_id, doc_type, date_from, date_to)))}
|
||||
|
||||
|
||||
def semantic_search(query, top_k=8, lp_id=None, doc_type=None, date_from=None, date_to=None):
|
||||
"""Dense only, high recall."""
|
||||
return {"mode": "semantic", "query": query,
|
||||
"results": _shape(search.semantic_search(query, top_k=top_k,
|
||||
filt=_filter(lp_id, doc_type, date_from, date_to)))}
|
||||
|
||||
|
||||
def keyword_search(query, top_k=8, lp_id=None, doc_type=None, date_from=None, date_to=None):
|
||||
"""High-precision lexical (sparse leg + rerank)."""
|
||||
return {"mode": "keyword", "query": query,
|
||||
"results": _shape(search.keyword_search(query, top_k=top_k,
|
||||
filt=_filter(lp_id, doc_type, date_from, date_to)))}
|
||||
|
||||
|
||||
# ── write tools (every write logged — guardrail #5) ───────────────────────────
|
||||
|
||||
def log_interaction(action, actor_type="agent", actor_id=None, target_id=None,
|
||||
target_type="canonical_entity", payload=None, source="mcp", db=None):
|
||||
"""Append an entry to the append-only interaction log."""
|
||||
c = _conn(db)
|
||||
iid = str(uuid.uuid4())
|
||||
c.execute("""INSERT INTO interaction_log
|
||||
(id, ts, actor_type, actor_id, action, target_type, target_id, payload, source, created_at)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?)""",
|
||||
(iid, _now(), actor_type, actor_id, action, target_type, target_id,
|
||||
json.dumps(payload) if payload is not None else None, source, _now()))
|
||||
c.commit()
|
||||
c.close()
|
||||
return {"id": iid, "logged": True}
|
||||
|
||||
|
||||
_ENRICH_FIELDS = {"thesis_fit", "segment", "accreditation_status", "qp_status",
|
||||
"warmth_score", "source", "owner_id", "last_touch_at", "notes"}
|
||||
|
||||
|
||||
def set_entity_enrichment(lp_id, fields, actor_id="analyst", db=None):
|
||||
"""One-way enrichment write INTO the canonical entity (guardrail #8). Logged."""
|
||||
upd = {k: v for k, v in (fields or {}).items() if k in _ENRICH_FIELDS}
|
||||
if not upd:
|
||||
return {"error": "no_valid_fields", "allowed": sorted(_ENRICH_FIELDS)}
|
||||
c = _conn(db)
|
||||
sets = ", ".join(f"{k}=?" for k in upd) + ", updated_at=?"
|
||||
c.execute(f"UPDATE canonical_entities SET {sets} WHERE id=?", [*upd.values(), _now(), lp_id])
|
||||
iid = str(uuid.uuid4())
|
||||
c.execute("""INSERT INTO interaction_log
|
||||
(id, ts, actor_type, actor_id, action, target_type, target_id, payload, source, created_at)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?)""",
|
||||
(iid, _now(), "agent", actor_id, "enrichment.written", "canonical_entity", lp_id,
|
||||
json.dumps(upd), "mcp", _now()))
|
||||
c.commit()
|
||||
c.close()
|
||||
return {"lp_id": lp_id, "updated": list(upd.keys()), "log_id": iid}
|
||||
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Ten31 CRM MCP server (Workstream C).
|
||||
|
||||
Exposes CRM reads, retrieval modes, and logged writes to the Claude Agent SDK
|
||||
over MCP (stdio). All logic lives in crm_tools.py (tested independently); this
|
||||
file is the thin transport wrapper.
|
||||
|
||||
Run:
|
||||
pip install mcp # one-time (MCP Python SDK)
|
||||
CRM_DB_PATH=/data/crm.db python3 backend/mcp/server.py
|
||||
|
||||
Register with the Agent SDK / Claude Code as an stdio MCP server pointing at this
|
||||
script. NO outbound/contact tools are exposed — that capability is gated to
|
||||
Phase 3 behind the compliance review (CLAUDE.md guardrails #4, #6).
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
import crm_tools as t # noqa: E402
|
||||
|
||||
from mcp.server.fastmcp import FastMCP # noqa: E402
|
||||
|
||||
mcp = FastMCP("ten31-crm")
|
||||
|
||||
|
||||
# ── reads ──
|
||||
@mcp.tool()
|
||||
def get_entity(lp_id: str) -> dict:
|
||||
"""Fetch a canonical LP/organization/person entity by id, with its linked
|
||||
source records and interaction count."""
|
||||
return t.get_entity(lp_id)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def search_records(query: str = "", entity_kind: str = "", limit: int = 20) -> dict:
|
||||
"""Structured search over canonical entities by name substring and kind
|
||||
('lp' | 'organization' | 'person')."""
|
||||
return t.search_records(query=query or None, entity_kind=entity_kind or None, limit=limit)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def get_interaction_history(lp_id: str, limit: int = 20) -> dict:
|
||||
"""Merged, dated interaction history (communications + fundraising grid notes)
|
||||
for a canonical entity."""
|
||||
return t.get_interaction_history(lp_id, limit=limit)
|
||||
|
||||
|
||||
# ── retrieval modes ──
|
||||
@mcp.tool()
|
||||
def hybrid_search(query: str, top_k: int = 8, lp_id: str = "", doc_type: str = "",
|
||||
date_from: int = 0, date_to: int = 0) -> dict:
|
||||
"""Dense + BM25 + rerank retrieval (default; best for entity-heavy queries).
|
||||
Optional filters: lp_id, doc_type, date_from/date_to (epoch seconds)."""
|
||||
return t.hybrid_search(query, top_k=top_k, lp_id=lp_id or None, doc_type=doc_type or None,
|
||||
date_from=date_from or None, date_to=date_to or None)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def semantic_search(query: str, top_k: int = 8, lp_id: str = "", doc_type: str = "") -> dict:
|
||||
"""Dense-only retrieval (high recall)."""
|
||||
return t.semantic_search(query, top_k=top_k, lp_id=lp_id or None, doc_type=doc_type or None)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def keyword_search(query: str, top_k: int = 8, lp_id: str = "", doc_type: str = "") -> dict:
|
||||
"""High-precision lexical retrieval (sparse leg + rerank)."""
|
||||
return t.keyword_search(query, top_k=top_k, lp_id=lp_id or None, doc_type=doc_type or None)
|
||||
|
||||
|
||||
# ── writes (logged) ──
|
||||
@mcp.tool()
|
||||
def log_interaction(action: str, actor_type: str = "agent", actor_id: str = "",
|
||||
target_id: str = "", payload: dict = None, source: str = "mcp") -> dict:
|
||||
"""Append an entry to the append-only interaction log (guardrail #5)."""
|
||||
return t.log_interaction(action, actor_type=actor_type, actor_id=actor_id or None,
|
||||
target_id=target_id or None, payload=payload, source=source)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def set_entity_enrichment(lp_id: str, fields: dict, actor_id: str = "analyst") -> dict:
|
||||
"""One-way enrichment write into a canonical entity (thesis_fit, segment,
|
||||
warmth_score, accreditation_status, etc.). Logged automatically."""
|
||||
return t.set_entity_enrichment(lp_id, fields, actor_id=actor_id)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mcp.run()
|
||||
@@ -0,0 +1,23 @@
|
||||
-- Reversal of 0001_phase0_foundation.sql.
|
||||
--
|
||||
-- Run manually (it is NOT applied automatically) if the Phase-0 foundation
|
||||
-- schema needs to be rolled back. Drops the new tables. The added `deleted_at`
|
||||
-- columns on existing tables are left in place by default because they are
|
||||
-- inert (nullable, unused unless the DELETE handlers are changed) and dropping
|
||||
-- columns requires SQLite >= 3.35; uncomment the DROP COLUMN lines to remove
|
||||
-- them on a modern SQLite.
|
||||
|
||||
DROP TABLE IF EXISTS relationship_edges;
|
||||
DROP TABLE IF EXISTS interaction_log;
|
||||
DROP TABLE IF EXISTS entity_links;
|
||||
DROP TABLE IF EXISTS canonical_entities;
|
||||
|
||||
-- Requires SQLite >= 3.35.0. Safe to leave the columns if unsure.
|
||||
-- ALTER TABLE contacts DROP COLUMN deleted_at;
|
||||
-- ALTER TABLE organizations DROP COLUMN deleted_at;
|
||||
-- ALTER TABLE opportunities DROP COLUMN deleted_at;
|
||||
-- ALTER TABLE communications DROP COLUMN deleted_at;
|
||||
-- ALTER TABLE lp_profiles DROP COLUMN deleted_at;
|
||||
|
||||
-- Forget the migration so it can be re-applied:
|
||||
DELETE FROM schema_migrations WHERE filename = '0001_phase0_foundation.sql';
|
||||
@@ -0,0 +1,116 @@
|
||||
-- Phase 0 — Workstream A2: foundation schema for the agentic system.
|
||||
--
|
||||
-- ADDITIVE AND REVERSIBLE ONLY (CLAUDE.md guardrail #3): this migration adds
|
||||
-- new tables and new nullable columns alongside the existing CRM. It never
|
||||
-- drops, renames, or rewrites existing data. Its reversal is 0001_phase0_foundation.down.sql.
|
||||
--
|
||||
-- Applied once at startup by backend/core_migrations.py, tracked in the
|
||||
-- schema_migrations ledger. Safe to leave in place; the canonical layer it
|
||||
-- creates starts EMPTY and is populated later by entity resolution (A4/B3).
|
||||
|
||||
-- ============================================================================
|
||||
-- 1. canonical_entities — the single, model-agnostic identity for an LP /
|
||||
-- organization / person. Both the classic contacts/lp_profiles model and the
|
||||
-- fundraising_* grid map INTO this; neither existing model is demoted.
|
||||
-- IDs are full-length (e.g. 'lp_' + uuid4 hex), NOT the 8-char truncated
|
||||
-- UUIDs used elsewhere in the CRM, so they are safe as the index/payload key.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS canonical_entities (
|
||||
id TEXT PRIMARY KEY,
|
||||
entity_kind TEXT NOT NULL, -- 'lp' | 'organization' | 'person'
|
||||
display_name TEXT NOT NULL,
|
||||
primary_email TEXT,
|
||||
-- Phase-0 LP/prospect fields (model-agnostic home):
|
||||
thesis_fit TEXT,
|
||||
segment TEXT,
|
||||
accreditation_status TEXT, -- free-text until counsel defines the vocabulary (guardrail #6)
|
||||
qp_status TEXT,
|
||||
warmth_score REAL,
|
||||
source TEXT,
|
||||
owner_id TEXT REFERENCES users(id),
|
||||
last_touch_at TEXT,
|
||||
notes TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
updated_at TEXT DEFAULT (datetime('now')),
|
||||
deleted_at TEXT -- soft-delete (never hard-delete; guardrail #3)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_kind ON canonical_entities(entity_kind);
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_email ON canonical_entities(primary_email);
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_owner ON canonical_entities(owner_id);
|
||||
|
||||
-- ============================================================================
|
||||
-- 2. entity_links — resolution map. Every source row (a contacts row, a
|
||||
-- fundraising_investors row, etc.) and every email/name variant points at the
|
||||
-- canonical entity it resolves to. This is how name variants collapse to one id.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS entity_links (
|
||||
id TEXT PRIMARY KEY,
|
||||
canonical_id TEXT NOT NULL REFERENCES canonical_entities(id) ON DELETE CASCADE,
|
||||
source_model TEXT NOT NULL, -- contacts|organizations|lp_profiles|fundraising_investors|fundraising_contacts|email_address|alias
|
||||
source_id TEXT, -- the local PK in that model (NULL for a bare email/name alias)
|
||||
match_value TEXT, -- normalized email or name variant
|
||||
match_kind TEXT NOT NULL, -- exact_email|name_variant|domain|manual
|
||||
confidence REAL DEFAULT 1.0,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
UNIQUE(source_model, source_id, match_value)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_entity_links_canonical ON entity_links(canonical_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_entity_links_match ON entity_links(match_value);
|
||||
CREATE INDEX IF NOT EXISTS idx_entity_links_source ON entity_links(source_model, source_id);
|
||||
|
||||
-- ============================================================================
|
||||
-- 3. interaction_log — APPEND-ONLY record of every agent action and every human
|
||||
-- touch (guardrail #5). Distinct from audit_log (which is mutation-diff-only
|
||||
-- and has no actor/agent dimension). Nothing in this table is ever updated or
|
||||
-- deleted by convention.
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS interaction_log (
|
||||
id TEXT PRIMARY KEY,
|
||||
ts TEXT NOT NULL DEFAULT (datetime('now')), -- event time
|
||||
actor_type TEXT NOT NULL, -- human | agent | system
|
||||
actor_id TEXT, -- users.id, or an agent name (Scout/Analyst/...)
|
||||
action TEXT NOT NULL, -- e.g. note.created | email.matched | enrichment.written | search.run
|
||||
target_type TEXT, -- canonical_entity | contact | communication | opportunity | ...
|
||||
target_id TEXT, -- canonical_entities.id where possible
|
||||
payload TEXT, -- JSON blob with the action detail
|
||||
source TEXT, -- crm_ui | mcp | ingest | scout | ...
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_interaction_target ON interaction_log(target_type, target_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_interaction_ts ON interaction_log(ts);
|
||||
CREATE INDEX IF NOT EXISTS idx_interaction_actor ON interaction_log(actor_type, actor_id);
|
||||
|
||||
-- ============================================================================
|
||||
-- 4. relationship_edges — derived graph of who-knows-whom between canonical
|
||||
-- entities. Starts EMPTY; seeded later from email_investor_links + calendar +
|
||||
-- X follower overlap (Analyst, Phase 2).
|
||||
-- ============================================================================
|
||||
CREATE TABLE IF NOT EXISTS relationship_edges (
|
||||
id TEXT PRIMARY KEY,
|
||||
src_id TEXT NOT NULL REFERENCES canonical_entities(id) ON DELETE CASCADE,
|
||||
dst_id TEXT NOT NULL REFERENCES canonical_entities(id) ON DELETE CASCADE,
|
||||
edge_type TEXT NOT NULL, -- email_corr | calendar | x_follow | intro | colleague
|
||||
source TEXT NOT NULL, -- provenance of this edge
|
||||
strength REAL DEFAULT 0,
|
||||
directed INTEGER DEFAULT 0,
|
||||
evidence TEXT, -- JSON supporting detail
|
||||
first_seen_at TEXT,
|
||||
last_seen_at TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
updated_at TEXT DEFAULT (datetime('now')),
|
||||
UNIQUE(src_id, dst_id, edge_type, source)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_rel_src ON relationship_edges(src_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_rel_dst ON relationship_edges(dst_id);
|
||||
|
||||
-- ============================================================================
|
||||
-- 5. Soft-delete columns on existing tables. Additive nullable columns; the CRM
|
||||
-- currently HARD-deletes everywhere (guardrail #3 gap). Adding the column is
|
||||
-- safe now; switching the DELETE handlers to set it instead of hard-deleting
|
||||
-- is a separate, reviewed code change.
|
||||
-- ============================================================================
|
||||
ALTER TABLE contacts ADD COLUMN deleted_at TEXT;
|
||||
ALTER TABLE organizations ADD COLUMN deleted_at TEXT;
|
||||
ALTER TABLE opportunities ADD COLUMN deleted_at TEXT;
|
||||
ALTER TABLE communications ADD COLUMN deleted_at TEXT;
|
||||
ALTER TABLE lp_profiles ADD COLUMN deleted_at TEXT;
|
||||
@@ -9,3 +9,4 @@ passlib[bcrypt]==1.7.4
|
||||
python-multipart==0.0.9
|
||||
aiofiles==23.2.1
|
||||
httpx==0.27.0
|
||||
cryptography==42.0.5
|
||||
|
||||
@@ -0,0 +1,279 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Seed a SYNTHETIC dev database for Ten31 CRM ingest/retrieval testing.
|
||||
|
||||
ALL DATA IS FAKE. No real LP/prospect information appears here (CLAUDE.md
|
||||
guardrail #9: Claude works only on synthetic/redacted data). This produces a
|
||||
realistic-shaped corpus so the Phase-0 ingest, chunking, and entity-resolution
|
||||
work can be developed and tested without ever touching the live CRM.
|
||||
|
||||
What it builds (into a SEPARATE dev DB, never crm.db):
|
||||
* The full real schema, via server.init_db() — which also runs the new
|
||||
core migration (backend/migrations/), so the canonical/interaction/graph
|
||||
tables exist.
|
||||
* A classic-model dataset: organizations, contacts (investors + prospects),
|
||||
opportunities across pipeline stages, communications with entity-rich prose
|
||||
notes, and lp_profiles.
|
||||
* A fundraising grid (fundraising_state.grid_json) populated via the real
|
||||
sync_fundraising_relational() code path, so the normalized mirror + the
|
||||
grid->classic bridge behave exactly as in production.
|
||||
* DELIBERATE entity-resolution test cases: several investors appear in BOTH
|
||||
models with NAME VARIANTS (e.g. "Jonathan Reyes" vs grid contact "Jon
|
||||
Reyes"), some with matching email (easy merge) and some without (hard case).
|
||||
|
||||
Usage:
|
||||
python3 backend/scripts/seed_synthetic.py # -> data/crm_dev.db
|
||||
python3 backend/scripts/seed_synthetic.py --db /tmp/x.db
|
||||
"""
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
BACKEND_DIR = os.path.dirname(HERE)
|
||||
PROJECT_DIR = os.path.dirname(BACKEND_DIR)
|
||||
sys.path.insert(0, BACKEND_DIR)
|
||||
|
||||
ORGS = [
|
||||
("Cedar Point Capital", "family_office"),
|
||||
("Harbor & Vine Family Office", "family_office"),
|
||||
("Northwind Endowment", "endowment"),
|
||||
("Granite Peak Partners", "institutional"),
|
||||
("Solano Ventures", "wealth_management"),
|
||||
("Tidewater Holdings", "family_office"),
|
||||
("Brightwater Capital", "institutional"),
|
||||
("Meridian Trust", "foundation"),
|
||||
]
|
||||
|
||||
FIRST = ["Jonathan", "Katherine", "Michael", "William", "Robert", "Elena", "Priya",
|
||||
"David", "Sarah", "James", "Maria", "Thomas", "Laura", "Daniel", "Rachel",
|
||||
"Steven", "Nicole", "Andrew", "Jessica", "Brian"]
|
||||
LAST = ["Reyes", "Calder", "Okonkwo", "Brandt", "Sutter", "Vance", "Mehta", "Ellison",
|
||||
"Cho", "Whitlock", "Santos", "Aldridge", "Kerr", "Nilsson", "Pope", "Devlin",
|
||||
"Frye", "Osei", "Lindqvist", "Marsh"]
|
||||
NICK = {"Jonathan": "Jon", "Katherine": "Kate", "Michael": "Mike", "William": "Bill",
|
||||
"Robert": "Bob", "James": "Jim", "Thomas": "Tom", "Daniel": "Dan",
|
||||
"Steven": "Steve", "Jessica": "Jess"}
|
||||
|
||||
FUND_COLS = ["fund_i", "fund_ii", "fund_iii", "tactical_fund", "pawn_to_e4",
|
||||
"ten31_terahash", "sats_and_stats", "pawn_to_f4", "join_the_fold"]
|
||||
FUND_LABELS = {"fund_i": "Fund I", "fund_ii": "Fund II", "fund_iii": "Fund III",
|
||||
"tactical_fund": "Tactical Fund"}
|
||||
AMOUNTS = [250_000, 500_000, 1_000_000, 2_500_000, 5_000_000]
|
||||
LEADS = ["JK", "Grant", "MB", "Parker"]
|
||||
|
||||
COMM_TEMPLATES = [
|
||||
("call", "Intro call recap",
|
||||
"Spoke with {person} ({org}) for {dur} min about {fund}. Strong interest in the "
|
||||
"bitcoin-energy and AI-infrastructure thesis; wants the latest deck and DPI figures. "
|
||||
"Flagged accreditation paperwork still outstanding. Next: send one-pager and schedule a partner call."),
|
||||
("email", "Follow-up: {fund} allocation",
|
||||
"Sent {person} the {fund} summary and the scarcity/critical-infrastructure memo. "
|
||||
"They asked how Ten31 Terahash relates to the energy thesis. Following up next week on commitment size."),
|
||||
("meeting", "Partner meeting notes",
|
||||
"Met {person} at {org}. Discussed pacing into {fund} and co-invest appetite. "
|
||||
"Concern about lockup; reassured on secondary options. Warm — wants to meet the GP again before committing."),
|
||||
("note", "Diligence status",
|
||||
"{person} is mid-diligence on {fund}. Legal reviewing subscription docs; wire expected within 30 days. "
|
||||
"Keep warm; send the Q update."),
|
||||
("text", "Quick ping",
|
||||
"Texted {person} re: the {fund} close timeline. Said they're 'in for at least a unit' pending IC approval."),
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--db", default=os.path.join(PROJECT_DIR, "data", "crm_dev.db"))
|
||||
args = ap.parse_args()
|
||||
db = os.path.abspath(args.db)
|
||||
if os.path.basename(db) == "crm.db":
|
||||
sys.exit("Refusing to seed the real crm.db. Use a dev path like data/crm_dev.db.")
|
||||
|
||||
for ext in ("", "-wal", "-shm"):
|
||||
if os.path.exists(db + ext):
|
||||
os.remove(db + ext)
|
||||
|
||||
os.environ["CRM_DB_PATH"] = db
|
||||
import server # noqa: E402 (must follow CRM_DB_PATH assignment)
|
||||
|
||||
server.init_db()
|
||||
gen, now = server.generate_id, server.now
|
||||
conn = server.get_db()
|
||||
random.seed(31)
|
||||
|
||||
def past(days_ago):
|
||||
return (datetime.datetime.utcnow() - datetime.timedelta(days=days_ago)).isoformat() + "Z"
|
||||
|
||||
# ── dev user (FK target for created_by/owner_id) ──
|
||||
uid = gen()
|
||||
conn.execute(
|
||||
"INSERT INTO users (id, username, email, password_hash, full_name, role) VALUES (?,?,?,?,?,?)",
|
||||
(uid, "dev_admin", "dev@example.invalid", server.hash_password("devpassword"),
|
||||
"Dev Admin", "admin"))
|
||||
|
||||
# ── organizations ──
|
||||
org_ids = {}
|
||||
for name, otype in ORGS:
|
||||
oid = gen()
|
||||
org_ids[name] = oid
|
||||
conn.execute(
|
||||
"INSERT INTO organizations (id, name, type, industry, country, description, created_by, updated_at) "
|
||||
"VALUES (?,?,?,?,?,?,?,?)",
|
||||
(oid, name, otype, "Investment Management", "USA",
|
||||
f"{name} — synthetic {otype.replace('_', ' ')} used for ingest testing.", uid, now()))
|
||||
|
||||
# ── classic contacts (investors + prospects) ──
|
||||
contacts = [] # (cid, first, last, org_name, contact_type)
|
||||
overlap_specs = [] # investors we will also place in the grid, with variants
|
||||
used = set()
|
||||
for i, (org_name, _) in enumerate(ORGS):
|
||||
# one "primary" investor contact per org
|
||||
first, last = FIRST[i], LAST[i]
|
||||
used.add((first, last))
|
||||
cid = gen()
|
||||
email = f"{first.lower()}.{last.lower()}@{org_name.split()[0].lower()}.invalid"
|
||||
conn.execute(
|
||||
"INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, "
|
||||
"status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(cid, first, last, email, "Managing Partner", org_ids[org_name], "investor", "active",
|
||||
"referral", f"Primary relationship at {org_name}. Met via conference intro.", uid, now()))
|
||||
contacts.append((cid, first, last, org_name, "investor"))
|
||||
# mark 5 of 8 for grid overlap with a NAME VARIANT
|
||||
if i < 5:
|
||||
variant = NICK.get(first, first[0] + ".") # nickname or initial
|
||||
match_email = email if i % 2 == 0 else "" # half share email (easy), half don't (hard)
|
||||
overlap_specs.append((org_name, f"{variant} {last}", match_email))
|
||||
|
||||
# extra prospect contacts (no org sometimes)
|
||||
for j in range(12):
|
||||
first = FIRST[(j + 8) % len(FIRST)]
|
||||
last = LAST[(j + 8) % len(LAST)]
|
||||
if (first, last) in used:
|
||||
last = LAST[(j + 11) % len(LAST)]
|
||||
used.add((first, last))
|
||||
org_name = ORGS[j % len(ORGS)][0] if j % 3 else None
|
||||
cid = gen()
|
||||
email = f"{first.lower()}{last.lower()}@example.invalid"
|
||||
conn.execute(
|
||||
"INSERT INTO contacts (id, first_name, last_name, email, title, organization_id, contact_type, "
|
||||
"status, source, notes, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(cid, first, last, email, "Principal", org_ids.get(org_name) if org_name else None,
|
||||
"prospect", "active", random.choice(["inbound", "referral", "conference", "x"]),
|
||||
f"Prospect sourced via {random.choice(['X DM', 'warm intro', 'podcast'])}.", uid, now()))
|
||||
contacts.append((cid, first, last, org_name, "prospect"))
|
||||
|
||||
# ── opportunities + lp_profiles + communications ──
|
||||
stages = server.PIPELINE_STAGES
|
||||
for idx, (cid, first, last, org_name, ctype) in enumerate(contacts):
|
||||
person = f"{first} {last}"
|
||||
# opportunity for most contacts
|
||||
if idx % 5 != 4:
|
||||
stage = stages[idx % len(stages)]
|
||||
fund_label = random.choice(list(FUND_LABELS.values()))
|
||||
amt = random.choice(AMOUNTS)
|
||||
conn.execute(
|
||||
"INSERT INTO opportunities (id, name, contact_id, organization_id, stage, commitment_amount, "
|
||||
"expected_amount, probability, fund_name, description, next_step, owner_id, priority, updated_at) "
|
||||
"VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(gen(), f"{org_name or person} — {fund_label}", cid, org_ids.get(org_name) if org_name else None,
|
||||
stage, amt if stage in ("committed", "funded") else 0, amt,
|
||||
{"lead": 10, "outreach": 25, "meeting": 40, "due_diligence": 60, "committed": 90, "funded": 100}[stage],
|
||||
fund_label, f"Potential {fund_label} allocation for {person}.",
|
||||
random.choice(["Send deck", "Schedule call", "Await IC", "Send subdocs"]),
|
||||
uid, random.choice(["low", "medium", "high"]), now()))
|
||||
|
||||
# lp_profile for ~closed investors
|
||||
if ctype == "investor" and idx % 2 == 0:
|
||||
amt = random.choice(AMOUNTS)
|
||||
conn.execute(
|
||||
"INSERT INTO lp_profiles (id, contact_id, commitment_amount, funded_amount, commitment_date, "
|
||||
"fund_name, investor_type, accredited, legal_docs_signed, wire_received, k1_sent, notes, updated_at) "
|
||||
"VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(gen(), cid, amt, amt if idx % 4 == 0 else 0, past(120),
|
||||
random.choice(list(FUND_LABELS.values())),
|
||||
random.choice(["family_office", "institutional", "endowment", "individual"]),
|
||||
1, 1 if idx % 3 else 0, 1 if idx % 4 == 0 else 0, 0,
|
||||
f"Closed LP. Accreditation on file. Primary contact {person}.", now()))
|
||||
|
||||
# 2-4 communications each, entity-rich prose
|
||||
for k in range(random.randint(2, 4)):
|
||||
ctype_comm, subj, body = random.choice(COMM_TEMPLATES)
|
||||
fund = random.choice(["Fund III", "Tactical Fund", "Ten31 Terahash", "Fund II"])
|
||||
conn.execute(
|
||||
"INSERT INTO communications (id, contact_id, type, subject, body, communication_date, "
|
||||
"duration_minutes, outcome, next_action, created_by, updated_at) VALUES (?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(gen(), cid, ctype_comm,
|
||||
subj.format(fund=fund),
|
||||
body.format(person=person, org=org_name or "their firm", fund=fund, dur=random.choice([20, 30, 45])),
|
||||
past(random.randint(1, 200)), random.choice([20, 30, 45, None]),
|
||||
random.choice(["positive", "neutral", "needs follow-up"]),
|
||||
random.choice(["Send deck", "Schedule call", "Send subdocs", None]),
|
||||
uid, now()))
|
||||
|
||||
# ── fundraising grid (authoritative grid_json -> real sync path) ──
|
||||
columns = server.DEFAULT_FUNDRAISING_COLUMNS
|
||||
views = server.DEFAULT_GRID_VIEWS
|
||||
rows = []
|
||||
|
||||
# (a) overlap investors — same org as a classic investor, but a NAME-VARIANT contact
|
||||
for org_name, variant_name, match_email in overlap_specs:
|
||||
row = {"id": "row-" + gen(), "investor_name": org_name, "lead": random.choice(LEADS),
|
||||
"lead_source": random.choice(["Conference", "Warm intro", "X"]),
|
||||
"notes": f"[call] {variant_name}: discussed Fund III pacing and co-invest. Warm.\n"
|
||||
f"[email] {variant_name}: sent the energy-thesis memo.",
|
||||
"priority": random.random() < 0.4, "follow_up": random.random() < 0.5, "graveyard": False,
|
||||
"contacts": [{"name": variant_name,
|
||||
"email": match_email or f"{variant_name.split()[0].lower()}@{org_name.split()[0].lower()}.invalid",
|
||||
"title": "Managing Partner"}]}
|
||||
for fc in random.sample(FUND_COLS, k=random.randint(1, 3)):
|
||||
row[fc] = random.choice(AMOUNTS)
|
||||
rows.append(row)
|
||||
|
||||
# (b) grid-only investors (no classic counterpart) — exercise the create path
|
||||
for n in range(7):
|
||||
nm = f"{random.choice(['Slate', 'Copper', 'Ridgeline', 'Anchor', 'Falcon', 'Quarry', 'Beacon'])} " \
|
||||
f"{random.choice(['Capital', 'Partners', 'Holdings', 'Group'])}"
|
||||
row = {"id": "row-" + gen(), "investor_name": nm, "lead": random.choice(LEADS),
|
||||
"lead_source": random.choice(["Inbound", "Referral", "Podcast"]),
|
||||
"notes": f"[note] First touch with {nm}. Sourced via X. Gauging thesis fit.",
|
||||
"priority": False, "follow_up": random.random() < 0.6,
|
||||
"graveyard": n >= 5, # a couple in the graveyard list
|
||||
"contacts": [{"name": f"{random.choice(FIRST)} {random.choice(LAST)}",
|
||||
"email": f"contact{n}@{nm.split()[0].lower()}.invalid", "title": "Partner"}]}
|
||||
for fc in random.sample(FUND_COLS, k=random.randint(0, 2)):
|
||||
row[fc] = random.choice(AMOUNTS)
|
||||
rows.append(row)
|
||||
|
||||
grid = {"columns": columns, "rows": rows}
|
||||
conn.execute(
|
||||
"INSERT INTO fundraising_state (id, grid_json, views_json, version, updated_by, created_at, updated_at) "
|
||||
"VALUES ('main', ?, ?, 1, ?, ?, ?)",
|
||||
(json.dumps(grid), json.dumps(views), uid, now(), now()))
|
||||
server.sync_fundraising_relational(conn, grid, views, actor_user_id=uid)
|
||||
conn.commit()
|
||||
|
||||
# ── summary ──
|
||||
def count(t):
|
||||
return conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0]
|
||||
|
||||
print(f"\nSynthetic dev DB written to: {db}")
|
||||
print(" Classic model:")
|
||||
for t in ("organizations", "contacts", "opportunities", "communications", "lp_profiles"):
|
||||
print(f" {t:<24} {count(t)}")
|
||||
print(" Fundraising grid (after real sync):")
|
||||
for t in ("fundraising_investors", "fundraising_contacts", "fundraising_funds",
|
||||
"fundraising_commitments", "fundraising_list_memberships"):
|
||||
print(f" {t:<24} {count(t)}")
|
||||
print(" Phase-0 foundation tables (from migration, empty until entity resolution):")
|
||||
for t in ("canonical_entities", "entity_links", "interaction_log", "relationship_edges"):
|
||||
print(f" {t:<24} {count(t)}")
|
||||
inv = count("contacts") # note grid bridge may have created extra investor contacts (the variants)
|
||||
print(f"\n Entity-resolution test bed: {len(overlap_specs)} investors intentionally appear in BOTH models "
|
||||
f"with name variants; total contacts now {inv} (grid bridge added the variant rows).")
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+620
-52
@@ -18,7 +18,7 @@ import re
|
||||
import base64
|
||||
import threading
|
||||
from datetime import datetime, timedelta
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
from http.server import HTTPServer, ThreadingHTTPServer, BaseHTTPRequestHandler
|
||||
from urllib.parse import urlparse, parse_qs, unquote
|
||||
from functools import wraps
|
||||
|
||||
@@ -52,6 +52,14 @@ CORS_ORIGIN = os.environ.get("CRM_CORS_ORIGIN", "*")
|
||||
ENV = os.environ.get("CRM_ENV", "development")
|
||||
LOGIN_RATE_LIMIT_PER_MIN = int(os.environ.get("CRM_LOGIN_RATE_LIMIT_PER_MIN", "20"))
|
||||
WRITE_RATE_LIMIT_PER_MIN = int(os.environ.get("CRM_WRITE_RATE_LIMIT_PER_MIN", "300"))
|
||||
GET_RATE_LIMIT_PER_MIN = int(os.environ.get("CRM_GET_RATE_LIMIT_PER_MIN", "600"))
|
||||
# Auto-ban any IP that racks up too many 404s in a short window — almost always
|
||||
# a vulnerability scanner blasting common paths (/.env, /.git/config, /swagger,
|
||||
# /actuator/env, wp-json, etc.). Banned IPs get instant 429s with no DB or
|
||||
# filesystem work, so they can't keep the single SQLite writer busy.
|
||||
ABUSE_404_THRESHOLD = int(os.environ.get("CRM_ABUSE_404_THRESHOLD", "15"))
|
||||
ABUSE_404_WINDOW_SEC = int(os.environ.get("CRM_ABUSE_404_WINDOW_SEC", "60"))
|
||||
ABUSE_BAN_SEC = int(os.environ.get("CRM_ABUSE_BAN_SEC", "900")) # 15 minutes
|
||||
BACKUP_POLICY_SETTING_KEY = "fundraising_backup_policy"
|
||||
DEFAULT_BACKUP_POLICY = {
|
||||
"enabled": True,
|
||||
@@ -127,6 +135,10 @@ def init_db():
|
||||
tags TEXT DEFAULT '[]',
|
||||
notes TEXT,
|
||||
linkedin_url TEXT,
|
||||
city TEXT,
|
||||
state TEXT,
|
||||
country TEXT,
|
||||
location_query TEXT,
|
||||
preferred_contact TEXT DEFAULT 'email',
|
||||
created_by TEXT REFERENCES users(id),
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
@@ -258,6 +270,7 @@ def init_db():
|
||||
investor_name TEXT NOT NULL,
|
||||
notes TEXT,
|
||||
lead TEXT,
|
||||
lead_source TEXT,
|
||||
priority INTEGER DEFAULT 0,
|
||||
follow_up INTEGER DEFAULT 0,
|
||||
graveyard INTEGER DEFAULT 0,
|
||||
@@ -337,6 +350,30 @@ def init_db():
|
||||
UNIQUE(investor_id, list_key)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS fundraising_presence (
|
||||
user_id TEXT PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE,
|
||||
username TEXT NOT NULL,
|
||||
full_name TEXT,
|
||||
active_view TEXT,
|
||||
row_id TEXT,
|
||||
col_id TEXT,
|
||||
is_editing INTEGER DEFAULT 0,
|
||||
cell_key TEXT,
|
||||
last_seen_at TEXT DEFAULT (datetime('now')),
|
||||
expires_at_epoch INTEGER NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS fundraising_cell_locks (
|
||||
cell_key TEXT PRIMARY KEY,
|
||||
row_id TEXT NOT NULL,
|
||||
col_id TEXT NOT NULL,
|
||||
locked_by_user_id TEXT NOT NULL REFERENCES users(id) ON DELETE CASCADE,
|
||||
locked_by_username TEXT NOT NULL,
|
||||
locked_by_full_name TEXT,
|
||||
last_seen_at TEXT DEFAULT (datetime('now')),
|
||||
expires_at_epoch INTEGER NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS app_settings (
|
||||
key TEXT PRIMARY KEY,
|
||||
value_json TEXT NOT NULL,
|
||||
@@ -363,9 +400,42 @@ def init_db():
|
||||
CREATE INDEX IF NOT EXISTS idx_fr_commitments_fund ON fundraising_commitments(fund_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_fr_automation_runs_created ON fundraising_automation_runs(created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_fr_memberships_list ON fundraising_list_memberships(list_key);
|
||||
CREATE INDEX IF NOT EXISTS idx_fr_presence_expires ON fundraising_presence(expires_at_epoch);
|
||||
CREATE INDEX IF NOT EXISTS idx_fr_locks_expires ON fundraising_cell_locks(expires_at_epoch);
|
||||
""")
|
||||
|
||||
# Lightweight schema migrations for existing databases.
|
||||
for stmt in [
|
||||
"ALTER TABLE contacts ADD COLUMN city TEXT",
|
||||
"ALTER TABLE contacts ADD COLUMN state TEXT",
|
||||
"ALTER TABLE contacts ADD COLUMN country TEXT",
|
||||
"ALTER TABLE contacts ADD COLUMN location_query TEXT",
|
||||
"ALTER TABLE fundraising_investors ADD COLUMN lead_source TEXT",
|
||||
]:
|
||||
try:
|
||||
conn.execute(stmt)
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
|
||||
# ─── Gmail integration migrations (feature-flag-guarded import) ───
|
||||
try:
|
||||
from email_integration.db import apply_migrations as _email_apply_migrations
|
||||
_email_apply_migrations(cursor)
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception as _e:
|
||||
print(f"[email_integration] migration warning: {_e}")
|
||||
|
||||
conn.commit()
|
||||
|
||||
# ─── Core schema migrations (Phase 0+; ordered .sql files w/ ledger) ───
|
||||
# Additive/reversible only; tracked in schema_migrations. See core_migrations.py.
|
||||
try:
|
||||
from core_migrations import apply_core_migrations as _apply_core_migrations
|
||||
_apply_core_migrations(conn)
|
||||
except Exception as _e:
|
||||
print(f"[migrations] core migration warning: {_e}")
|
||||
|
||||
conn.close()
|
||||
print(f"Database initialized at {DB_PATH}")
|
||||
|
||||
@@ -563,6 +633,16 @@ def _split_full_name(full_name):
|
||||
def _normalize_text(value):
|
||||
return str(value or '').strip().lower()
|
||||
|
||||
def _parse_location_text(text):
|
||||
raw = str(text or '').strip()
|
||||
if not raw:
|
||||
return '', '', '', ''
|
||||
parts = [p.strip() for p in raw.split(',') if p.strip()]
|
||||
city = parts[0] if len(parts) >= 1 else ''
|
||||
state = parts[1] if len(parts) >= 2 else ''
|
||||
country = parts[2] if len(parts) >= 3 else ''
|
||||
return city, state, country, raw
|
||||
|
||||
def ensure_default_automation_rules(conn):
|
||||
defaults = [
|
||||
{
|
||||
@@ -662,6 +742,11 @@ def _upsert_contact_from_fundraising(conn, investor_name, contact, actor_user_id
|
||||
full_name = str(contact.get('name') or '').strip()
|
||||
email = str(contact.get('email') or '').strip()
|
||||
title = str(contact.get('title') or '').strip()
|
||||
source = str(contact.get('source') or '').strip()
|
||||
city = str(contact.get('city') or '').strip()
|
||||
state = str(contact.get('state') or '').strip()
|
||||
country = str(contact.get('country') or '').strip()
|
||||
location_query = str(contact.get('location_query') or '').strip()
|
||||
if not full_name and not email:
|
||||
return None
|
||||
first_name, last_name = _split_full_name(full_name)
|
||||
@@ -700,20 +785,25 @@ def _upsert_contact_from_fundraising(conn, investor_name, contact, actor_user_id
|
||||
next_last = last_name if (last_name or full_name) else str(existing['last_name'] or '')
|
||||
next_email = email or str(existing['email'] or '')
|
||||
next_title = title or str(existing['title'] or '')
|
||||
next_source = source or str(existing['source'] or '')
|
||||
next_city = city or str(existing['city'] or '')
|
||||
next_state = state or str(existing['state'] or '')
|
||||
next_country = country or str(existing['country'] or '')
|
||||
next_location_query = location_query or str(existing['location_query'] or '')
|
||||
next_org = org_id or existing['organization_id']
|
||||
conn.execute("""
|
||||
UPDATE contacts
|
||||
SET first_name = ?, last_name = ?, email = ?, title = ?,
|
||||
organization_id = ?, contact_type = 'investor', updated_at = ?
|
||||
organization_id = ?, source = ?, contact_type = 'investor', city = ?, state = ?, country = ?, location_query = ?, updated_at = ?
|
||||
WHERE id = ?
|
||||
""", (next_first, next_last, next_email, next_title, next_org, now(), existing['id']))
|
||||
""", (next_first, next_last, next_email, next_title, next_org, next_source, next_city, next_state, next_country, next_location_query, now(), existing['id']))
|
||||
return existing['id']
|
||||
|
||||
contact_id = generate_id()
|
||||
conn.execute("""
|
||||
INSERT INTO contacts (
|
||||
id, first_name, last_name, email, title, organization_id, contact_type, status, created_by, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, 'investor', 'active', ?, ?)
|
||||
id, first_name, last_name, email, title, organization_id, source, contact_type, status, city, state, country, location_query, created_by, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, 'investor', 'active', ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
contact_id,
|
||||
first_name or 'Unknown',
|
||||
@@ -721,6 +811,11 @@ def _upsert_contact_from_fundraising(conn, investor_name, contact, actor_user_id
|
||||
email,
|
||||
title,
|
||||
org_id,
|
||||
source,
|
||||
city,
|
||||
state,
|
||||
country,
|
||||
location_query,
|
||||
actor_user_id,
|
||||
now()
|
||||
))
|
||||
@@ -748,6 +843,11 @@ def _sync_contact_to_fundraising_state(conn, contact_row, actor_user_id=None, re
|
||||
email = str(contact_row.get('email') or '').strip()
|
||||
full_name = ' '.join([str(contact_row.get('first_name') or '').strip(), str(contact_row.get('last_name') or '').strip()]).strip()
|
||||
title = str(contact_row.get('title') or '').strip()
|
||||
source = str(contact_row.get('source') or '').strip()
|
||||
city = str(contact_row.get('city') or '').strip()
|
||||
state = str(contact_row.get('state') or '').strip()
|
||||
country = str(contact_row.get('country') or '').strip()
|
||||
location_query = str(contact_row.get('location_query') or '').strip()
|
||||
if not full_name and not email:
|
||||
return
|
||||
|
||||
@@ -814,22 +914,24 @@ def _sync_contact_to_fundraising_state(conn, contact_row, actor_user_id=None, re
|
||||
"name": full_name or existing.get('name') or '',
|
||||
"email": email or existing.get('email') or '',
|
||||
"title": title or existing.get('title') or '',
|
||||
"city": str(existing.get('city') or ''),
|
||||
"state": str(existing.get('state') or ''),
|
||||
"country": str(existing.get('country') or ''),
|
||||
"location_query": str(existing.get('location_query') or '')
|
||||
"city": city or str(existing.get('city') or ''),
|
||||
"state": state or str(existing.get('state') or ''),
|
||||
"country": country or str(existing.get('country') or ''),
|
||||
"location_query": location_query or str(existing.get('location_query') or '')
|
||||
}
|
||||
else:
|
||||
next_contacts.append({
|
||||
"name": full_name,
|
||||
"email": email,
|
||||
"title": title,
|
||||
"city": "",
|
||||
"state": "",
|
||||
"country": "",
|
||||
"location_query": ""
|
||||
"city": city,
|
||||
"state": state,
|
||||
"country": country,
|
||||
"location_query": location_query
|
||||
})
|
||||
inv['contacts'] = next_contacts
|
||||
if source and not str(inv.get('lead_source') or '').strip():
|
||||
inv['lead_source'] = source
|
||||
changed = True
|
||||
|
||||
if not changed:
|
||||
@@ -901,6 +1003,7 @@ def sync_fundraising_relational(conn, grid, views, actor_user_id=None):
|
||||
investor_name = str(row.get('investor_name') or '').strip() or 'Untitled Investor'
|
||||
notes = str(row.get('notes') or '')
|
||||
lead = str(row.get('lead') or '')
|
||||
lead_source = str(row.get('lead_source') or row.get('combined_lead_source') or '').strip()
|
||||
total_invested = 0.0
|
||||
for _, col in fund_columns:
|
||||
total_invested += _to_number(row.get(str(col.get('id'))))
|
||||
@@ -909,13 +1012,17 @@ def sync_fundraising_relational(conn, grid, views, actor_user_id=None):
|
||||
investor_id = existing['id'] if existing else generate_id()
|
||||
conn.execute("""
|
||||
INSERT INTO fundraising_investors (
|
||||
id, investor_name, notes, lead, priority, follow_up, graveyard,
|
||||
id, investor_name, notes, lead, lead_source, priority, follow_up, graveyard,
|
||||
source_row_id, total_invested, updated_at
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(source_row_id) DO UPDATE SET
|
||||
investor_name = excluded.investor_name,
|
||||
notes = excluded.notes,
|
||||
lead = excluded.lead,
|
||||
lead_source = CASE
|
||||
WHEN COALESCE(trim(fundraising_investors.lead_source), '') = '' THEN excluded.lead_source
|
||||
ELSE fundraising_investors.lead_source
|
||||
END,
|
||||
priority = excluded.priority,
|
||||
follow_up = excluded.follow_up,
|
||||
graveyard = excluded.graveyard,
|
||||
@@ -926,6 +1033,7 @@ def sync_fundraising_relational(conn, grid, views, actor_user_id=None):
|
||||
investor_name,
|
||||
notes,
|
||||
lead,
|
||||
lead_source,
|
||||
1 if _to_bool(row.get('priority')) else 0,
|
||||
1 if _to_bool(row.get('follow_up')) else 0,
|
||||
1 if _to_bool(row.get('graveyard')) else 0,
|
||||
@@ -947,7 +1055,10 @@ def sync_fundraising_relational(conn, grid, views, actor_user_id=None):
|
||||
email = str(c.get('email') or '').strip()
|
||||
if not full_name and not email:
|
||||
continue
|
||||
_upsert_contact_from_fundraising(conn, investor_name, c, actor_user_id=actor_user_id)
|
||||
contact_payload = dict(c)
|
||||
if lead_source and not str(contact_payload.get('source') or '').strip():
|
||||
contact_payload['source'] = lead_source
|
||||
_upsert_contact_from_fundraising(conn, investor_name, contact_payload, actor_user_id=actor_user_id)
|
||||
conn.execute("""
|
||||
INSERT INTO fundraising_contacts (
|
||||
id, investor_id, full_name, email, title, city, state, country, location_query, sort_order, updated_at
|
||||
@@ -958,7 +1069,7 @@ def sync_fundraising_relational(conn, grid, views, actor_user_id=None):
|
||||
str(c.get('location_query') or ''), i, now()
|
||||
))
|
||||
elif isinstance(contacts, str) and contacts.strip():
|
||||
_upsert_contact_from_fundraising(conn, investor_name, {"name": contacts.strip(), "email": "", "title": ""}, actor_user_id=actor_user_id)
|
||||
_upsert_contact_from_fundraising(conn, investor_name, {"name": contacts.strip(), "email": "", "title": "", "source": lead_source}, actor_user_id=actor_user_id)
|
||||
conn.execute("""
|
||||
INSERT INTO fundraising_contacts (
|
||||
id, investor_id, full_name, email, title, city, state, country, location_query, sort_order, updated_at
|
||||
@@ -1291,6 +1402,7 @@ DEFAULT_FUNDRAISING_COLUMNS = [
|
||||
{"id": "contacts", "label": "Contacts", "type": "contacts", "width": 260},
|
||||
{"id": "log_action", "label": "Log", "type": "action", "readOnly": True, "width": 90},
|
||||
{"id": "notes", "label": "Notes / Communication / Outreach", "type": "longtext", "width": 420},
|
||||
{"id": "lead_source", "label": "Lead Source", "type": "text", "width": 180},
|
||||
{"id": "notes_last_modified", "label": "Notes Last Modified", "type": "date", "readOnly": True, "width": 180},
|
||||
{"id": "last_communication_date", "label": "Last Communication Date", "type": "date", "readOnly": True, "width": 195},
|
||||
{"id": "priority", "label": "Priority", "type": "checkbox", "width": 110},
|
||||
@@ -1314,7 +1426,12 @@ DEFAULT_FUNDRAISING_ROWS = []
|
||||
|
||||
class CRMHandler(BaseHTTPRequestHandler):
|
||||
"""Main HTTP request handler for the CRM API."""
|
||||
_rate_limit_buckets = {}
|
||||
# Class-level state shared across all handler threads. Protected by
|
||||
# _abuse_lock; see rate_limited() and record_404() for usage.
|
||||
_rate_limit_buckets = {} # (scope, ip) -> [timestamps]
|
||||
_404_buckets = {} # ip -> [timestamps] of recent 404 responses
|
||||
_banned_ips = {} # ip -> ban_until_epoch
|
||||
_abuse_lock = threading.Lock()
|
||||
|
||||
def log_message(self, format, *args):
|
||||
"""Override to use cleaner logging."""
|
||||
@@ -1323,14 +1440,22 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
# ── Request Parsing ──
|
||||
|
||||
def get_body(self):
|
||||
# Cache parsed body on the request handler instance so repeated
|
||||
# calls don't try to re-read an already-consumed stream. Handler
|
||||
# instances are one-per-request in ThreadingHTTPServer, so the
|
||||
# cache is naturally request-scoped.
|
||||
if hasattr(self, '_cached_body'):
|
||||
return self._cached_body
|
||||
content_length = int(self.headers.get('Content-Length', 0))
|
||||
if content_length == 0:
|
||||
return {}
|
||||
self._cached_body = {}
|
||||
return self._cached_body
|
||||
body = self.rfile.read(content_length)
|
||||
try:
|
||||
return json.loads(body.decode('utf-8'))
|
||||
self._cached_body = json.loads(body.decode('utf-8'))
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
self._cached_body = {}
|
||||
return self._cached_body
|
||||
|
||||
def get_query_params(self):
|
||||
parsed = urlparse(self.path)
|
||||
@@ -1373,15 +1498,57 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
def rate_limited(self, scope, limit_per_minute):
|
||||
now_ts = time.time()
|
||||
bucket_key = f"{scope}:{self.get_client_ip()}"
|
||||
bucket = self._rate_limit_buckets.get(bucket_key, [])
|
||||
cutoff = now_ts - 60.0
|
||||
bucket = [t for t in bucket if t >= cutoff]
|
||||
if len(bucket) >= max(1, int(limit_per_minute)):
|
||||
with self._abuse_lock:
|
||||
bucket = self._rate_limit_buckets.get(bucket_key, [])
|
||||
cutoff = now_ts - 60.0
|
||||
bucket = [t for t in bucket if t >= cutoff]
|
||||
if len(bucket) >= max(1, int(limit_per_minute)):
|
||||
self._rate_limit_buckets[bucket_key] = bucket
|
||||
return True
|
||||
bucket.append(now_ts)
|
||||
self._rate_limit_buckets[bucket_key] = bucket
|
||||
return False
|
||||
|
||||
def is_banned(self):
|
||||
"""Return True if the client IP is currently in the abuse blacklist.
|
||||
|
||||
Uses a coarse class-level dict — fine for a handful of scanners hitting
|
||||
a small team CRM. Auto-expires entries when their ban window passes.
|
||||
"""
|
||||
ip = self.get_client_ip()
|
||||
now_ts = time.time()
|
||||
with self._abuse_lock:
|
||||
until = self._banned_ips.get(ip)
|
||||
if until is None:
|
||||
return False
|
||||
if now_ts >= until:
|
||||
self._banned_ips.pop(ip, None)
|
||||
return False
|
||||
return True
|
||||
bucket.append(now_ts)
|
||||
self._rate_limit_buckets[bucket_key] = bucket
|
||||
return False
|
||||
|
||||
def record_404(self):
|
||||
"""Track 404s per IP and auto-ban IPs that exceed the burst threshold.
|
||||
|
||||
Called from send_error_json whenever we send a 404. A scanner probing
|
||||
/.env, /.git/config, /swagger, /actuator/env etc. will trip this fast
|
||||
and get parked on the blacklist for ABUSE_BAN_SEC seconds.
|
||||
"""
|
||||
ip = self.get_client_ip()
|
||||
now_ts = time.time()
|
||||
with self._abuse_lock:
|
||||
bucket = self._404_buckets.get(ip, [])
|
||||
cutoff = now_ts - ABUSE_404_WINDOW_SEC
|
||||
bucket = [t for t in bucket if t >= cutoff]
|
||||
bucket.append(now_ts)
|
||||
if len(bucket) >= ABUSE_404_THRESHOLD:
|
||||
self._banned_ips[ip] = now_ts + ABUSE_BAN_SEC
|
||||
self._404_buckets.pop(ip, None)
|
||||
sys.stderr.write(
|
||||
f"[abuse] Banning {ip} for {ABUSE_BAN_SEC}s after "
|
||||
f"{len(bucket)} 404s in {ABUSE_404_WINDOW_SEC}s\n"
|
||||
)
|
||||
else:
|
||||
self._404_buckets[ip] = bucket
|
||||
|
||||
# ── Response Helpers ──
|
||||
|
||||
@@ -1395,6 +1562,14 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
self.wfile.write(json.dumps(data, default=str).encode('utf-8'))
|
||||
|
||||
def send_error_json(self, message, status=400):
|
||||
# Record 404s for the abuser-detection blacklist before responding.
|
||||
# We do this on send (not on routing) so that any code path that 404s
|
||||
# contributes to the burst counter, including unknown POST paths.
|
||||
if status == 404:
|
||||
try:
|
||||
self.record_404()
|
||||
except Exception:
|
||||
pass
|
||||
self.send_json({"error": message}, status)
|
||||
|
||||
def send_file(self, filepath, content_type='text/html'):
|
||||
@@ -1412,6 +1587,8 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
# ── Routing ──
|
||||
|
||||
def do_OPTIONS(self):
|
||||
if self.is_banned():
|
||||
return self.send_error_json("Too many requests", 429)
|
||||
self.send_response(200)
|
||||
self.send_header('Access-Control-Allow-Origin', CORS_ORIGIN)
|
||||
self.send_header('Access-Control-Allow-Methods', 'GET, POST, PUT, PATCH, DELETE, OPTIONS')
|
||||
@@ -1419,8 +1596,25 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
self.end_headers()
|
||||
|
||||
def do_GET(self):
|
||||
# Short-circuit known abusers before any work, including file reads.
|
||||
if self.is_banned():
|
||||
return self.send_error_json("Too many requests", 429)
|
||||
# Generic per-IP GET rate limit. Generous enough for a normal user
|
||||
# (page load fans out ~15 GETs + heartbeats every ~6s) but blocks a
|
||||
# scanner blasting hundreds of paths per second.
|
||||
if self.rate_limited('get', GET_RATE_LIMIT_PER_MIN):
|
||||
return self.send_error_json("Too many requests", 429)
|
||||
|
||||
path = self.get_path()
|
||||
|
||||
# ─── Gmail integration routes (feature-flag-guarded) ─────────
|
||||
try:
|
||||
from email_integration.routes import try_handle as _email_try_handle
|
||||
if _email_try_handle(self):
|
||||
return
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Serve frontend
|
||||
if path == '/' or path == '/index.html':
|
||||
return self.send_file(os.path.join(FRONTEND_DIR, 'index.html'))
|
||||
@@ -1510,6 +1704,8 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
# Fundraising grid state
|
||||
if path == '/api/fundraising/state':
|
||||
return self.handle_get_fundraising_state(user)
|
||||
if path == '/api/fundraising/collab/state':
|
||||
return self.handle_get_fundraising_collab_state(user)
|
||||
if path == '/api/fundraising/export':
|
||||
return self.handle_export_fundraising_state(user)
|
||||
if path == '/api/fundraising/backups':
|
||||
@@ -1538,12 +1734,23 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
self.send_error_json("Not found", 404)
|
||||
|
||||
def do_POST(self):
|
||||
if self.is_banned():
|
||||
return self.send_error_json("Too many requests", 429)
|
||||
|
||||
path = self.get_path()
|
||||
body = self.get_body()
|
||||
|
||||
if self.rate_limited('write', WRITE_RATE_LIMIT_PER_MIN):
|
||||
return self.send_error_json("Too many requests", 429)
|
||||
|
||||
# ─── Gmail integration routes (feature-flag-guarded) ─────────
|
||||
try:
|
||||
from email_integration.routes import try_handle as _email_try_handle
|
||||
if _email_try_handle(self):
|
||||
return
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Auth (no token needed)
|
||||
if path == '/api/auth/login':
|
||||
if self.rate_limited('login', LOGIN_RATE_LIMIT_PER_MIN):
|
||||
@@ -1573,8 +1780,12 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
return self.handle_create_feature_request(user, body)
|
||||
if path == '/api/fundraising/log-communication':
|
||||
return self.handle_log_fundraising_communication(user, body)
|
||||
if path == '/api/fundraising/collab/heartbeat':
|
||||
return self.handle_fundraising_collab_heartbeat(user, body)
|
||||
if path == '/api/admin/users':
|
||||
return self.handle_admin_create_user(user, body)
|
||||
if path == '/api/admin/reset-all-data':
|
||||
return self.handle_admin_reset_all_data(user, body)
|
||||
if path == '/api/fundraising/backup':
|
||||
return self.handle_backup_fundraising_state(user)
|
||||
if path == '/api/fundraising/restore-preview':
|
||||
@@ -1587,6 +1798,8 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
self.send_error_json("Not found", 404)
|
||||
|
||||
def do_PUT(self):
|
||||
if self.is_banned():
|
||||
return self.send_error_json("Too many requests", 429)
|
||||
path = self.get_path()
|
||||
body = self.get_body()
|
||||
if self.rate_limited('write', WRITE_RATE_LIMIT_PER_MIN):
|
||||
@@ -1611,6 +1824,8 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
self.send_error_json("Not found", 404)
|
||||
|
||||
def do_PATCH(self):
|
||||
if self.is_banned():
|
||||
return self.send_error_json("Too many requests", 429)
|
||||
path = self.get_path()
|
||||
body = self.get_body()
|
||||
if self.rate_limited('write', WRITE_RATE_LIMIT_PER_MIN):
|
||||
@@ -1637,6 +1852,8 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
self.send_error_json("Not found", 404)
|
||||
|
||||
def do_DELETE(self):
|
||||
if self.is_banned():
|
||||
return self.send_error_json("Too many requests", 429)
|
||||
path = self.get_path()
|
||||
if self.rate_limited('write', WRITE_RATE_LIMIT_PER_MIN):
|
||||
return self.send_error_json("Too many requests", 429)
|
||||
@@ -1761,8 +1978,8 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
args.append(params['status'])
|
||||
if params.get('search'):
|
||||
search = f"%{params['search']}%"
|
||||
query += " AND (c.first_name LIKE ? OR c.last_name LIKE ? OR c.email LIKE ? OR o.name LIKE ?)"
|
||||
args.extend([search, search, search, search])
|
||||
query += " AND (c.first_name LIKE ? OR c.last_name LIKE ? OR c.email LIKE ? OR o.name LIKE ? OR c.source LIKE ?)"
|
||||
args.extend([search, search, search, search, search])
|
||||
if params.get('organization_id'):
|
||||
query += " AND c.organization_id = ?"
|
||||
args.append(params['organization_id'])
|
||||
@@ -1772,7 +1989,7 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
|
||||
sort = params.get('sort', 'updated_at')
|
||||
order = 'DESC' if params.get('order', 'desc').lower() == 'desc' else 'ASC'
|
||||
allowed_sorts = ['first_name', 'last_name', 'email', 'created_at', 'updated_at', 'contact_type']
|
||||
allowed_sorts = ['first_name', 'last_name', 'email', 'created_at', 'updated_at', 'contact_type', 'source']
|
||||
if sort in allowed_sorts:
|
||||
query += f" ORDER BY c.{sort} {order}"
|
||||
else:
|
||||
@@ -1837,20 +2054,25 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
|
||||
contact_id = generate_id()
|
||||
conn = get_db()
|
||||
organization_id = body.get('organization_id')
|
||||
if not organization_id and body.get('organization'):
|
||||
organization_id = _ensure_organization_by_name(conn, body.get('organization'), user['user_id'])
|
||||
|
||||
tags = json.dumps(body.get('tags', []))
|
||||
conn.execute("""
|
||||
INSERT INTO contacts (id, first_name, last_name, email, phone, mobile, title,
|
||||
organization_id, contact_type, status, source, tags, notes, linkedin_url,
|
||||
preferred_contact, created_by)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
city, state, country, location_query, preferred_contact, created_by)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
contact_id, body['first_name'], body['last_name'],
|
||||
body.get('email'), body.get('phone'), body.get('mobile'),
|
||||
body.get('title'), body.get('organization_id'),
|
||||
body.get('title'), organization_id,
|
||||
body.get('contact_type', 'prospect'), body.get('status', 'active'),
|
||||
body.get('source'), tags, body.get('notes'),
|
||||
body.get('linkedin_url'), body.get('preferred_contact', 'email'),
|
||||
body.get('linkedin_url'), body.get('city'), body.get('state'),
|
||||
body.get('country'), body.get('location_query'),
|
||||
body.get('preferred_contact', 'email'),
|
||||
user['user_id']
|
||||
))
|
||||
|
||||
@@ -1879,13 +2101,17 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
|
||||
updatable = ['first_name', 'last_name', 'email', 'phone', 'mobile', 'title',
|
||||
'organization_id', 'contact_type', 'status', 'source', 'notes',
|
||||
'linkedin_url', 'preferred_contact']
|
||||
'linkedin_url', 'city', 'state', 'country', 'location_query', 'preferred_contact']
|
||||
sets = []
|
||||
args = []
|
||||
for field in updatable:
|
||||
if field in body:
|
||||
sets.append(f"{field} = ?")
|
||||
args.append(body[field])
|
||||
if 'organization' in body and 'organization_id' not in body:
|
||||
org_id = _ensure_organization_by_name(conn, body.get('organization'), user['user_id'])
|
||||
sets.append("organization_id = ?")
|
||||
args.append(org_id)
|
||||
|
||||
if 'tags' in body:
|
||||
sets.append("tags = ?")
|
||||
@@ -2876,12 +3102,17 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
entity_type = body.get('entity_type', 'contacts')
|
||||
mapping = body.get('mapping', {})
|
||||
dry_run = body.get('dry_run', False)
|
||||
update_existing = bool(body.get('update_existing', True))
|
||||
action_overrides_raw = body.get('action_overrides', {}) or {}
|
||||
|
||||
if not csv_data:
|
||||
return self.send_error_json("No data provided. Send 'data' as array of objects.")
|
||||
|
||||
conn = get_db()
|
||||
results = {"created": 0, "updated": 0, "skipped": 0, "errors": []}
|
||||
results = {"created": 0, "updated": 0, "skipped": 0, "errors": [], "matches": []}
|
||||
# Keep in-memory email matches so dry-run mirrors real behavior for
|
||||
# duplicate emails appearing multiple times in the same CSV batch.
|
||||
batch_email_matches = {}
|
||||
|
||||
try:
|
||||
for i, row in enumerate(csv_data):
|
||||
@@ -2911,11 +3142,44 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
continue
|
||||
|
||||
email = data.get('email', '').strip()
|
||||
email_key = email.lower()
|
||||
linkedin_url = data.get('linkedin_url', data.get('linkedin', '')).strip()
|
||||
city = data.get('city', '').strip()
|
||||
state = data.get('state', '').strip()
|
||||
country = data.get('country', '').strip()
|
||||
location_query = data.get('location_query', '').strip()
|
||||
raw_location = data.get('location', data.get('city_location', data.get('city/location', ''))).strip()
|
||||
if raw_location:
|
||||
p_city, p_state, p_country, p_query = _parse_location_text(raw_location)
|
||||
city = city or p_city
|
||||
state = state or p_state
|
||||
country = country or p_country
|
||||
location_query = location_query or p_query
|
||||
|
||||
# Check for existing contact by email
|
||||
existing = None
|
||||
existing_summary = None
|
||||
if email:
|
||||
existing = conn.execute("SELECT id FROM contacts WHERE email = ?", (email,)).fetchone()
|
||||
if email_key in batch_email_matches:
|
||||
existing_summary = batch_email_matches[email_key]
|
||||
existing = {"id": existing_summary.get('id')}
|
||||
else:
|
||||
existing = conn.execute("""
|
||||
SELECT c.id, c.first_name, c.last_name, c.email, o.name as organization_name
|
||||
FROM contacts c
|
||||
LEFT JOIN organizations o ON c.organization_id = o.id
|
||||
WHERE lower(c.email) = lower(?)
|
||||
ORDER BY c.updated_at DESC
|
||||
LIMIT 1
|
||||
""", (email,)).fetchone()
|
||||
if existing:
|
||||
existing_summary = {
|
||||
"id": existing['id'],
|
||||
"name": f"{str(existing['first_name'] or '').strip()} {str(existing['last_name'] or '').strip()}".strip(),
|
||||
"email": str(existing['email'] or ''),
|
||||
"organization": str(existing['organization_name'] or '')
|
||||
}
|
||||
batch_email_matches[email_key] = existing_summary
|
||||
|
||||
# Handle organization
|
||||
org_id = None
|
||||
@@ -2931,32 +3195,149 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
(org_id, org_name, user['user_id'])
|
||||
)
|
||||
|
||||
action_override = None
|
||||
if isinstance(action_overrides_raw, dict):
|
||||
action_override = action_overrides_raw.get(str(i + 1)) or action_overrides_raw.get(i + 1)
|
||||
default_action = 'update' if update_existing else 'skip'
|
||||
action = action_override if action_override in ('update', 'skip', 'create_duplicate') else default_action
|
||||
if existing:
|
||||
incoming_name = f"{first_name} {last_name}".strip()
|
||||
results['matches'].append({
|
||||
"row": i + 1,
|
||||
"incoming_name": incoming_name,
|
||||
"incoming_email": email,
|
||||
"incoming_organization": org_name,
|
||||
"existing_id": existing_summary.get('id') if isinstance(existing_summary, dict) else existing['id'],
|
||||
"existing_name": existing_summary.get('name') if isinstance(existing_summary, dict) else '',
|
||||
"existing_email": existing_summary.get('email') if isinstance(existing_summary, dict) else email,
|
||||
"existing_organization": existing_summary.get('organization') if isinstance(existing_summary, dict) else '',
|
||||
"default_action": default_action,
|
||||
"action": action
|
||||
})
|
||||
|
||||
if not dry_run:
|
||||
if existing:
|
||||
conn.execute("""
|
||||
UPDATE contacts SET first_name=?, last_name=?, phone=?, title=?,
|
||||
organization_id=COALESCE(?, organization_id),
|
||||
contact_type=COALESCE(?, contact_type), updated_at=?
|
||||
WHERE id=?
|
||||
""", (first_name, last_name, data.get('phone'),
|
||||
data.get('title'), org_id,
|
||||
data.get('contact_type'), now(), existing['id']))
|
||||
results['updated'] += 1
|
||||
if action == 'update':
|
||||
conn.execute("""
|
||||
UPDATE contacts SET first_name=?, last_name=?, phone=?, title=?,
|
||||
organization_id=COALESCE(?, organization_id),
|
||||
contact_type=COALESCE(?, contact_type),
|
||||
linkedin_url=COALESCE(?, linkedin_url),
|
||||
city=COALESCE(?, city),
|
||||
state=COALESCE(?, state),
|
||||
country=COALESCE(?, country),
|
||||
location_query=COALESCE(?, location_query),
|
||||
updated_at=?
|
||||
WHERE id=?
|
||||
""", (first_name, last_name, data.get('phone'),
|
||||
data.get('title'), org_id,
|
||||
data.get('contact_type'),
|
||||
linkedin_url if linkedin_url else None,
|
||||
city if city else None,
|
||||
state if state else None,
|
||||
country if country else None,
|
||||
location_query if location_query else None,
|
||||
now(), existing['id']))
|
||||
if email:
|
||||
batch_email_matches[email_key] = {
|
||||
"id": existing['id'],
|
||||
"name": f"{first_name} {last_name}".strip(),
|
||||
"email": email,
|
||||
"organization": org_name
|
||||
}
|
||||
updated_contact = row_to_dict(conn.execute("""
|
||||
SELECT c.*, o.name as organization_name
|
||||
FROM contacts c LEFT JOIN organizations o ON c.organization_id = o.id
|
||||
WHERE c.id = ?
|
||||
""", (existing['id'],)).fetchone())
|
||||
_sync_contact_to_fundraising_state(conn, updated_contact, actor_user_id=user['user_id'], remove=False)
|
||||
results['updated'] += 1
|
||||
elif action == 'create_duplicate':
|
||||
contact_id = generate_id()
|
||||
conn.execute("""
|
||||
INSERT INTO contacts (id, first_name, last_name, email, phone,
|
||||
title, organization_id, contact_type, status, source,
|
||||
linkedin_url, city, state, country, location_query, created_by)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'active', 'import', ?, ?, ?, ?, ?, ?)
|
||||
""", (contact_id, first_name, last_name, email,
|
||||
data.get('phone'), data.get('title'), org_id,
|
||||
data.get('contact_type', 'prospect'), linkedin_url,
|
||||
city, state, country, location_query, user['user_id']))
|
||||
if email:
|
||||
batch_email_matches[email_key] = {
|
||||
"id": contact_id,
|
||||
"name": f"{first_name} {last_name}".strip(),
|
||||
"email": email,
|
||||
"organization": org_name
|
||||
}
|
||||
created_contact = row_to_dict(conn.execute("""
|
||||
SELECT c.*, o.name as organization_name
|
||||
FROM contacts c LEFT JOIN organizations o ON c.organization_id = o.id
|
||||
WHERE c.id = ?
|
||||
""", (contact_id,)).fetchone())
|
||||
_sync_contact_to_fundraising_state(conn, created_contact, actor_user_id=user['user_id'], remove=False)
|
||||
results['created'] += 1
|
||||
else:
|
||||
results['skipped'] += 1
|
||||
results['errors'].append(f"Row {i+1}: Existing contact matched by email; skipped")
|
||||
else:
|
||||
contact_id = generate_id()
|
||||
conn.execute("""
|
||||
INSERT INTO contacts (id, first_name, last_name, email, phone,
|
||||
title, organization_id, contact_type, status, source, created_by)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'active', 'import', ?)
|
||||
title, organization_id, contact_type, status, source,
|
||||
linkedin_url, city, state, country, location_query, created_by)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'active', 'import', ?, ?, ?, ?, ?, ?)
|
||||
""", (contact_id, first_name, last_name, email,
|
||||
data.get('phone'), data.get('title'), org_id,
|
||||
data.get('contact_type', 'prospect'), user['user_id']))
|
||||
data.get('contact_type', 'prospect'), linkedin_url,
|
||||
city, state, country, location_query, user['user_id']))
|
||||
if email:
|
||||
batch_email_matches[email_key] = {
|
||||
"id": contact_id,
|
||||
"name": f"{first_name} {last_name}".strip(),
|
||||
"email": email,
|
||||
"organization": org_name
|
||||
}
|
||||
created_contact = row_to_dict(conn.execute("""
|
||||
SELECT c.*, o.name as organization_name
|
||||
FROM contacts c LEFT JOIN organizations o ON c.organization_id = o.id
|
||||
WHERE c.id = ?
|
||||
""", (contact_id,)).fetchone())
|
||||
_sync_contact_to_fundraising_state(conn, created_contact, actor_user_id=user['user_id'], remove=False)
|
||||
results['created'] += 1
|
||||
else:
|
||||
if existing:
|
||||
results['updated'] += 1
|
||||
if action == 'update':
|
||||
results['updated'] += 1
|
||||
if email:
|
||||
batch_email_matches[email_key] = {
|
||||
"id": existing['id'],
|
||||
"name": f"{first_name} {last_name}".strip(),
|
||||
"email": email,
|
||||
"organization": org_name
|
||||
}
|
||||
elif action == 'create_duplicate':
|
||||
results['created'] += 1
|
||||
if email:
|
||||
batch_email_matches[email_key] = {
|
||||
"id": f"dryrun-{i+1}",
|
||||
"name": f"{first_name} {last_name}".strip(),
|
||||
"email": email,
|
||||
"organization": org_name
|
||||
}
|
||||
else:
|
||||
results['skipped'] += 1
|
||||
results['errors'].append(f"Row {i+1}: Existing contact matched by email; would be skipped")
|
||||
else:
|
||||
results['created'] += 1
|
||||
if email:
|
||||
# Simulate that the row now exists for subsequent duplicate-email rows.
|
||||
batch_email_matches[email_key] = {
|
||||
"id": f"dryrun-{i+1}",
|
||||
"name": f"{first_name} {last_name}".strip(),
|
||||
"email": email,
|
||||
"organization": org_name
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
results['errors'].append(f"Row {i+1}: {str(e)}")
|
||||
@@ -3141,6 +3522,58 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
conn.close()
|
||||
return self.send_json({"data": updated})
|
||||
|
||||
def handle_admin_reset_all_data(self, user, body):
|
||||
if not require_admin(user):
|
||||
return self.send_error_json("Admin only", 403)
|
||||
|
||||
confirm_phrase = str(body.get('confirm_phrase') or '').strip()
|
||||
if confirm_phrase != 'RESET ALL DATA':
|
||||
return self.send_error_json("Confirmation phrase must be exactly: RESET ALL DATA", 400)
|
||||
|
||||
conn = get_db()
|
||||
try:
|
||||
ensure_fundraising_state_row(conn)
|
||||
state = conn.execute("SELECT * FROM fundraising_state WHERE id = 'main'").fetchone()
|
||||
pre_backup = create_fundraising_backup_file(state, kind="pre_restore") if state else None
|
||||
|
||||
conn.execute("DELETE FROM communications")
|
||||
conn.execute("DELETE FROM opportunities")
|
||||
conn.execute("DELETE FROM lp_profiles")
|
||||
conn.execute("DELETE FROM custom_field_values")
|
||||
conn.execute("DELETE FROM custom_fields")
|
||||
conn.execute("DELETE FROM feature_requests")
|
||||
conn.execute("DELETE FROM contacts")
|
||||
conn.execute("DELETE FROM organizations")
|
||||
|
||||
default_grid = {
|
||||
"columns": deep_copy_json(DEFAULT_FUNDRAISING_COLUMNS),
|
||||
"rows": deep_copy_json(DEFAULT_FUNDRAISING_ROWS)
|
||||
}
|
||||
default_views = sanitize_grid_views(deep_copy_json(DEFAULT_GRID_VIEWS))
|
||||
conn.execute("""
|
||||
UPDATE fundraising_state
|
||||
SET grid_json = ?, views_json = ?, version = COALESCE(version, 1) + 1, updated_by = ?, updated_at = ?
|
||||
WHERE id = 'main'
|
||||
""", (json.dumps(default_grid), json.dumps(default_views), user['user_id'], now()))
|
||||
sync_fundraising_relational(conn, default_grid, default_views, actor_user_id=user['user_id'])
|
||||
|
||||
log_audit(conn, user['user_id'], 'system', 'all-data', 'reset', {
|
||||
"pre_backup": pre_backup['filename'] if pre_backup else None
|
||||
})
|
||||
conn.commit()
|
||||
except Exception as exc:
|
||||
conn.rollback()
|
||||
conn.close()
|
||||
return self.send_error_json(f"Failed to reset data: {str(exc)}", 500)
|
||||
|
||||
conn.close()
|
||||
return self.send_json({
|
||||
"data": {
|
||||
"status": "ok",
|
||||
"pre_backup": pre_backup
|
||||
}
|
||||
})
|
||||
|
||||
def handle_list_audit_log(self, user, params):
|
||||
if not require_admin(user):
|
||||
return self.send_error_json("Admin access required", 403)
|
||||
@@ -3574,6 +4007,128 @@ class CRMHandler(BaseHTTPRequestHandler):
|
||||
}
|
||||
})
|
||||
|
||||
def _cleanup_fundraising_collab(self, conn):
|
||||
now_epoch = int(time.time())
|
||||
conn.execute("DELETE FROM fundraising_presence WHERE expires_at_epoch <= ?", (now_epoch,))
|
||||
conn.execute("DELETE FROM fundraising_cell_locks WHERE expires_at_epoch <= ?", (now_epoch,))
|
||||
|
||||
def _list_fundraising_collab_state(self, conn):
|
||||
presence_rows = rows_to_list(conn.execute("""
|
||||
SELECT user_id, username, full_name, active_view, row_id, col_id, is_editing, cell_key, last_seen_at
|
||||
FROM fundraising_presence
|
||||
ORDER BY last_seen_at DESC
|
||||
""").fetchall())
|
||||
lock_rows = rows_to_list(conn.execute("""
|
||||
SELECT cell_key, row_id, col_id, locked_by_user_id, locked_by_username, locked_by_full_name, last_seen_at
|
||||
FROM fundraising_cell_locks
|
||||
ORDER BY last_seen_at DESC
|
||||
""").fetchall())
|
||||
for row in presence_rows:
|
||||
row['is_editing'] = bool(row.get('is_editing'))
|
||||
return {"presence": presence_rows, "locks": lock_rows}
|
||||
|
||||
def handle_get_fundraising_collab_state(self, user):
|
||||
conn = get_db()
|
||||
self._cleanup_fundraising_collab(conn)
|
||||
snapshot = self._list_fundraising_collab_state(conn)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return self.send_json({"data": snapshot})
|
||||
|
||||
def handle_fundraising_collab_heartbeat(self, user, body):
|
||||
active_view = str(body.get('active_view') or '').strip()
|
||||
selected = body.get('selected') if isinstance(body.get('selected'), dict) else {}
|
||||
editing = body.get('editing') if isinstance(body.get('editing'), dict) else {}
|
||||
selected_row_id = str(selected.get('row_id') or '').strip()
|
||||
selected_col_id = str(selected.get('col_id') or '').strip()
|
||||
editing_row_id = str(editing.get('row_id') or '').strip()
|
||||
editing_col_id = str(editing.get('col_id') or '').strip()
|
||||
is_editing = bool(editing_row_id and editing_col_id)
|
||||
ttl_seconds = int(body.get('ttl_seconds') or 25)
|
||||
ttl_seconds = max(10, min(120, ttl_seconds))
|
||||
now_epoch = int(time.time())
|
||||
expires_at_epoch = now_epoch + ttl_seconds
|
||||
seen_at = now()
|
||||
lock_conflict = None
|
||||
|
||||
conn = get_db()
|
||||
self._cleanup_fundraising_collab(conn)
|
||||
|
||||
user_row = conn.execute("SELECT username, full_name FROM users WHERE id = ?", (user['user_id'],)).fetchone()
|
||||
username = str(user_row['username']) if user_row and user_row['username'] else str(user.get('username') or '')
|
||||
full_name = str(user_row['full_name']) if user_row and user_row['full_name'] else ''
|
||||
editing_cell_key = f"{editing_row_id}:{editing_col_id}" if is_editing else None
|
||||
|
||||
if is_editing and editing_cell_key:
|
||||
existing_lock = conn.execute("""
|
||||
SELECT cell_key, row_id, col_id, locked_by_user_id, locked_by_username, locked_by_full_name, last_seen_at
|
||||
FROM fundraising_cell_locks
|
||||
WHERE cell_key = ? AND locked_by_user_id != ? AND expires_at_epoch > ?
|
||||
LIMIT 1
|
||||
""", (editing_cell_key, user['user_id'], now_epoch)).fetchone()
|
||||
if existing_lock:
|
||||
lock_conflict = row_to_dict(existing_lock)
|
||||
is_editing = False
|
||||
editing_cell_key = None
|
||||
else:
|
||||
conn.execute("""
|
||||
INSERT INTO fundraising_cell_locks (
|
||||
cell_key, row_id, col_id, locked_by_user_id, locked_by_username, locked_by_full_name, last_seen_at, expires_at_epoch
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(cell_key) DO UPDATE SET
|
||||
row_id = excluded.row_id,
|
||||
col_id = excluded.col_id,
|
||||
locked_by_user_id = excluded.locked_by_user_id,
|
||||
locked_by_username = excluded.locked_by_username,
|
||||
locked_by_full_name = excluded.locked_by_full_name,
|
||||
last_seen_at = excluded.last_seen_at,
|
||||
expires_at_epoch = excluded.expires_at_epoch
|
||||
""", (editing_cell_key, editing_row_id, editing_col_id, user['user_id'], username, full_name, seen_at, expires_at_epoch))
|
||||
conn.execute("""
|
||||
DELETE FROM fundraising_cell_locks
|
||||
WHERE locked_by_user_id = ? AND cell_key != ?
|
||||
""", (user['user_id'], editing_cell_key))
|
||||
else:
|
||||
conn.execute("DELETE FROM fundraising_cell_locks WHERE locked_by_user_id = ?", (user['user_id'],))
|
||||
|
||||
conn.execute("""
|
||||
INSERT INTO fundraising_presence (
|
||||
user_id, username, full_name, active_view, row_id, col_id, is_editing, cell_key, last_seen_at, expires_at_epoch
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(user_id) DO UPDATE SET
|
||||
username = excluded.username,
|
||||
full_name = excluded.full_name,
|
||||
active_view = excluded.active_view,
|
||||
row_id = excluded.row_id,
|
||||
col_id = excluded.col_id,
|
||||
is_editing = excluded.is_editing,
|
||||
cell_key = excluded.cell_key,
|
||||
last_seen_at = excluded.last_seen_at,
|
||||
expires_at_epoch = excluded.expires_at_epoch
|
||||
""", (
|
||||
user['user_id'],
|
||||
username,
|
||||
full_name,
|
||||
active_view,
|
||||
selected_row_id or editing_row_id,
|
||||
selected_col_id or editing_col_id,
|
||||
1 if is_editing else 0,
|
||||
editing_cell_key,
|
||||
seen_at,
|
||||
expires_at_epoch
|
||||
))
|
||||
|
||||
snapshot = self._list_fundraising_collab_state(conn)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
return self.send_json({
|
||||
"data": {
|
||||
**snapshot,
|
||||
"lock_conflict": lock_conflict
|
||||
}
|
||||
})
|
||||
|
||||
def handle_update_fundraising_state(self, user, body):
|
||||
grid = body.get('grid', {})
|
||||
views = body.get('views')
|
||||
@@ -3948,7 +4503,20 @@ def main():
|
||||
print("Demo data seeding disabled (set CRM_SEED_DEMO_DATA=1 to enable).")
|
||||
start_backup_scheduler()
|
||||
|
||||
server = HTTPServer((HOST, PORT), CRMHandler)
|
||||
# ─── Gmail sync scheduler (feature-flag-guarded) ─────────────────
|
||||
if os.environ.get("CRM_GMAIL_INTEGRATION_ENABLED", "").lower() in ("1", "true", "yes", "on"):
|
||||
try:
|
||||
from email_integration.scheduler import start_sync_scheduler
|
||||
start_sync_scheduler()
|
||||
print("[email_integration] Gmail sync scheduler started")
|
||||
except Exception as _e:
|
||||
print(f"[email_integration] failed to start scheduler: {_e}")
|
||||
|
||||
# ThreadingHTTPServer lets one slow request (or a wave of scanner probes)
|
||||
# not block legit users. SQLite is opened per-request via get_db(), and
|
||||
# WAL mode allows concurrent readers + a single writer, so this is safe.
|
||||
server = ThreadingHTTPServer((HOST, PORT), CRMHandler)
|
||||
server.daemon_threads = True
|
||||
print(f"\n{'='*60}")
|
||||
print(f" Venture Fund CRM Server")
|
||||
print(f" Running at http://{HOST}:{PORT}")
|
||||
|
||||
Reference in New Issue
Block a user