""" Investor matching. Builds an in-memory index of investor email addresses from: - fundraising_contacts.email - contacts.email - organizations.email + organizations.website (domain only) For each synced email, returns a list of investor links. Exact-email matches beat domain matches; if any exact match exists, domain matches are suppressed. The index is rebuilt every `REFRESH_INTERVAL_SEC` or on demand via rebuild(). """ import re import threading import time from dataclasses import dataclass from typing import Optional REFRESH_INTERVAL_SEC = 900 # 15 minutes # Domains we never domain-match against (personal mailboxes). COMMON_PERSONAL_DOMAINS = { "gmail.com", "googlemail.com", "outlook.com", "hotmail.com", "live.com", "msn.com", "yahoo.com", "yahoo.co.uk", "ymail.com", "icloud.com", "me.com", "mac.com", "aol.com", "proton.me", "protonmail.com", "pm.me", "fastmail.com", "tuta.io", "hey.com", "duck.com", "zoho.com", } # Also skip matching on the team's own domain (they email each other). # Populated from CONFIG.workspace_domain at rebuild time. @dataclass class MatchTarget: fundraising_investor_id: Optional[str] = None fundraising_contact_id: Optional[str] = None contact_id: Optional[str] = None organization_id: Optional[str] = None investor_name: Optional[str] = None @dataclass class InvestorLink: matched_address: str match_kind: str # exact_email | domain_match | manual match_confidence: float target: MatchTarget class InvestorIndex: def __init__(self, own_domain: Optional[str] = None): self._email_index: dict[str, MatchTarget] = {} self._domain_index: dict[str, list[MatchTarget]] = {} self._own_domain = (own_domain or "").lower() or None self._last_built = 0.0 self._lock = threading.Lock() # ------------------------------------------------------------------ build def rebuild(self, db_conn_factory) -> None: with self._lock: email_idx: dict[str, MatchTarget] = {} domain_idx: dict[str, list[MatchTarget]] = {} conn = db_conn_factory() try: cur = conn.cursor() # fundraising_contacts cur.execute( "SELECT fc.id, fc.email, fc.investor_id, fi.investor_name " "FROM fundraising_contacts fc " "LEFT JOIN fundraising_investors fi ON fi.id = fc.investor_id " "WHERE fc.email IS NOT NULL AND fc.email != ''" ) for r in cur.fetchall(): addr = (r["email"] or "").lower().strip() if not _valid_email(addr): continue email_idx[addr] = MatchTarget( fundraising_contact_id=r["id"], fundraising_investor_id=r["investor_id"], investor_name=r["investor_name"], ) # contacts cur.execute( "SELECT id, email, organization_id FROM contacts " "WHERE email IS NOT NULL AND email != ''" ) for r in cur.fetchall(): addr = (r["email"] or "").lower().strip() if not _valid_email(addr): continue # Don't overwrite a fundraising_contact match; they're higher signal. email_idx.setdefault(addr, MatchTarget( contact_id=r["id"], organization_id=r["organization_id"], )) # organizations — domain-only match source cur.execute( "SELECT id, name, email, website FROM organizations " "WHERE (email IS NOT NULL AND email != '') OR (website IS NOT NULL AND website != '')" ) for r in cur.fetchall(): for d in _domains_for_org(r): if d in COMMON_PERSONAL_DOMAINS: continue if self._own_domain and d == self._own_domain: continue domain_idx.setdefault(d, []).append(MatchTarget( organization_id=r["id"], investor_name=r["name"], )) finally: conn.close() self._email_index = email_idx self._domain_index = domain_idx self._last_built = time.time() def rebuild_if_stale(self, db_conn_factory) -> None: if time.time() - self._last_built > REFRESH_INTERVAL_SEC: self.rebuild(db_conn_factory) # ------------------------------------------------------------------ query def match(self, addresses: set[str], *, exclude_addresses: Optional[set[str]] = None) -> list[InvestorLink]: excl = {a.lower() for a in (exclude_addresses or set())} candidates = {a.lower().strip() for a in addresses if a} - excl # Exclude own domain addresses (teammates emailing each other). if self._own_domain: candidates = {a for a in candidates if not a.endswith("@" + self._own_domain)} links: list[InvestorLink] = [] seen_targets: set[tuple] = set() # Exact email matches first. for addr in candidates: t = self._email_index.get(addr) if t: key = (t.fundraising_contact_id, t.contact_id) if key in seen_targets: continue seen_targets.add(key) links.append(InvestorLink( matched_address=addr, match_kind="exact_email", match_confidence=1.0, target=t, )) if links: # exact hits short-circuit domain matching return links # Domain fallback. for addr in candidates: _, _, domain = addr.partition("@") if not domain or domain in COMMON_PERSONAL_DOMAINS: continue for t in self._domain_index.get(domain, []): key = ("org", t.organization_id) if key in seen_targets: continue seen_targets.add(key) links.append(InvestorLink( matched_address=addr, match_kind="domain_match", match_confidence=0.6, target=t, )) return links # ---------------------------------------------------------------------------- helpers _EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$") def _valid_email(s: str) -> bool: return bool(_EMAIL_RE.match(s)) def _domains_for_org(row) -> list[str]: out: list[str] = [] if row["email"]: _, _, d = row["email"].lower().partition("@") if d: out.append(d) if row["website"]: d = _domain_from_url(row["website"]) if d: out.append(d) return list({d for d in out if d}) def _domain_from_url(url: str) -> Optional[str]: if not url: return None m = re.match(r"^\s*(?:https?://)?(?:www\.)?([^/:?#\s]+)", url.strip(), re.IGNORECASE) if not m: return None return m.group(1).lower()