"""Redaction / re-hydration boundary — the privacy gate between Ten31's sovereign data and the Claude API. Implements docs/redaction-rehydration.md, hardened against an adversarial leak-hunt (see docs/spark-control-scrub-endpoints.md for the gateway twin). Defense in depth — NO single layer is trusted as "leak-proof": 1. MINIMIZE-FIRST (caller): a local-Qwen summary strips most identity before scrub runs. 2. PRE-NEUTRALIZE: any pre-existing [TYPE_N]-shaped string in the input is tokenized first, so every placeholder that reaches Claude is one WE minted (no injection). 3. TIER-1 DROP: labelled/structured account-wire-SSN-IBAN-passport data, separator tolerant, excised entirely (never tokenized, never in the map). 4. KNOWN-ENTITY tokenize: the LP identities we own (dictionary from the canonical layer), matched UNICODE-FOLDED (accents/case) with hyphenated-surname extension. 5. STRUCTURED-PII tokenize/bucket: emails, URLs (incl. scheme-less/social), phones (intl + extensions), amounts (currency words/codes/symbols + worded + ranges), dates (ISO + worded + numeric + quarter), street addresses, bare long digit runs. 6. NER BACKSTOP (ner_fn, on-infra local Qwen): tokenizes residual unknown person/org/ location names the dictionary can't know. Unknown names are the largest residual, so callers in production pass ner_fn and FAIL CLOSED if it is unreachable. The pseudonym map ({token: real_value}) is the de-anonymization key: local-only, NEVER sent to Claude, NEVER written to interaction_log (only counts). """ import json import re import sqlite3 import unicodedata import uuid from datetime import datetime, timezone TOKEN_TYPES = ("PERSON", "ORG", "FUND", "EMAIL", "PHONE", "URL", "ADDR", "AMOUNT", "DATE", "LOC", "MISC") _TOKEN_RE = re.compile(r"\[(?:" + "|".join(TOKEN_TYPES) + r")_\d+\]") # ── Tier-1: NEVER-SEND (dropped, not tokenized). Separator-tolerant + label-anchored. ── # Separators allow space/dot/dash/SLASH/COMMA so grouped account/SSN forms can't bypass. _SEP = r"[\s.\-/,]" _LABEL = (r"(?:acct|account|a/c|wire|routing|aba|sort\s?code|ssn|social\s?security|tax\s?id|" r"ein|policy|member|ref)") TIER1_PATTERNS = [ ("ssn", re.compile(r"\b\d{3}" + _SEP + r"\d{2}" + _SEP + r"\d{4}\b")), ("ssn", re.compile(r"(?i)\b(?:ssn|social\s?security|tax\s?id|ein)\b[^\d]{0,12}\(?\d{3}\)?" + _SEP + r"{0,3}\d{2}" + _SEP + r"{0,3}\d{4}\b")), ("iban", re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]){11,30}\b")), # IBAN >=15 chars; excludes 12-char ISIN ("swift", re.compile(r"(?i)\b(?:swift|bic)\b[^A-Za-z0-9]{0,8}[A-Z]{4}[A-Z]{2}[A-Z0-9]{2,5}\b")), ("passport", re.compile(r"(?i)\bpassport\b(?:\s?(?:no|number|num|#)\.?)?[^\dA-Za-z]{0,6}[A-Za-z]{0,2}[\s\-]?\d{6,9}\b")), ("labeled_account", re.compile(r"(?i)\b" + _LABEL + r"\b[^\dA-Za-z]{0,14}[#:]?\s*[\dXx](?:[\dXx]" + _SEP + r"?){5,}\b")), # labelled identifier with a LETTER prefix or an intervening 'no/number/id/ref/to' word # (e.g. 'acct A123456789012', 'member ID: X4451200931', 'Wire to GB123456789012') — these # slip the digit-led rule above, the bare-digit catch, and the IBAN floor. ("labeled_account", re.compile(r"(?i)\b" + _LABEL + r"\b(?:[\s.:#\-]{0,3}(?:no|number|num|id|ref|to)\b)?[\s.:#\-]{0,4}[A-Za-z]{0,4}\d[\dA-Za-z]{4,}\b")), ] # ── structured PII (Tier-2) ──────────────────────────────────────────────────── _EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b") _URL_RE = re.compile( r"\bhttps?://[^\s)\]]+" r"|\bwww\.[^\s)\]]+" r"|\b(?:[a-z0-9\-]+\.)?(?:linkedin|twitter|github|facebook|instagram|x|substack|medium)\.com/[^\s)\]]+", re.IGNORECASE) # Phones: NANP (3-3-4, optional +1, optional extension) OR E.164/international (leading +). # Tightened so plain 4-4 year ranges ('2019-2024') don't match. _PHONE_RE = re.compile( r"(? reversible [MISC]. Not glued to letters (so an ISIN/ticker like # US0378331005 stays intact substance), and a trailing sentence period doesn't block it. _BARE_DIGITS_RE = re.compile(r"(?= 1_000_000_000: return f"~${round(v/1_000_000_000)}B" if v >= 1_000_000: return f"~${round(v/1_000_000)}M" if v >= 1_000: return f"~${round(v/1_000)}k" return "~$<1k" def _bucket_date(s): iso = re.match(r"((?:19|20)\d{2})-(\d{2})-\d{2}", s) if iso: return f"Q{(int(iso.group(2))-1)//3 + 1} {iso.group(1)}" q = re.search(r"(?i)Q([1-4])[\s\-]?((?:19|20)\d{2})", s) if q: return f"Q{q.group(1)} {q.group(2)}" y = re.search(r"\b((?:19|20)\d{2})\b", s) if y: return y.group(1) yy = re.search(r"[/.\-](\d{2})\b", s) # 2-digit year fallback if yy: return "19" + yy.group(1) if int(yy.group(1)) > 30 else "20" + yy.group(1) return "(period)" class ScrubState: """Local pseudonym map for ONE task: same surface string -> same token (injective). The map is the de-anon key — local-only, never sent/serialized to a third party.""" def __init__(self): self.token_map = {} self._by_value = {} self._counters = {t: 0 for t in TOKEN_TYPES} self.tier1_dropped = [] def token_for(self, ttype, surface): key = (ttype, surface) tok = self._by_value.get(key) if tok is None: self._counters[ttype] += 1 tok = f"[{ttype}_{self._counters[ttype]}]" self._by_value[key] = tok self.token_map[tok] = surface return tok def _flatten_known(known_entities): if not known_entities: return [] type_by_key = {"persons": "PERSON", "orgs": "ORG", "funds": "FUND", "emails": "EMAIL", "locations": "LOC"} out = [] for key, ttype in type_by_key.items(): for s in known_entities.get(key, []) or []: s = (s or "").strip() if s: out.append((s, ttype)) return out def _match_known(text, known_list, state): """Tokenize known entities, matched UNICODE-FOLDED + case-insensitive, longest-first, extending over hyphen/apostrophe compounds so a known half of a double-barrelled surname pulls in the whole token. Operates by span so we can fold for matching but replace the ORIGINAL surface (preserved for rehydrate).""" if not known_list: return text folded = _fold(text) pairs = sorted(((_fold(unicodedata.normalize("NFKC", s)), t) for s, t in known_list), key=lambda x: len(x[0]), reverse=True) type_by_folded = {} for fs, t in pairs: type_by_folded.setdefault(fs, t) alt = "|".join(re.escape(fs) for fs, _ in pairs if fs) if not alt: return text rx = re.compile(r"(? 1 and folded[st - 1] in "-'’" and re.match(_WORDX, folded[st - 2] or ""): k = st - 2 while k >= 0 and (re.match(_WORDX, folded[k]) or folded[k] in "-'’"): k -= 1 st = k + 1 while en < len(folded) - 1 and folded[en] in "-'’" and re.match(_WORDX, folded[en + 1] or ""): k = en + 1 while k < len(folded) and (re.match(_WORDX, folded[k]) or folded[k] in "-'’"): k += 1 en = k spans.append((st, en, ttype)) if not spans: return text # merge overlaps, replace right-to-left in the ORIGINAL spans.sort() merged = [spans[0]] for st, en, tt in spans[1:]: ps, pe, ptt = merged[-1] if st <= pe: merged[-1] = (ps, max(pe, en), ptt) else: merged.append((st, en, tt)) for st, en, tt in reversed(merged): surface = text[st:en] text = text[:st] + state.token_for(tt, surface) + text[en:] return text def scrub(text, known_entities=None, bucket=False, state=None, ner_fn=None): """De-identify `text`. Returns (outbound_text, token_map, audit). Pass ner_fn (a local-model NER callable text->[(surface,type)]) in production to catch unknown names; without it the dictionary+regex path leaves unknown free-text names as residual (callers should minimize-first and/or fail closed).""" if text is None: text = "" st = state or ScrubState() # NFKC-normalize so decomposed (NFD) names and ligatures align with the dictionary # (else 'Reyés' in NFD or 'Steffen' with a ligature would miss and leak), and strip # zero-width characters that could split a known name ('Reyes'). s = unicodedata.normalize("NFKC", str(text)) s = re.sub(r"[\u200b\u200c\u200d\u2060\ufeff]", "", s) # 1) PRE-NEUTRALIZE pre-existing [TYPE_N] strings so they can't collide with our tokens. s = _TOKEN_RE.sub(lambda m: st.token_for("MISC", m.group(0)), s) # 2) TIER-1 DROP (labelled/structured; separator tolerant). Neutral marker, no value. for label, pat in TIER1_PATTERNS: def _drop(_m, _label=label): st.tier1_dropped.append(_label) return "[redacted]" s = pat.sub(_drop, s) # 3) KNOWN ENTITIES (unicode-folded, hyphen-extended). s = _match_known(s, _flatten_known(known_entities), st) # 4) STRUCTURED PII. Order matters: emails/urls/addresses, then DATES and AMOUNTS # (so dashed ISO dates / ranges aren't swallowed by the permissive phone matcher), # then PHONES, then any bare long digit run left over. s = _EMAIL_RE.sub(lambda m: st.token_for("EMAIL", m.group(0)), s) s = _URL_RE.sub(lambda m: st.token_for("URL", m.group(0)), s) s = _ZIP_RE.sub(lambda m: st.token_for("LOC", m.group(0)), s) # state+ZIP before ADDR (which would eat the state) s = _ADDR_RE.sub(lambda m: st.token_for("ADDR", m.group(0)), s) for date_re in _DATE_RES: if bucket: s = date_re.sub(lambda m: _bucket_date(m.group(0)), s) else: s = date_re.sub(lambda m: st.token_for("DATE", m.group(0)), s) for amt_re in _AMOUNT_RES: if bucket: s = amt_re.sub(lambda m: _bucket_amount(m.group(0)), s) else: s = amt_re.sub(lambda m: st.token_for("AMOUNT", m.group(0)), s) s = _PHONE_RE.sub(lambda m: st.token_for("PHONE", m.group(0)), s) # bare long unlabeled digit runs -> reversible [MISC] (never leak digits to Claude; # don't DROP, since these may be substance like share counts / security ids). s = _BARE_DIGITS_RE.sub(lambda m: st.token_for("MISC", m.group(0)), s) # 5) NER BACKSTOP for unknown names (production: local Qwen). Tokenize what it finds. # A connection failure here propagates so the caller can FAIL CLOSED rather than # emit name-blind. Sort longest-first so a full name is tokenized before its parts. if ner_fn is not None: for surface, ntype in sorted((ner_fn(s) or []), key=lambda e: len(e[0] or ""), reverse=True): surface = (surface or "").strip() if not surface or _TOKEN_RE.search(surface): continue tt = ntype if ntype in TOKEN_TYPES else "PERSON" s = re.sub(r"(?= 2: persons.add(name) for part in re.split(r"[\s'’\-]+", name): if len(part) >= 2 and not part.isdigit(): # index every part incl. short surnames (Wu, Li) persons.add(part) def _safe(q, fn): try: for r in conn.execute(q): fn(r) except sqlite3.OperationalError: pass # No `deleted_at` filter: tokenizing a soft-deleted name is desirable, and the live # contacts/canonical schemas vary on that column — filtering on it silently zeroed the # whole dictionary (a missing-column OperationalError swallowed by _safe). _safe("SELECT display_name, primary_email FROM canonical_entities WHERE entity_kind='person'", lambda r: (_add_person(r["display_name"]), r["primary_email"] and emails.add(r["primary_email"].strip().lower()))) _safe("SELECT first_name, last_name, email FROM contacts", lambda r: (_add_person(f"{r['first_name'] or ''} {r['last_name'] or ''}"), r["email"] and emails.add(r["email"].strip().lower()))) _safe("SELECT full_name, email FROM fundraising_contacts", lambda r: (_add_person(r["full_name"]), r["email"] and emails.add(r["email"].strip().lower()))) _safe("SELECT display_name FROM canonical_entities WHERE entity_kind IN ('organization','investor','lp')", lambda r: r["display_name"] and orgs.add(r["display_name"].strip())) _safe("SELECT name FROM organizations", lambda r: r["name"] and orgs.add(r["name"].strip())) _safe("SELECT investor_name FROM fundraising_investors", lambda r: r["investor_name"] and orgs.add(r["investor_name"].strip())) _safe("SELECT fund_name FROM fundraising_funds", lambda r: r["fund_name"] and funds.add(r["fund_name"].strip())) conn.close() for e in list(emails): lp = e.split("@")[0] if len(lp) >= 3 and not lp.isdigit(): persons.add(lp) return {"persons": sorted(persons, key=len, reverse=True), "orgs": sorted(orgs, key=len, reverse=True), "funds": sorted(funds, key=len, reverse=True), "emails": sorted(emails, key=len, reverse=True)} # ── audit logging (metadata only — never the map or real values) ─────────────── def _now(): return datetime.now(timezone.utc).replace(tzinfo=None).isoformat() + "Z" def log_scrub(conn, actor_id, audit, task=None, session_id=None, target_id=None, source="mcp"): payload = {"task": task, "session_id": session_id, "token_count": audit.get("token_count"), "tokens_by_type": audit.get("tokens_by_type"), "tier1_dropped_count": audit.get("tier1_dropped_count"), "tier1_dropped_kinds": audit.get("tier1_dropped_kinds"), "bucketed": audit.get("bucketed"), "outbound_chars": audit.get("outbound_chars")} conn.execute( """INSERT INTO interaction_log (id, ts, actor_type, actor_id, action, target_type, target_id, payload, source, created_at) VALUES (?,?, 'agent', ?, 'redaction.scrub', 'canonical_entity', ?, ?, ?, ?)""", (str(uuid.uuid4()), _now(), actor_id, target_id, json.dumps(payload), source, _now())) def log_rehydrate(conn, actor_id, tokens_rehydrated, residual, human_decision="pending", reviewer_id=None, task=None, session_id=None, source="mcp"): payload = {"task": task, "session_id": session_id, "tokens_rehydrated": tokens_rehydrated, "residual_placeholders": residual, "human_decision": human_decision, "reviewer_id": reviewer_id} conn.execute( """INSERT INTO interaction_log (id, ts, actor_type, actor_id, action, target_type, target_id, payload, source, created_at) VALUES (?,?, 'agent', ?, 'redaction.rehydrate', 'canonical_entity', NULL, ?, ?, ?)""", (str(uuid.uuid4()), _now(), actor_id, json.dumps(payload), source, _now()))