diff --git a/backend/redaction/scrub.py b/backend/redaction/scrub.py index 92fffbd..f3c3da7 100644 --- a/backend/redaction/scrub.py +++ b/backend/redaction/scrub.py @@ -70,9 +70,9 @@ _NUMWORD = (r"(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve| r"sixty|seventy|eighty|ninety|hundred|couple|few|several|half|a)") _MAG = r"(?:mm|bn|tn|thousand|million|billion|trillion|k|m|b)" # longest-first so 'MM' isn't split into 'M' _AMOUNT_RES = [ - re.compile(r"[$€£]\s?\d[\d,. ]*\d?\s?-\s?[$€£]?\s?\d[\d,. ]*\d?(?:\s?" + _MAG + r")?", re.IGNORECASE), # $3-5M range - re.compile(r"[$€£]\s?\d[\d,]*(?:\.\d+)?(?:\s?" + _MAG + r")?", re.IGNORECASE), # $5,000,000 / $5m - re.compile(r"\b(?:USD|EUR|GBP|CHF|CAD|AUD)\s?[$€£]?\s?\d[\d,]*(?:\.\d+)?(?:\s?" + _MAG + r")?", re.IGNORECASE), + re.compile(r"[$€£]\s?\d[\d,. ]*\d?\s?-\s?[$€£]?\s?\d[\d,. ]*\d?(?:\s?" + _MAG + r"\b)?", re.IGNORECASE), # $3-5M range + re.compile(r"[$€£]\s?\d[\d,]*(?:\.\d+)?(?:\s?" + _MAG + r"\b)?", re.IGNORECASE), # $5,000,000 / $5m + re.compile(r"\b(?:USD|EUR|GBP|CHF|CAD|AUD)\s?[$€£]?\s?\d[\d,]*(?:\.\d+)?(?:\s?" + _MAG + r"\b)?", re.IGNORECASE), re.compile(r"\b\d[\d,]*(?:\.\d+)?\s?(?:dollars?|euros?|pounds?)\b", re.IGNORECASE), # 5,000,000 dollars re.compile(r"(?i)\b(?:" + _NUMWORD + r"[\s\-]+){1,4}" + _MAG + r"\s+(?:dollars?|euros?|pounds?)\b"), # five million dollars ] diff --git a/backend/redaction/test_scrub_leak.py b/backend/redaction/test_scrub_leak.py index b035ab3..5e0faae 100644 --- a/backend/redaction/test_scrub_leak.py +++ b/backend/redaction/test_scrub_leak.py @@ -157,6 +157,22 @@ def main(): check("Rey​es" not in out(zw, {"persons": ["Reyes"]}) and "Reyes" not in out(zw, {"persons": ["Reyes"]}), "zero-width-split known name does not leak") + print("\n[round-5 — magnitude suffix must not eat a following word]") + # A single-letter magnitude (k/m/b) immediately before a real word must NOT be + # consumed as a suffix: '$5,000,000 but' -> the 'b' of 'but' was being eaten, + # yielding '[AMOUNT_1]ut'. A \b after the magnitude fixes it. Money still vanishes, + # the following word survives intact, and legitimate suffixes still tokenize. + for raw, word in [("$5,000,000 but he hesitates", "but he hesitates"), + ("committed $250,000 because timing", "because timing"), + ("USD 5,000,000 but capped", "but capped"), + ("between $3-5M but capped", "but capped")]: + o = out(raw) + check("[AMOUNT_1]ut" not in o and "[AMOUNT_1]ecause" not in o, f"magnitude does not bleed into next word: {raw!r}") + check(word in o, f"following word survives intact: {word!r}") + check("$" not in o and "USD 5" not in o, f"amount still tokenized: {raw!r}") + check(out("raised $5m but later") == "raised [AMOUNT_1] but later", "real 'm' suffix still tokenizes ($5m)") + check(out("about $5b in assets") == "about [AMOUNT_1] in assets", "real 'b' suffix still tokenizes ($5b)") + conn.close() print() if FAILS: diff --git a/start9/0.4/startos/utils.ts b/start9/0.4/startos/utils.ts index 29051c2..fceee2d 100644 --- a/start9/0.4/startos/utils.ts +++ b/start9/0.4/startos/utils.ts @@ -21,8 +21,9 @@ export const PACKAGE_TITLE = 'Ten31 Database' // * 0.1.0:53 (seed v5 thesis into the Architect Workshop) // * 0.1.0:54 (unification polish: LinkedIn in grid inline contact editor) // * 0.1.0:55 (Architect grounding boundary: redaction/re-hydration privacy gate) -// * Current: 0.1.0:56 (Thesis Workshop redesign: edit/choose/delete + approve-as-current) -export const PACKAGE_VERSION = '0.1.0:56' +// * 0.1.0:56 (Thesis Workshop redesign: edit/choose/delete + approve-as-current) +// * Current: 0.1.0:57 (redaction fix: magnitude regex no longer eats the word after an amount) +export const PACKAGE_VERSION = '0.1.0:57' export const DATA_MOUNT_PATH = '/data' export const WEB_PORT = 8080 diff --git a/start9/0.4/startos/versions/index.ts b/start9/0.4/startos/versions/index.ts index 2833e9e..355d0c5 100644 --- a/start9/0.4/startos/versions/index.ts +++ b/start9/0.4/startos/versions/index.ts @@ -17,8 +17,9 @@ import { v_0_1_0_53 } from './v0.1.0.53' import { v_0_1_0_54 } from './v0.1.0.54' import { v_0_1_0_55 } from './v0.1.0.55' import { v_0_1_0_56 } from './v0.1.0.56' +import { v_0_1_0_57 } from './v0.1.0.57' export const versionGraph = VersionGraph.of({ - current: v_0_1_0_56, - other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43, v_0_1_0_44, v_0_1_0_45, v_0_1_0_46, v_0_1_0_47, v_0_1_0_48, v_0_1_0_49, v_0_1_0_50, v_0_1_0_51, v_0_1_0_52, v_0_1_0_53, v_0_1_0_54, v_0_1_0_55], + current: v_0_1_0_57, + other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43, v_0_1_0_44, v_0_1_0_45, v_0_1_0_46, v_0_1_0_47, v_0_1_0_48, v_0_1_0_49, v_0_1_0_50, v_0_1_0_51, v_0_1_0_52, v_0_1_0_53, v_0_1_0_54, v_0_1_0_55, v_0_1_0_56], }) diff --git a/start9/0.4/startos/versions/v0.1.0.57.ts b/start9/0.4/startos/versions/v0.1.0.57.ts new file mode 100644 index 0000000..ca965e6 --- /dev/null +++ b/start9/0.4/startos/versions/v0.1.0.57.ts @@ -0,0 +1,18 @@ +import { VersionInfo } from '@start9labs/start-sdk' + +// Redaction engine fix: the amount/magnitude regex no longer swallows the first +// letter of a following word. '$5,000,000 but' was scrubbing to '[AMOUNT_1]ut' +// because the single-letter 'b' (billion) suffix matched the 'b' of 'but'; a word +// boundary after the magnitude restores it. Round-trips were already lossless; this +// keeps the *outbound* (to-Claude) text from losing a word. No data migration. +export const v_0_1_0_57 = VersionInfo.of({ + version: '0.1.0:57', + releaseNotes: { + en_US: [ + 'Redaction fix: a dollar amount immediately followed by a word (e.g. "$5,000,000', + 'but") no longer eats the first letter of that word when de-identifying text sent', + 'to the Architect. Real magnitude suffixes ($5m, $5b, $3-5M) still tokenize.', + ].join(' '), + }, + migrations: { up: async () => {}, down: async () => {} }, +})