redaction: \b after magnitude so amounts don't eat the next word (v0.1.0:57)
The currency-anchored amount regexes treated a single-letter magnitude suffix (k/m/b) as optional but unbounded, so "$5,000,000 but" scrubbed to "[AMOUNT_1]ut" — the 'b' of "but" was consumed as a 'billion' suffix. Add a word boundary after _MAG on the three currency-anchored _AMOUNT_RES patterns (range, symbol, ISO-code); the worded-amount pattern is unaffected. Money still tokenizes in every case ($5m/$5b/$3-5M/USD 5,000,000); only the OUTBOUND to-Claude text stops losing the leading letter of the following word. Round-trips were already lossless. Regression-locked by a round-5 section in test_scrub_leak.py; full redaction suite (scrub_leak + reidentification + grounding_boundary) green. Packaged as StartOS v0.1.0:57. Reported by the Spark gateway dev; gateway re-vendored scrub.py verbatim for parity (same golden-file leak test gates both sides). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -70,9 +70,9 @@ _NUMWORD = (r"(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|
|
|||||||
r"sixty|seventy|eighty|ninety|hundred|couple|few|several|half|a)")
|
r"sixty|seventy|eighty|ninety|hundred|couple|few|several|half|a)")
|
||||||
_MAG = r"(?:mm|bn|tn|thousand|million|billion|trillion|k|m|b)" # longest-first so 'MM' isn't split into 'M'
|
_MAG = r"(?:mm|bn|tn|thousand|million|billion|trillion|k|m|b)" # longest-first so 'MM' isn't split into 'M'
|
||||||
_AMOUNT_RES = [
|
_AMOUNT_RES = [
|
||||||
re.compile(r"[$€£]\s?\d[\d,. ]*\d?\s?-\s?[$€£]?\s?\d[\d,. ]*\d?(?:\s?" + _MAG + r")?", re.IGNORECASE), # $3-5M range
|
re.compile(r"[$€£]\s?\d[\d,. ]*\d?\s?-\s?[$€£]?\s?\d[\d,. ]*\d?(?:\s?" + _MAG + r"\b)?", re.IGNORECASE), # $3-5M range
|
||||||
re.compile(r"[$€£]\s?\d[\d,]*(?:\.\d+)?(?:\s?" + _MAG + r")?", re.IGNORECASE), # $5,000,000 / $5m
|
re.compile(r"[$€£]\s?\d[\d,]*(?:\.\d+)?(?:\s?" + _MAG + r"\b)?", re.IGNORECASE), # $5,000,000 / $5m
|
||||||
re.compile(r"\b(?:USD|EUR|GBP|CHF|CAD|AUD)\s?[$€£]?\s?\d[\d,]*(?:\.\d+)?(?:\s?" + _MAG + r")?", re.IGNORECASE),
|
re.compile(r"\b(?:USD|EUR|GBP|CHF|CAD|AUD)\s?[$€£]?\s?\d[\d,]*(?:\.\d+)?(?:\s?" + _MAG + r"\b)?", re.IGNORECASE),
|
||||||
re.compile(r"\b\d[\d,]*(?:\.\d+)?\s?(?:dollars?|euros?|pounds?)\b", re.IGNORECASE), # 5,000,000 dollars
|
re.compile(r"\b\d[\d,]*(?:\.\d+)?\s?(?:dollars?|euros?|pounds?)\b", re.IGNORECASE), # 5,000,000 dollars
|
||||||
re.compile(r"(?i)\b(?:" + _NUMWORD + r"[\s\-]+){1,4}" + _MAG + r"\s+(?:dollars?|euros?|pounds?)\b"), # five million dollars
|
re.compile(r"(?i)\b(?:" + _NUMWORD + r"[\s\-]+){1,4}" + _MAG + r"\s+(?:dollars?|euros?|pounds?)\b"), # five million dollars
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -157,6 +157,22 @@ def main():
|
|||||||
check("Reyes" not in out(zw, {"persons": ["Reyes"]}) and "Reyes" not in out(zw, {"persons": ["Reyes"]}),
|
check("Reyes" not in out(zw, {"persons": ["Reyes"]}) and "Reyes" not in out(zw, {"persons": ["Reyes"]}),
|
||||||
"zero-width-split known name does not leak")
|
"zero-width-split known name does not leak")
|
||||||
|
|
||||||
|
print("\n[round-5 — magnitude suffix must not eat a following word]")
|
||||||
|
# A single-letter magnitude (k/m/b) immediately before a real word must NOT be
|
||||||
|
# consumed as a suffix: '$5,000,000 but' -> the 'b' of 'but' was being eaten,
|
||||||
|
# yielding '[AMOUNT_1]ut'. A \b after the magnitude fixes it. Money still vanishes,
|
||||||
|
# the following word survives intact, and legitimate suffixes still tokenize.
|
||||||
|
for raw, word in [("$5,000,000 but he hesitates", "but he hesitates"),
|
||||||
|
("committed $250,000 because timing", "because timing"),
|
||||||
|
("USD 5,000,000 but capped", "but capped"),
|
||||||
|
("between $3-5M but capped", "but capped")]:
|
||||||
|
o = out(raw)
|
||||||
|
check("[AMOUNT_1]ut" not in o and "[AMOUNT_1]ecause" not in o, f"magnitude does not bleed into next word: {raw!r}")
|
||||||
|
check(word in o, f"following word survives intact: {word!r}")
|
||||||
|
check("$" not in o and "USD 5" not in o, f"amount still tokenized: {raw!r}")
|
||||||
|
check(out("raised $5m but later") == "raised [AMOUNT_1] but later", "real 'm' suffix still tokenizes ($5m)")
|
||||||
|
check(out("about $5b in assets") == "about [AMOUNT_1] in assets", "real 'b' suffix still tokenizes ($5b)")
|
||||||
|
|
||||||
conn.close()
|
conn.close()
|
||||||
print()
|
print()
|
||||||
if FAILS:
|
if FAILS:
|
||||||
|
|||||||
@@ -21,8 +21,9 @@ export const PACKAGE_TITLE = 'Ten31 Database'
|
|||||||
// * 0.1.0:53 (seed v5 thesis into the Architect Workshop)
|
// * 0.1.0:53 (seed v5 thesis into the Architect Workshop)
|
||||||
// * 0.1.0:54 (unification polish: LinkedIn in grid inline contact editor)
|
// * 0.1.0:54 (unification polish: LinkedIn in grid inline contact editor)
|
||||||
// * 0.1.0:55 (Architect grounding boundary: redaction/re-hydration privacy gate)
|
// * 0.1.0:55 (Architect grounding boundary: redaction/re-hydration privacy gate)
|
||||||
// * Current: 0.1.0:56 (Thesis Workshop redesign: edit/choose/delete + approve-as-current)
|
// * 0.1.0:56 (Thesis Workshop redesign: edit/choose/delete + approve-as-current)
|
||||||
export const PACKAGE_VERSION = '0.1.0:56'
|
// * Current: 0.1.0:57 (redaction fix: magnitude regex no longer eats the word after an amount)
|
||||||
|
export const PACKAGE_VERSION = '0.1.0:57'
|
||||||
|
|
||||||
export const DATA_MOUNT_PATH = '/data'
|
export const DATA_MOUNT_PATH = '/data'
|
||||||
export const WEB_PORT = 8080
|
export const WEB_PORT = 8080
|
||||||
|
|||||||
@@ -17,8 +17,9 @@ import { v_0_1_0_53 } from './v0.1.0.53'
|
|||||||
import { v_0_1_0_54 } from './v0.1.0.54'
|
import { v_0_1_0_54 } from './v0.1.0.54'
|
||||||
import { v_0_1_0_55 } from './v0.1.0.55'
|
import { v_0_1_0_55 } from './v0.1.0.55'
|
||||||
import { v_0_1_0_56 } from './v0.1.0.56'
|
import { v_0_1_0_56 } from './v0.1.0.56'
|
||||||
|
import { v_0_1_0_57 } from './v0.1.0.57'
|
||||||
|
|
||||||
export const versionGraph = VersionGraph.of({
|
export const versionGraph = VersionGraph.of({
|
||||||
current: v_0_1_0_56,
|
current: v_0_1_0_57,
|
||||||
other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43, v_0_1_0_44, v_0_1_0_45, v_0_1_0_46, v_0_1_0_47, v_0_1_0_48, v_0_1_0_49, v_0_1_0_50, v_0_1_0_51, v_0_1_0_52, v_0_1_0_53, v_0_1_0_54, v_0_1_0_55],
|
other: [v_0_1_0_39, v_0_1_0_40, v_0_1_0_41, v_0_1_0_42, v_0_1_0_43, v_0_1_0_44, v_0_1_0_45, v_0_1_0_46, v_0_1_0_47, v_0_1_0_48, v_0_1_0_49, v_0_1_0_50, v_0_1_0_51, v_0_1_0_52, v_0_1_0_53, v_0_1_0_54, v_0_1_0_55, v_0_1_0_56],
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -0,0 +1,18 @@
|
|||||||
|
import { VersionInfo } from '@start9labs/start-sdk'
|
||||||
|
|
||||||
|
// Redaction engine fix: the amount/magnitude regex no longer swallows the first
|
||||||
|
// letter of a following word. '$5,000,000 but' was scrubbing to '[AMOUNT_1]ut'
|
||||||
|
// because the single-letter 'b' (billion) suffix matched the 'b' of 'but'; a word
|
||||||
|
// boundary after the magnitude restores it. Round-trips were already lossless; this
|
||||||
|
// keeps the *outbound* (to-Claude) text from losing a word. No data migration.
|
||||||
|
export const v_0_1_0_57 = VersionInfo.of({
|
||||||
|
version: '0.1.0:57',
|
||||||
|
releaseNotes: {
|
||||||
|
en_US: [
|
||||||
|
'Redaction fix: a dollar amount immediately followed by a word (e.g. "$5,000,000',
|
||||||
|
'but") no longer eats the first letter of that word when de-identifying text sent',
|
||||||
|
'to the Architect. Real magnitude suffixes ($5m, $5b, $3-5M) still tokenize.',
|
||||||
|
].join(' '),
|
||||||
|
},
|
||||||
|
migrations: { up: async () => {}, down: async () => {} },
|
||||||
|
})
|
||||||
Reference in New Issue
Block a user