From 4774c97ff8b24107553a5166250767faaea09cae Mon Sep 17 00:00:00 2001 From: David Xu Date: Thu, 21 Nov 2024 15:29:38 -0800 Subject: [PATCH 1/4] recompile pattern if needed before using to mask --- python/langsmith/anonymizer.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/langsmith/anonymizer.py b/python/langsmith/anonymizer.py index 02954d460..bee15eb09 100644 --- a/python/langsmith/anonymizer.py +++ b/python/langsmith/anonymizer.py @@ -90,7 +90,15 @@ class RuleNodeProcessor(StringNodeProcessor): def __init__(self, rules: List[StringNodeRule]): """Initialize the processor with a list of rules.""" - self.rules = rules + self.rules = [ + { + "pattern": rule["pattern"] + if isinstance(rule["pattern"], re.Pattern) + else re.compile(rule["pattern"]), + "replace": rule.get("replace"), + } + for rule in rules + ] def mask_nodes(self, nodes: List[StringNode]) -> List[StringNode]: """Mask nodes using the rules.""" From 9a46b6c3e9261cc6b84ef69f89117515fc6e163b Mon Sep 17 00:00:00 2001 From: David Xu Date: Thu, 21 Nov 2024 16:40:26 -0800 Subject: [PATCH 2/4] adding tests for rulenodeprocessor. move fallback logic for rule replace to initializaiotn --- python/langsmith/anonymizer.py | 13 ++---- python/tests/unit_tests/test_anonymizer.py | 48 +++++++++++++++++++++- 2 files changed, 51 insertions(+), 10 deletions(-) diff --git a/python/langsmith/anonymizer.py b/python/langsmith/anonymizer.py index bee15eb09..498c63dac 100644 --- a/python/langsmith/anonymizer.py +++ b/python/langsmith/anonymizer.py @@ -95,7 +95,9 @@ def __init__(self, rules: List[StringNodeRule]): "pattern": rule["pattern"] if isinstance(rule["pattern"], re.Pattern) else re.compile(rule["pattern"]), - "replace": rule.get("replace"), + "replace": rule["replace"] + if isinstance(rule.get("replace"), str) + else "[redacted]", } for rule in rules ] @@ -106,14 +108,7 @@ def mask_nodes(self, nodes: List[StringNode]) -> List[StringNode]: for item in nodes: new_value = item["value"] for rule in self.rules: - new_value = rule["pattern"].sub( - ( - rule["replace"] - if isinstance(rule["replace"], str) - else "[redacted]" - ), - new_value, - ) + new_value = rule["pattern"].sub(rule["replace"], new_value) if new_value != item["value"]: result.append(StringNode(value=new_value, path=item["path"])) return result diff --git a/python/tests/unit_tests/test_anonymizer.py b/python/tests/unit_tests/test_anonymizer.py index 147f46d1c..17a346745 100644 --- a/python/tests/unit_tests/test_anonymizer.py +++ b/python/tests/unit_tests/test_anonymizer.py @@ -9,7 +9,7 @@ from pydantic import BaseModel from langsmith import Client, traceable, tracing_context -from langsmith.anonymizer import StringNodeRule, create_anonymizer +from langsmith.anonymizer import StringNodeRule, create_anonymizer, RuleNodeProcessor EMAIL_REGEX = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") UUID_REGEX = re.compile( @@ -139,3 +139,49 @@ def my_func(body: str, from_: MyInput) -> MyOutput: if "inputs" in patched_data: assert patched_data["inputs"] == expected_inputs assert patched_data["outputs"] == expected_outputs + +def test_rule_node_processor_scrub_sensitive_info(): + rules = [ + StringNodeRule(pattern=re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), replace="[ssn]"), + StringNodeRule( + pattern=re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"), + replace="[email]", + ), + StringNodeRule( + pattern=re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), replace="[phone]" + ), + ] + processor = RuleNodeProcessor(rules) + + nodes = [ + {"value": "My SSN is 123-45-6789.", "path": ["field1"]}, + {"value": "Contact me at john.doe@example.com.", "path": ["field2"]}, + {"value": "Call me on 123-456-7890.", "path": ["field3"]}, + ] + + expected = [ + {"value": "My SSN is [ssn].", "path": ["field1"]}, + {"value": "Contact me at [email].", "path": ["field2"]}, + {"value": "Call me on [phone].", "path": ["field3"]}, + ] + + result = processor.mask_nodes(nodes) + + assert result == expected + +def test_rule_node_processor_default_replace(): + rules = [ + StringNodeRule(pattern=re.compile(r"sensitive")), + ] + processor = RuleNodeProcessor(rules) + + nodes = [ + {"value": "This contains sensitive data", "path": ["field1"]}, + ] + + expected = [ + {"value": "This contains [redacted] data", "path": ["field1"]}, + ] + + result = processor.mask_nodes(nodes) + assert result == expected \ No newline at end of file From efae89ed0df640d92ef1b90ed5b6c720a0343b3a Mon Sep 17 00:00:00 2001 From: David Xu Date: Thu, 21 Nov 2024 16:44:43 -0800 Subject: [PATCH 3/4] fix import error --- python/tests/unit_tests/test_anonymizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/unit_tests/test_anonymizer.py b/python/tests/unit_tests/test_anonymizer.py index 17a346745..2bee0d2ba 100644 --- a/python/tests/unit_tests/test_anonymizer.py +++ b/python/tests/unit_tests/test_anonymizer.py @@ -9,7 +9,7 @@ from pydantic import BaseModel from langsmith import Client, traceable, tracing_context -from langsmith.anonymizer import StringNodeRule, create_anonymizer, RuleNodeProcessor +from langsmith.anonymizer import RuleNodeProcessor, StringNodeRule, create_anonymizer EMAIL_REGEX = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") UUID_REGEX = re.compile( From 20de1f7ea0f80d4115cb3cd71f7b6369056dd6f3 Mon Sep 17 00:00:00 2001 From: David Xu Date: Thu, 21 Nov 2024 16:48:13 -0800 Subject: [PATCH 4/4] formatting --- python/langsmith/anonymizer.py | 16 ++++++++++------ python/tests/unit_tests/test_anonymizer.py | 10 ++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/python/langsmith/anonymizer.py b/python/langsmith/anonymizer.py index 498c63dac..e04ee0c37 100644 --- a/python/langsmith/anonymizer.py +++ b/python/langsmith/anonymizer.py @@ -92,12 +92,16 @@ def __init__(self, rules: List[StringNodeRule]): """Initialize the processor with a list of rules.""" self.rules = [ { - "pattern": rule["pattern"] - if isinstance(rule["pattern"], re.Pattern) - else re.compile(rule["pattern"]), - "replace": rule["replace"] - if isinstance(rule.get("replace"), str) - else "[redacted]", + "pattern": ( + rule["pattern"] + if isinstance(rule["pattern"], re.Pattern) + else re.compile(rule["pattern"]) + ), + "replace": ( + rule["replace"] + if isinstance(rule.get("replace"), str) + else "[redacted]" + ), } for rule in rules ] diff --git a/python/tests/unit_tests/test_anonymizer.py b/python/tests/unit_tests/test_anonymizer.py index 2bee0d2ba..bd6284bf5 100644 --- a/python/tests/unit_tests/test_anonymizer.py +++ b/python/tests/unit_tests/test_anonymizer.py @@ -140,16 +140,17 @@ def my_func(body: str, from_: MyInput) -> MyOutput: assert patched_data["inputs"] == expected_inputs assert patched_data["outputs"] == expected_outputs + def test_rule_node_processor_scrub_sensitive_info(): rules = [ - StringNodeRule(pattern=re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), replace="[ssn]"), + StringNodeRule(pattern=re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), replace="[ssn]"), StringNodeRule( pattern=re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"), replace="[email]", - ), + ), StringNodeRule( pattern=re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), replace="[phone]" - ), + ), ] processor = RuleNodeProcessor(rules) @@ -169,6 +170,7 @@ def test_rule_node_processor_scrub_sensitive_info(): assert result == expected + def test_rule_node_processor_default_replace(): rules = [ StringNodeRule(pattern=re.compile(r"sensitive")), @@ -184,4 +186,4 @@ def test_rule_node_processor_default_replace(): ] result = processor.mask_nodes(nodes) - assert result == expected \ No newline at end of file + assert result == expected