langchain-ai · davidx33 · Nov 22, 2024 · Nov 21, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/python/langsmith/anonymizer.py b/python/langsmith/anonymizer.py
@@ -1,4 +1,4 @@
 import re  # noqa
 import inspect
 from abc import abstractmethod
 from collections import defaultdict
@@ -90,22 +90,29 @@
 
     def __init__(self, rules: List[StringNodeRule]):
         """Initialize the processor with a list of rules."""
-        self.rules = rules
+        self.rules = [
+            {
+                "pattern": (
+                    rule["pattern"]
+                    if isinstance(rule["pattern"], re.Pattern)
+                    else re.compile(rule["pattern"])
+                ),
+                "replace": (
+                    rule["replace"]
+                    if isinstance(rule.get("replace"), str)
+                    else "[redacted]"
+                ),
+            }
+            for rule in rules
+        ]
 
     def mask_nodes(self, nodes: List[StringNode]) -> List[StringNode]:
         """Mask nodes using the rules."""
         result = []
         for item in nodes:
             new_value = item["value"]
             for rule in self.rules:
-                new_value = rule["pattern"].sub(
-                    (
-                        rule["replace"]
-                        if isinstance(rule["replace"], str)
-                        else "[redacted]"
-                    ),
-                    new_value,
-                )
+                new_value = rule["pattern"].sub(rule["replace"], new_value)
             if new_value != item["value"]:
                 result.append(StringNode(value=new_value, path=item["path"]))
         return result

diff --git a/python/tests/unit_tests/test_anonymizer.py b/python/tests/unit_tests/test_anonymizer.py
@@ -9,7 +9,7 @@
 from pydantic import BaseModel
 
 from langsmith import Client, traceable, tracing_context
-from langsmith.anonymizer import StringNodeRule, create_anonymizer
+from langsmith.anonymizer import RuleNodeProcessor, StringNodeRule, create_anonymizer
 
 EMAIL_REGEX = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
 UUID_REGEX = re.compile(
@@ -139,3 +139,51 @@ def my_func(body: str, from_: MyInput) -> MyOutput:
     if "inputs" in patched_data:
         assert patched_data["inputs"] == expected_inputs
     assert patched_data["outputs"] == expected_outputs
+
+
+def test_rule_node_processor_scrub_sensitive_info():
+    rules = [
+        StringNodeRule(pattern=re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), replace="[ssn]"),
+        StringNodeRule(
+            pattern=re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"),
+            replace="[email]",
+        ),
+        StringNodeRule(
+            pattern=re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), replace="[phone]"
+        ),
+    ]
+    processor = RuleNodeProcessor(rules)
+
+    nodes = [
+        {"value": "My SSN is 123-45-6789.", "path": ["field1"]},
+        {"value": "Contact me at [email protected].", "path": ["field2"]},
+        {"value": "Call me on 123-456-7890.", "path": ["field3"]},
+    ]
+
+    expected = [
+        {"value": "My SSN is [ssn].", "path": ["field1"]},
+        {"value": "Contact me at [email].", "path": ["field2"]},
+        {"value": "Call me on [phone].", "path": ["field3"]},
+    ]
+
+    result = processor.mask_nodes(nodes)
+
+    assert result == expected
+
+
+def test_rule_node_processor_default_replace():
+    rules = [
+        StringNodeRule(pattern=re.compile(r"sensitive")),
+    ]
+    processor = RuleNodeProcessor(rules)
+
+    nodes = [
+        {"value": "This contains sensitive data", "path": ["field1"]},
+    ]
+
+    expected = [
+        {"value": "This contains [redacted] data", "path": ["field1"]},
+    ]
+
+    result = processor.mask_nodes(nodes)
+    assert result == expected