Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

recompile pattern if needed before its used to mask sensitive info #1248

Merged
merged 4 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions python/langsmith/anonymizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import re # noqa

Check notice on line 1 in python/langsmith/anonymizer.py

View workflow job for this annotation

GitHub Actions / benchmark

Benchmark results

........... WARNING: the benchmark result may be unstable * the standard deviation (94.3 ms) is 14% of the mean (665 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. create_5_000_run_trees: Mean +- std dev: 665 ms +- 94 ms ........... WARNING: the benchmark result may be unstable * the standard deviation (187 ms) is 13% of the mean (1.46 sec) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. create_10_000_run_trees: Mean +- std dev: 1.46 sec +- 0.19 sec ........... WARNING: the benchmark result may be unstable * the standard deviation (175 ms) is 12% of the mean (1.46 sec) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. create_20_000_run_trees: Mean +- std dev: 1.46 sec +- 0.18 sec ........... dumps_class_nested_py_branch_and_leaf_200x400: Mean +- std dev: 708 us +- 10 us ........... dumps_class_nested_py_leaf_50x100: Mean +- std dev: 25.4 ms +- 0.2 ms ........... dumps_class_nested_py_leaf_100x200: Mean +- std dev: 105 ms +- 3 ms ........... dumps_dataclass_nested_50x100: Mean +- std dev: 25.7 ms +- 0.3 ms ........... WARNING: the benchmark result may be unstable * the standard deviation (19.1 ms) is 26% of the mean (74.6 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydantic_nested_50x100: Mean +- std dev: 74.6 ms +- 19.1 ms ........... dumps_pydanticv1_nested_50x100: Mean +- std dev: 205 ms +- 3 ms

Check notice on line 1 in python/langsmith/anonymizer.py

View workflow job for this annotation

GitHub Actions / benchmark

Comparison against main

+-----------------------------------------------+----------+------------------------+ | Benchmark | main | changes | +===============================================+==========+========================+ | dumps_pydanticv1_nested_50x100 | 225 ms | 205 ms: 1.09x faster | +-----------------------------------------------+----------+------------------------+ | create_5_000_run_trees | 718 ms | 665 ms: 1.08x faster | +-----------------------------------------------+----------+------------------------+ | dumps_dataclass_nested_50x100 | 25.8 ms | 25.7 ms: 1.00x faster | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_100x200 | 104 ms | 105 ms: 1.01x slower | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_branch_and_leaf_200x400 | 704 us | 708 us: 1.01x slower | +-----------------------------------------------+----------+------------------------+ | dumps_class_nested_py_leaf_50x100 | 25.0 ms | 25.4 ms: 1.02x slower | +-----------------------------------------------+----------+------------------------+ | create_20_000_run_trees | 1.40 sec | 1.46 sec: 1.04x slower | +-----------------------------------------------+----------+------------------------+ | create_10_000_run_trees | 1.39 sec | 1.46 sec: 1.05x slower | +-----------------------------------------------+----------+------------------------+ | dumps_pydantic_nested_50x100 | 67.7 ms | 74.6 ms: 1.10x slower | +-----------------------------------------------+----------+------------------------+ | Geometric mean | (ref) | 1.00x slower | +-----------------------------------------------+----------+------------------------+
import inspect
from abc import abstractmethod
from collections import defaultdict
Expand Down Expand Up @@ -90,22 +90,29 @@

def __init__(self, rules: List[StringNodeRule]):
"""Initialize the processor with a list of rules."""
self.rules = rules
self.rules = [
{
"pattern": (
rule["pattern"]
if isinstance(rule["pattern"], re.Pattern)
else re.compile(rule["pattern"])
),
"replace": (
rule["replace"]
if isinstance(rule.get("replace"), str)
else "[redacted]"
),
}
for rule in rules
]

def mask_nodes(self, nodes: List[StringNode]) -> List[StringNode]:
"""Mask nodes using the rules."""
result = []
for item in nodes:
new_value = item["value"]
for rule in self.rules:
new_value = rule["pattern"].sub(
(
rule["replace"]
if isinstance(rule["replace"], str)
else "[redacted]"
),
new_value,
)
new_value = rule["pattern"].sub(rule["replace"], new_value)
if new_value != item["value"]:
result.append(StringNode(value=new_value, path=item["path"]))
return result
Expand Down
50 changes: 49 additions & 1 deletion python/tests/unit_tests/test_anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pydantic import BaseModel

from langsmith import Client, traceable, tracing_context
from langsmith.anonymizer import StringNodeRule, create_anonymizer
from langsmith.anonymizer import RuleNodeProcessor, StringNodeRule, create_anonymizer

EMAIL_REGEX = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
UUID_REGEX = re.compile(
Expand Down Expand Up @@ -139,3 +139,51 @@ def my_func(body: str, from_: MyInput) -> MyOutput:
if "inputs" in patched_data:
assert patched_data["inputs"] == expected_inputs
assert patched_data["outputs"] == expected_outputs


def test_rule_node_processor_scrub_sensitive_info():
rules = [
StringNodeRule(pattern=re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), replace="[ssn]"),
StringNodeRule(
pattern=re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"),
replace="[email]",
),
StringNodeRule(
pattern=re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), replace="[phone]"
),
]
processor = RuleNodeProcessor(rules)

nodes = [
{"value": "My SSN is 123-45-6789.", "path": ["field1"]},
{"value": "Contact me at [email protected].", "path": ["field2"]},
{"value": "Call me on 123-456-7890.", "path": ["field3"]},
]

expected = [
{"value": "My SSN is [ssn].", "path": ["field1"]},
{"value": "Contact me at [email].", "path": ["field2"]},
{"value": "Call me on [phone].", "path": ["field3"]},
]

result = processor.mask_nodes(nodes)

assert result == expected


def test_rule_node_processor_default_replace():
rules = [
StringNodeRule(pattern=re.compile(r"sensitive")),
]
processor = RuleNodeProcessor(rules)

nodes = [
{"value": "This contains sensitive data", "path": ["field1"]},
]

expected = [
{"value": "This contains [redacted] data", "path": ["field1"]},
]

result = processor.mask_nodes(nodes)
assert result == expected
Loading