langchain-ai · davidx33 · Nov 25, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/docs/observability/how_to_guides/tracing/mask_inputs_outputs.mdx b/docs/observability/how_to_guides/tracing/mask_inputs_outputs.mdx
@@ -107,18 +107,24 @@ However, if inputs or outputs are to be sent to client, the `anonymizer` method
     python`
         from langsmith.anonymizer import create_anonymizer
         from langsmith import Client, traceable
+        import re
 
         # create anonymizer from list of regex patterns and replacement values
         anonymizer = create_anonymizer([
-            { "pattern": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "replace": "<email>" },
-            { "pattern": r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}", "replace": "<uuid>" }
+            { "pattern": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "replace": "<email-address>" },
+            { "pattern": r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}", "replace": "<UUID>" }
         ])
 
         # or create anonymizer from a function
-        anonymizer = create_anonymizer(lambda text: r"...".sub("[value]", text))
-
+        email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
+        uuid_pattern = re.compile(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
+
+        anonymizer = create_anonymizer(
+            lambda text: email_pattern.sub("<email-address>", uuid_pattern.sub("<UUID>", text))
+        )
+
         client = Client(anonymizer=anonymizer)
-        
+
         @traceable(client=client)
         def main(inputs: dict) -> dict:
             ...
@@ -127,24 +133,25 @@ However, if inputs or outputs are to be sent to client, the `anonymizer` method
         import { createAnonymizer } from "langsmith/anonymizer"
         import { traceable } from "langsmith/traceable"
         import { Client } from "langsmith"
-        
+
         // create anonymizer from list of regex patterns and replacement values
         const anonymizer = createAnonymizer([
             { pattern: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, replace: "<email>" },
             { pattern: /[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}/g, replace: "<uuid>" }
         ])
-        
+
         // or create anonymizer from a function
         const anonymizer = createAnonymizer((value) => value.replace("...", "<value>"))
-        
+
         const client = new Client({ anonymizer })
-        
+
         const main = traceable(async (inputs: any) => {
             // ...
         }, { client })
     `,
-  ]}
-  groupId="client-language"
+
+]}
+groupId="client-language"
 />
 
 Please note, that using the anonymizer might incur a performance hit with complex regular expressions or large payloads, as the anonymizer serializes the payload to JSON before processing.
@@ -299,6 +306,10 @@ You can combine rule-based masking with various anonymizers to scrub sensitive i
 
 ### Regex
 
+:::info
+The implementation below is not exhaustive and may miss some formats or edge cases. Test any implementation thoroughly before using it in production.
+:::
+
 You can use regex to mask inputs and outputs before they are sent to LangSmith. The implementation below masks email addresses, phone numbers, full names, credit card numbers, and SSNs.
 
 ```python
@@ -335,7 +346,6 @@ def regex_anonymize(text):
 def recursive_anonymize(data, depth=10):
     """
     Recursively traverse the data structure and anonymize sensitive information.
-
     Args:
         data (any): The input data to be anonymized.
         depth (int): The current recursion depth to prevent excessive recursion.
@@ -364,7 +374,6 @@ def recursive_anonymize(data, depth=10):
     else:
         return data
 
-# Wrap the OpenAI client
 openai_client = wrap_openai(openai.Client())
 
 # Initialize the LangSmith client with the anonymization functions
@@ -400,6 +409,10 @@ The non-anonymized run will look like this in LangSmith:
 
 ### Microsoft Presidio
 
+:::info
+The implementation below provides a general example of how to anonymize sensitive information in messages exchanged between a user and an LLM. It is not exhaustive and does not account for all cases. Test any implementation thoroughly before using it in production.
+:::
+
 Microsoft Presidio is a data protection and de-identification SDK. The implementation below uses Presidio to anonymize inputs and outputs before they are sent to LangSmith. For up to date information, please refer to Presidio's [official documentation](https://microsoft.github.io/presidio/).
 
 To use Presidio and its spaCy model, install the following:
@@ -426,49 +439,43 @@ from presidio_analyzer import AnalyzerEngine
 anonymizer = AnonymizerEngine()
 analyzer = AnalyzerEngine()
 
-def recursive_anonymize(data, depth=10):
+def presidio_anonymize(data):
     """
-    Recursively traverse the data structure and anonymize sensitive information.
+    Anonymize sensitive information sent by the user or returned by the model.
 
     Args:
-        data (any): The input data to be anonymized.
-        depth (int): The current recursion depth to prevent excessive recursion.
+        data (any): The data to be anonymized.
 
     Returns:
         any: The anonymized data.
     """
-    if depth == 0:
+    message_list = (
+        data.get('messages') or [data.get('choices', [{}])[0].get('message')]
+    )
+    if not message_list or not all(isinstance(msg, dict) and msg for msg in message_list):
         return data
-
-    if isinstance(data, dict):
-        anonymized_dict = {}
-        for k, v in data.items():
-            anonymized_value = recursive_anonymize(v, depth - 1)
-            anonymized_dict[k] = anonymized_value
-        return anonymized_dict
-    elif isinstance(data, list):
-        anonymized_list = []
-        for item in data:
-            anonymized_item = recursive_anonymize(item, depth - 1)
-            anonymized_list.append(anonymized_item)
-        return anonymized_list
-    elif isinstance(data, str):
+    for message in message_list:
+        content = message.get('content', '')
+        if not content.strip():
+            print("Empty content detected. Skipping anonymization.")
+            continue
         results = analyzer.analyze(
-            text=data,
+            text=content,
             entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "US_SSN"],
             language='en'
         )
-        anonymized_data = anonymizer.anonymize(text=data, analyzer_results=results)
-        return anonymized_data
-    else:
-        return data
+        anonymized_result = anonymizer.anonymize(
+            text=content,
+            analyzer_results=results
+        )
+        message['content'] = anonymized_result.text
+    return data
 
-# wrap the openai client
 openai_client = wrap_openai(openai.Client())
 
 # initialize the langsmith client with the anonymization functions
 langsmith_client = Client(
-  hide_inputs=recursive_anonymize, hide_outputs=recursive_anonymize
+  hide_inputs=presidio_anonymize, hide_outputs=presidio_anonymize
 )
 
 # The trace produced will have its metadata present, but the inputs and outputs will be anonymized
@@ -499,6 +506,10 @@ The non-anonymized run will look like this in LangSmith:
 
 ### Amazon Comprehend
 
+:::info
+The implementation below provides a general example of how to anonymize sensitive information in messages exchanged between a user and an LLM. It is not exhaustive and does not account for all cases. Test any implementation thoroughly before using it in production.
+:::
+
 Comprehend is a natural language processing service that can detect personally identifiable information. The implementation below uses Comprehend to anonymize inputs and outputs before they are sent to LangSmith. For up to date information, please refer to Comprehend's [official documentation](https://docs.aws.amazon.com/comprehend/latest/APIReference/API_DetectPiiEntities.html).
 
 To use Comprehend, install [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html):
@@ -548,7 +559,6 @@ def redact_pii_entities(text, entities):
 
     return redacted_text
 
-# Function to detect PII using AWS Comprehend
 def detect_pii(text):
     """
     Detect PII entities in the given text using AWS Comprehend.
@@ -570,45 +580,40 @@ def detect_pii(text):
         print(f"Error detecting PII: {e}")
         return []
 
-def recursive_anonymize(data, depth=10):
+def comprehend_anonymize(data):
     """
-    Recursively traverse the data structure and anonymize sensitive information.
+    Anonymize sensitive information sent by the user or returned by the model.
 
     Args:
         data (any): The input data to be anonymized.
-        depth (int): The current recursion depth to prevent excessive recursion.
 
     Returns:
         any: The anonymized data.
     """
-    if depth == 0:
-        return data
-
-    if isinstance(data, dict):
-        anonymized_dict = {}
-        for k, v in data.items():
-            anonymized_value = recursive_anonymize(v, depth - 1)
-            anonymized_dict[k] = anonymized_value
-        return anonymized_dict
-    elif isinstance(data, list):
-        anonymized_list = []
-        for item in data:
-            anonymized_item = recursive_anonymize(item, depth - 1)
-            anonymized_list.append(anonymized_item)
-        return anonymized_list
-    elif isinstance(data, str):
-        entities = detect_pii(data)
-        anonymized_data = redact_pii_entities(data, entities)
-        return anonymized_data
-    else:
+    message_list = (
+        data.get('messages') or [data.get('choices', [{}])[0].get('message')]
+    )
+    if not message_list or not all(isinstance(msg, dict) and msg for msg in message_list):
         return data
+    for message in message_list:
+        content = message.get('content', '')
+        if not content.strip():
+            print("Empty content detected. Skipping anonymization.")
+            continue
+        entities = detect_pii(content)
+        if entities:
+            anonymized_text = redact_pii_entities(content, entities)
+            message['content'] = anonymized_text
+        else:
+            print("No PII detected. Content remains unchanged.")
+
+    return data
 
-# wrap the openai client
 openai_client = wrap_openai(openai.Client())
 
 # initialize the langsmith client with the anonymization functions
 langsmith_client = Client(
-  hide_inputs=recursive_anonymize, hide_outputs=recursive_anonymize
+  hide_inputs=comprehend_anonymize, hide_outputs=comprehend_anonymize
 )
 
 # The trace produced will have its metadata present, but the inputs and outputs will be anonymized

diff --git a/docs/observability/how_to_guides/tracing/static/presidio-anonymized.png b/docs/observability/how_to_guides/tracing/static/presidio-anonymized.png