deepsense-ai · maks-operlejn-ds · Sep 11, 2023 · Sep 11, 2023 · Sep 11, 2023 · Sep 11, 2023
diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py
@@ -1,6 +1,9 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import Dict
+from typing import Dict, List
+
+from presidio_analyzer import RecognizerResult
+from presidio_anonymizer.entities import EngineResult
 
 MappingDataType = Dict[str, Dict[str, str]]
 
@@ -17,5 +20,75 @@ def data(self) -> MappingDataType:
         return {k: dict(v) for k, v in self.mapping.items()}
 
     def update(self, new_mapping: MappingDataType) -> None:
+        """Update the deanonymizer mapping with new values
+        Duplicated values will not be added
+        """
+        new_values_seen = set()
+
         for entity_type, values in new_mapping.items():
-            self.mapping[entity_type].update(values)
+            for k, v in values.items():
+                # Make sure it is not a duplicate value
+                if (
+                    v not in self.mapping[entity_type].values()
+                    and v not in new_values_seen
+                ):
+                    self.mapping[entity_type][k] = v
+                    new_values_seen.update({v})
+
+
+def create_anonymizer_mapping(
+    original_text: str,
+    analyzer_results: List[RecognizerResult],
+    anonymizer_results: EngineResult,
+    is_reversed: bool = False,
+) -> MappingDataType:
+    """Creates or updates the mapping used to anonymize and/or deanonymize text.
+
+    This method exploits the results returned by the
+    analysis and anonymization processes.
+
+    If is_reversed is True, it constructs a mapping from each original
+    entity to its anonymized value.
+
+    If is_reversed is False, it constructs a mapping from each
+    anonymized entity back to its original text value.
+
+    Example of mapping:
+    {
+        "PERSON": {
+            "<original>": "<anonymized>",
+            "John Doe": "Slim Shady"
+        },
+        "PHONE_NUMBER": {
+            "111-111-1111": "555-555-5555"
+        }
+        ...
+    }
+    """
+
+    # We are able to zip and loop through both lists because we expect
+    # them to return corresponding entities for each identified piece
+    # of analyzable data from our input.
+
+    # We sort them by their 'start' attribute because it allows us to
+    # match corresponding entities by their position in the input text.
+    analyzer_results = sorted(analyzer_results, key=lambda d: d.start)
+    anonymizer_results.items = sorted(anonymizer_results.items, key=lambda d: d.start)
+
+    new_anonymizer_mapping: MappingDataType = defaultdict(dict)
+
+    for analyzed_entity, anonymized_entity in zip(
+        analyzer_results, anonymizer_results.items
+    ):
+        original_value = original_text[analyzed_entity.start : analyzed_entity.end]
+
+        if is_reversed:
+            new_anonymizer_mapping[anonymized_entity.entity_type][
+                anonymized_entity.text
+            ] = original_value
+        else:
+            new_anonymizer_mapping[anonymized_entity.entity_type][
+                original_value
+            ] = anonymized_entity.text
+
+    return new_anonymizer_mapping
diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import json
-from collections import defaultdict
 from pathlib import Path
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 
@@ -14,6 +13,7 @@
 from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
     DeanonymizerMapping,
     MappingDataType,
+    create_anonymizer_mapping,
 )
 from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
     default_matching_strategy,
@@ -43,8 +43,7 @@
     ) from e
 
 if TYPE_CHECKING:
-    from presidio_analyzer import EntityRecognizer, RecognizerResult
-    from presidio_anonymizer.entities import EngineResult
+    from presidio_analyzer import EntityRecognizer
 
 # Configuring Anonymizer for multiple languages
 # Detailed description and examples can be found here:
@@ -140,6 +139,14 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str:
         Each PII entity is replaced with a fake value.
         Each time fake values will be different, as they are generated randomly.
 
+        PresidioAnonymizer has no built-in memory -
+        so it will not remember the effects of anonymizing previous texts.
+        >>> anonymizer = PresidioAnonymizer()
+        >>> anonymizer.anonymize("John Doe")
+        'Noah Rhodes'
+        >>> anonymizer.anonymize("John Doe")
+        'Brett Russell'
+
         Args:
             text: text to anonymize
             language: language to use for analysis of PII
@@ -156,17 +163,30 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str:
                 "Change your language configuration file to add more languages."
             )
 
-        results = self._analyzer.analyze(
+        analyzer_results = self._analyzer.analyze(
             text,
             entities=self.analyzed_fields,
             language=language,
         )
 
-        return self._anonymizer.anonymize(
+        filtered_analyzer_results = (
+            self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
+                analyzer_results
+            )
+        )
+
+        anonymizer_results = self._anonymizer.anonymize(
             text,
-            analyzer_results=results,
+            analyzer_results=analyzer_results,
             operators=self.operators,
-        ).text
+        )
+
+        anonymizer_mapping = create_anonymizer_mapping(
+            text,
+            filtered_analyzer_results,
+            anonymizer_results,
+        )
+        return default_matching_strategy(text, anonymizer_mapping)
 
 
 class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
@@ -185,57 +205,14 @@ def deanonymizer_mapping(self) -> MappingDataType:
         """Return the deanonymizer mapping"""
         return self._deanonymizer_mapping.data
 
-    def _update_deanonymizer_mapping(
-        self,
-        original_text: str,
-        analyzer_results: List[RecognizerResult],
-        anonymizer_results: EngineResult,
-    ) -> None:
-        """Creates or updates the mapping used to de-anonymize text.
-
-        This method exploits the results returned by the
-        analysis and anonymization processes.
-
-        It constructs a mapping from each anonymized entity
-        back to its original text value.
-
-        Mapping will be stored as "deanonymizer_mapping" property.
-
-        Example of "deanonymizer_mapping":
-        {
-            "PERSON": {
-                "<anonymized>": "<original>",
-                "John Doe": "Slim Shady"
-            },
-            "PHONE_NUMBER": {
-                "111-111-1111": "555-555-5555"
-            }
-            ...
+    @property
+    def anonymizer_mapping(self) -> MappingDataType:
+        """Return the anonymizer mapping
+        This is just the reverse version of the deanonymizer mapping."""
+        return {
+            key: {v: k for k, v in inner_dict.items()}
+            for key, inner_dict in self.deanonymizer_mapping.items()
         }
-        """
-
-        # We are able to zip and loop through both lists because we expect
-        # them to return corresponding entities for each identified piece
-        # of analyzable data from our input.
-
-        # We sort them by their 'start' attribute because it allows us to
-        # match corresponding entities by their position in the input text.
-        analyzer_results = sorted(analyzer_results, key=lambda d: d.start)
-        anonymizer_results.items = sorted(
-            anonymizer_results.items, key=lambda d: d.start
-        )
-
-        new_deanonymizer_mapping: MappingDataType = defaultdict(dict)
-
-        for analyzed_entity, anonymized_entity in zip(
-            analyzer_results, anonymizer_results.items
-        ):
-            original_value = original_text[analyzed_entity.start : analyzed_entity.end]
-            new_deanonymizer_mapping[anonymized_entity.entity_type][
-                anonymized_entity.text
-            ] = original_value
-
-        self._deanonymizer_mapping.update(new_deanonymizer_mapping)
 
     def _anonymize(self, text: str, language: Optional[str] = None) -> str:
         """Anonymize text.
@@ -244,6 +221,14 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str:
         At the same time, we will create a mapping from each anonymized entity
         back to its original text value.
 
+        Thanks to the built-in memory, all previously anonymised entities
+        will be remembered and replaced by the same fake values:
+        >>> anonymizer = PresidioReversibleAnonymizer()
+        >>> anonymizer.anonymize("John Doe")
+        'Noah Rhodes'
+        >>> anonymizer.anonymize("John Doe")
+        'Noah Rhodes'
+
         Args:
             text: text to anonymize
             language: language to use for analysis of PII
@@ -278,11 +263,15 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str:
             operators=self.operators,
         )
 
-        self._update_deanonymizer_mapping(
-            text, filtered_analyzer_results, anonymizer_results
+        new_deanonymizer_mapping = create_anonymizer_mapping(
+            text,
+            filtered_analyzer_results,
+            anonymizer_results,
+            is_reversed=True,
         )
+        self._deanonymizer_mapping.update(new_deanonymizer_mapping)
 
-        return anonymizer_results.text
+        return default_matching_strategy(text, self.anonymizer_mapping)
 
     def _deanonymize(
         self,

diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
@@ -39,6 +39,22 @@ def test_anonymize_multiple() -> None:
         assert phrase not in anonymized_text
 
 
+@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
+def test_check_instances() -> None:
+    """Test anonymizing multiple items in a sentence"""
+    from langchain_experimental.data_anonymizer import PresidioAnonymizer
+
+    text = (
+        "This is John Smith. John Smith works in a bakery." "John Smith is a good guy"
+    )
+    anonymizer = PresidioAnonymizer(["PERSON"], faker_seed=42)
+    anonymized_text = anonymizer.anonymize(text)
+    assert anonymized_text.count("Noah Rhodes") == 3
+
+    anonymized_text = anonymizer.anonymize(text)
+    assert anonymized_text.count("Noah Rhodes") == 0
+
+
 @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
 def test_anonymize_with_custom_operator() -> None:
     """Test anonymize a name with a custom operator"""
@@ -78,6 +94,8 @@ def test_add_recognizer_operator() -> None:
     assert anonymized_text == "<TITLE> Jane Doe was here."
 
     # anonymizing with custom recognizer and operator
+    anonymizer = PresidioAnonymizer(analyzed_fields=[])
+    anonymizer.add_recognizer(custom_recognizer)
     custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})}
     anonymizer.add_operators(custom_operator)
     anonymized_text = anonymizer.anonymize(text)

diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py
@@ -40,6 +40,32 @@ def test_anonymize_multiple() -> None:
         assert phrase not in anonymized_text
 
 
+@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
+def test_check_instances() -> None:
+    """Test anonymizing multiple items in a sentence"""
+    from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
+
+    text = (
+        "This is John Smith. John Smith works in a bakery." "John Smith is a good guy"
+    )
+    anonymizer = PresidioReversibleAnonymizer(["PERSON"], faker_seed=42)
+    anonymized_text = anonymizer.anonymize(text)
+    persons = list(anonymizer.deanonymizer_mapping["PERSON"].keys())
+    assert len(persons) == 1
+
+    anonymized_name = persons[0]
+    assert anonymized_text.count(anonymized_name) == 3
+
+    anonymized_text = anonymizer.anonymize(text)
+    assert anonymized_text.count(anonymized_name) == 3
+    assert anonymizer.deanonymizer_mapping["PERSON"][anonymized_name] == "John Smith"
+
+    text = "This is Jane Smith"
+    anonymized_text = anonymizer.anonymize(text)
+    persons = list(anonymizer.deanonymizer_mapping["PERSON"].keys())
+    assert len(persons) == 2
+
+
 @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
 def test_anonymize_with_custom_operator() -> None:
     """Test anonymize a name with a custom operator"""
@@ -79,6 +105,8 @@ def test_add_recognizer_operator() -> None:
     assert anonymized_text == "<TITLE> Jane Doe was here."
 
     # anonymizing with custom recognizer and operator
+    anonymizer = PresidioReversibleAnonymizer(analyzed_fields=[])
+    anonymizer.add_recognizer(custom_recognizer)
     custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})}
     anonymizer.add_operators(custom_operator)
     anonymized_text = anonymizer.anonymize(text)