Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Instance anonymization #17

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict
from typing import Dict, List

from presidio_analyzer import RecognizerResult
from presidio_anonymizer.entities import EngineResult

MappingDataType = Dict[str, Dict[str, str]]

Expand All @@ -17,5 +20,75 @@ def data(self) -> MappingDataType:
return {k: dict(v) for k, v in self.mapping.items()}

def update(self, new_mapping: MappingDataType) -> None:
"""Update the deanonymizer mapping with new values
Duplicated values will not be added
"""
new_values_seen = set()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need this? Due to the fact that the same value can appear for different entity_type?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the case of default recognizers, not likely, but who knows what users will come up with when adding their own 😛


for entity_type, values in new_mapping.items():
self.mapping[entity_type].update(values)
for k, v in values.items():
# Make sure it is not a duplicate value
if (
v not in self.mapping[entity_type].values()
and v not in new_values_seen
):
self.mapping[entity_type][k] = v
new_values_seen.update({v})


def create_anonymizer_mapping(
original_text: str,
analyzer_results: List[RecognizerResult],
anonymizer_results: EngineResult,
is_reversed: bool = False,
) -> MappingDataType:
"""Creates or updates the mapping used to anonymize and/or deanonymize text.

This method exploits the results returned by the
analysis and anonymization processes.

If is_reversed is True, it constructs a mapping from each original
entity to its anonymized value.

If is_reversed is False, it constructs a mapping from each
anonymized entity back to its original text value.

Example of mapping:
{
"PERSON": {
"<original>": "<anonymized>",
"John Doe": "Slim Shady"
},
"PHONE_NUMBER": {
"111-111-1111": "555-555-5555"
}
...
}
"""

# We are able to zip and loop through both lists because we expect
# them to return corresponding entities for each identified piece
# of analyzable data from our input.

# We sort them by their 'start' attribute because it allows us to
# match corresponding entities by their position in the input text.
analyzer_results = sorted(analyzer_results, key=lambda d: d.start)
anonymizer_results.items = sorted(anonymizer_results.items, key=lambda d: d.start)

new_anonymizer_mapping: MappingDataType = defaultdict(dict)

for analyzed_entity, anonymized_entity in zip(
analyzer_results, anonymizer_results.items
):
original_value = original_text[analyzed_entity.start : analyzed_entity.end]

if is_reversed:
new_anonymizer_mapping[anonymized_entity.entity_type][
anonymized_entity.text
] = original_value
else:
new_anonymizer_mapping[anonymized_entity.entity_type][
original_value
] = anonymized_entity.text

return new_anonymizer_mapping
109 changes: 49 additions & 60 deletions libs/experimental/langchain_experimental/data_anonymizer/presidio.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

import json
from collections import defaultdict
from pathlib import Path
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union

Expand All @@ -14,6 +13,7 @@
from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
DeanonymizerMapping,
MappingDataType,
create_anonymizer_mapping,
)
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
default_matching_strategy,
Expand Down Expand Up @@ -43,8 +43,7 @@
) from e

if TYPE_CHECKING:
from presidio_analyzer import EntityRecognizer, RecognizerResult
from presidio_anonymizer.entities import EngineResult
from presidio_analyzer import EntityRecognizer

# Configuring Anonymizer for multiple languages
# Detailed description and examples can be found here:
Expand Down Expand Up @@ -140,6 +139,14 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str:
Each PII entity is replaced with a fake value.
Each time fake values will be different, as they are generated randomly.

PresidioAnonymizer has no built-in memory -
maks-operlejn-ds marked this conversation as resolved.
Show resolved Hide resolved
so it will not remember the effects of anonymizing previous texts.
>>> anonymizer = PresidioAnonymizer()
>>> anonymizer.anonymize("John Doe")
'Noah Rhodes'
>>> anonymizer.anonymize("John Doe")
'Brett Russell'

Args:
text: text to anonymize
language: language to use for analysis of PII
Expand All @@ -156,17 +163,30 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str:
"Change your language configuration file to add more languages."
)

results = self._analyzer.analyze(
analyzer_results = self._analyzer.analyze(
text,
entities=self.analyzed_fields,
language=language,
)

return self._anonymizer.anonymize(
filtered_analyzer_results = (
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
analyzer_results
)
)

anonymizer_results = self._anonymizer.anonymize(
text,
analyzer_results=results,
analyzer_results=analyzer_results,
operators=self.operators,
).text
)

anonymizer_mapping = create_anonymizer_mapping(
text,
filtered_analyzer_results,
anonymizer_results,
)
return default_matching_strategy(text, anonymizer_mapping)


class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
Expand All @@ -185,57 +205,14 @@ def deanonymizer_mapping(self) -> MappingDataType:
"""Return the deanonymizer mapping"""
return self._deanonymizer_mapping.data

def _update_deanonymizer_mapping(
self,
original_text: str,
analyzer_results: List[RecognizerResult],
anonymizer_results: EngineResult,
) -> None:
"""Creates or updates the mapping used to de-anonymize text.

This method exploits the results returned by the
analysis and anonymization processes.

It constructs a mapping from each anonymized entity
back to its original text value.

Mapping will be stored as "deanonymizer_mapping" property.

Example of "deanonymizer_mapping":
{
"PERSON": {
"<anonymized>": "<original>",
"John Doe": "Slim Shady"
},
"PHONE_NUMBER": {
"111-111-1111": "555-555-5555"
}
...
@property
def anonymizer_mapping(self) -> MappingDataType:
"""Return the anonymizer mapping
This is just the reverse version of the deanonymizer mapping."""
return {
key: {v: k for k, v in inner_dict.items()}
for key, inner_dict in self.deanonymizer_mapping.items()
}
"""

# We are able to zip and loop through both lists because we expect
# them to return corresponding entities for each identified piece
# of analyzable data from our input.

# We sort them by their 'start' attribute because it allows us to
# match corresponding entities by their position in the input text.
analyzer_results = sorted(analyzer_results, key=lambda d: d.start)
anonymizer_results.items = sorted(
anonymizer_results.items, key=lambda d: d.start
)

new_deanonymizer_mapping: MappingDataType = defaultdict(dict)

for analyzed_entity, anonymized_entity in zip(
analyzer_results, anonymizer_results.items
):
original_value = original_text[analyzed_entity.start : analyzed_entity.end]
new_deanonymizer_mapping[anonymized_entity.entity_type][
anonymized_entity.text
] = original_value

self._deanonymizer_mapping.update(new_deanonymizer_mapping)

def _anonymize(self, text: str, language: Optional[str] = None) -> str:
"""Anonymize text.
Expand All @@ -244,6 +221,14 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str:
At the same time, we will create a mapping from each anonymized entity
back to its original text value.

Thanks to the built-in memory, all previously anonymised entities
will be remembered and replaced by the same fake values:
>>> anonymizer = PresidioReversibleAnonymizer()
>>> anonymizer.anonymize("John Doe")
'Noah Rhodes'
>>> anonymizer.anonymize("John Doe")
'Noah Rhodes'

Args:
text: text to anonymize
language: language to use for analysis of PII
Expand Down Expand Up @@ -278,11 +263,15 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str:
operators=self.operators,
)

self._update_deanonymizer_mapping(
text, filtered_analyzer_results, anonymizer_results
new_deanonymizer_mapping = create_anonymizer_mapping(
text,
filtered_analyzer_results,
anonymizer_results,
is_reversed=True,
)
self._deanonymizer_mapping.update(new_deanonymizer_mapping)

return anonymizer_results.text
return default_matching_strategy(text, self.anonymizer_mapping)

def _deanonymize(
self,
Expand Down
18 changes: 18 additions & 0 deletions libs/experimental/tests/unit_tests/test_data_anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,22 @@ def test_anonymize_multiple() -> None:
assert phrase not in anonymized_text


@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_check_instances() -> None:
"""Test anonymizing multiple items in a sentence"""
from langchain_experimental.data_anonymizer import PresidioAnonymizer

text = (
"This is John Smith. John Smith works in a bakery." "John Smith is a good guy"
)
anonymizer = PresidioAnonymizer(["PERSON"], faker_seed=42)
anonymized_text = anonymizer.anonymize(text)
assert anonymized_text.count("Noah Rhodes") == 3

anonymized_text = anonymizer.anonymize(text)
assert anonymized_text.count("Noah Rhodes") == 0


@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_anonymize_with_custom_operator() -> None:
"""Test anonymize a name with a custom operator"""
Expand Down Expand Up @@ -78,6 +94,8 @@ def test_add_recognizer_operator() -> None:
assert anonymized_text == "<TITLE> Jane Doe was here."

# anonymizing with custom recognizer and operator
anonymizer = PresidioAnonymizer(analyzed_fields=[])
anonymizer.add_recognizer(custom_recognizer)
maks-operlejn-ds marked this conversation as resolved.
Show resolved Hide resolved
custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})}
anonymizer.add_operators(custom_operator)
anonymized_text = anonymizer.anonymize(text)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,32 @@ def test_anonymize_multiple() -> None:
assert phrase not in anonymized_text


@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_check_instances() -> None:
"""Test anonymizing multiple items in a sentence"""
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer

text = (
"This is John Smith. John Smith works in a bakery." "John Smith is a good guy"
)
anonymizer = PresidioReversibleAnonymizer(["PERSON"], faker_seed=42)
anonymized_text = anonymizer.anonymize(text)
persons = list(anonymizer.deanonymizer_mapping["PERSON"].keys())
assert len(persons) == 1

anonymized_name = persons[0]
assert anonymized_text.count(anonymized_name) == 3

anonymized_text = anonymizer.anonymize(text)
assert anonymized_text.count(anonymized_name) == 3
assert anonymizer.deanonymizer_mapping["PERSON"][anonymized_name] == "John Smith"

text = "This is Jane Smith"
anonymized_text = anonymizer.anonymize(text)
persons = list(anonymizer.deanonymizer_mapping["PERSON"].keys())
assert len(persons) == 2


@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_anonymize_with_custom_operator() -> None:
"""Test anonymize a name with a custom operator"""
Expand Down Expand Up @@ -79,6 +105,8 @@ def test_add_recognizer_operator() -> None:
assert anonymized_text == "<TITLE> Jane Doe was here."

# anonymizing with custom recognizer and operator
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=[])
anonymizer.add_recognizer(custom_recognizer)
custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})}
anonymizer.add_operators(custom_operator)
anonymized_text = anonymizer.anonymize(text)
Expand Down