Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace emails, numbers and other NE by brackets #48

Merged
merged 4 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 31 additions & 10 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def pseudonymize_ne(self, ner, sentence):
for i in range(len(ner)):
entity = ner[i]
ent_string = entity["entity_group"] # noqa
ent_word = entity["word"]
# here we could check that string is "PER"
ent_conf = entity["score"] # noqa
ent_position = entity["start"], entity["end"]
Expand All @@ -156,14 +157,16 @@ def pseudonymize_ne(self, ner, sentence):
# replace PER
if ent_string == "PER":
# add the name of this entity to list
nelist.append(entity["word"])
else:
# Locations and Organizations
new_sentence = (
new_sentence[: (ent_position[0])]
+ "x" * (ent_position[1] - ent_position[0])
+ new_sentence[(ent_position[1]) :] # noqa
)
nelist.append(ent_word)
# replace LOC
elif ent_string == "LOC":
new_sentence = new_sentence.replace(ent_word, "[location]")
# replace ORG
elif ent_string == "ORG":
new_sentence = new_sentence.replace(ent_word, "[organization]")
# replace MISC
elif ent_string == "MISC":
new_sentence = new_sentence.replace(ent_word, "[misc]")
# replace all unique PER now
new_sentence = self.pseudonymize_per(new_sentence, nelist)

Expand All @@ -172,8 +175,25 @@ def pseudonymize_ne(self, ner, sentence):

def pseudonymize_numbers(self, sentence):
sent_as_list = list(sentence)
sent_as_list = [char if not char.isdigit() else "x" for char in sent_as_list]
return "".join(sent_as_list)
new_list = []
for i in range(len(sent_as_list)):
if sent_as_list[i].isdigit():
if i == 0 or not sent_as_list[i - 1].isdigit():
new_list.append("[number]")
else:
new_list.append(sent_as_list[i])

return "".join(new_list)

def pseudonymize_email_addresses(self, sentence):
split = sentence.split(" ")
new_list = []
for word in split:
if "@" in word:
new_list.append("[email]")
else:
new_list.append(word)
return " ".join(new_list)

def concatenate(self, sentences):
return " ".join(sentences)
Expand All @@ -183,6 +203,7 @@ def pseudonymize(self, text: str):
sentences = self.get_sentences(text)
pseudonymized_sentences = []
for sent in sentences:
sent = self.pseudonymize_email_addresses(sent)
ner = self.get_ner(sent)
ps_sent = " ".join(self.pseudonymize_ne(ner, sent)) if ner else sent
ps_sent = self.pseudonymize_numbers(ps_sent)
Expand Down
144 changes: 115 additions & 29 deletions mailcom/test/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,46 +114,23 @@ def test_pseudonymize_per(get_default_fr):
)


def test_pseudonymize_ne(get_default_fr):
sentence = "Francois and Agathe are friends."
ner = [
{
"entity_group": "PER",
"score": 0.99,
"word": "Francois",
"start": 0,
"end": 8,
},
{
"entity_group": "PER",
"score": 0.99,
"word": "Agathe",
"start": 13,
"end": 19,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Francois" not in pseudonymized_sentence[0]
assert "Agathe" not in pseudonymized_sentence[0]
assert any(
pseudo in pseudonymized_sentence[0]
for pseudo in get_default_fr.pseudo_first_names["fr"]
)


def test_pseudonymize_numbers(get_default_fr):
sentence = "My phone number is 123-456-7890."
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
assert pseudonymized_sentence == "My phone number is xxx-xxx-xxxx."
assert pseudonymized_sentence == "My phone number is [number]-[number]-[number]."

sentence = "The year 2023 is almost over."
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
assert pseudonymized_sentence == "The year xxxx is almost over."
assert pseudonymized_sentence == "The year [number] is almost over."

sentence = "No digits here!"
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
assert pseudonymized_sentence == "No digits here!"

sentence = ""
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
assert pseudonymized_sentence == ""


def test_concatenate_empty_list(get_default_fr):
sentences = []
Expand Down Expand Up @@ -200,3 +177,112 @@ def test_pseudonymize_no_entities(get_default_fr):
text = "Ceci est une phrase simple sans entités nommées ni chiffres."
pseudonymized_text = get_default_fr.pseudonymize(text)
assert pseudonymized_text == text


def test_pseudonymize_email_addresses(get_default_fr):
sentence = "My email is [email protected]."
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == "My email is [email]"

sentence = "Contact us at [email protected] or [email protected]."
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == "Contact us at [email] or [email]"

sentence = "No email addresses here!"
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == "No email addresses here!"

sentence = ""
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == ""


def test_pseudonymize_ne_with_person_entities(get_default_fr):
sentence = "Francois et Agathe sont amis."
ner = [
{
"entity_group": "PER",
"score": 0.99,
"word": "Francois",
"start": 0,
"end": 8,
},
{
"entity_group": "PER",
"score": 0.99,
"word": "Agathe",
"start": 13,
"end": 19,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Francois" not in pseudonymized_sentence[0]
assert "Agathe" not in pseudonymized_sentence[0]
assert any(
pseudo in pseudonymized_sentence[0]
for pseudo in get_default_fr.pseudo_first_names["fr"]
)


def test_pseudonymize_ne_with_location_entities(get_default_fr):
sentence = "Paris et New York sont des villes."
ner = [
{
"entity_group": "LOC",
"score": 0.99,
"word": "Paris",
"start": 0,
"end": 5,
},
{
"entity_group": "LOC",
"score": 0.99,
"word": "New York",
"start": 10,
"end": 18,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Paris" not in pseudonymized_sentence[0]
assert "New York" not in pseudonymized_sentence[0]
assert "[location]" in pseudonymized_sentence[0]


def test_pseudonymize_ne_with_organization_entities(get_default_fr):
sentence = "Google et Microsoft sont des géants de la technologie."
ner = [
{
"entity_group": "ORG",
"score": 0.99,
"word": "Google",
"start": 0,
"end": 6,
},
{
"entity_group": "ORG",
"score": 0.99,
"word": "Microsoft",
"start": 11,
"end": 20,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Google" not in pseudonymized_sentence[0]
assert "Microsoft" not in pseudonymized_sentence[0]
assert "[organization]" in pseudonymized_sentence[0]


def test_pseudonymize_ne_with_misc_entities(get_default_fr):
sentence = "La tour Eiffel est un monument célèbre."
ner = [
{
"entity_group": "MISC",
"score": 0.99,
"word": "tour Eiffel",
"start": 4,
"end": 16,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "tour Eiffel" not in pseudonymized_sentence[0]
assert "[misc]" in pseudonymized_sentence[0]