Skip to content

Commit

Permalink
Replaced locations, organizations and misc with brakets
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Oct 31, 2024
1 parent 799faf6 commit 8b53faa
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 35 deletions.
19 changes: 11 additions & 8 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def pseudonymize_ne(self, ner, sentence):
for i in range(len(ner)):
entity = ner[i]
ent_string = entity["entity_group"] # noqa
ent_word = entity["word"]
# here we could check that string is "PER"
ent_conf = entity["score"] # noqa
ent_position = entity["start"], entity["end"]
Expand All @@ -156,14 +157,16 @@ def pseudonymize_ne(self, ner, sentence):
# replace PER
if ent_string == "PER":
# add the name of this entity to list
nelist.append(entity["word"])
else:
# Locations and Organizations
new_sentence = (
new_sentence[: (ent_position[0])]
+ "x" * (ent_position[1] - ent_position[0])
+ new_sentence[(ent_position[1]) :] # noqa
)
nelist.append(ent_word)
# replace LOC
elif ent_string == "LOC":
new_sentence = new_sentence.replace(ent_word, "[location]")
# replace ORG
elif ent_string == "ORG":
new_sentence = new_sentence.replace(ent_word, "[organization]")
# replace MISC
elif ent_string == "MISC":
new_sentence = new_sentence.replace(ent_word, "[misc]")
# replace all unique PER now
new_sentence = self.pseudonymize_per(new_sentence, nelist)

Expand Down
118 changes: 91 additions & 27 deletions mailcom/test/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,33 +114,6 @@ def test_pseudonymize_per(get_default_fr):
)


def test_pseudonymize_ne(get_default_fr):
sentence = "Francois and Agathe are friends."
ner = [
{
"entity_group": "PER",
"score": 0.99,
"word": "Francois",
"start": 0,
"end": 8,
},
{
"entity_group": "PER",
"score": 0.99,
"word": "Agathe",
"start": 13,
"end": 19,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Francois" not in pseudonymized_sentence[0]
assert "Agathe" not in pseudonymized_sentence[0]
assert any(
pseudo in pseudonymized_sentence[0]
for pseudo in get_default_fr.pseudo_first_names["fr"]
)


def test_pseudonymize_numbers(get_default_fr):
sentence = "My phone number is 123-456-7890."
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
Expand Down Expand Up @@ -222,3 +195,94 @@ def test_pseudonymize_email_addresses(get_default_fr):
sentence = ""
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == ""


def test_pseudonymize_ne_with_person_entities(get_default_fr):
sentence = "Francois et Agathe sont amis."
ner = [
{
"entity_group": "PER",
"score": 0.99,
"word": "Francois",
"start": 0,
"end": 8,
},
{
"entity_group": "PER",
"score": 0.99,
"word": "Agathe",
"start": 13,
"end": 19,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Francois" not in pseudonymized_sentence[0]
assert "Agathe" not in pseudonymized_sentence[0]
assert any(
pseudo in pseudonymized_sentence[0]
for pseudo in get_default_fr.pseudo_first_names["fr"]
)


def test_pseudonymize_ne_with_location_entities(get_default_fr):
sentence = "Paris et New York sont des villes."
ner = [
{
"entity_group": "LOC",
"score": 0.99,
"word": "Paris",
"start": 0,
"end": 5,
},
{
"entity_group": "LOC",
"score": 0.99,
"word": "New York",
"start": 10,
"end": 18,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Paris" not in pseudonymized_sentence[0]
assert "New York" not in pseudonymized_sentence[0]
assert "[location]" in pseudonymized_sentence[0]


def test_pseudonymize_ne_with_organization_entities(get_default_fr):
sentence = "Google et Microsoft sont des géants de la technologie."
ner = [
{
"entity_group": "ORG",
"score": 0.99,
"word": "Google",
"start": 0,
"end": 6,
},
{
"entity_group": "ORG",
"score": 0.99,
"word": "Microsoft",
"start": 11,
"end": 20,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Google" not in pseudonymized_sentence[0]
assert "Microsoft" not in pseudonymized_sentence[0]
assert "[organization]" in pseudonymized_sentence[0]


def test_pseudonymize_ne_with_misc_entities(get_default_fr):
sentence = "La tour Eiffel est un monument célèbre."
ner = [
{
"entity_group": "MISC",
"score": 0.99,
"word": "tour Eiffel",
"start": 4,
"end": 16,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "tour Eiffel" not in pseudonymized_sentence[0]
assert "[misc]" in pseudonymized_sentence[0]

0 comments on commit 8b53faa

Please sign in to comment.