diff --git a/mailcom/parse.py b/mailcom/parse.py index 5c52e78..d44acda 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -145,6 +145,7 @@ def pseudonymize_ne(self, ner, sentence): for i in range(len(ner)): entity = ner[i] ent_string = entity["entity_group"] # noqa + ent_word = entity["word"] # here we could check that string is "PER" ent_conf = entity["score"] # noqa ent_position = entity["start"], entity["end"] @@ -156,14 +157,16 @@ def pseudonymize_ne(self, ner, sentence): # replace PER if ent_string == "PER": # add the name of this entity to list - nelist.append(entity["word"]) - else: - # Locations and Organizations - new_sentence = ( - new_sentence[: (ent_position[0])] - + "x" * (ent_position[1] - ent_position[0]) - + new_sentence[(ent_position[1]) :] # noqa - ) + nelist.append(ent_word) + # replace LOC + elif ent_string == "LOC": + new_sentence = new_sentence.replace(ent_word, "[location]") + # replace ORG + elif ent_string == "ORG": + new_sentence = new_sentence.replace(ent_word, "[organization]") + # replace MISC + elif ent_string == "MISC": + new_sentence = new_sentence.replace(ent_word, "[misc]") # replace all unique PER now new_sentence = self.pseudonymize_per(new_sentence, nelist) diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index cf82d63..fa586fb 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -114,33 +114,6 @@ def test_pseudonymize_per(get_default_fr): ) -def test_pseudonymize_ne(get_default_fr): - sentence = "Francois and Agathe are friends." - ner = [ - { - "entity_group": "PER", - "score": 0.99, - "word": "Francois", - "start": 0, - "end": 8, - }, - { - "entity_group": "PER", - "score": 0.99, - "word": "Agathe", - "start": 13, - "end": 19, - }, - ] - pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) - assert "Francois" not in pseudonymized_sentence[0] - assert "Agathe" not in pseudonymized_sentence[0] - assert any( - pseudo in pseudonymized_sentence[0] - for pseudo in get_default_fr.pseudo_first_names["fr"] - ) - - def test_pseudonymize_numbers(get_default_fr): sentence = "My phone number is 123-456-7890." pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence) @@ -222,3 +195,94 @@ def test_pseudonymize_email_addresses(get_default_fr): sentence = "" pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence) assert pseudonymized_sentence == "" + + +def test_pseudonymize_ne_with_person_entities(get_default_fr): + sentence = "Francois et Agathe sont amis." + ner = [ + { + "entity_group": "PER", + "score": 0.99, + "word": "Francois", + "start": 0, + "end": 8, + }, + { + "entity_group": "PER", + "score": 0.99, + "word": "Agathe", + "start": 13, + "end": 19, + }, + ] + pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) + assert "Francois" not in pseudonymized_sentence[0] + assert "Agathe" not in pseudonymized_sentence[0] + assert any( + pseudo in pseudonymized_sentence[0] + for pseudo in get_default_fr.pseudo_first_names["fr"] + ) + + +def test_pseudonymize_ne_with_location_entities(get_default_fr): + sentence = "Paris et New York sont des villes." + ner = [ + { + "entity_group": "LOC", + "score": 0.99, + "word": "Paris", + "start": 0, + "end": 5, + }, + { + "entity_group": "LOC", + "score": 0.99, + "word": "New York", + "start": 10, + "end": 18, + }, + ] + pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) + assert "Paris" not in pseudonymized_sentence[0] + assert "New York" not in pseudonymized_sentence[0] + assert "[location]" in pseudonymized_sentence[0] + + +def test_pseudonymize_ne_with_organization_entities(get_default_fr): + sentence = "Google et Microsoft sont des géants de la technologie." + ner = [ + { + "entity_group": "ORG", + "score": 0.99, + "word": "Google", + "start": 0, + "end": 6, + }, + { + "entity_group": "ORG", + "score": 0.99, + "word": "Microsoft", + "start": 11, + "end": 20, + }, + ] + pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) + assert "Google" not in pseudonymized_sentence[0] + assert "Microsoft" not in pseudonymized_sentence[0] + assert "[organization]" in pseudonymized_sentence[0] + + +def test_pseudonymize_ne_with_misc_entities(get_default_fr): + sentence = "La tour Eiffel est un monument célèbre." + ner = [ + { + "entity_group": "MISC", + "score": 0.99, + "word": "tour Eiffel", + "start": 4, + "end": 16, + }, + ] + pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) + assert "tour Eiffel" not in pseudonymized_sentence[0] + assert "[misc]" in pseudonymized_sentence[0]