Skip to content

Commit

Permalink
Only import PII constants during Curator import (#61)
Browse files Browse the repository at this point in the history
* Move PII constants to a seperate file that does not import presidio/spacy and other GPU dependencies

Signed-off-by: Ayush Dattagupta <[email protected]>

* Add comment around import, move constant import to global scope

Signed-off-by: Ayush Dattagupta <[email protected]>

---------

Signed-off-by: Ayush Dattagupta <[email protected]>
  • Loading branch information
ayushdg authored May 13, 2024
1 parent 06ee061 commit 38d8ce7
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 24 deletions.
4 changes: 2 additions & 2 deletions nemo_curator/modifiers/pii_modifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pandas as pd

from nemo_curator.modifiers import DocumentModifier
from nemo_curator.pii.algorithm import DEFAULT_LANGUAGE
from nemo_curator.pii.constants import DEFAULT_LANGUAGE, DEFAULT_MAX_DOC_SIZE
from nemo_curator.utils.decorators import batched
from nemo_curator.utils.distributed_utils import load_object_on_worker

Expand Down Expand Up @@ -97,7 +97,7 @@ def load_deidentifier(self):

if self.device == "gpu":
spacy.require_gpu()
from nemo_curator.pii.algorithm import DEFAULT_MAX_DOC_SIZE, PiiDeidentifier
from nemo_curator.pii.algorithm import PiiDeidentifier

deidentifier: PiiDeidentifier = PiiDeidentifier(
language=self.language,
Expand Down
26 changes: 5 additions & 21 deletions nemo_curator/pii/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
from pathlib import Path
from typing import Any, List, Mapping, Union

# NOTE: Importing this module before cluster creation will create a primary CUDA context
# that leads to issues of all GPUs not being used when creating a cluster/client later on.
# Ensure that this module is always imported after cluster creation only when the algorithm
# needs to be executed. See: https://github.com/NVIDIA/NeMo-Curator/issues/64
import yaml
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NerModelConfiguration
Expand All @@ -30,36 +34,16 @@
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

from nemo_curator.pii.constants import DEFAULT_LANGUAGE, SUPPORTED_ENTITIES
from nemo_curator.pii.custom_batch_analyzer_engine import CustomBatchAnalyzerEngine
from nemo_curator.pii.custom_nlp_engine import CustomNlpEngine
from nemo_curator.pii.recognizers.address_recognizer import AddressRecognizer

__all__ = [
"DEFAULT_LANGUAGE",
"SUPPORTED_ENTITIES",
"DEFAULT_MAX_DOC_SIZE",
"PiiDeidentifier",
]


DEFAULT_LANGUAGE = "en"
SUPPORTED_ENTITIES = [
"ADDRESS",
"CREDIT_CARD",
"EMAIL_ADDRESS",
"DATE_TIME",
"IP_ADDRESS",
"LOCATION",
"PERSON",
"URL",
"US_SSN",
"US_PASSPORT",
"US_DRIVER_LICENSE",
"PHONE_NUMBER",
]
DEFAULT_MAX_DOC_SIZE = 2000000


class PiiDeidentifier(object):
"""Cleans PII from an unstructured text"""

Expand Down
20 changes: 20 additions & 0 deletions nemo_curator/pii/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
DEFAULT_LANGUAGE = "en"

SUPPORTED_ENTITIES = [
"ADDRESS",
"CREDIT_CARD",
"EMAIL_ADDRESS",
"DATE_TIME",
"IP_ADDRESS",
"LOCATION",
"PERSON",
"URL",
"US_SSN",
"US_PASSPORT",
"US_DRIVER_LICENSE",
"PHONE_NUMBER",
]

DEFAULT_MAX_DOC_SIZE = 2000000

__all__ = ["DEFAULT_LANGUAGE", "SUPPORTED_ENTITIES", "DEFAULT_MAX_DOC_SIZE"]
1 change: 0 additions & 1 deletion tests/test_pii_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from pathlib import Path

import pandas as pd
import pytest
from dask import dataframe as dd
from dask.distributed import Client, LocalCluster

Expand Down

0 comments on commit 38d8ce7

Please sign in to comment.