From cd41e08563e55ca481254baa7b4f4dfe68fa165b Mon Sep 17 00:00:00 2001 From: Akshay Karle <1443108+akshaykarle@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:30:21 +0100 Subject: [PATCH] update anonymize cli to support csv file anonymization as well --- src/cli.py | 30 +++++++++++++++--------------- src/text/__init__.py | 0 src/text/text.py | 21 --------------------- 3 files changed, 15 insertions(+), 36 deletions(-) delete mode 100644 src/text/__init__.py delete mode 100644 src/text/text.py diff --git a/src/cli.py b/src/cli.py index 015e8be..0ae7a7d 100644 --- a/src/cli.py +++ b/src/cli.py @@ -1,28 +1,26 @@ import argparse +from presidio_analyzer.analyzer_engine import AnalyzerEngine from presidio_anonymizer.entities.engine.result.operator_result import OperatorResult from analyzer_engine.csv_analyzer_engine import CSVAnalyzerEngine -from presidio_anonymizer import BatchAnonymizerEngine +from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine from config.nlp_engine_config import FlairNLPEngine from operators.vault import Vault -from text.text import text_analyzer, text_anonymizer NLP_ENGINE = "flair/ner-english-large" def analyze(args): analyzer_results = None + nlp_engine = FlairNLPEngine(NLP_ENGINE) if args.text: - analyzer_results = text_analyzer(args.text, args.language) + nlp_engine, registry = nlp_engine.create_nlp_engine() + engine = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine) + analyzer_results = engine.analyze(text=args.text, language=args.language) else: - nlp_engine = FlairNLPEngine(NLP_ENGINE) engine = CSVAnalyzerEngine(nlp_engine) - - analyzer_results = engine.analyze_csv( - csv_full_path=args.filepath, - language=args.language - ) + analyzer_results = engine.analyze_csv(csv_full_path=args.filepath, language=args.language) print(analyzer_results) return analyzer_results @@ -31,15 +29,17 @@ def analyze(args): def anonymize(args): analyzer_results = analyze(args) anonymized_results = None + anonymizer_engine = None + + if args.vaulturl: + anonymizer_engine = Vault(args.vaulturl, args.vaultkey, args.vaulttoken) + else: + anonymizer_engine = AnonymizerEngine() if args.text: - if args.vaulturl: - vault = Vault(args.vaulturl, args.vaultkey, args.vaulttoken) - anonymized_results = vault.anonymize(args.text, analyzer_results) - else: - anonymized_results = text_anonymizer(args.text, analyzer_results) + anonymized_results = anonymizer_engine.anonymize(args.text, analyzer_results) else: - anonymizer = BatchAnonymizerEngine() + anonymizer = BatchAnonymizerEngine(anonymizer_engine) anonymized_results = anonymizer.anonymize_dict(analyzer_results) print(anonymized_results) diff --git a/src/text/__init__.py b/src/text/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/text/text.py b/src/text/text.py deleted file mode 100644 index faf504f..0000000 --- a/src/text/text.py +++ /dev/null @@ -1,21 +0,0 @@ -from typing import Optional -from presidio_analyzer.analyzer_engine import AnalyzerEngine -from presidio_anonymizer.anonymizer_engine import AnonymizerEngine -from config.nlp_engine_config import FlairNLPEngine - -NLP_ENGINE = "flair/ner-english-large" - -def text_analyzer(text, language): - nlp_engine = FlairNLPEngine(NLP_ENGINE) - nlp_engine, registry = nlp_engine.create_nlp_engine() - engine = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine) - - return engine.analyze( - text=text, - language=language - ) - - -def text_anonymizer(text: str, analyzer_results, operators: Optional[dict] = None): - anonymizer = AnonymizerEngine() - return anonymizer.anonymize(text, analyzer_results, operators)