Skip to content

Commit

Permalink
NEW: Add PII classifier powered by presidio
Browse files Browse the repository at this point in the history
  • Loading branch information
ghisvail committed Jun 10, 2024
1 parent 29416d4 commit 22126eb
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 1 deletion.
37 changes: 37 additions & 0 deletions medkit/text/deid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Iterator

from medkit._import import import_optional
from medkit.core.text import Entity, span_utils
from medkit.core.text.operation import NEROperation

if TYPE_CHECKING:
from medkit.core.text import Segment

presidio_analyzer = import_optional("presidio_analyzer", extra="deid")

__all__ = ["PIIClassifier"]


class PIIClassifier(NEROperation):
"""Classify sensitive text information."""

def __init__(self, uid: str | None = None, name: str | None = None, **kwargs):
super().__init__(uid=uid, name=name, **kwargs)
self._analyzer = presidio_analyzer.AnalyzerEngine()

def run(self, segments: list[Segment]) -> list[Entity]:
return [entity for segment in segments for entity in self._run_one(segment)]

def _run_one(self, segment: Segment) -> Iterator[Entity]:
for result in self._analyzer.analyze(text=segment.text, language="en"):
text, spans = span_utils.extract(
text=segment.text, spans=segment.spans, ranges=[(result.start, result.end)]
)
yield Entity(
label=result.entity_type,
text=text,
spans=spans,
metadata={"score": result.score},
)
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ dependencies = [
[project.optional-dependencies]
deid = [
"presidio-analyzer >=2.2.33,<3",
"presidio-anonymizer >=2.2.33,<3",
]
edsnlp = [
"edsnlp>=0.9",
Expand Down

0 comments on commit 22126eb

Please sign in to comment.