Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issues with outdated build process #153

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ jobs:
export LD_LIBRARY_PATH=${{ env.LD_LIBRARY_PATH }}
export C_INCLUDE_PATH=${{ env.C_INCLUDE_PATH }}
export CPP_INCLUDE_PATH=${{ env.CPP_INCLUDE_PATH }}
python -m pip install --upgrade pip wheel setuptools
python -m pip install --upgrade pip wheel setuptools Cython
pip install -r requirements/python-dev

- name: Cache restore nltk data
Expand Down
4 changes: 2 additions & 2 deletions requirements/python-dev
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# needed to run the tests
flake8
coveralls
nose
pytest
mypy
tox

Expand All @@ -13,7 +13,7 @@ sphinx>=3
sphinx_rtd_theme>=0.5

# This is for the tests/benchmark_accuracy_real_data.py script
cchardet
chardet
pandas
click
python-magic
Expand Down
6 changes: 3 additions & 3 deletions scrubadub/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from .filth import Filth
from .detectors.tagged import KnownFilthItem

from typing import List, Dict, Union, Optional, Tuple, Callable, Iterable, Type, Set
from typing import List, Dict, Union, Optional, Tuple, Callable, Iterable, Type
import numpy as np
import pandas as pd
import sklearn.metrics
Expand All @@ -27,8 +27,8 @@ class TextPosition(ToStringMixin):
def __init__(self, filth: Filth, grouping_function: GroupingFunction):
self.beg = filth.beg
self.end = filth.end
self.detected = set() # type: Set[Tuple[str, ...]]
self.tagged = set() # type: Set[Tuple[str, ...]]
self.detected: set[Tuple[str, ...]] = set()
self.tagged: set[Tuple[str, ...]] = set()
self.document_name = str(filth.document_name or '') # type: str

if isinstance(filth, filth_module.TaggedEvaluationFilth):
Expand Down
14 changes: 7 additions & 7 deletions scrubadub/detectors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ class Detector(object):
```Detector.supported_local()``` function.
"""

filth_cls = Filth # type: ClassVar[Type[Filth]]
name = 'detector' # type: str
autoload = False # type: bool
filth_cls: ClassVar[Type[Filth]] = Filth
name: str = 'detector'
autoload: bool = False

def __init__(self, name: Optional[str] = None, locale: str = 'en_US'):
"""Initialise the ``Detector``.
Expand All @@ -46,7 +46,7 @@ def __init__(self, name: Optional[str] = None, locale: str = 'en_US'):
:type locale: str, optional
"""
if getattr(self, 'name', 'detector') == 'detector' and getattr(self, 'filth_cls', None) is not None:
if getattr(self.filth_cls, 'type', None) is not None and type(self) != Detector:
if getattr(self.filth_cls, 'type', None) is not None and type(self) is not Detector:
self.name = self.filth_cls.type
warnings.warn(
"Setting the detector name from the filth_cls.type is depreciated, please declare an explicit name"
Expand Down Expand Up @@ -111,8 +111,8 @@ class RegexDetector(Detector):
'This url will be found {{URL}}'
"""

regex = None # type: Optional[Pattern[str]]
filth_cls = Filth # type: ClassVar[Type[Filth]]
regex: Optional[Pattern[str]] = None
filth_cls: ClassVar[Type[Filth]] = Filth

def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
"""Yields discovered filth in the provided ``text``.
Expand Down Expand Up @@ -145,7 +145,7 @@ def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generato

class RegionLocalisedRegexDetector(RegexDetector):
"""Detector to detect ``Filth`` localised using regular expressions localised by the region"""
region_regex = {} # type: Dict[str, Pattern]
region_regex: Dict[str, Pattern] = {}

def __init__(self, **kwargs):
"""Initialise the ``Detector``.
Expand Down
3 changes: 2 additions & 1 deletion scrubadub/detectors/credit_card.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,13 @@ class CreditCardDetector(RegexDetector):
# TODO: regex doesn't match if the credit card number has spaces/dashes in

regex = re.compile((
r"(?<=\s)"
r"\b"
r"(?:4[0-9]{12}(?:[0-9]{3})?" # Visa
r"|(?:5[1-5][0-9]{2}" # MasterCard
r"|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}"
r"|3[47][0-9]{13}" # American Express
r"|3(?:0[0-5]|[68][0-9])[0-9]{11}" # Diners Club
r"|6(?:011|5[0-9]{2})[0-9]{12}" # Discover
r"|(?:2131|1800|35\d{3})\d{11})" # JCB
r"\b"
), re.VERBOSE)
4 changes: 4 additions & 0 deletions scrubadub/detectors/date_of_birth.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def __init__(self, context_before: int = 2, context_after: int = 1, require_cont
self.context_after = context_after
self.require_context = require_context

if self.language is None:
raise ValueError("Langauge is not set.")
try:
self.context_words = self.context_words_language_map[self.language]
except KeyError:
Expand All @@ -100,6 +102,8 @@ def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generato
"""

# using the dateparser lib - locale can be set here
if self.language is None:
raise ValueError("Langauge is not set.")
try:
date_picker = search_dates(text, languages=[self.language])
except RecursionError:
Expand Down
8 changes: 4 additions & 4 deletions scrubadub/filth/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@ class Filth(object):

# this allows people to customize the output, especially for placeholder
# text and identifier replacements
prefix = u'{{' # type: ClassVar[str]
suffix = u'}}' # type: ClassVar[str]
prefix: ClassVar[str] = u'{{'
suffix: ClassVar[str] = u'}}'

# the `type` is used when filths are merged to come up with a sane label
type = 'unknown' # type: ClassVar[str]
type: ClassVar[str] = 'unknown'

# the `lookup` is used to keep track of all of the different types of filth
# that are encountered across all `Filth` types.
lookup = utils.Lookup()

# For backwards compatibility, but this is deprecated.
regex = None # type: Optional[Pattern[str]]
regex: Optional[Pattern[str]] = None

def __init__(self, beg: Optional[int] = None, end: Optional[int] = None, text: Optional[str] = None,
match: Optional[Match] = None, detector_name: Optional[str] = None,
Expand Down
3 changes: 1 addition & 2 deletions scrubadub/filth/phone.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import phonenumbers

from faker import Faker
from typing import List

from .base import Filth
from .. import utils
Expand All @@ -22,7 +21,7 @@ def generate(faker: Faker) -> str:
"""
phone_number = ''
language, region = utils.locale_split(faker._locales[0])
results = [] # type: List[phonenumbers.PhoneNumberMatch]
results = [] # type: list[phonenumbers.PhoneNumberMatch]
# Here I'm filtering for numbers that pass validation by the phonenumbers package
while len(results) < 1:
# Faker generates random numbers of the right format eg (###)###-####
Expand Down
10 changes: 6 additions & 4 deletions scrubadub/post_processors/filth_replacer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import math
import hashlib

from typing import Sequence, Optional, Union, Dict
from typing import Sequence, Optional, Union
from collections import defaultdict

from scrubadub.filth import Filth, MergedFilth, TaggedEvaluationFilth
Expand Down Expand Up @@ -43,7 +43,7 @@ class FilthReplacer(PostProcessor):
# NOTE: this is not an efficient way to store this in memory. could
# alternatively hash the type and text and do away with the overhead
# bits of storing the tuple in the lookup
typed_lookup = defaultdict(lambda: utils.Lookup(), {}) # type: Dict[str, utils.Lookup]
typed_lookup = defaultdict(lambda: utils.Lookup(), {}) # type: dict[str, utils.Lookup]

def __init__(self, include_type: bool = True, include_count: bool = False, include_hash: bool = False,
uppercase: bool = True, separator: Optional[str] = None, hash_length: Optional[int] = None,
Expand Down Expand Up @@ -101,9 +101,11 @@ def filth_label(self, filth: Filth) -> str:
replacement_pieces = []

if self.include_type:
filth_type = getattr(f, 'type', None)
if filth_type is None:
filth_type_check: Optional[str] = getattr(f, 'type', None)
if filth_type_check is None:
continue
else:
filth_type: str = filth_type_check
if filth_type == TaggedEvaluationFilth.type:
filth_comparison_type = getattr(f, 'comparison_type', None)
if filth_comparison_type is not None:
Expand Down
5 changes: 3 additions & 2 deletions scrubadub/scrubbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ def _check_and_add_detector(self, detector: Detector, warn: bool = False):
) % locals())
self._detectors[name] = detector

def add_post_processor(self, post_processor: Union[PostProcessor, Type[PostProcessor], str], index: int = None):
def add_post_processor(self, post_processor: Union[PostProcessor, Type[PostProcessor], str],
index: Optional[int] = None):
"""Add a ``PostProcessor`` to a Scrubber

You can add a post-processor to a ``Scrubber`` by passing one of three objects to this function:
Expand Down Expand Up @@ -215,7 +216,7 @@ def remove_post_processor(self, post_processor: Union[PostProcessor, Type[PostPr
elif isinstance(post_processor, str):
self._post_processors = [x for x in self._post_processors if x.name != post_processor]

def _check_and_add_post_processor(self, post_processor: PostProcessor, index: int = None):
def _check_and_add_post_processor(self, post_processor: PostProcessor, index: Optional[int] = None):
"""Check the types and add the PostProcessor to the scrubber"""
if not isinstance(post_processor, PostProcessor):
raise TypeError((
Expand Down
4 changes: 1 addition & 3 deletions tests/benchmark_accuracy_real_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
import click
import magic
import dotenv
# import chardet
# try a new chardet package, its a drop in replacement based on a mozilla project.
import cchardet as chardet
import chardet
import logging
import posixpath
import azure.storage.blob
Expand Down
5 changes: 2 additions & 3 deletions tests/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@
tests = [
"mypy --config-file setup.cfg scrubadub/",
"flake8 --config setup.cfg scrubadub/",
# If py3.5 then examples with spacy don't work so disable doctests
'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then nosetests --with-doctest --doctest-extension=rst ./tests/ ./scrubadub/ ./docs/ ; else nosetests ; fi',
'pytest --doctest-glob="*.rst" ./tests/ ./scrubadub/ ./docs/',
"python3 ./tests/benchmark_accuracy.py --fast",
"python3 ./tests/benchmark_time.py",
'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then cd docs && make html && cd - ; fi',
'cd docs && make html && cd -',
]


Expand Down
6 changes: 3 additions & 3 deletions tests/test_comparison_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,11 +246,11 @@ def test_filth_grouper(self):
self.assertEqual(['filth', 'detector', 'locale'], df.columns.names)
self.assertEqual(
[
('name', 'tagged', 'en_US'),
('phone', 'phone', 'en_GB'),
('phone', 'phone', 'en_US'),
('phone', 'tagged', 'en_GB'),
('phone', 'tagged', 'en_US')
('phone', 'phone', 'en_US'),
('phone', 'tagged', 'en_US'),
('name', 'tagged', 'en_US'),
],
df.columns.values.tolist(),
)
Expand Down
7 changes: 7 additions & 0 deletions tests/test_detector_credit_card.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,10 @@ def test_visa2(self):
AFTER: My credit card is {{CREDIT_CARD}}.
"""
self.compare_before_after()

def test_start_of_string(self):
"""
BEFORE: 4012888888881881.
AFTER: {{CREDIT_CARD}}.
"""
self.compare_before_after()